diff --git a/share/man/man9/VOP_ATTRIB.9 b/share/man/man9/VOP_ATTRIB.9
index 45f1e2f1652a..ddfee1f5d1f9 100644
--- a/share/man/man9/VOP_ATTRIB.9
+++ b/share/man/man9/VOP_ATTRIB.9
@@ -1,139 +1,137 @@
 .\" -*- nroff -*-
 .\"
 .\" Copyright (c) 1996 Doug Rabson
 .\"
 .\" All rights reserved.
 .\"
 .\" This program is free software.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY EXPRESS OR
 .\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 .\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 .\" IN NO EVENT SHALL THE DEVELOPERS BE LIABLE FOR ANY DIRECT, INDIRECT,
 .\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 .\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 .\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 .\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd August 8, 2020
+.Dd October 2, 2021
 .Dt VOP_ATTRIB 9
 .Os
 .Sh NAME
 .Nm VOP_GETATTR ,
 .Nm VOP_SETATTR
 .Nd get and set attributes on a file or directory
 .Sh SYNOPSIS
 .In sys/param.h
 .In sys/vnode.h
 .Ft int
 .Fn VOP_GETATTR "struct vnode *vp" "struct vattr *vap" "struct ucred *cred"
 .Ft int
 .Fn VOP_SETATTR "struct vnode *vp" "struct vattr *vap" "struct ucred *cred"
 .Ft int
 .Fn VOP_STAT "struct vnode *vp" "struct stat *sb" "struct ucred *active_cred" \
-"struct ucred *file_cred" "struct thread *td"
+"struct ucred *file_cred"
 .Sh DESCRIPTION
 These entry points manipulate various attributes of a file or directory,
 including file permissions, owner, group, size,
 access time and modification time.
 .Pp
 .Fn VOP_STAT
 returns data in a format suitable for the
 .Xr stat 2
 system call and by default is implemented as a wrapper around
 .Fn VOP_GETATTR .
 Filesystems may want to implement their own variant for performance reasons.
 .Pp
 For
 .Fn VOP_GETATTR
 and
 .Fn VOP_SETATTR
 the arguments are:
 .Bl -tag -width cred
 .It Fa vp
 The vnode of the file.
 .It Fa vap
 The attributes of the file.
 .It Fa cred
 The user credentials of the calling thread.
 .El
 .Pp
 For
 .Fn VOP_STAT
 the arguments are:
 .Bl -tag -width active_cred
 .It Fa vp
 The vnode of the file.
 .It Fa sb
 The attributes of the file.
 .It Fa active_cred
 The user credentials of the calling thread.
 .It Fa file_cred
 The credentials installed on the file description pointing to the vnode or NOCRED.
-.It Fa td
-The calling thread.
 .El
 .Pp
 Attributes which are not being modified by
 .Fn VOP_SETATTR
 should be set to the value
 .Dv VNOVAL ;
 .Fn VATTR_NULL
 may be used to clear all the values, and should generally be used to reset
 the contents of
 .Fa *vap
 prior to setting specific values.
 .Sh LOCKS
 Both
 .Fn VOP_GETATTR
 and
 .Fn VOP_STAT
 expect the vnode to be locked on entry and will leave the vnode locked on
 return.
 The lock type can be either shared or exclusive.
 .Pp
 .Fn VOP_SETATTR
 expects the vnode to be locked on entry and will leave the vnode locked on
 return.
 The lock type must be exclusive.
 .Sh RETURN VALUES
 .Fn VOP_GETATTR
 returns 0 if it was able to retrieve the attribute data via
 .Fa *vap ,
 otherwise an appropriate error is returned.
 .Fn VOP_SETATTR
 returns zero if the attributes were changed successfully, otherwise an
 appropriate error is returned.
 .Fn VOP_STAT
 returns 0 if it was able to retrieve the attribute data
 .Fa *sb ,
 otherwise an appropriate error is returned.
 .Sh ERRORS
 .Bl -tag -width Er
 .It Bq Er EPERM
 The file is immutable.
 .It Bq Er EACCES
 The caller does not have permission to modify the file or directory attributes.
 .It Bq Er EROFS
 The file system is read-only.
 .El
 .Sh SEE ALSO
 .Xr VFS 9 ,
 .Xr vnode 9 ,
 .Xr VOP_ACCESS 9
 .Sh AUTHORS
 This manual page was written by
 .An Doug Rabson .
diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c
index 331732eb4234..2539b406618f 100644
--- a/sys/compat/linux/linux_event.c
+++ b/sys/compat/linux/linux_event.c
@@ -1,980 +1,979 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2007 Roman Divacky
  * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/callout.h>
 #include <sys/capsicum.h>
 #include <sys/types.h>
 #include <sys/user.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/errno.h>
 #include <sys/event.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/selinfo.h>
 #include <sys/specialfd.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/timespec.h>
 #include <sys/eventfd.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_event.h>
 #include <compat/linux/linux_file.h>
 #include <compat/linux/linux_timer.h>
 #include <compat/linux/linux_util.h>
 
 typedef uint64_t	epoll_udata_t;
 
 struct epoll_event {
 	uint32_t	events;
 	epoll_udata_t	data;
 }
 #if defined(__amd64__)
 __attribute__((packed))
 #endif
 ;
 
 #define	LINUX_MAX_EVENTS	(INT_MAX / sizeof(struct epoll_event))
 
 static int	epoll_to_kevent(struct thread *td, int fd,
 		    struct epoll_event *l_event, struct kevent *kevent,
 		    int *nkevents);
 static void	kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event);
 static int	epoll_kev_copyout(void *arg, struct kevent *kevp, int count);
 static int	epoll_kev_copyin(void *arg, struct kevent *kevp, int count);
 static int	epoll_register_kevent(struct thread *td, struct file *epfp,
 		    int fd, int filter, unsigned int flags);
 static int	epoll_fd_registered(struct thread *td, struct file *epfp,
 		    int fd);
 static int	epoll_delete_all_events(struct thread *td, struct file *epfp,
 		    int fd);
 
 struct epoll_copyin_args {
 	struct kevent	*changelist;
 };
 
 struct epoll_copyout_args {
 	struct epoll_event	*leventlist;
 	struct proc		*p;
 	uint32_t		count;
 	int			error;
 };
 
 /* timerfd */
 typedef uint64_t	timerfd_t;
 
 static fo_rdwr_t	timerfd_read;
 static fo_ioctl_t	timerfd_ioctl;
 static fo_poll_t	timerfd_poll;
 static fo_kqfilter_t	timerfd_kqfilter;
 static fo_stat_t	timerfd_stat;
 static fo_close_t	timerfd_close;
 static fo_fill_kinfo_t	timerfd_fill_kinfo;
 
 static struct fileops timerfdops = {
 	.fo_read = timerfd_read,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = timerfd_ioctl,
 	.fo_poll = timerfd_poll,
 	.fo_kqfilter = timerfd_kqfilter,
 	.fo_stat = timerfd_stat,
 	.fo_close = timerfd_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = timerfd_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static void	filt_timerfddetach(struct knote *kn);
 static int	filt_timerfdread(struct knote *kn, long hint);
 
 static struct filterops timerfd_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_timerfddetach,
 	.f_event = filt_timerfdread
 };
 
 struct timerfd {
 	clockid_t	tfd_clockid;
 	struct itimerspec tfd_time;
 	struct callout	tfd_callout;
 	timerfd_t	tfd_count;
 	bool		tfd_canceled;
 	struct selinfo	tfd_sel;
 	struct mtx	tfd_lock;
 };
 
 static void	linux_timerfd_expire(void *);
 static void	linux_timerfd_curval(struct timerfd *, struct itimerspec *);
 
 static int
 epoll_create_common(struct thread *td, int flags)
 {
 
 	return (kern_kqueue(td, flags, NULL));
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args)
 {
 
 	/*
 	 * args->size is unused. Linux just tests it
 	 * and then forgets it as well.
 	 */
 	if (args->size <= 0)
 		return (EINVAL);
 
 	return (epoll_create_common(td, 0));
 }
 #endif
 
 int
 linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args)
 {
 	int flags;
 
 	if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0)
 		return (EINVAL);
 
 	flags = 0;
 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
 		flags |= O_CLOEXEC;
 
 	return (epoll_create_common(td, flags));
 }
 
 /* Structure converting function from epoll to kevent. */
 static int
 epoll_to_kevent(struct thread *td, int fd, struct epoll_event *l_event,
     struct kevent *kevent, int *nkevents)
 {
 	uint32_t levents = l_event->events;
 	struct linux_pemuldata *pem;
 	struct proc *p;
 	unsigned short kev_flags = EV_ADD | EV_ENABLE;
 
 	/* flags related to how event is registered */
 	if ((levents & LINUX_EPOLLONESHOT) != 0)
 		kev_flags |= EV_DISPATCH;
 	if ((levents & LINUX_EPOLLET) != 0)
 		kev_flags |= EV_CLEAR;
 	if ((levents & LINUX_EPOLLERR) != 0)
 		kev_flags |= EV_ERROR;
 	if ((levents & LINUX_EPOLLRDHUP) != 0)
 		kev_flags |= EV_EOF;
 
 	/* flags related to what event is registered */
 	if ((levents & LINUX_EPOLL_EVRD) != 0) {
 		EV_SET(kevent, fd, EVFILT_READ, kev_flags, 0, 0, 0);
 		kevent->ext[0] = l_event->data;
 		++kevent;
 		++(*nkevents);
 	}
 	if ((levents & LINUX_EPOLL_EVWR) != 0) {
 		EV_SET(kevent, fd, EVFILT_WRITE, kev_flags, 0, 0, 0);
 		kevent->ext[0] = l_event->data;
 		++kevent;
 		++(*nkevents);
 	}
 	/* zero event mask is legal */
 	if ((levents & (LINUX_EPOLL_EVRD | LINUX_EPOLL_EVWR)) == 0) {
 		EV_SET(kevent++, fd, EVFILT_READ, EV_ADD|EV_DISABLE, 0, 0, 0);
 		++(*nkevents);
 	}
 
 	if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) {
 		p = td->td_proc;
 
 		pem = pem_find(p);
 		KASSERT(pem != NULL, ("epoll proc emuldata not found.\n"));
 
 		LINUX_PEM_XLOCK(pem);
 		if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) {
 			pem->flags |= LINUX_XUNSUP_EPOLL;
 			LINUX_PEM_XUNLOCK(pem);
 			linux_msg(td, "epoll_ctl unsupported flags: 0x%x",
 			    levents);
 		} else
 			LINUX_PEM_XUNLOCK(pem);
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 /*
  * Structure converting function from kevent to epoll. In a case
  * this is called on error in registration we store the error in
  * event->data and pick it up later in linux_epoll_ctl().
  */
 static void
 kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event)
 {
 
 	l_event->data = kevent->ext[0];
 
 	if ((kevent->flags & EV_ERROR) != 0) {
 		l_event->events = LINUX_EPOLLERR;
 		return;
 	}
 
 	/* XXX EPOLLPRI, EPOLLHUP */
 	switch (kevent->filter) {
 	case EVFILT_READ:
 		l_event->events = LINUX_EPOLLIN;
 		if ((kevent->flags & EV_EOF) != 0)
 			l_event->events |= LINUX_EPOLLRDHUP;
 	break;
 	case EVFILT_WRITE:
 		l_event->events = LINUX_EPOLLOUT;
 	break;
 	}
 }
 
 /*
  * Copyout callback used by kevent. This converts kevent
  * events to epoll events and copies them back to the
  * userspace. This is also called on error on registering
  * of the filter.
  */
 static int
 epoll_kev_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct epoll_copyout_args *args;
 	struct epoll_event *eep;
 	int error, i;
 
 	args = (struct epoll_copyout_args*) arg;
 	eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO);
 
 	for (i = 0; i < count; i++)
 		kevent_to_epoll(&kevp[i], &eep[i]);
 
 	error = copyout(eep, args->leventlist, count * sizeof(*eep));
 	if (error == 0) {
 		args->leventlist += count;
 		args->count += count;
 	} else if (args->error == 0)
 		args->error = error;
 
 	free(eep, M_EPOLL);
 	return (error);
 }
 
 /*
  * Copyin callback used by kevent. This copies already
  * converted filters from kernel memory to the kevent
  * internal kernel memory. Hence the memcpy instead of
  * copyin.
  */
 static int
 epoll_kev_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct epoll_copyin_args *args;
 
 	args = (struct epoll_copyin_args*) arg;
 
 	memcpy(kevp, args->changelist, count * sizeof(*kevp));
 	args->changelist += count;
 
 	return (0);
 }
 
 /*
  * Load epoll filter, convert it to kevent filter
  * and load it into kevent subsystem.
  */
 int
 linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args)
 {
 	struct file *epfp, *fp;
 	struct epoll_copyin_args ciargs;
 	struct kevent kev[2];
 	struct kevent_copyops k_ops = { &ciargs,
 					NULL,
 					epoll_kev_copyin};
 	struct epoll_event le;
 	cap_rights_t rights;
 	int nchanges = 0;
 	int error;
 
 	if (args->op != LINUX_EPOLL_CTL_DEL) {
 		error = copyin(args->event, &le, sizeof(le));
 		if (error != 0)
 			return (error);
 	}
 
 	error = fget(td, args->epfd,
 	    cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE), &epfp);
 	if (error != 0)
 		return (error);
 	if (epfp->f_type != DTYPE_KQUEUE) {
 		error = EINVAL;
 		goto leave1;
 	}
 
 	 /* Protect user data vector from incorrectly supplied fd. */
 	error = fget(td, args->fd,
 		     cap_rights_init_one(&rights, CAP_POLL_EVENT), &fp);
 	if (error != 0)
 		goto leave1;
 
 	/* Linux disallows spying on himself */
 	if (epfp == fp) {
 		error = EINVAL;
 		goto leave0;
 	}
 
 	ciargs.changelist = kev;
 
 	if (args->op != LINUX_EPOLL_CTL_DEL) {
 		error = epoll_to_kevent(td, args->fd, &le, kev, &nchanges);
 		if (error != 0)
 			goto leave0;
 	}
 
 	switch (args->op) {
 	case LINUX_EPOLL_CTL_MOD:
 		error = epoll_delete_all_events(td, epfp, args->fd);
 		if (error != 0)
 			goto leave0;
 		break;
 
 	case LINUX_EPOLL_CTL_ADD:
 		if (epoll_fd_registered(td, epfp, args->fd)) {
 			error = EEXIST;
 			goto leave0;
 		}
 		break;
 
 	case LINUX_EPOLL_CTL_DEL:
 		/* CTL_DEL means unregister this fd with this epoll */
 		error = epoll_delete_all_events(td, epfp, args->fd);
 		goto leave0;
 
 	default:
 		error = EINVAL;
 		goto leave0;
 	}
 
 	error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL);
 
 leave0:
 	fdrop(fp, td);
 
 leave1:
 	fdrop(epfp, td);
 	return (error);
 }
 
 /*
  * Wait for a filter to be triggered on the epoll file descriptor.
  */
 static int
 linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events,
     int maxevents, int timeout, sigset_t *uset)
 {
 	struct epoll_copyout_args coargs;
 	struct kevent_copyops k_ops = { &coargs,
 					epoll_kev_copyout,
 					NULL};
 	struct timespec ts, *tsp;
 	cap_rights_t rights;
 	struct file *epfp;
 	sigset_t omask;
 	int error;
 
 	if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS)
 		return (EINVAL);
 
 	error = fget(td, epfd,
 	    cap_rights_init_one(&rights, CAP_KQUEUE_EVENT), &epfp);
 	if (error != 0)
 		return (error);
 	if (epfp->f_type != DTYPE_KQUEUE) {
 		error = EINVAL;
 		goto leave;
 	}
 	if (uset != NULL) {
 		error = kern_sigprocmask(td, SIG_SETMASK, uset,
 		    &omask, 0);
 		if (error != 0)
 			goto leave;
 		td->td_pflags |= TDP_OLDMASK;
 		/*
 		 * Make sure that ast() is called on return to
 		 * usermode and TDP_OLDMASK is cleared, restoring old
 		 * sigmask.
 		 */
 		thread_lock(td);
 		td->td_flags |= TDF_ASTPENDING;
 		thread_unlock(td);
 	}
 
 	coargs.leventlist = events;
 	coargs.p = td->td_proc;
 	coargs.count = 0;
 	coargs.error = 0;
 
 	/*
 	 * Linux epoll_wait(2) man page states that timeout of -1 causes caller
 	 * to block indefinitely. Real implementation does it if any negative
 	 * timeout value is passed.
 	 */
 	if (timeout >= 0) {
 		/* Convert from milliseconds to timespec. */
 		ts.tv_sec = timeout / 1000;
 		ts.tv_nsec = (timeout % 1000) * 1000000;
 		tsp = &ts;
 	} else {
 		tsp = NULL;
 	}
 
 	error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp);
 	if (error == 0 && coargs.error != 0)
 		error = coargs.error;
 
 	/*
 	 * kern_kevent might return ENOMEM which is not expected from epoll_wait.
 	 * Maybe we should translate that but I don't think it matters at all.
 	 */
 	if (error == 0)
 		td->td_retval[0] = coargs.count;
 
 	if (uset != NULL)
 		error = kern_sigprocmask(td, SIG_SETMASK, &omask,
 		    NULL, 0);
 leave:
 	fdrop(epfp, td);
 	return (error);
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args)
 {
 
 	return (linux_epoll_wait_common(td, args->epfd, args->events,
 	    args->maxevents, args->timeout, NULL));
 }
 #endif
 
 int
 linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args)
 {
 	sigset_t mask, *pmask;
 	l_sigset_t lmask;
 	int error;
 
 	if (args->mask != NULL) {
 		if (args->sigsetsize != sizeof(l_sigset_t))
 			return (EINVAL);
 		error = copyin(args->mask, &lmask, sizeof(l_sigset_t));
 		if (error != 0)
 			return (error);
 		linux_to_bsd_sigset(&lmask, &mask);
 		pmask = &mask;
 	} else
 		pmask = NULL;
 	return (linux_epoll_wait_common(td, args->epfd, args->events,
 	    args->maxevents, args->timeout, pmask));
 }
 
 static int
 epoll_register_kevent(struct thread *td, struct file *epfp, int fd, int filter,
     unsigned int flags)
 {
 	struct epoll_copyin_args ciargs;
 	struct kevent kev;
 	struct kevent_copyops k_ops = { &ciargs,
 					NULL,
 					epoll_kev_copyin};
 
 	ciargs.changelist = &kev;
 	EV_SET(&kev, fd, filter, flags, 0, 0, 0);
 
 	return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL));
 }
 
 static int
 epoll_fd_registered(struct thread *td, struct file *epfp, int fd)
 {
 	/*
 	 * Set empty filter flags to avoid accidental modification of already
 	 * registered events. In the case of event re-registration:
 	 * 1. If event does not exists kevent() does nothing and returns ENOENT
 	 * 2. If event does exists, it's enabled/disabled state is preserved
 	 *    but fflags, data and udata fields are overwritten. So we can not
 	 *    set socket lowats and store user's context pointer in udata.
 	 */
 	if (epoll_register_kevent(td, epfp, fd, EVFILT_READ, 0) != ENOENT ||
 	    epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, 0) != ENOENT)
 		return (1);
 
 	return (0);
 }
 
 static int
 epoll_delete_all_events(struct thread *td, struct file *epfp, int fd)
 {
 	int error1, error2;
 
 	error1 = epoll_register_kevent(td, epfp, fd, EVFILT_READ, EV_DELETE);
 	error2 = epoll_register_kevent(td, epfp, fd, EVFILT_WRITE, EV_DELETE);
 
 	/* return 0 if at least one result positive */
 	return (error1 == 0 ? 0 : error2);
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
 int
 linux_eventfd(struct thread *td, struct linux_eventfd_args *args)
 {
 	struct specialfd_eventfd ae;
 
 	bzero(&ae, sizeof(ae));
 	ae.initval = args->initval;
 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
 }
 #endif
 
 int
 linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args)
 {
 	struct specialfd_eventfd ae;
 	int flags;
 
 	if ((args->flags & ~(LINUX_O_CLOEXEC | LINUX_O_NONBLOCK |
 	    LINUX_EFD_SEMAPHORE)) != 0)
 		return (EINVAL);
 	flags = 0;
 	if ((args->flags & LINUX_O_CLOEXEC) != 0)
 		flags |= EFD_CLOEXEC;
 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
 		flags |= EFD_NONBLOCK;
 	if ((args->flags & LINUX_EFD_SEMAPHORE) != 0)
 		flags |= EFD_SEMAPHORE;
 
 	bzero(&ae, sizeof(ae));
 	ae.flags = flags;
 	ae.initval = args->initval;
 	return (kern_specialfd(td, SPECIALFD_EVENTFD, &ae));
 }
 
 int
 linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args)
 {
 	struct filedesc *fdp;
 	struct timerfd *tfd;
 	struct file *fp;
 	clockid_t clockid;
 	int fflags, fd, error;
 
 	if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0)
 		return (EINVAL);
 
 	error = linux_to_native_clockid(&clockid, args->clockid);
 	if (error != 0)
 		return (error);
 	if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
 		return (EINVAL);
 
 	fflags = 0;
 	if ((args->flags & LINUX_TFD_CLOEXEC) != 0)
 		fflags |= O_CLOEXEC;
 
 	fdp = td->td_proc->p_fd;
 	error = falloc(td, &fp, &fd, fflags);
 	if (error != 0)
 		return (error);
 
 	tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO);
 	tfd->tfd_clockid = clockid;
 	mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF);
 
 	callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0);
 	knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock);
 
 	fflags = FREAD;
 	if ((args->flags & LINUX_O_NONBLOCK) != 0)
 		fflags |= FNONBLOCK;
 
 	finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops);
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
 	return (error);
 }
 
 static int
 timerfd_close(struct file *fp, struct thread *td)
 {
 	struct timerfd *tfd;
 
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
 		return (EINVAL);
 
 	timespecclear(&tfd->tfd_time.it_value);
 	timespecclear(&tfd->tfd_time.it_interval);
 
 	callout_drain(&tfd->tfd_callout);
 
 	seldrain(&tfd->tfd_sel);
 	knlist_destroy(&tfd->tfd_sel.si_note);
 
 	fp->f_ops = &badfileops;
 	mtx_destroy(&tfd->tfd_lock);
 	free(tfd, M_EPOLL);
 
 	return (0);
 }
 
 static int
 timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct timerfd *tfd;
 	timerfd_t count;
 	int error;
 
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
 		return (EINVAL);
 
 	if (uio->uio_resid < sizeof(timerfd_t))
 		return (EINVAL);
 
 	error = 0;
 	mtx_lock(&tfd->tfd_lock);
 retry:
 	if (tfd->tfd_canceled) {
 		tfd->tfd_count = 0;
 		mtx_unlock(&tfd->tfd_lock);
 		return (ECANCELED);
 	}
 	if (tfd->tfd_count == 0) {
 		if ((fp->f_flag & FNONBLOCK) != 0) {
 			mtx_unlock(&tfd->tfd_lock);
 			return (EAGAIN);
 		}
 		error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0);
 		if (error == 0)
 			goto retry;
 	}
 	if (error == 0) {
 		count = tfd->tfd_count;
 		tfd->tfd_count = 0;
 		mtx_unlock(&tfd->tfd_lock);
 		error = uiomove(&count, sizeof(timerfd_t), uio);
 	} else
 		mtx_unlock(&tfd->tfd_lock);
 
 	return (error);
 }
 
 static int
 timerfd_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct timerfd *tfd;
 	int revents = 0;
 
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
 		return (POLLERR);
 
 	mtx_lock(&tfd->tfd_lock);
 	if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0)
 		revents |= events & (POLLIN|POLLRDNORM);
 	if (revents == 0)
 		selrecord(td, &tfd->tfd_sel);
 	mtx_unlock(&tfd->tfd_lock);
 
 	return (revents);
 }
 
 static int
 timerfd_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct timerfd *tfd;
 
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL)
 		return (EINVAL);
 
 	if (kn->kn_filter == EVFILT_READ)
 		kn->kn_fop = &timerfd_rfiltops;
 	else
 		return (EINVAL);
 
 	kn->kn_hook = tfd;
 	knlist_add(&tfd->tfd_sel.si_note, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_timerfddetach(struct knote *kn)
 {
 	struct timerfd *tfd = kn->kn_hook;
 
 	mtx_lock(&tfd->tfd_lock);
 	knlist_remove(&tfd->tfd_sel.si_note, kn, 1);
 	mtx_unlock(&tfd->tfd_lock);
 }
 
 static int
 filt_timerfdread(struct knote *kn, long hint)
 {
 	struct timerfd *tfd = kn->kn_hook;
 
 	return (tfd->tfd_count > 0);
 }
 
 static int
 timerfd_ioctl(struct file *fp, u_long cmd, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 
 	if (fp->f_data == NULL || fp->f_type != DTYPE_LINUXTFD)
 		return (EINVAL);
 
 	switch (cmd) {
 	case FIONBIO:
 	case FIOASYNC:
 		return (0);
 	}
 
 	return (ENOTTY);
 }
 
 static int
-timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
-    struct thread *td)
+timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
 {
 
 	return (ENXIO);
 }
 
 static int
 timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_UNKNOWN;
 	return (0);
 }
 
 static void
 linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts)
 {
 
 	if (tfd->tfd_clockid == CLOCK_REALTIME)
 		getnanotime(ts);
 	else	/* CLOCK_MONOTONIC */
 		getnanouptime(ts);
 }
 
 static void
 linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots)
 {
 	struct timespec cts;
 
 	linux_timerfd_clocktime(tfd, &cts);
 	*ots = tfd->tfd_time;
 	if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) {
 		timespecsub(&ots->it_value, &cts, &ots->it_value);
 		if (ots->it_value.tv_sec < 0 ||
 		    (ots->it_value.tv_sec == 0 &&
 		     ots->it_value.tv_nsec == 0)) {
 			ots->it_value.tv_sec  = 0;
 			ots->it_value.tv_nsec = 1;
 		}
 	}
 }
 
 int
 linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args)
 {
 	struct l_itimerspec lots;
 	struct itimerspec ots;
 	struct timerfd *tfd;
 	struct file *fp;
 	int error;
 
 	error = fget(td, args->fd, &cap_read_rights, &fp);
 	if (error != 0)
 		return (error);
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	mtx_lock(&tfd->tfd_lock);
 	linux_timerfd_curval(tfd, &ots);
 	mtx_unlock(&tfd->tfd_lock);
 
 	error = native_to_linux_itimerspec(&lots, &ots);
 	if (error == 0)
 		error = copyout(&lots, args->old_value, sizeof(lots));
 
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args)
 {
 	struct l_itimerspec lots;
 	struct itimerspec nts, ots;
 	struct timespec cts, ts;
 	struct timerfd *tfd;
 	struct timeval tv;
 	struct file *fp;
 	int error;
 
 	if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0)
 		return (EINVAL);
 
 	error = copyin(args->new_value, &lots, sizeof(lots));
 	if (error != 0)
 		return (error);
 	error = linux_to_native_itimerspec(&nts, &lots);
 	if (error != 0)
 		return (error);
 
 	error = fget(td, args->fd, &cap_write_rights, &fp);
 	if (error != 0)
 		return (error);
 	tfd = fp->f_data;
 	if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	mtx_lock(&tfd->tfd_lock);
 	if (!timespecisset(&nts.it_value))
 		timespecclear(&nts.it_interval);
 	if (args->old_value != NULL)
 		linux_timerfd_curval(tfd, &ots);
 
 	tfd->tfd_time = nts;
 	tfd->tfd_count = 0;
 	if (timespecisset(&nts.it_value)) {
 		linux_timerfd_clocktime(tfd, &cts);
 		ts = nts.it_value;
 		if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) {
 			timespecadd(&tfd->tfd_time.it_value, &cts,
 				&tfd->tfd_time.it_value);
 		} else {
 			timespecsub(&ts, &cts, &ts);
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
 			linux_timerfd_expire, tfd);
 		tfd->tfd_canceled = false;
 	} else {
 		tfd->tfd_canceled = true;
 		callout_stop(&tfd->tfd_callout);
 	}
 	mtx_unlock(&tfd->tfd_lock);
 
 	if (args->old_value != NULL) {
 		error = native_to_linux_itimerspec(&lots, &ots);
 		if (error == 0)
 			error = copyout(&lots, args->old_value, sizeof(lots));
 	}
 
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 static void
 linux_timerfd_expire(void *arg)
 {
 	struct timespec cts, ts;
 	struct timeval tv;
 	struct timerfd *tfd;
 
 	tfd = (struct timerfd *)arg;
 
 	linux_timerfd_clocktime(tfd, &cts);
 	if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) {
 		if (timespecisset(&tfd->tfd_time.it_interval))
 			timespecadd(&tfd->tfd_time.it_value,
 				    &tfd->tfd_time.it_interval,
 				    &tfd->tfd_time.it_value);
 		else
 			/* single shot timer */
 			timespecclear(&tfd->tfd_time.it_value);
 		if (timespecisset(&tfd->tfd_time.it_value)) {
 			timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
 			callout_reset(&tfd->tfd_callout, tvtohz(&tv),
 				linux_timerfd_expire, tfd);
 		}
 		tfd->tfd_count++;
 		KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0);
 		selwakeup(&tfd->tfd_sel);
 		wakeup(&tfd->tfd_count);
 	} else if (timespecisset(&tfd->tfd_time.it_value)) {
 		timespecsub(&tfd->tfd_time.it_value, &cts, &ts);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		callout_reset(&tfd->tfd_callout, tvtohz(&tv),
 		    linux_timerfd_expire, tfd);
 	}
 }
diff --git a/sys/compat/linuxkpi/common/src/linux_compat.c b/sys/compat/linuxkpi/common/src/linux_compat.c
index 279f7131fc57..7a062ca36eba 100644
--- a/sys/compat/linuxkpi/common/src/linux_compat.c
+++ b/sys/compat/linuxkpi/common/src/linux_compat.c
@@ -1,2654 +1,2653 @@
 /*-
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
  * Copyright (c) 2013-2021 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/sglist.h>
 #include <sys/sleepqueue.h>
 #include <sys/refcount.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/rwlock.h>
 #include <sys/mman.h>
 #include <sys/stack.h>
 #include <sys/sysent.h>
 #include <sys/time.h>
 #include <sys/user.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 #include <machine/stdarg.h>
 
 #if defined(__i386__) || defined(__amd64__)
 #include <machine/md_var.h>
 #endif
 
 #include <linux/kobject.h>
 #include <linux/cpu.h>
 #include <linux/device.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/cdev.h>
 #include <linux/file.h>
 #include <linux/sysfs.h>
 #include <linux/mm.h>
 #include <linux/io.h>
 #include <linux/vmalloc.h>
 #include <linux/netdevice.h>
 #include <linux/timer.h>
 #include <linux/interrupt.h>
 #include <linux/uaccess.h>
 #include <linux/list.h>
 #include <linux/kthread.h>
 #include <linux/kernel.h>
 #include <linux/compat.h>
 #include <linux/poll.h>
 #include <linux/smp.h>
 #include <linux/wait_bit.h>
 #include <linux/rcupdate.h>
 
 #if defined(__i386__) || defined(__amd64__)
 #include <asm/smp.h>
 #endif
 
 SYSCTL_NODE(_compat, OID_AUTO, linuxkpi, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "LinuxKPI parameters");
 
 int linuxkpi_debug;
 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug, CTLFLAG_RWTUN,
     &linuxkpi_debug, 0, "Set to enable pr_debug() prints. Clear to disable.");
 
 static struct timeval lkpi_net_lastlog;
 static int lkpi_net_curpps;
 static int lkpi_net_maxpps = 99;
 SYSCTL_INT(_compat_linuxkpi, OID_AUTO, net_ratelimit, CTLFLAG_RWTUN,
     &lkpi_net_maxpps, 0, "Limit number of LinuxKPI net messages per second.");
 
 MALLOC_DEFINE(M_KMALLOC, "lkpikmalloc", "Linux kmalloc compat");
 
 #include <linux/rbtree.h>
 /* Undo Linux compat changes. */
 #undef RB_ROOT
 #undef file
 #undef cdev
 #define	RB_ROOT(head)	(head)->rbh_root
 
 static void linux_destroy_dev(struct linux_cdev *);
 static void linux_cdev_deref(struct linux_cdev *ldev);
 static struct vm_area_struct *linux_cdev_handle_find(void *handle);
 
 cpumask_t cpu_online_mask;
 struct kobject linux_class_root;
 struct device linux_root_device;
 struct class linux_class_misc;
 struct list_head pci_drivers;
 struct list_head pci_devices;
 spinlock_t pci_lock;
 
 unsigned long linux_timer_hz_mask;
 
 wait_queue_head_t linux_bit_waitq;
 wait_queue_head_t linux_var_waitq;
 
 int
 panic_cmp(struct rb_node *one, struct rb_node *two)
 {
 	panic("no cmp");
 }
 
 RB_GENERATE(linux_root, rb_node, __entry, panic_cmp);
 
 int
 kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list args)
 {
 	va_list tmp_va;
 	int len;
 	char *old;
 	char *name;
 	char dummy;
 
 	old = kobj->name;
 
 	if (old && fmt == NULL)
 		return (0);
 
 	/* compute length of string */
 	va_copy(tmp_va, args);
 	len = vsnprintf(&dummy, 0, fmt, tmp_va);
 	va_end(tmp_va);
 
 	/* account for zero termination */
 	len++;
 
 	/* check for error */
 	if (len < 1)
 		return (-EINVAL);
 
 	/* allocate memory for string */
 	name = kzalloc(len, GFP_KERNEL);
 	if (name == NULL)
 		return (-ENOMEM);
 	vsnprintf(name, len, fmt, args);
 	kobj->name = name;
 
 	/* free old string */
 	kfree(old);
 
 	/* filter new string */
 	for (; *name != '\0'; name++)
 		if (*name == '/')
 			*name = '!';
 	return (0);
 }
 
 int
 kobject_set_name(struct kobject *kobj, const char *fmt, ...)
 {
 	va_list args;
 	int error;
 
 	va_start(args, fmt);
 	error = kobject_set_name_vargs(kobj, fmt, args);
 	va_end(args);
 
 	return (error);
 }
 
 static int
 kobject_add_complete(struct kobject *kobj, struct kobject *parent)
 {
 	const struct kobj_type *t;
 	int error;
 
 	kobj->parent = parent;
 	error = sysfs_create_dir(kobj);
 	if (error == 0 && kobj->ktype && kobj->ktype->default_attrs) {
 		struct attribute **attr;
 		t = kobj->ktype;
 
 		for (attr = t->default_attrs; *attr != NULL; attr++) {
 			error = sysfs_create_file(kobj, *attr);
 			if (error)
 				break;
 		}
 		if (error)
 			sysfs_remove_dir(kobj);
 	}
 	return (error);
 }
 
 int
 kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...)
 {
 	va_list args;
 	int error;
 
 	va_start(args, fmt);
 	error = kobject_set_name_vargs(kobj, fmt, args);
 	va_end(args);
 	if (error)
 		return (error);
 
 	return kobject_add_complete(kobj, parent);
 }
 
 void
 linux_kobject_release(struct kref *kref)
 {
 	struct kobject *kobj;
 	char *name;
 
 	kobj = container_of(kref, struct kobject, kref);
 	sysfs_remove_dir(kobj);
 	name = kobj->name;
 	if (kobj->ktype && kobj->ktype->release)
 		kobj->ktype->release(kobj);
 	kfree(name);
 }
 
 static void
 linux_kobject_kfree(struct kobject *kobj)
 {
 	kfree(kobj);
 }
 
 static void
 linux_kobject_kfree_name(struct kobject *kobj)
 {
 	if (kobj) {
 		kfree(kobj->name);
 	}
 }
 
 const struct kobj_type linux_kfree_type = {
 	.release = linux_kobject_kfree
 };
 
 static void
 linux_device_release(struct device *dev)
 {
 	pr_debug("linux_device_release: %s\n", dev_name(dev));
 	kfree(dev);
 }
 
 static ssize_t
 linux_class_show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
 	struct class_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct class_attribute, attr);
 	error = -EIO;
 	if (dattr->show)
 		error = dattr->show(container_of(kobj, struct class, kobj),
 		    dattr, buf);
 	return (error);
 }
 
 static ssize_t
 linux_class_store(struct kobject *kobj, struct attribute *attr, const char *buf,
     size_t count)
 {
 	struct class_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct class_attribute, attr);
 	error = -EIO;
 	if (dattr->store)
 		error = dattr->store(container_of(kobj, struct class, kobj),
 		    dattr, buf, count);
 	return (error);
 }
 
 static void
 linux_class_release(struct kobject *kobj)
 {
 	struct class *class;
 
 	class = container_of(kobj, struct class, kobj);
 	if (class->class_release)
 		class->class_release(class);
 }
 
 static const struct sysfs_ops linux_class_sysfs = {
 	.show  = linux_class_show,
 	.store = linux_class_store,
 };
 
 const struct kobj_type linux_class_ktype = {
 	.release = linux_class_release,
 	.sysfs_ops = &linux_class_sysfs
 };
 
 static void
 linux_dev_release(struct kobject *kobj)
 {
 	struct device *dev;
 
 	dev = container_of(kobj, struct device, kobj);
 	/* This is the precedence defined by linux. */
 	if (dev->release)
 		dev->release(dev);
 	else if (dev->class && dev->class->dev_release)
 		dev->class->dev_release(dev);
 }
 
 static ssize_t
 linux_dev_show(struct kobject *kobj, struct attribute *attr, char *buf)
 {
 	struct device_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct device_attribute, attr);
 	error = -EIO;
 	if (dattr->show)
 		error = dattr->show(container_of(kobj, struct device, kobj),
 		    dattr, buf);
 	return (error);
 }
 
 static ssize_t
 linux_dev_store(struct kobject *kobj, struct attribute *attr, const char *buf,
     size_t count)
 {
 	struct device_attribute *dattr;
 	ssize_t error;
 
 	dattr = container_of(attr, struct device_attribute, attr);
 	error = -EIO;
 	if (dattr->store)
 		error = dattr->store(container_of(kobj, struct device, kobj),
 		    dattr, buf, count);
 	return (error);
 }
 
 static const struct sysfs_ops linux_dev_sysfs = {
 	.show  = linux_dev_show,
 	.store = linux_dev_store,
 };
 
 const struct kobj_type linux_dev_ktype = {
 	.release = linux_dev_release,
 	.sysfs_ops = &linux_dev_sysfs
 };
 
 struct device *
 device_create(struct class *class, struct device *parent, dev_t devt,
     void *drvdata, const char *fmt, ...)
 {
 	struct device *dev;
 	va_list args;
 
 	dev = kzalloc(sizeof(*dev), M_WAITOK);
 	dev->parent = parent;
 	dev->class = class;
 	dev->devt = devt;
 	dev->driver_data = drvdata;
 	dev->release = linux_device_release;
 	va_start(args, fmt);
 	kobject_set_name_vargs(&dev->kobj, fmt, args);
 	va_end(args);
 	device_register(dev);
 
 	return (dev);
 }
 
 int
 kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype,
     struct kobject *parent, const char *fmt, ...)
 {
 	va_list args;
 	int error;
 
 	kobject_init(kobj, ktype);
 	kobj->ktype = ktype;
 	kobj->parent = parent;
 	kobj->name = NULL;
 
 	va_start(args, fmt);
 	error = kobject_set_name_vargs(kobj, fmt, args);
 	va_end(args);
 	if (error)
 		return (error);
 	return kobject_add_complete(kobj, parent);
 }
 
 static void
 linux_kq_lock(void *arg)
 {
 	spinlock_t *s = arg;
 
 	spin_lock(s);
 }
 static void
 linux_kq_unlock(void *arg)
 {
 	spinlock_t *s = arg;
 
 	spin_unlock(s);
 }
 
 static void
 linux_kq_assert_lock(void *arg, int what)
 {
 #ifdef INVARIANTS
 	spinlock_t *s = arg;
 
 	if (what == LA_LOCKED)
 		mtx_assert(&s->m, MA_OWNED);
 	else
 		mtx_assert(&s->m, MA_NOTOWNED);
 #endif
 }
 
 static void
 linux_file_kqfilter_poll(struct linux_file *, int);
 
 struct linux_file *
 linux_file_alloc(void)
 {
 	struct linux_file *filp;
 
 	filp = kzalloc(sizeof(*filp), GFP_KERNEL);
 
 	/* set initial refcount */
 	filp->f_count = 1;
 
 	/* setup fields needed by kqueue support */
 	spin_lock_init(&filp->f_kqlock);
 	knlist_init(&filp->f_selinfo.si_note, &filp->f_kqlock,
 	    linux_kq_lock, linux_kq_unlock, linux_kq_assert_lock);
 
 	return (filp);
 }
 
 void
 linux_file_free(struct linux_file *filp)
 {
 	if (filp->_file == NULL) {
 		if (filp->f_op != NULL && filp->f_op->release != NULL)
 			filp->f_op->release(filp->f_vnode, filp);
 		if (filp->f_shmem != NULL)
 			vm_object_deallocate(filp->f_shmem);
 		kfree_rcu(filp, rcu);
 	} else {
 		/*
 		 * The close method of the character device or file
 		 * will free the linux_file structure:
 		 */
 		_fdrop(filp->_file, curthread);
 	}
 }
 
 static int
 linux_cdev_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot,
     vm_page_t *mres)
 {
 	struct vm_area_struct *vmap;
 
 	vmap = linux_cdev_handle_find(vm_obj->handle);
 
 	MPASS(vmap != NULL);
 	MPASS(vmap->vm_private_data == vm_obj->handle);
 
 	if (likely(vmap->vm_ops != NULL && offset < vmap->vm_len)) {
 		vm_paddr_t paddr = IDX_TO_OFF(vmap->vm_pfn) + offset;
 		vm_page_t page;
 
 		if (((*mres)->flags & PG_FICTITIOUS) != 0) {
 			/*
 			 * If the passed in result page is a fake
 			 * page, update it with the new physical
 			 * address.
 			 */
 			page = *mres;
 			vm_page_updatefake(page, paddr, vm_obj->memattr);
 		} else {
 			/*
 			 * Replace the passed in "mres" page with our
 			 * own fake page and free up the all of the
 			 * original pages.
 			 */
 			VM_OBJECT_WUNLOCK(vm_obj);
 			page = vm_page_getfake(paddr, vm_obj->memattr);
 			VM_OBJECT_WLOCK(vm_obj);
 
 			vm_page_replace(page, vm_obj, (*mres)->pindex, *mres);
 			*mres = page;
 		}
 		vm_page_valid(page);
 		return (VM_PAGER_OK);
 	}
 	return (VM_PAGER_FAIL);
 }
 
 static int
 linux_cdev_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type,
     vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
 {
 	struct vm_area_struct *vmap;
 	int err;
 
 	/* get VM area structure */
 	vmap = linux_cdev_handle_find(vm_obj->handle);
 	MPASS(vmap != NULL);
 	MPASS(vmap->vm_private_data == vm_obj->handle);
 
 	VM_OBJECT_WUNLOCK(vm_obj);
 
 	linux_set_current(curthread);
 
 	down_write(&vmap->vm_mm->mmap_sem);
 	if (unlikely(vmap->vm_ops == NULL)) {
 		err = VM_FAULT_SIGBUS;
 	} else {
 		struct vm_fault vmf;
 
 		/* fill out VM fault structure */
 		vmf.virtual_address = (void *)(uintptr_t)IDX_TO_OFF(pidx);
 		vmf.flags = (fault_type & VM_PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
 		vmf.pgoff = 0;
 		vmf.page = NULL;
 		vmf.vma = vmap;
 
 		vmap->vm_pfn_count = 0;
 		vmap->vm_pfn_pcount = &vmap->vm_pfn_count;
 		vmap->vm_obj = vm_obj;
 
 		err = vmap->vm_ops->fault(&vmf);
 
 		while (vmap->vm_pfn_count == 0 && err == VM_FAULT_NOPAGE) {
 			kern_yield(PRI_USER);
 			err = vmap->vm_ops->fault(&vmf);
 		}
 	}
 
 	/* translate return code */
 	switch (err) {
 	case VM_FAULT_OOM:
 		err = VM_PAGER_AGAIN;
 		break;
 	case VM_FAULT_SIGBUS:
 		err = VM_PAGER_BAD;
 		break;
 	case VM_FAULT_NOPAGE:
 		/*
 		 * By contract the fault handler will return having
 		 * busied all the pages itself. If pidx is already
 		 * found in the object, it will simply xbusy the first
 		 * page and return with vm_pfn_count set to 1.
 		 */
 		*first = vmap->vm_pfn_first;
 		*last = *first + vmap->vm_pfn_count - 1;
 		err = VM_PAGER_OK;
 		break;
 	default:
 		err = VM_PAGER_ERROR;
 		break;
 	}
 	up_write(&vmap->vm_mm->mmap_sem);
 	VM_OBJECT_WLOCK(vm_obj);
 	return (err);
 }
 
 static struct rwlock linux_vma_lock;
 static TAILQ_HEAD(, vm_area_struct) linux_vma_head =
     TAILQ_HEAD_INITIALIZER(linux_vma_head);
 
 static void
 linux_cdev_handle_free(struct vm_area_struct *vmap)
 {
 	/* Drop reference on vm_file */
 	if (vmap->vm_file != NULL)
 		fput(vmap->vm_file);
 
 	/* Drop reference on mm_struct */
 	mmput(vmap->vm_mm);
 
 	kfree(vmap);
 }
 
 static void
 linux_cdev_handle_remove(struct vm_area_struct *vmap)
 {
 	rw_wlock(&linux_vma_lock);
 	TAILQ_REMOVE(&linux_vma_head, vmap, vm_entry);
 	rw_wunlock(&linux_vma_lock);
 }
 
 static struct vm_area_struct *
 linux_cdev_handle_find(void *handle)
 {
 	struct vm_area_struct *vmap;
 
 	rw_rlock(&linux_vma_lock);
 	TAILQ_FOREACH(vmap, &linux_vma_head, vm_entry) {
 		if (vmap->vm_private_data == handle)
 			break;
 	}
 	rw_runlock(&linux_vma_lock);
 	return (vmap);
 }
 
 static int
 linux_cdev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
 		      vm_ooffset_t foff, struct ucred *cred, u_short *color)
 {
 
 	MPASS(linux_cdev_handle_find(handle) != NULL);
 	*color = 0;
 	return (0);
 }
 
 static void
 linux_cdev_pager_dtor(void *handle)
 {
 	const struct vm_operations_struct *vm_ops;
 	struct vm_area_struct *vmap;
 
 	vmap = linux_cdev_handle_find(handle);
 	MPASS(vmap != NULL);
 
 	/*
 	 * Remove handle before calling close operation to prevent
 	 * other threads from reusing the handle pointer.
 	 */
 	linux_cdev_handle_remove(vmap);
 
 	down_write(&vmap->vm_mm->mmap_sem);
 	vm_ops = vmap->vm_ops;
 	if (likely(vm_ops != NULL))
 		vm_ops->close(vmap);
 	up_write(&vmap->vm_mm->mmap_sem);
 
 	linux_cdev_handle_free(vmap);
 }
 
 static struct cdev_pager_ops linux_cdev_pager_ops[2] = {
   {
 	/* OBJT_MGTDEVICE */
 	.cdev_pg_populate	= linux_cdev_pager_populate,
 	.cdev_pg_ctor	= linux_cdev_pager_ctor,
 	.cdev_pg_dtor	= linux_cdev_pager_dtor
   },
   {
 	/* OBJT_DEVICE */
 	.cdev_pg_fault	= linux_cdev_pager_fault,
 	.cdev_pg_ctor	= linux_cdev_pager_ctor,
 	.cdev_pg_dtor	= linux_cdev_pager_dtor
   },
 };
 
 int
 zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
     unsigned long size)
 {
 	vm_object_t obj;
 	vm_page_t m;
 
 	obj = vma->vm_obj;
 	if (obj == NULL || (obj->flags & OBJ_UNMANAGED) != 0)
 		return (-ENOTSUP);
 	VM_OBJECT_RLOCK(obj);
 	for (m = vm_page_find_least(obj, OFF_TO_IDX(address));
 	    m != NULL && m->pindex < OFF_TO_IDX(address + size);
 	    m = TAILQ_NEXT(m, listq))
 		pmap_remove_all(m);
 	VM_OBJECT_RUNLOCK(obj);
 	return (0);
 }
 
 static struct file_operations dummy_ldev_ops = {
 	/* XXXKIB */
 };
 
 static struct linux_cdev dummy_ldev = {
 	.ops = &dummy_ldev_ops,
 };
 
 #define	LDEV_SI_DTR	0x0001
 #define	LDEV_SI_REF	0x0002
 
 static void
 linux_get_fop(struct linux_file *filp, const struct file_operations **fop,
     struct linux_cdev **dev)
 {
 	struct linux_cdev *ldev;
 	u_int siref;
 
 	ldev = filp->f_cdev;
 	*fop = filp->f_op;
 	if (ldev != NULL) {
 		if (ldev->kobj.ktype == &linux_cdev_static_ktype) {
 			refcount_acquire(&ldev->refs);
 		} else {
 			for (siref = ldev->siref;;) {
 				if ((siref & LDEV_SI_DTR) != 0) {
 					ldev = &dummy_ldev;
 					*fop = ldev->ops;
 					siref = ldev->siref;
 					MPASS((ldev->siref & LDEV_SI_DTR) == 0);
 				} else if (atomic_fcmpset_int(&ldev->siref,
 				    &siref, siref + LDEV_SI_REF)) {
 					break;
 				}
 			}
 		}
 	}
 	*dev = ldev;
 }
 
 static void
 linux_drop_fop(struct linux_cdev *ldev)
 {
 
 	if (ldev == NULL)
 		return;
 	if (ldev->kobj.ktype == &linux_cdev_static_ktype) {
 		linux_cdev_deref(ldev);
 	} else {
 		MPASS(ldev->kobj.ktype == &linux_cdev_ktype);
 		MPASS((ldev->siref & ~LDEV_SI_DTR) != 0);
 		atomic_subtract_int(&ldev->siref, LDEV_SI_REF);
 	}
 }
 
 #define	OPW(fp,td,code) ({			\
 	struct file *__fpop;			\
 	__typeof(code) __retval;		\
 						\
 	__fpop = (td)->td_fpop;			\
 	(td)->td_fpop = (fp);			\
 	__retval = (code);			\
 	(td)->td_fpop = __fpop;			\
 	__retval;				\
 })
 
 static int
 linux_dev_fdopen(struct cdev *dev, int fflags, struct thread *td,
     struct file *file)
 {
 	struct linux_cdev *ldev;
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	int error;
 
 	ldev = dev->si_drv1;
 
 	filp = linux_file_alloc();
 	filp->f_dentry = &filp->f_dentry_store;
 	filp->f_op = ldev->ops;
 	filp->f_mode = file->f_flag;
 	filp->f_flags = file->f_flag;
 	filp->f_vnode = file->f_vnode;
 	filp->_file = file;
 	refcount_acquire(&ldev->refs);
 	filp->f_cdev = ldev;
 
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 
 	if (fop->open != NULL) {
 		error = -fop->open(file->f_vnode, filp);
 		if (error != 0) {
 			linux_drop_fop(ldev);
 			linux_cdev_deref(filp->f_cdev);
 			kfree(filp);
 			return (error);
 		}
 	}
 
 	/* hold on to the vnode - used for fstat() */
 	vhold(filp->f_vnode);
 
 	/* release the file from devfs */
 	finit(file, filp->f_mode, DTYPE_DEV, filp, &linuxfileops);
 	linux_drop_fop(ldev);
 	return (ENXIO);
 }
 
 #define	LINUX_IOCTL_MIN_PTR 0x10000UL
 #define	LINUX_IOCTL_MAX_PTR (LINUX_IOCTL_MIN_PTR + IOCPARM_MAX)
 
 static inline int
 linux_remap_address(void **uaddr, size_t len)
 {
 	uintptr_t uaddr_val = (uintptr_t)(*uaddr);
 
 	if (unlikely(uaddr_val >= LINUX_IOCTL_MIN_PTR &&
 	    uaddr_val < LINUX_IOCTL_MAX_PTR)) {
 		struct task_struct *pts = current;
 		if (pts == NULL) {
 			*uaddr = NULL;
 			return (1);
 		}
 
 		/* compute data offset */
 		uaddr_val -= LINUX_IOCTL_MIN_PTR;
 
 		/* check that length is within bounds */
 		if ((len > IOCPARM_MAX) ||
 		    (uaddr_val + len) > pts->bsd_ioctl_len) {
 			*uaddr = NULL;
 			return (1);
 		}
 
 		/* re-add kernel buffer address */
 		uaddr_val += (uintptr_t)pts->bsd_ioctl_data;
 
 		/* update address location */
 		*uaddr = (void *)uaddr_val;
 		return (1);
 	}
 	return (0);
 }
 
 int
 linux_copyin(const void *uaddr, void *kaddr, size_t len)
 {
 	if (linux_remap_address(__DECONST(void **, &uaddr), len)) {
 		if (uaddr == NULL)
 			return (-EFAULT);
 		memcpy(kaddr, uaddr, len);
 		return (0);
 	}
 	return (-copyin(uaddr, kaddr, len));
 }
 
 int
 linux_copyout(const void *kaddr, void *uaddr, size_t len)
 {
 	if (linux_remap_address(&uaddr, len)) {
 		if (uaddr == NULL)
 			return (-EFAULT);
 		memcpy(uaddr, kaddr, len);
 		return (0);
 	}
 	return (-copyout(kaddr, uaddr, len));
 }
 
 size_t
 linux_clear_user(void *_uaddr, size_t _len)
 {
 	uint8_t *uaddr = _uaddr;
 	size_t len = _len;
 
 	/* make sure uaddr is aligned before going into the fast loop */
 	while (((uintptr_t)uaddr & 7) != 0 && len > 7) {
 		if (subyte(uaddr, 0))
 			return (_len);
 		uaddr++;
 		len--;
 	}
 
 	/* zero 8 bytes at a time */
 	while (len > 7) {
 #ifdef __LP64__
 		if (suword64(uaddr, 0))
 			return (_len);
 #else
 		if (suword32(uaddr, 0))
 			return (_len);
 		if (suword32(uaddr + 4, 0))
 			return (_len);
 #endif
 		uaddr += 8;
 		len -= 8;
 	}
 
 	/* zero fill end, if any */
 	while (len > 0) {
 		if (subyte(uaddr, 0))
 			return (_len);
 		uaddr++;
 		len--;
 	}
 	return (0);
 }
 
 int
 linux_access_ok(const void *uaddr, size_t len)
 {
 	uintptr_t saddr;
 	uintptr_t eaddr;
 
 	/* get start and end address */
 	saddr = (uintptr_t)uaddr;
 	eaddr = (uintptr_t)uaddr + len;
 
 	/* verify addresses are valid for userspace */
 	return ((saddr == eaddr) ||
 	    (eaddr > saddr && eaddr <= VM_MAXUSER_ADDRESS));
 }
 
 /*
  * This function should return either EINTR or ERESTART depending on
  * the signal type sent to this thread:
  */
 static int
 linux_get_error(struct task_struct *task, int error)
 {
 	/* check for signal type interrupt code */
 	if (error == EINTR || error == ERESTARTSYS || error == ERESTART) {
 		error = -linux_schedule_get_interrupt_value(task);
 		if (error == 0)
 			error = EINTR;
 	}
 	return (error);
 }
 
 static int
 linux_file_ioctl_sub(struct file *fp, struct linux_file *filp,
     const struct file_operations *fop, u_long cmd, caddr_t data,
     struct thread *td)
 {
 	struct task_struct *task = current;
 	unsigned size;
 	int error;
 
 	size = IOCPARM_LEN(cmd);
 	/* refer to logic in sys_ioctl() */
 	if (size > 0) {
 		/*
 		 * Setup hint for linux_copyin() and linux_copyout().
 		 *
 		 * Background: Linux code expects a user-space address
 		 * while FreeBSD supplies a kernel-space address.
 		 */
 		task->bsd_ioctl_data = data;
 		task->bsd_ioctl_len = size;
 		data = (void *)LINUX_IOCTL_MIN_PTR;
 	} else {
 		/* fetch user-space pointer */
 		data = *(void **)data;
 	}
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		/* try the compat IOCTL handler first */
 		if (fop->compat_ioctl != NULL) {
 			error = -OPW(fp, td, fop->compat_ioctl(filp,
 			    cmd, (u_long)data));
 		} else {
 			error = ENOTTY;
 		}
 
 		/* fallback to the regular IOCTL handler, if any */
 		if (error == ENOTTY && fop->unlocked_ioctl != NULL) {
 			error = -OPW(fp, td, fop->unlocked_ioctl(filp,
 			    cmd, (u_long)data));
 		}
 	} else
 #endif
 	{
 		if (fop->unlocked_ioctl != NULL) {
 			error = -OPW(fp, td, fop->unlocked_ioctl(filp,
 			    cmd, (u_long)data));
 		} else {
 			error = ENOTTY;
 		}
 	}
 	if (size > 0) {
 		task->bsd_ioctl_data = NULL;
 		task->bsd_ioctl_len = 0;
 	}
 
 	if (error == EWOULDBLOCK) {
 		/* update kqfilter status, if any */
 		linux_file_kqfilter_poll(filp,
 		    LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
 	} else {
 		error = linux_get_error(task, error);
 	}
 	return (error);
 }
 
 #define	LINUX_POLL_TABLE_NORMAL ((poll_table *)1)
 
 /*
  * This function atomically updates the poll wakeup state and returns
  * the previous state at the time of update.
  */
 static uint8_t
 linux_poll_wakeup_state(atomic_t *v, const uint8_t *pstate)
 {
 	int c, old;
 
 	c = v->counter;
 
 	while ((old = atomic_cmpxchg(v, c, pstate[c])) != c)
 		c = old;
 
 	return (c);
 }
 
 static int
 linux_poll_wakeup_callback(wait_queue_t *wq, unsigned int wq_state, int flags, void *key)
 {
 	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
 		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT, /* NOP */
 		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
 		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_READY,
 		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_READY, /* NOP */
 	};
 	struct linux_file *filp = container_of(wq, struct linux_file, f_wait_queue.wq);
 
 	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
 	case LINUX_FWQ_STATE_QUEUED:
 		linux_poll_wakeup(filp);
 		return (1);
 	default:
 		return (0);
 	}
 }
 
 void
 linux_poll_wait(struct linux_file *filp, wait_queue_head_t *wqh, poll_table *p)
 {
 	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
 		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_NOT_READY,
 		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_NOT_READY, /* NOP */
 		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_QUEUED, /* NOP */
 		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_QUEUED,
 	};
 
 	/* check if we are called inside the select system call */
 	if (p == LINUX_POLL_TABLE_NORMAL)
 		selrecord(curthread, &filp->f_selinfo);
 
 	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
 	case LINUX_FWQ_STATE_INIT:
 		/* NOTE: file handles can only belong to one wait-queue */
 		filp->f_wait_queue.wqh = wqh;
 		filp->f_wait_queue.wq.func = &linux_poll_wakeup_callback;
 		add_wait_queue(wqh, &filp->f_wait_queue.wq);
 		atomic_set(&filp->f_wait_queue.state, LINUX_FWQ_STATE_QUEUED);
 		break;
 	default:
 		break;
 	}
 }
 
 static void
 linux_poll_wait_dequeue(struct linux_file *filp)
 {
 	static const uint8_t state[LINUX_FWQ_STATE_MAX] = {
 		[LINUX_FWQ_STATE_INIT] = LINUX_FWQ_STATE_INIT,	/* NOP */
 		[LINUX_FWQ_STATE_NOT_READY] = LINUX_FWQ_STATE_INIT,
 		[LINUX_FWQ_STATE_QUEUED] = LINUX_FWQ_STATE_INIT,
 		[LINUX_FWQ_STATE_READY] = LINUX_FWQ_STATE_INIT,
 	};
 
 	seldrain(&filp->f_selinfo);
 
 	switch (linux_poll_wakeup_state(&filp->f_wait_queue.state, state)) {
 	case LINUX_FWQ_STATE_NOT_READY:
 	case LINUX_FWQ_STATE_QUEUED:
 	case LINUX_FWQ_STATE_READY:
 		remove_wait_queue(filp->f_wait_queue.wqh, &filp->f_wait_queue.wq);
 		break;
 	default:
 		break;
 	}
 }
 
 void
 linux_poll_wakeup(struct linux_file *filp)
 {
 	/* this function should be NULL-safe */
 	if (filp == NULL)
 		return;
 
 	selwakeup(&filp->f_selinfo);
 
 	spin_lock(&filp->f_kqlock);
 	filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ |
 	    LINUX_KQ_FLAG_NEED_WRITE;
 
 	/* make sure the "knote" gets woken up */
 	KNOTE_LOCKED(&filp->f_selinfo.si_note, 1);
 	spin_unlock(&filp->f_kqlock);
 }
 
 static void
 linux_file_kqfilter_detach(struct knote *kn)
 {
 	struct linux_file *filp = kn->kn_hook;
 
 	spin_lock(&filp->f_kqlock);
 	knlist_remove(&filp->f_selinfo.si_note, kn, 1);
 	spin_unlock(&filp->f_kqlock);
 }
 
 static int
 linux_file_kqfilter_read_event(struct knote *kn, long hint)
 {
 	struct linux_file *filp = kn->kn_hook;
 
 	mtx_assert(&filp->f_kqlock.m, MA_OWNED);
 
 	return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_READ) ? 1 : 0);
 }
 
 static int
 linux_file_kqfilter_write_event(struct knote *kn, long hint)
 {
 	struct linux_file *filp = kn->kn_hook;
 
 	mtx_assert(&filp->f_kqlock.m, MA_OWNED);
 
 	return ((filp->f_kqflags & LINUX_KQ_FLAG_NEED_WRITE) ? 1 : 0);
 }
 
 static struct filterops linux_dev_kqfiltops_read = {
 	.f_isfd = 1,
 	.f_detach = linux_file_kqfilter_detach,
 	.f_event = linux_file_kqfilter_read_event,
 };
 
 static struct filterops linux_dev_kqfiltops_write = {
 	.f_isfd = 1,
 	.f_detach = linux_file_kqfilter_detach,
 	.f_event = linux_file_kqfilter_write_event,
 };
 
 static void
 linux_file_kqfilter_poll(struct linux_file *filp, int kqflags)
 {
 	struct thread *td;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	int temp;
 
 	if ((filp->f_kqflags & kqflags) == 0)
 		return;
 
 	td = curthread;
 
 	linux_get_fop(filp, &fop, &ldev);
 	/* get the latest polling state */
 	temp = OPW(filp->_file, td, fop->poll(filp, NULL));
 	linux_drop_fop(ldev);
 
 	spin_lock(&filp->f_kqlock);
 	/* clear kqflags */
 	filp->f_kqflags &= ~(LINUX_KQ_FLAG_NEED_READ |
 	    LINUX_KQ_FLAG_NEED_WRITE);
 	/* update kqflags */
 	if ((temp & (POLLIN | POLLOUT)) != 0) {
 		if ((temp & POLLIN) != 0)
 			filp->f_kqflags |= LINUX_KQ_FLAG_NEED_READ;
 		if ((temp & POLLOUT) != 0)
 			filp->f_kqflags |= LINUX_KQ_FLAG_NEED_WRITE;
 
 		/* make sure the "knote" gets woken up */
 		KNOTE_LOCKED(&filp->f_selinfo.si_note, 0);
 	}
 	spin_unlock(&filp->f_kqlock);
 }
 
 static int
 linux_file_kqfilter(struct file *file, struct knote *kn)
 {
 	struct linux_file *filp;
 	struct thread *td;
 	int error;
 
 	td = curthread;
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	if (filp->f_op->poll == NULL)
 		return (EINVAL);
 
 	spin_lock(&filp->f_kqlock);
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		filp->f_kqflags |= LINUX_KQ_FLAG_HAS_READ;
 		kn->kn_fop = &linux_dev_kqfiltops_read;
 		kn->kn_hook = filp;
 		knlist_add(&filp->f_selinfo.si_note, kn, 1);
 		error = 0;
 		break;
 	case EVFILT_WRITE:
 		filp->f_kqflags |= LINUX_KQ_FLAG_HAS_WRITE;
 		kn->kn_fop = &linux_dev_kqfiltops_write;
 		kn->kn_hook = filp;
 		knlist_add(&filp->f_selinfo.si_note, kn, 1);
 		error = 0;
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	spin_unlock(&filp->f_kqlock);
 
 	if (error == 0) {
 		linux_set_current(td);
 
 		/* update kqfilter status, if any */
 		linux_file_kqfilter_poll(filp,
 		    LINUX_KQ_FLAG_HAS_READ | LINUX_KQ_FLAG_HAS_WRITE);
 	}
 	return (error);
 }
 
 static int
 linux_file_mmap_single(struct file *fp, const struct file_operations *fop,
     vm_ooffset_t *offset, vm_size_t size, struct vm_object **object,
     int nprot, bool is_shared, struct thread *td)
 {
 	struct task_struct *task;
 	struct vm_area_struct *vmap;
 	struct mm_struct *mm;
 	struct linux_file *filp;
 	vm_memattr_t attr;
 	int error;
 
 	filp = (struct linux_file *)fp->f_data;
 	filp->f_flags = fp->f_flag;
 
 	if (fop->mmap == NULL)
 		return (EOPNOTSUPP);
 
 	linux_set_current(td);
 
 	/*
 	 * The same VM object might be shared by multiple processes
 	 * and the mm_struct is usually freed when a process exits.
 	 *
 	 * The atomic reference below makes sure the mm_struct is
 	 * available as long as the vmap is in the linux_vma_head.
 	 */
 	task = current;
 	mm = task->mm;
 	if (atomic_inc_not_zero(&mm->mm_users) == 0)
 		return (EINVAL);
 
 	vmap = kzalloc(sizeof(*vmap), GFP_KERNEL);
 	vmap->vm_start = 0;
 	vmap->vm_end = size;
 	vmap->vm_pgoff = *offset / PAGE_SIZE;
 	vmap->vm_pfn = 0;
 	vmap->vm_flags = vmap->vm_page_prot = (nprot & VM_PROT_ALL);
 	if (is_shared)
 		vmap->vm_flags |= VM_SHARED;
 	vmap->vm_ops = NULL;
 	vmap->vm_file = get_file(filp);
 	vmap->vm_mm = mm;
 
 	if (unlikely(down_write_killable(&vmap->vm_mm->mmap_sem))) {
 		error = linux_get_error(task, EINTR);
 	} else {
 		error = -OPW(fp, td, fop->mmap(filp, vmap));
 		error = linux_get_error(task, error);
 		up_write(&vmap->vm_mm->mmap_sem);
 	}
 
 	if (error != 0) {
 		linux_cdev_handle_free(vmap);
 		return (error);
 	}
 
 	attr = pgprot2cachemode(vmap->vm_page_prot);
 
 	if (vmap->vm_ops != NULL) {
 		struct vm_area_struct *ptr;
 		void *vm_private_data;
 		bool vm_no_fault;
 
 		if (vmap->vm_ops->open == NULL ||
 		    vmap->vm_ops->close == NULL ||
 		    vmap->vm_private_data == NULL) {
 			/* free allocated VM area struct */
 			linux_cdev_handle_free(vmap);
 			return (EINVAL);
 		}
 
 		vm_private_data = vmap->vm_private_data;
 
 		rw_wlock(&linux_vma_lock);
 		TAILQ_FOREACH(ptr, &linux_vma_head, vm_entry) {
 			if (ptr->vm_private_data == vm_private_data)
 				break;
 		}
 		/* check if there is an existing VM area struct */
 		if (ptr != NULL) {
 			/* check if the VM area structure is invalid */
 			if (ptr->vm_ops == NULL ||
 			    ptr->vm_ops->open == NULL ||
 			    ptr->vm_ops->close == NULL) {
 				error = ESTALE;
 				vm_no_fault = 1;
 			} else {
 				error = EEXIST;
 				vm_no_fault = (ptr->vm_ops->fault == NULL);
 			}
 		} else {
 			/* insert VM area structure into list */
 			TAILQ_INSERT_TAIL(&linux_vma_head, vmap, vm_entry);
 			error = 0;
 			vm_no_fault = (vmap->vm_ops->fault == NULL);
 		}
 		rw_wunlock(&linux_vma_lock);
 
 		if (error != 0) {
 			/* free allocated VM area struct */
 			linux_cdev_handle_free(vmap);
 			/* check for stale VM area struct */
 			if (error != EEXIST)
 				return (error);
 		}
 
 		/* check if there is no fault handler */
 		if (vm_no_fault) {
 			*object = cdev_pager_allocate(vm_private_data, OBJT_DEVICE,
 			    &linux_cdev_pager_ops[1], size, nprot, *offset,
 			    td->td_ucred);
 		} else {
 			*object = cdev_pager_allocate(vm_private_data, OBJT_MGTDEVICE,
 			    &linux_cdev_pager_ops[0], size, nprot, *offset,
 			    td->td_ucred);
 		}
 
 		/* check if allocating the VM object failed */
 		if (*object == NULL) {
 			if (error == 0) {
 				/* remove VM area struct from list */
 				linux_cdev_handle_remove(vmap);
 				/* free allocated VM area struct */
 				linux_cdev_handle_free(vmap);
 			}
 			return (EINVAL);
 		}
 	} else {
 		struct sglist *sg;
 
 		sg = sglist_alloc(1, M_WAITOK);
 		sglist_append_phys(sg,
 		    (vm_paddr_t)vmap->vm_pfn << PAGE_SHIFT, vmap->vm_len);
 
 		*object = vm_pager_allocate(OBJT_SG, sg, vmap->vm_len,
 		    nprot, 0, td->td_ucred);
 
 		linux_cdev_handle_free(vmap);
 
 		if (*object == NULL) {
 			sglist_free(sg);
 			return (EINVAL);
 		}
 	}
 
 	if (attr != VM_MEMATTR_DEFAULT) {
 		VM_OBJECT_WLOCK(*object);
 		vm_object_set_memattr(*object, attr);
 		VM_OBJECT_WUNLOCK(*object);
 	}
 	*offset = 0;
 	return (0);
 }
 
 struct cdevsw linuxcdevsw = {
 	.d_version = D_VERSION,
 	.d_fdopen = linux_dev_fdopen,
 	.d_name = "lkpidev",
 };
 
 static int
 linux_file_read(struct file *file, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	ssize_t bytes;
 	int error;
 
 	error = 0;
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	/* XXX no support for I/O vectors currently */
 	if (uio->uio_iovcnt != 1)
 		return (EOPNOTSUPP);
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 	if (fop->read != NULL) {
 		bytes = OPW(file, td, fop->read(filp,
 		    uio->uio_iov->iov_base,
 		    uio->uio_iov->iov_len, &uio->uio_offset));
 		if (bytes >= 0) {
 			uio->uio_iov->iov_base =
 			    ((uint8_t *)uio->uio_iov->iov_base) + bytes;
 			uio->uio_iov->iov_len -= bytes;
 			uio->uio_resid -= bytes;
 		} else {
 			error = linux_get_error(current, -bytes);
 		}
 	} else
 		error = ENXIO;
 
 	/* update kqfilter status, if any */
 	linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_READ);
 	linux_drop_fop(ldev);
 
 	return (error);
 }
 
 static int
 linux_file_write(struct file *file, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	ssize_t bytes;
 	int error;
 
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	/* XXX no support for I/O vectors currently */
 	if (uio->uio_iovcnt != 1)
 		return (EOPNOTSUPP);
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 	if (fop->write != NULL) {
 		bytes = OPW(file, td, fop->write(filp,
 		    uio->uio_iov->iov_base,
 		    uio->uio_iov->iov_len, &uio->uio_offset));
 		if (bytes >= 0) {
 			uio->uio_iov->iov_base =
 			    ((uint8_t *)uio->uio_iov->iov_base) + bytes;
 			uio->uio_iov->iov_len -= bytes;
 			uio->uio_resid -= bytes;
 			error = 0;
 		} else {
 			error = linux_get_error(current, -bytes);
 		}
 	} else
 		error = ENXIO;
 
 	/* update kqfilter status, if any */
 	linux_file_kqfilter_poll(filp, LINUX_KQ_FLAG_HAS_WRITE);
 
 	linux_drop_fop(ldev);
 
 	return (error);
 }
 
 static int
 linux_file_poll(struct file *file, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	int revents;
 
 	filp = (struct linux_file *)file->f_data;
 	filp->f_flags = file->f_flag;
 	linux_set_current(td);
 	linux_get_fop(filp, &fop, &ldev);
 	if (fop->poll != NULL) {
 		revents = OPW(file, td, fop->poll(filp,
 		    LINUX_POLL_TABLE_NORMAL)) & events;
 	} else {
 		revents = 0;
 	}
 	linux_drop_fop(ldev);
 	return (revents);
 }
 
 static int
 linux_file_close(struct file *file, struct thread *td)
 {
 	struct linux_file *filp;
 	int (*release)(struct inode *, struct linux_file *);
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	int error;
 
 	filp = (struct linux_file *)file->f_data;
 
 	KASSERT(file_count(filp) == 0,
 	    ("File refcount(%d) is not zero", file_count(filp)));
 
 	if (td == NULL)
 		td = curthread;
 
 	error = 0;
 	filp->f_flags = file->f_flag;
 	linux_set_current(td);
 	linux_poll_wait_dequeue(filp);
 	linux_get_fop(filp, &fop, &ldev);
 	/*
 	 * Always use the real release function, if any, to avoid
 	 * leaking device resources:
 	 */
 	release = filp->f_op->release;
 	if (release != NULL)
 		error = -OPW(file, td, release(filp->f_vnode, filp));
 	funsetown(&filp->f_sigio);
 	if (filp->f_vnode != NULL)
 		vdrop(filp->f_vnode);
 	linux_drop_fop(ldev);
 	ldev = filp->f_cdev;
 	if (ldev != NULL)
 		linux_cdev_deref(ldev);
 	linux_synchronize_rcu(RCU_TYPE_REGULAR);
 	kfree(filp);
 
 	return (error);
 }
 
 static int
 linux_file_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *cred,
     struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	struct fiodgname_arg *fgn;
 	const char *p;
 	int error, i;
 
 	error = 0;
 	filp = (struct linux_file *)fp->f_data;
 	filp->f_flags = fp->f_flag;
 	linux_get_fop(filp, &fop, &ldev);
 
 	linux_set_current(td);
 	switch (cmd) {
 	case FIONBIO:
 		break;
 	case FIOASYNC:
 		if (fop->fasync == NULL)
 			break;
 		error = -OPW(fp, td, fop->fasync(0, filp, fp->f_flag & FASYNC));
 		break;
 	case FIOSETOWN:
 		error = fsetown(*(int *)data, &filp->f_sigio);
 		if (error == 0) {
 			if (fop->fasync == NULL)
 				break;
 			error = -OPW(fp, td, fop->fasync(0, filp,
 			    fp->f_flag & FASYNC));
 		}
 		break;
 	case FIOGETOWN:
 		*(int *)data = fgetown(&filp->f_sigio);
 		break;
 	case FIODGNAME:
 #ifdef	COMPAT_FREEBSD32
 	case FIODGNAME_32:
 #endif
 		if (filp->f_cdev == NULL || filp->f_cdev->cdev == NULL) {
 			error = ENXIO;
 			break;
 		}
 		fgn = data;
 		p = devtoname(filp->f_cdev->cdev);
 		i = strlen(p) + 1;
 		if (i > fgn->len) {
 			error = EINVAL;
 			break;
 		}
 		error = copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i);
 		break;
 	default:
 		error = linux_file_ioctl_sub(fp, filp, fop, cmd, data, td);
 		break;
 	}
 	linux_drop_fop(ldev);
 	return (error);
 }
 
 static int
 linux_file_mmap_sub(struct thread *td, vm_size_t objsize, vm_prot_t prot,
     vm_prot_t maxprot, int flags, struct file *fp,
     vm_ooffset_t *foff, const struct file_operations *fop, vm_object_t *objp)
 {
 	/*
 	 * Character devices do not provide private mappings
 	 * of any kind:
 	 */
 	if ((maxprot & VM_PROT_WRITE) == 0 &&
 	    (prot & VM_PROT_WRITE) != 0)
 		return (EACCES);
 	if ((flags & (MAP_PRIVATE | MAP_COPY)) != 0)
 		return (EINVAL);
 
 	return (linux_file_mmap_single(fp, fop, foff, objsize, objp,
 	    (int)prot, (flags & MAP_SHARED) ? true : false, td));
 }
 
 static int
 linux_file_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 	struct linux_file *filp;
 	const struct file_operations *fop;
 	struct linux_cdev *ldev;
 	struct mount *mp;
 	struct vnode *vp;
 	vm_object_t object;
 	vm_prot_t maxprot;
 	int error;
 
 	filp = (struct linux_file *)fp->f_data;
 
 	vp = filp->f_vnode;
 	if (vp == NULL)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Ensure that file and memory protections are
 	 * compatible.
 	 */
 	mp = vp->v_mount;
 	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 		maxprot = VM_PROT_NONE;
 		if ((prot & VM_PROT_EXECUTE) != 0)
 			return (EACCES);
 	} else
 		maxprot = VM_PROT_EXECUTE;
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_READ;
 	else if ((prot & VM_PROT_READ) != 0)
 		return (EACCES);
 
 	/*
 	 * If we are sharing potential changes via MAP_SHARED and we
 	 * are trying to get write permission although we opened it
 	 * without asking for it, bail out.
 	 *
 	 * Note that most character devices always share mappings.
 	 *
 	 * Rely on linux_file_mmap_sub() to fail invalid MAP_PRIVATE
 	 * requests rather than doing it here.
 	 */
 	if ((flags & MAP_SHARED) != 0) {
 		if ((fp->f_flag & FWRITE) != 0)
 			maxprot |= VM_PROT_WRITE;
 		else if ((prot & VM_PROT_WRITE) != 0)
 			return (EACCES);
 	}
 	maxprot &= cap_maxprot;
 
 	linux_get_fop(filp, &fop, &ldev);
 	error = linux_file_mmap_sub(td, size, prot, maxprot, flags, fp,
 	    &foff, fop, &object);
 	if (error != 0)
 		goto out;
 
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, FALSE, td);
 	if (error != 0)
 		vm_object_deallocate(object);
 out:
 	linux_drop_fop(ldev);
 	return (error);
 }
 
 static int
-linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
-    struct thread *td)
+linux_file_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 	struct linux_file *filp;
 	struct vnode *vp;
 	int error;
 
 	filp = (struct linux_file *)fp->f_data;
 	if (filp->f_vnode == NULL)
 		return (EOPNOTSUPP);
 
 	vp = filp->f_vnode;
 
 	vn_lock(vp, LK_SHARED | LK_RETRY);
-	error = VOP_STAT(vp, sb, td->td_ucred, NOCRED, td);
+	error = VOP_STAT(vp, sb, curthread->td_ucred, NOCRED);
 	VOP_UNLOCK(vp);
 
 	return (error);
 }
 
 static int
 linux_file_fill_kinfo(struct file *fp, struct kinfo_file *kif,
     struct filedesc *fdp)
 {
 	struct linux_file *filp;
 	struct vnode *vp;
 	int error;
 
 	filp = fp->f_data;
 	vp = filp->f_vnode;
 	if (vp == NULL) {
 		error = 0;
 		kif->kf_type = KF_TYPE_DEV;
 	} else {
 		vref(vp);
 		FILEDESC_SUNLOCK(fdp);
 		error = vn_fill_kinfo_vnode(vp, kif);
 		vrele(vp);
 		kif->kf_type = KF_TYPE_VNODE;
 		FILEDESC_SLOCK(fdp);
 	}
 	return (error);
 }
 
 unsigned int
 linux_iminor(struct inode *inode)
 {
 	struct linux_cdev *ldev;
 
 	if (inode == NULL || inode->v_rdev == NULL ||
 	    inode->v_rdev->si_devsw != &linuxcdevsw)
 		return (-1U);
 	ldev = inode->v_rdev->si_drv1;
 	if (ldev == NULL)
 		return (-1U);
 
 	return (minor(ldev->dev));
 }
 
 struct fileops linuxfileops = {
 	.fo_read = linux_file_read,
 	.fo_write = linux_file_write,
 	.fo_truncate = invfo_truncate,
 	.fo_kqfilter = linux_file_kqfilter,
 	.fo_stat = linux_file_stat,
 	.fo_fill_kinfo = linux_file_fill_kinfo,
 	.fo_poll = linux_file_poll,
 	.fo_close = linux_file_close,
 	.fo_ioctl = linux_file_ioctl,
 	.fo_mmap = linux_file_mmap,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_flags = DFLAG_PASSABLE,
 };
 
 /*
  * Hash of vmmap addresses.  This is infrequently accessed and does not
  * need to be particularly large.  This is done because we must store the
  * caller's idea of the map size to properly unmap.
  */
 struct vmmap {
 	LIST_ENTRY(vmmap)	vm_next;
 	void 			*vm_addr;
 	unsigned long		vm_size;
 };
 
 struct vmmaphd {
 	struct vmmap *lh_first;
 };
 #define	VMMAP_HASH_SIZE	64
 #define	VMMAP_HASH_MASK	(VMMAP_HASH_SIZE - 1)
 #define	VM_HASH(addr)	((uintptr_t)(addr) >> PAGE_SHIFT) & VMMAP_HASH_MASK
 static struct vmmaphd vmmaphead[VMMAP_HASH_SIZE];
 static struct mtx vmmaplock;
 
 static void
 vmmap_add(void *addr, unsigned long size)
 {
 	struct vmmap *vmmap;
 
 	vmmap = kmalloc(sizeof(*vmmap), GFP_KERNEL);
 	mtx_lock(&vmmaplock);
 	vmmap->vm_size = size;
 	vmmap->vm_addr = addr;
 	LIST_INSERT_HEAD(&vmmaphead[VM_HASH(addr)], vmmap, vm_next);
 	mtx_unlock(&vmmaplock);
 }
 
 static struct vmmap *
 vmmap_remove(void *addr)
 {
 	struct vmmap *vmmap;
 
 	mtx_lock(&vmmaplock);
 	LIST_FOREACH(vmmap, &vmmaphead[VM_HASH(addr)], vm_next)
 		if (vmmap->vm_addr == addr)
 			break;
 	if (vmmap)
 		LIST_REMOVE(vmmap, vm_next);
 	mtx_unlock(&vmmaplock);
 
 	return (vmmap);
 }
 
 #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__)
 void *
 _ioremap_attr(vm_paddr_t phys_addr, unsigned long size, int attr)
 {
 	void *addr;
 
 	addr = pmap_mapdev_attr(phys_addr, size, attr);
 	if (addr == NULL)
 		return (NULL);
 	vmmap_add(addr, size);
 
 	return (addr);
 }
 #endif
 
 void
 iounmap(void *addr)
 {
 	struct vmmap *vmmap;
 
 	vmmap = vmmap_remove(addr);
 	if (vmmap == NULL)
 		return;
 #if defined(__i386__) || defined(__amd64__) || defined(__powerpc__) || defined(__aarch64__)
 	pmap_unmapdev((vm_offset_t)addr, vmmap->vm_size);
 #endif
 	kfree(vmmap);
 }
 
 void *
 vmap(struct page **pages, unsigned int count, unsigned long flags, int prot)
 {
 	vm_offset_t off;
 	size_t size;
 
 	size = count * PAGE_SIZE;
 	off = kva_alloc(size);
 	if (off == 0)
 		return (NULL);
 	vmmap_add((void *)off, size);
 	pmap_qenter(off, pages, count);
 
 	return ((void *)off);
 }
 
 void
 vunmap(void *addr)
 {
 	struct vmmap *vmmap;
 
 	vmmap = vmmap_remove(addr);
 	if (vmmap == NULL)
 		return;
 	pmap_qremove((vm_offset_t)addr, vmmap->vm_size / PAGE_SIZE);
 	kva_free((vm_offset_t)addr, vmmap->vm_size);
 	kfree(vmmap);
 }
 
 static char *
 devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap)
 {
 	unsigned int len;
 	char *p;
 	va_list aq;
 
 	va_copy(aq, ap);
 	len = vsnprintf(NULL, 0, fmt, aq);
 	va_end(aq);
 
 	if (dev != NULL)
 		p = devm_kmalloc(dev, len + 1, gfp);
 	else
 		p = kmalloc(len + 1, gfp);
 	if (p != NULL)
 		vsnprintf(p, len + 1, fmt, ap);
 
 	return (p);
 }
 
 char *
 kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
 {
 
 	return (devm_kvasprintf(NULL, gfp, fmt, ap));
 }
 
 char *
 lkpi_devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...)
 {
 	va_list ap;
 	char *p;
 
 	va_start(ap, fmt);
 	p = devm_kvasprintf(dev, gfp, fmt, ap);
 	va_end(ap);
 
 	return (p);
 }
 
 char *
 kasprintf(gfp_t gfp, const char *fmt, ...)
 {
 	va_list ap;
 	char *p;
 
 	va_start(ap, fmt);
 	p = kvasprintf(gfp, fmt, ap);
 	va_end(ap);
 
 	return (p);
 }
 
 static void
 linux_timer_callback_wrapper(void *context)
 {
 	struct timer_list *timer;
 
 	timer = context;
 
 	if (linux_set_current_flags(curthread, M_NOWAIT)) {
 		/* try again later */
 		callout_reset(&timer->callout, 1,
 		    &linux_timer_callback_wrapper, timer);
 		return;
 	}
 
 	timer->function(timer->data);
 }
 
 int
 mod_timer(struct timer_list *timer, int expires)
 {
 	int ret;
 
 	timer->expires = expires;
 	ret = callout_reset(&timer->callout,
 	    linux_timer_jiffies_until(expires),
 	    &linux_timer_callback_wrapper, timer);
 
 	MPASS(ret == 0 || ret == 1);
 
 	return (ret == 1);
 }
 
 void
 add_timer(struct timer_list *timer)
 {
 
 	callout_reset(&timer->callout,
 	    linux_timer_jiffies_until(timer->expires),
 	    &linux_timer_callback_wrapper, timer);
 }
 
 void
 add_timer_on(struct timer_list *timer, int cpu)
 {
 
 	callout_reset_on(&timer->callout,
 	    linux_timer_jiffies_until(timer->expires),
 	    &linux_timer_callback_wrapper, timer, cpu);
 }
 
 int
 del_timer(struct timer_list *timer)
 {
 
 	if (callout_stop(&(timer)->callout) == -1)
 		return (0);
 	return (1);
 }
 
 int
 del_timer_sync(struct timer_list *timer)
 {
 
 	if (callout_drain(&(timer)->callout) == -1)
 		return (0);
 	return (1);
 }
 
 /* greatest common divisor, Euclid equation */
 static uint64_t
 lkpi_gcd_64(uint64_t a, uint64_t b)
 {
 	uint64_t an;
 	uint64_t bn;
 
 	while (b != 0) {
 		an = b;
 		bn = a % b;
 		a = an;
 		b = bn;
 	}
 	return (a);
 }
 
 uint64_t lkpi_nsec2hz_rem;
 uint64_t lkpi_nsec2hz_div = 1000000000ULL;
 uint64_t lkpi_nsec2hz_max;
 
 uint64_t lkpi_usec2hz_rem;
 uint64_t lkpi_usec2hz_div = 1000000ULL;
 uint64_t lkpi_usec2hz_max;
 
 uint64_t lkpi_msec2hz_rem;
 uint64_t lkpi_msec2hz_div = 1000ULL;
 uint64_t lkpi_msec2hz_max;
 
 static void
 linux_timer_init(void *arg)
 {
 	uint64_t gcd;
 
 	/*
 	 * Compute an internal HZ value which can divide 2**32 to
 	 * avoid timer rounding problems when the tick value wraps
 	 * around 2**32:
 	 */
 	linux_timer_hz_mask = 1;
 	while (linux_timer_hz_mask < (unsigned long)hz)
 		linux_timer_hz_mask *= 2;
 	linux_timer_hz_mask--;
 
 	/* compute some internal constants */
 
 	lkpi_nsec2hz_rem = hz;
 	lkpi_usec2hz_rem = hz;
 	lkpi_msec2hz_rem = hz;
 
 	gcd = lkpi_gcd_64(lkpi_nsec2hz_rem, lkpi_nsec2hz_div);
 	lkpi_nsec2hz_rem /= gcd;
 	lkpi_nsec2hz_div /= gcd;
 	lkpi_nsec2hz_max = -1ULL / lkpi_nsec2hz_rem;
 
 	gcd = lkpi_gcd_64(lkpi_usec2hz_rem, lkpi_usec2hz_div);
 	lkpi_usec2hz_rem /= gcd;
 	lkpi_usec2hz_div /= gcd;
 	lkpi_usec2hz_max = -1ULL / lkpi_usec2hz_rem;
 
 	gcd = lkpi_gcd_64(lkpi_msec2hz_rem, lkpi_msec2hz_div);
 	lkpi_msec2hz_rem /= gcd;
 	lkpi_msec2hz_div /= gcd;
 	lkpi_msec2hz_max = -1ULL / lkpi_msec2hz_rem;
 }
 SYSINIT(linux_timer, SI_SUB_DRIVERS, SI_ORDER_FIRST, linux_timer_init, NULL);
 
 void
 linux_complete_common(struct completion *c, int all)
 {
 	int wakeup_swapper;
 
 	sleepq_lock(c);
 	if (all) {
 		c->done = UINT_MAX;
 		wakeup_swapper = sleepq_broadcast(c, SLEEPQ_SLEEP, 0, 0);
 	} else {
 		if (c->done != UINT_MAX)
 			c->done++;
 		wakeup_swapper = sleepq_signal(c, SLEEPQ_SLEEP, 0, 0);
 	}
 	sleepq_release(c);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Indefinite wait for done != 0 with or without signals.
  */
 int
 linux_wait_for_common(struct completion *c, int flags)
 {
 	struct task_struct *task;
 	int error;
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 
 	task = current;
 
 	if (flags != 0)
 		flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
 	else
 		flags = SLEEPQ_SLEEP;
 	error = 0;
 	for (;;) {
 		sleepq_lock(c);
 		if (c->done)
 			break;
 		sleepq_add(c, NULL, "completion", flags, 0);
 		if (flags & SLEEPQ_INTERRUPTIBLE) {
 			DROP_GIANT();
 			error = -sleepq_wait_sig(c, 0);
 			PICKUP_GIANT();
 			if (error != 0) {
 				linux_schedule_save_interrupt_value(task, error);
 				error = -ERESTARTSYS;
 				goto intr;
 			}
 		} else {
 			DROP_GIANT();
 			sleepq_wait(c, 0);
 			PICKUP_GIANT();
 		}
 	}
 	if (c->done != UINT_MAX)
 		c->done--;
 	sleepq_release(c);
 
 intr:
 	return (error);
 }
 
 /*
  * Time limited wait for done != 0 with or without signals.
  */
 int
 linux_wait_for_timeout_common(struct completion *c, int timeout, int flags)
 {
 	struct task_struct *task;
 	int end = jiffies + timeout;
 	int error;
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 
 	task = current;
 
 	if (flags != 0)
 		flags = SLEEPQ_INTERRUPTIBLE | SLEEPQ_SLEEP;
 	else
 		flags = SLEEPQ_SLEEP;
 
 	for (;;) {
 		sleepq_lock(c);
 		if (c->done)
 			break;
 		sleepq_add(c, NULL, "completion", flags, 0);
 		sleepq_set_timeout(c, linux_timer_jiffies_until(end));
 
 		DROP_GIANT();
 		if (flags & SLEEPQ_INTERRUPTIBLE)
 			error = -sleepq_timedwait_sig(c, 0);
 		else
 			error = -sleepq_timedwait(c, 0);
 		PICKUP_GIANT();
 
 		if (error != 0) {
 			/* check for timeout */
 			if (error == -EWOULDBLOCK) {
 				error = 0;	/* timeout */
 			} else {
 				/* signal happened */
 				linux_schedule_save_interrupt_value(task, error);
 				error = -ERESTARTSYS;
 			}
 			goto done;
 		}
 	}
 	if (c->done != UINT_MAX)
 		c->done--;
 	sleepq_release(c);
 
 	/* return how many jiffies are left */
 	error = linux_timer_jiffies_until(end);
 done:
 	return (error);
 }
 
 int
 linux_try_wait_for_completion(struct completion *c)
 {
 	int isdone;
 
 	sleepq_lock(c);
 	isdone = (c->done != 0);
 	if (c->done != 0 && c->done != UINT_MAX)
 		c->done--;
 	sleepq_release(c);
 	return (isdone);
 }
 
 int
 linux_completion_done(struct completion *c)
 {
 	int isdone;
 
 	sleepq_lock(c);
 	isdone = (c->done != 0);
 	sleepq_release(c);
 	return (isdone);
 }
 
 static void
 linux_cdev_deref(struct linux_cdev *ldev)
 {
 	if (refcount_release(&ldev->refs) &&
 	    ldev->kobj.ktype == &linux_cdev_ktype)
 		kfree(ldev);
 }
 
 static void
 linux_cdev_release(struct kobject *kobj)
 {
 	struct linux_cdev *cdev;
 	struct kobject *parent;
 
 	cdev = container_of(kobj, struct linux_cdev, kobj);
 	parent = kobj->parent;
 	linux_destroy_dev(cdev);
 	linux_cdev_deref(cdev);
 	kobject_put(parent);
 }
 
 static void
 linux_cdev_static_release(struct kobject *kobj)
 {
 	struct cdev *cdev;
 	struct linux_cdev *ldev;
 
 	ldev = container_of(kobj, struct linux_cdev, kobj);
 	cdev = ldev->cdev;
 	if (cdev != NULL) {
 		destroy_dev(cdev);
 		ldev->cdev = NULL;
 	}
 	kobject_put(kobj->parent);
 }
 
 int
 linux_cdev_device_add(struct linux_cdev *ldev, struct device *dev)
 {
 	int ret;
 
 	if (dev->devt != 0) {
 		/* Set parent kernel object. */
 		ldev->kobj.parent = &dev->kobj;
 
 		/*
 		 * Unlike Linux we require the kobject of the
 		 * character device structure to have a valid name
 		 * before calling this function:
 		 */
 		if (ldev->kobj.name == NULL)
 			return (-EINVAL);
 
 		ret = cdev_add(ldev, dev->devt, 1);
 		if (ret)
 			return (ret);
 	}
 	ret = device_add(dev);
 	if (ret != 0 && dev->devt != 0)
 		cdev_del(ldev);
 	return (ret);
 }
 
 void
 linux_cdev_device_del(struct linux_cdev *ldev, struct device *dev)
 {
 	device_del(dev);
 
 	if (dev->devt != 0)
 		cdev_del(ldev);
 }
 
 static void
 linux_destroy_dev(struct linux_cdev *ldev)
 {
 
 	if (ldev->cdev == NULL)
 		return;
 
 	MPASS((ldev->siref & LDEV_SI_DTR) == 0);
 	MPASS(ldev->kobj.ktype == &linux_cdev_ktype);
 
 	atomic_set_int(&ldev->siref, LDEV_SI_DTR);
 	while ((atomic_load_int(&ldev->siref) & ~LDEV_SI_DTR) != 0)
 		pause("ldevdtr", hz / 4);
 
 	destroy_dev(ldev->cdev);
 	ldev->cdev = NULL;
 }
 
 const struct kobj_type linux_cdev_ktype = {
 	.release = linux_cdev_release,
 };
 
 const struct kobj_type linux_cdev_static_ktype = {
 	.release = linux_cdev_static_release,
 };
 
 static void
 linux_handle_ifnet_link_event(void *arg, struct ifnet *ifp, int linkstate)
 {
 	struct notifier_block *nb;
 	struct netdev_notifier_info ni;
 
 	nb = arg;
 	ni.ifp = ifp;
 	ni.dev = (struct net_device *)ifp;
 	if (linkstate == LINK_STATE_UP)
 		nb->notifier_call(nb, NETDEV_UP, &ni);
 	else
 		nb->notifier_call(nb, NETDEV_DOWN, &ni);
 }
 
 static void
 linux_handle_ifnet_arrival_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 	struct netdev_notifier_info ni;
 
 	nb = arg;
 	ni.ifp = ifp;
 	ni.dev = (struct net_device *)ifp;
 	nb->notifier_call(nb, NETDEV_REGISTER, &ni);
 }
 
 static void
 linux_handle_ifnet_departure_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 	struct netdev_notifier_info ni;
 
 	nb = arg;
 	ni.ifp = ifp;
 	ni.dev = (struct net_device *)ifp;
 	nb->notifier_call(nb, NETDEV_UNREGISTER, &ni);
 }
 
 static void
 linux_handle_iflladdr_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 	struct netdev_notifier_info ni;
 
 	nb = arg;
 	ni.ifp = ifp;
 	ni.dev = (struct net_device *)ifp;
 	nb->notifier_call(nb, NETDEV_CHANGEADDR, &ni);
 }
 
 static void
 linux_handle_ifaddr_event(void *arg, struct ifnet *ifp)
 {
 	struct notifier_block *nb;
 	struct netdev_notifier_info ni;
 
 	nb = arg;
 	ni.ifp = ifp;
 	ni.dev = (struct net_device *)ifp;
 	nb->notifier_call(nb, NETDEV_CHANGEIFADDR, &ni);
 }
 
 int
 register_netdevice_notifier(struct notifier_block *nb)
 {
 
 	nb->tags[NETDEV_UP] = EVENTHANDLER_REGISTER(
 	    ifnet_link_event, linux_handle_ifnet_link_event, nb, 0);
 	nb->tags[NETDEV_REGISTER] = EVENTHANDLER_REGISTER(
 	    ifnet_arrival_event, linux_handle_ifnet_arrival_event, nb, 0);
 	nb->tags[NETDEV_UNREGISTER] = EVENTHANDLER_REGISTER(
 	    ifnet_departure_event, linux_handle_ifnet_departure_event, nb, 0);
 	nb->tags[NETDEV_CHANGEADDR] = EVENTHANDLER_REGISTER(
 	    iflladdr_event, linux_handle_iflladdr_event, nb, 0);
 
 	return (0);
 }
 
 int
 register_inetaddr_notifier(struct notifier_block *nb)
 {
 
 	nb->tags[NETDEV_CHANGEIFADDR] = EVENTHANDLER_REGISTER(
 	    ifaddr_event, linux_handle_ifaddr_event, nb, 0);
 	return (0);
 }
 
 int
 unregister_netdevice_notifier(struct notifier_block *nb)
 {
 
 	EVENTHANDLER_DEREGISTER(ifnet_link_event,
 	    nb->tags[NETDEV_UP]);
 	EVENTHANDLER_DEREGISTER(ifnet_arrival_event,
 	    nb->tags[NETDEV_REGISTER]);
 	EVENTHANDLER_DEREGISTER(ifnet_departure_event,
 	    nb->tags[NETDEV_UNREGISTER]);
 	EVENTHANDLER_DEREGISTER(iflladdr_event,
 	    nb->tags[NETDEV_CHANGEADDR]);
 
 	return (0);
 }
 
 int
 unregister_inetaddr_notifier(struct notifier_block *nb)
 {
 
 	EVENTHANDLER_DEREGISTER(ifaddr_event,
 	    nb->tags[NETDEV_CHANGEIFADDR]);
 
 	return (0);
 }
 
 struct list_sort_thunk {
 	int (*cmp)(void *, struct list_head *, struct list_head *);
 	void *priv;
 };
 
 static inline int
 linux_le_cmp(void *priv, const void *d1, const void *d2)
 {
 	struct list_head *le1, *le2;
 	struct list_sort_thunk *thunk;
 
 	thunk = priv;
 	le1 = *(__DECONST(struct list_head **, d1));
 	le2 = *(__DECONST(struct list_head **, d2));
 	return ((thunk->cmp)(thunk->priv, le1, le2));
 }
 
 void
 list_sort(void *priv, struct list_head *head, int (*cmp)(void *priv,
     struct list_head *a, struct list_head *b))
 {
 	struct list_sort_thunk thunk;
 	struct list_head **ar, *le;
 	size_t count, i;
 
 	count = 0;
 	list_for_each(le, head)
 		count++;
 	ar = malloc(sizeof(struct list_head *) * count, M_KMALLOC, M_WAITOK);
 	i = 0;
 	list_for_each(le, head)
 		ar[i++] = le;
 	thunk.cmp = cmp;
 	thunk.priv = priv;
 	qsort_r(ar, count, sizeof(struct list_head *), &thunk, linux_le_cmp);
 	INIT_LIST_HEAD(head);
 	for (i = 0; i < count; i++)
 		list_add_tail(ar[i], head);
 	free(ar, M_KMALLOC);
 }
 
 #if defined(__i386__) || defined(__amd64__)
 int
 linux_wbinvd_on_all_cpus(void)
 {
 
 	pmap_invalidate_cache();
 	return (0);
 }
 #endif
 
 int
 linux_on_each_cpu(void callback(void *), void *data)
 {
 
 	smp_rendezvous(smp_no_rendezvous_barrier, callback,
 	    smp_no_rendezvous_barrier, data);
 	return (0);
 }
 
 int
 linux_in_atomic(void)
 {
 
 	return ((curthread->td_pflags & TDP_NOFAULTING) != 0);
 }
 
 struct linux_cdev *
 linux_find_cdev(const char *name, unsigned major, unsigned minor)
 {
 	dev_t dev = MKDEV(major, minor);
 	struct cdev *cdev;
 
 	dev_lock();
 	LIST_FOREACH(cdev, &linuxcdevsw.d_devs, si_list) {
 		struct linux_cdev *ldev = cdev->si_drv1;
 		if (ldev->dev == dev &&
 		    strcmp(kobject_name(&ldev->kobj), name) == 0) {
 			break;
 		}
 	}
 	dev_unlock();
 
 	return (cdev != NULL ? cdev->si_drv1 : NULL);
 }
 
 int
 __register_chrdev(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name,
     const struct file_operations *fops)
 {
 	struct linux_cdev *cdev;
 	int ret = 0;
 	int i;
 
 	for (i = baseminor; i < baseminor + count; i++) {
 		cdev = cdev_alloc();
 		cdev->ops = fops;
 		kobject_set_name(&cdev->kobj, name);
 
 		ret = cdev_add(cdev, makedev(major, i), 1);
 		if (ret != 0)
 			break;
 	}
 	return (ret);
 }
 
 int
 __register_chrdev_p(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name,
     const struct file_operations *fops, uid_t uid,
     gid_t gid, int mode)
 {
 	struct linux_cdev *cdev;
 	int ret = 0;
 	int i;
 
 	for (i = baseminor; i < baseminor + count; i++) {
 		cdev = cdev_alloc();
 		cdev->ops = fops;
 		kobject_set_name(&cdev->kobj, name);
 
 		ret = cdev_add_ext(cdev, makedev(major, i), uid, gid, mode);
 		if (ret != 0)
 			break;
 	}
 	return (ret);
 }
 
 void
 __unregister_chrdev(unsigned int major, unsigned int baseminor,
     unsigned int count, const char *name)
 {
 	struct linux_cdev *cdevp;
 	int i;
 
 	for (i = baseminor; i < baseminor + count; i++) {
 		cdevp = linux_find_cdev(name, major, i);
 		if (cdevp != NULL)
 			cdev_del(cdevp);
 	}
 }
 
 void
 linux_dump_stack(void)
 {
 #ifdef STACK
 	struct stack st;
 
 	stack_zero(&st);
 	stack_save(&st);
 	stack_print(&st);
 #endif
 }
 
 int
 linuxkpi_net_ratelimit(void)
 {
 
 	return (ppsratecheck(&lkpi_net_lastlog, &lkpi_net_curpps,
 	   lkpi_net_maxpps));
 }
 
 #if defined(__i386__) || defined(__amd64__)
 bool linux_cpu_has_clflush;
 #endif
 
 static void
 linux_compat_init(void *arg)
 {
 	struct sysctl_oid *rootoid;
 	int i;
 
 #if defined(__i386__) || defined(__amd64__)
 	linux_cpu_has_clflush = (cpu_feature & CPUID_CLFSH);
 #endif
 	rw_init(&linux_vma_lock, "lkpi-vma-lock");
 
 	rootoid = SYSCTL_ADD_ROOT_NODE(NULL,
 	    OID_AUTO, "sys", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "sys");
 	kobject_init(&linux_class_root, &linux_class_ktype);
 	kobject_set_name(&linux_class_root, "class");
 	linux_class_root.oidp = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(rootoid),
 	    OID_AUTO, "class", CTLFLAG_RD|CTLFLAG_MPSAFE, NULL, "class");
 	kobject_init(&linux_root_device.kobj, &linux_dev_ktype);
 	kobject_set_name(&linux_root_device.kobj, "device");
 	linux_root_device.kobj.oidp = SYSCTL_ADD_NODE(NULL,
 	    SYSCTL_CHILDREN(rootoid), OID_AUTO, "device",
 	    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "device");
 	linux_root_device.bsddev = root_bus;
 	linux_class_misc.name = "misc";
 	class_register(&linux_class_misc);
 	INIT_LIST_HEAD(&pci_drivers);
 	INIT_LIST_HEAD(&pci_devices);
 	spin_lock_init(&pci_lock);
 	mtx_init(&vmmaplock, "IO Map lock", NULL, MTX_DEF);
 	for (i = 0; i < VMMAP_HASH_SIZE; i++)
 		LIST_INIT(&vmmaphead[i]);
 	init_waitqueue_head(&linux_bit_waitq);
 	init_waitqueue_head(&linux_var_waitq);
 
 	CPU_COPY(&all_cpus, &cpu_online_mask);
 }
 SYSINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_init, NULL);
 
 static void
 linux_compat_uninit(void *arg)
 {
 	linux_kobject_kfree_name(&linux_class_root);
 	linux_kobject_kfree_name(&linux_root_device.kobj);
 	linux_kobject_kfree_name(&linux_class_misc.kobj);
 
 	mtx_destroy(&vmmaplock);
 	spin_lock_destroy(&pci_lock);
 	rw_destroy(&linux_vma_lock);
 }
 SYSUNINIT(linux_compat, SI_SUB_DRIVERS, SI_ORDER_SECOND, linux_compat_uninit, NULL);
 
 /*
  * NOTE: Linux frequently uses "unsigned long" for pointer to integer
  * conversion and vice versa, where in FreeBSD "uintptr_t" would be
  * used. Assert these types have the same size, else some parts of the
  * LinuxKPI may not work like expected:
  */
 CTASSERT(sizeof(unsigned long) == sizeof(uintptr_t));
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
index a3d67aaa11ba..6ff0ffaf1c43 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
@@ -1,299 +1,299 @@
 /*
  * Copyright (c) 2020 iXsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_recv.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_file.h>
 #include <sys/buf.h>
 #include <sys/stat.h>
 
 int
 zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
 {
 	struct thread *td;
 	int rc, fd;
 
 	td = curthread;
 	pwd_ensure_dirs();
 	/* 12.x doesn't take a const char * */
 	rc = kern_openat(td, AT_FDCWD, __DECONST(char *, path),
 	    UIO_SYSSPACE, flags, mode);
 	if (rc)
 		return (SET_ERROR(rc));
 	fd = td->td_retval[0];
 	td->td_retval[0] = 0;
 	if (fget(curthread, fd, &cap_no_rights, fpp))
 		kern_close(td, fd);
 	return (0);
 }
 
 void
 zfs_file_close(zfs_file_t *fp)
 {
 	fo_close(fp, curthread);
 }
 
 static int
 zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *offp,
     ssize_t *resid)
 {
 	ssize_t rc;
 	struct uio auio;
 	struct thread *td;
 	struct iovec aiov;
 
 	td = curthread;
 	aiov.iov_base = (void *)(uintptr_t)buf;
 	aiov.iov_len = count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_resid = count;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = *offp;
 
 	if ((fp->f_flag & FWRITE) == 0)
 		return (SET_ERROR(EBADF));
 
 	if (fp->f_type == DTYPE_VNODE)
 		bwillwrite();
 
 	rc = fo_write(fp, &auio, td->td_ucred, FOF_OFFSET, td);
 	if (rc)
 		return (SET_ERROR(rc));
 	if (resid)
 		*resid = auio.uio_resid;
 	else if (auio.uio_resid)
 		return (SET_ERROR(EIO));
 	*offp += count - auio.uio_resid;
 	return (rc);
 }
 
 int
 zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
 {
 	loff_t off = fp->f_offset;
 	ssize_t rc;
 
 	rc = zfs_file_write_impl(fp, buf, count, &off, resid);
 	if (rc == 0)
 		fp->f_offset = off;
 
 	return (SET_ERROR(rc));
 }
 
 int
 zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
     ssize_t *resid)
 {
 	return (zfs_file_write_impl(fp, buf, count, &off, resid));
 }
 
 static int
 zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp,
     ssize_t *resid)
 {
 	ssize_t rc;
 	struct uio auio;
 	struct thread *td;
 	struct iovec aiov;
 
 	td = curthread;
 	aiov.iov_base = (void *)(uintptr_t)buf;
 	aiov.iov_len = count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_resid = count;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = *offp;
 
 	if ((fp->f_flag & FREAD) == 0)
 		return (SET_ERROR(EBADF));
 
 	rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td);
 	if (rc)
 		return (SET_ERROR(rc));
 	if (resid)
 		*resid = auio.uio_resid;
 	*offp += count - auio.uio_resid;
 	return (SET_ERROR(0));
 }
 
 int
 zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
 {
 	loff_t off = fp->f_offset;
 	ssize_t rc;
 
 	rc = zfs_file_read_impl(fp, buf, count, &off, resid);
 	if (rc == 0)
 		fp->f_offset = off;
 	return (rc);
 }
 
 int
 zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
     ssize_t *resid)
 {
 	return (zfs_file_read_impl(fp, buf, count, &off, resid));
 }
 
 int
 zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
 {
 	int rc;
 	struct thread *td;
 
 	td = curthread;
 	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0)
 		return (SET_ERROR(ESPIPE));
 	rc = fo_seek(fp, *offp, whence, td);
 	if (rc == 0)
 		*offp = td->td_uretoff.tdu_off;
 	return (SET_ERROR(rc));
 }
 
 int
 zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr)
 {
 	struct thread *td;
 	struct stat sb;
 	int rc;
 
 	td = curthread;
 
-	rc = fo_stat(fp, &sb, td->td_ucred, td);
+	rc = fo_stat(fp, &sb, td->td_ucred);
 	if (rc)
 		return (SET_ERROR(rc));
 	zfattr->zfa_size = sb.st_size;
 	zfattr->zfa_mode = sb.st_mode;
 
 	return (0);
 }
 
 static __inline int
 zfs_vop_fsync(vnode_t *vp)
 {
 	struct mount *mp;
 	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto drop;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	error = VOP_FSYNC(vp, MNT_WAIT, curthread);
 	VOP_UNLOCK1(vp);
 	vn_finished_write(mp);
 drop:
 	return (SET_ERROR(error));
 }
 
 int
 zfs_file_fsync(zfs_file_t *fp, int flags)
 {
 	if (fp->f_type != DTYPE_VNODE)
 		return (EINVAL);
 
 	return (zfs_vop_fsync(fp->f_vnode));
 }
 
 zfs_file_t *
 zfs_file_get(int fd)
 {
 	struct file *fp;
 
 	if (fget(curthread, fd, &cap_no_rights, &fp))
 		return (NULL);
 
 	return (fp);
 }
 
 void
 zfs_file_put(zfs_file_t *fp)
 {
 	fdrop(fp, curthread);
 }
 
 loff_t
 zfs_file_off(zfs_file_t *fp)
 {
 	return (fp->f_offset);
 }
 
 void *
 zfs_file_private(zfs_file_t *fp)
 {
 	file_t *tmpfp;
 	void *data;
 	int error;
 
 	tmpfp = curthread->td_fpop;
 	curthread->td_fpop = fp;
 	error = devfs_get_cdevpriv(&data);
 	curthread->td_fpop = tmpfp;
 	if (error != 0)
 		return (NULL);
 	return (data);
 }
 
 int
 zfs_file_unlink(const char *fnamep)
 {
 	zfs_uio_seg_t seg = UIO_SYSSPACE;
 	int rc;
 
 #if __FreeBSD_version >= 1300018
 	rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0);
 #elif __FreeBSD_version >= 1202504 || defined(AT_BENEATH)
 	rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep),
 	    seg, 0, 0);
 #else
 	rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep),
 	    seg, 0);
 #endif
 	return (SET_ERROR(rc));
 }
diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c
index 9a327c02ee96..964d288324b4 100644
--- a/sys/fs/devfs/devfs_vnops.c
+++ b/sys/fs/devfs/devfs_vnops.c
@@ -1,2122 +1,2122 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2000-2004
  *	Poul-Henning Kamp.  All rights reserved.
  * Copyright (c) 1989, 1992-1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kernfs_vnops.c	8.15 (Berkeley) 5/21/95
  * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43
  *
  * $FreeBSD$
  */
 
 /*
  * TODO:
  *	mkdir: want it ?
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/ttycom.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 static struct vop_vector devfs_vnodeops;
 static struct vop_vector devfs_specops;
 static struct fileops devfs_ops_f;
 
 #include <fs/devfs/devfs.h>
 #include <fs/devfs/devfs_int.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
 static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data");
 
 struct mtx	devfs_de_interlock;
 MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF);
 struct sx	clone_drain_lock;
 SX_SYSINIT(clone_drain_lock, &clone_drain_lock, "clone events drain lock");
 struct mtx	cdevpriv_mtx;
 MTX_SYSINIT(cdevpriv_mtx, &cdevpriv_mtx, "cdevpriv lock", MTX_DEF);
 
 SYSCTL_DECL(_vfs_devfs);
 
 static int devfs_dotimes;
 SYSCTL_INT(_vfs_devfs, OID_AUTO, dotimes, CTLFLAG_RW,
     &devfs_dotimes, 0, "Update timestamps on DEVFS with default precision");
 
 /*
  * Update devfs node timestamp.  Note that updates are unlocked and
  * stat(2) could see partially updated times.
  */
 static void
 devfs_timestamp(struct timespec *tsp)
 {
 	time_t ts;
 
 	if (devfs_dotimes) {
 		vfs_timestamp(tsp);
 	} else {
 		ts = time_second;
 		if (tsp->tv_sec != ts) {
 			tsp->tv_sec = ts;
 			tsp->tv_nsec = 0;
 		}
 	}
 }
 
 static int
 devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp,
     int *ref)
 {
 	*dswp = devvn_refthread(fp->f_vnode, devp, ref);
 	if (*dswp == NULL || *devp != fp->f_data) {
 		if (*dswp != NULL)
 			dev_relthread(*devp, *ref);
 		return (ENXIO);
 	}
 	KASSERT((*devp)->si_refcount > 0,
 	    ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp)));
 	if (*dswp == NULL)
 		return (ENXIO);
 	curthread->td_fpop = fp;
 	return (0);
 }
 
 int
 devfs_get_cdevpriv(void **datap)
 {
 	struct file *fp;
 	struct cdev_privdata *p;
 	int error;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return (EBADF);
 	p = fp->f_cdevpriv;
 	if (p != NULL) {
 		error = 0;
 		*datap = p->cdpd_data;
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 int
 devfs_set_cdevpriv(void *priv, d_priv_dtor_t *priv_dtr)
 {
 	struct file *fp;
 	struct cdev_priv *cdp;
 	struct cdev_privdata *p;
 	int error;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return (ENOENT);
 	cdp = cdev2priv((struct cdev *)fp->f_data);
 	p = malloc(sizeof(struct cdev_privdata), M_CDEVPDATA, M_WAITOK);
 	p->cdpd_data = priv;
 	p->cdpd_dtr = priv_dtr;
 	p->cdpd_fp = fp;
 	mtx_lock(&cdevpriv_mtx);
 	if (fp->f_cdevpriv == NULL) {
 		LIST_INSERT_HEAD(&cdp->cdp_fdpriv, p, cdpd_list);
 		fp->f_cdevpriv = p;
 		mtx_unlock(&cdevpriv_mtx);
 		error = 0;
 	} else {
 		mtx_unlock(&cdevpriv_mtx);
 		free(p, M_CDEVPDATA);
 		error = EBUSY;
 	}
 	return (error);
 }
 
 void
 devfs_destroy_cdevpriv(struct cdev_privdata *p)
 {
 
 	mtx_assert(&cdevpriv_mtx, MA_OWNED);
 	KASSERT(p->cdpd_fp->f_cdevpriv == p,
 	    ("devfs_destoy_cdevpriv %p != %p", p->cdpd_fp->f_cdevpriv, p));
 	p->cdpd_fp->f_cdevpriv = NULL;
 	LIST_REMOVE(p, cdpd_list);
 	mtx_unlock(&cdevpriv_mtx);
 	(p->cdpd_dtr)(p->cdpd_data);
 	free(p, M_CDEVPDATA);
 }
 
 static void
 devfs_fpdrop(struct file *fp)
 {
 	struct cdev_privdata *p;
 
 	mtx_lock(&cdevpriv_mtx);
 	if ((p = fp->f_cdevpriv) == NULL) {
 		mtx_unlock(&cdevpriv_mtx);
 		return;
 	}
 	devfs_destroy_cdevpriv(p);
 }
 
 void
 devfs_clear_cdevpriv(void)
 {
 	struct file *fp;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return;
 	devfs_fpdrop(fp);
 }
 
 static void
 devfs_usecount_add(struct vnode *vp)
 {
 	struct devfs_dirent *de;
 	struct cdev *dev;
 
 	mtx_lock(&devfs_de_interlock);
 	VI_LOCK(vp);
 	VNPASS(vp->v_type == VCHR || vp->v_type == VBAD, vp);
 	if (VN_IS_DOOMED(vp)) {
 		goto out_unlock;
 	}
 
 	de = vp->v_data;
 	dev = vp->v_rdev;
 	MPASS(de != NULL);
 	MPASS(dev != NULL);
 	dev->si_usecount++;
 	de->de_usecount++;
 out_unlock:
 	VI_UNLOCK(vp);
 	mtx_unlock(&devfs_de_interlock);
 }
 
 static void
 devfs_usecount_subl(struct vnode *vp)
 {
 	struct devfs_dirent *de;
 	struct cdev *dev;
 
 	mtx_assert(&devfs_de_interlock, MA_OWNED);
 	ASSERT_VI_LOCKED(vp, __func__);
 	VNPASS(vp->v_type == VCHR || vp->v_type == VBAD, vp);
 
 	de = vp->v_data;
 	dev = vp->v_rdev;
 	if (de == NULL)
 		return;
 	if (dev == NULL) {
 		MPASS(de->de_usecount == 0);
 		return;
 	}
 	if (dev->si_usecount < de->de_usecount)
 		panic("%s: si_usecount underflow for dev %p "
 		    "(has %ld, dirent has %d)\n",
 		    __func__, dev, dev->si_usecount, de->de_usecount);
 	if (VN_IS_DOOMED(vp)) {
 		dev->si_usecount -= de->de_usecount;
 		de->de_usecount = 0;
 	} else {
 		if (de->de_usecount == 0)
 			panic("%s: de_usecount underflow for dev %p\n",
 			    __func__, dev);
 		dev->si_usecount--;
 		de->de_usecount--;
 	}
 }
 
 static void
 devfs_usecount_sub(struct vnode *vp)
 {
 
 	mtx_lock(&devfs_de_interlock);
 	VI_LOCK(vp);
 	devfs_usecount_subl(vp);
 	VI_UNLOCK(vp);
 	mtx_unlock(&devfs_de_interlock);
 }
 
 static int
 devfs_usecountl(struct vnode *vp)
 {
 
 	VNPASS(vp->v_type == VCHR, vp);
 	mtx_assert(&devfs_de_interlock, MA_OWNED);
 	ASSERT_VI_LOCKED(vp, __func__);
 	return (vp->v_rdev->si_usecount);
 }
 
 int
 devfs_usecount(struct vnode *vp)
 {
 	int count;
 
 	VNPASS(vp->v_type == VCHR, vp);
 	mtx_lock(&devfs_de_interlock);
 	VI_LOCK(vp);
 	count = devfs_usecountl(vp);
 	VI_UNLOCK(vp);
 	mtx_unlock(&devfs_de_interlock);
 	return (count);
 }
 
 void
 devfs_ctty_ref(struct vnode *vp)
 {
 
 	vrefact(vp);
 	devfs_usecount_add(vp);
 }
 
 void
 devfs_ctty_unref(struct vnode *vp)
 {
 
 	devfs_usecount_sub(vp);
 	vrele(vp);
 }
 
 /*
  * On success devfs_populate_vp() returns with dmp->dm_lock held.
  */
 static int
 devfs_populate_vp(struct vnode *vp)
 {
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	int locked;
 
 	ASSERT_VOP_LOCKED(vp, "devfs_populate_vp");
 
 	dmp = VFSTODEVFS(vp->v_mount);
 	if (!devfs_populate_needed(dmp)) {
 		sx_xlock(&dmp->dm_lock);
 		goto out_nopopulate;
 	}
 
 	locked = VOP_ISLOCKED(vp);
 
 	sx_xlock(&dmp->dm_lock);
 	DEVFS_DMP_HOLD(dmp);
 
 	/* Can't call devfs_populate() with the vnode lock held. */
 	VOP_UNLOCK(vp);
 	devfs_populate(dmp);
 
 	sx_xunlock(&dmp->dm_lock);
 	vn_lock(vp, locked | LK_RETRY);
 	sx_xlock(&dmp->dm_lock);
 	if (DEVFS_DMP_DROP(dmp)) {
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		return (ERESTART);
 	}
 out_nopopulate:
 	if (VN_IS_DOOMED(vp)) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ERESTART);
 	}
 	de = vp->v_data;
 	KASSERT(de != NULL,
 	    ("devfs_populate_vp: vp->v_data == NULL but vnode not doomed"));
 	if ((de->de_flags & DE_DOOMED) != 0) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ERESTART);
 	}
 
 	return (0);
 }
 
 static int
 devfs_vptocnp(struct vop_vptocnp_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode **dvp = ap->a_vpp;
 	struct devfs_mount *dmp;
 	char *buf = ap->a_buf;
 	size_t *buflen = ap->a_buflen;
 	struct devfs_dirent *dd, *de;
 	int i, error;
 
 	dmp = VFSTODEVFS(vp->v_mount);
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	if (vp->v_type != VCHR && vp->v_type != VDIR) {
 		error = ENOENT;
 		goto finished;
 	}
 
 	dd = vp->v_data;
 	if (vp->v_type == VDIR && dd == dmp->dm_rootdir) {
 		*dvp = vp;
 		vref(*dvp);
 		goto finished;
 	}
 
 	i = *buflen;
 	i -= dd->de_dirent->d_namlen;
 	if (i < 0) {
 		error = ENOMEM;
 		goto finished;
 	}
 	bcopy(dd->de_dirent->d_name, buf + i, dd->de_dirent->d_namlen);
 	*buflen = i;
 	de = devfs_parent_dirent(dd);
 	if (de == NULL) {
 		error = ENOENT;
 		goto finished;
 	}
 	mtx_lock(&devfs_de_interlock);
 	*dvp = de->de_vnode;
 	if (*dvp != NULL) {
 		VI_LOCK(*dvp);
 		mtx_unlock(&devfs_de_interlock);
 		vholdl(*dvp);
 		VI_UNLOCK(*dvp);
 		vref(*dvp);
 		vdrop(*dvp);
 	} else {
 		mtx_unlock(&devfs_de_interlock);
 		error = ENOENT;
 	}
 finished:
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 /*
  * Construct the fully qualified path name relative to the mountpoint.
  * If a NULL cnp is provided, no '/' is appended to the resulting path.
  */
 char *
 devfs_fqpn(char *buf, struct devfs_mount *dmp, struct devfs_dirent *dd,
     struct componentname *cnp)
 {
 	int i;
 	struct devfs_dirent *de;
 
 	sx_assert(&dmp->dm_lock, SA_LOCKED);
 
 	i = SPECNAMELEN;
 	buf[i] = '\0';
 	if (cnp != NULL)
 		i -= cnp->cn_namelen;
 	if (i < 0)
 		 return (NULL);
 	if (cnp != NULL)
 		bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen);
 	de = dd;
 	while (de != dmp->dm_rootdir) {
 		if (cnp != NULL || i < SPECNAMELEN) {
 			i--;
 			if (i < 0)
 				 return (NULL);
 			buf[i] = '/';
 		}
 		i -= de->de_dirent->d_namlen;
 		if (i < 0)
 			 return (NULL);
 		bcopy(de->de_dirent->d_name, buf + i,
 		    de->de_dirent->d_namlen);
 		de = devfs_parent_dirent(de);
 		if (de == NULL)
 			return (NULL);
 	}
 	return (buf + i);
 }
 
 static int
 devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp,
 	struct devfs_dirent *de)
 {
 	int not_found;
 
 	not_found = 0;
 	if (de->de_flags & DE_DOOMED)
 		not_found = 1;
 	if (DEVFS_DE_DROP(de)) {
 		KASSERT(not_found == 1, ("DEVFS de dropped but not doomed"));
 		devfs_dirent_free(de);
 	}
 	if (DEVFS_DMP_DROP(dmp)) {
 		KASSERT(not_found == 1,
 			("DEVFS mount struct freed before dirent"));
 		not_found = 2;
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 	}
 	if (not_found == 1 || (drop_dm_lock && not_found != 2))
 		sx_unlock(&dmp->dm_lock);
 	return (not_found);
 }
 
 static void
 devfs_insmntque_dtr(struct vnode *vp, void *arg)
 {
 	struct devfs_dirent *de;
 
 	de = (struct devfs_dirent *)arg;
 	mtx_lock(&devfs_de_interlock);
 	vp->v_data = NULL;
 	de->de_vnode = NULL;
 	mtx_unlock(&devfs_de_interlock);
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * devfs_allocv shall be entered with dmp->dm_lock held, and it drops
  * it on return.
  */
 int
 devfs_allocv(struct devfs_dirent *de, struct mount *mp, int lockmode,
     struct vnode **vpp)
 {
 	int error;
 	struct vnode *vp;
 	struct cdev *dev;
 	struct devfs_mount *dmp;
 	struct cdevsw *dsw;
 	enum vgetstate vs;
 
 	dmp = VFSTODEVFS(mp);
 	if (de->de_flags & DE_DOOMED) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ENOENT);
 	}
 loop:
 	DEVFS_DE_HOLD(de);
 	DEVFS_DMP_HOLD(dmp);
 	mtx_lock(&devfs_de_interlock);
 	vp = de->de_vnode;
 	if (vp != NULL) {
 		vs = vget_prep(vp);
 		mtx_unlock(&devfs_de_interlock);
 		sx_xunlock(&dmp->dm_lock);
 		vget_finish(vp, lockmode | LK_RETRY, vs);
 		sx_xlock(&dmp->dm_lock);
 		if (devfs_allocv_drop_refs(0, dmp, de)) {
 			vput(vp);
 			return (ENOENT);
 		}
 		else if (VN_IS_DOOMED(vp)) {
 			mtx_lock(&devfs_de_interlock);
 			if (de->de_vnode == vp) {
 				de->de_vnode = NULL;
 				vp->v_data = NULL;
 			}
 			mtx_unlock(&devfs_de_interlock);
 			vput(vp);
 			goto loop;
 		}
 		sx_xunlock(&dmp->dm_lock);
 		*vpp = vp;
 		return (0);
 	}
 	mtx_unlock(&devfs_de_interlock);
 	if (de->de_dirent->d_type == DT_CHR) {
 		if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) {
 			devfs_allocv_drop_refs(1, dmp, de);
 			return (ENOENT);
 		}
 		dev = &de->de_cdp->cdp_c;
 	} else {
 		dev = NULL;
 	}
 	error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp);
 	if (error != 0) {
 		devfs_allocv_drop_refs(1, dmp, de);
 		printf("devfs_allocv: failed to allocate new vnode\n");
 		return (error);
 	}
 
 	if (de->de_dirent->d_type == DT_CHR) {
 		vp->v_type = VCHR;
 		VI_LOCK(vp);
 		dev_lock();
 		dev_refl(dev);
 		/* XXX: v_rdev should be protect by vnode lock */
 		vp->v_rdev = dev;
 		VNPASS(vp->v_usecount == 1, vp);
 		/* Special casing of ttys for deadfs.  Probably redundant. */
 		dsw = dev->si_devsw;
 		if (dsw != NULL && (dsw->d_flags & D_TTY) != 0)
 			vp->v_vflag |= VV_ISTTY;
 		dev_unlock();
 		VI_UNLOCK(vp);
 		if ((dev->si_flags & SI_ETERNAL) != 0)
 			vp->v_vflag |= VV_ETERNALDEV;
 		vp->v_op = &devfs_specops;
 	} else if (de->de_dirent->d_type == DT_DIR) {
 		vp->v_type = VDIR;
 	} else if (de->de_dirent->d_type == DT_LNK) {
 		vp->v_type = VLNK;
 	} else {
 		vp->v_type = VBAD;
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWITNESS);
 	VN_LOCK_ASHARE(vp);
 	mtx_lock(&devfs_de_interlock);
 	vp->v_data = de;
 	de->de_vnode = vp;
 	mtx_unlock(&devfs_de_interlock);
 	error = insmntque1(vp, mp, devfs_insmntque_dtr, de);
 	if (error != 0) {
 		(void) devfs_allocv_drop_refs(1, dmp, de);
 		return (error);
 	}
 	if (devfs_allocv_drop_refs(0, dmp, de)) {
 		vput(vp);
 		return (ENOENT);
 	}
 #ifdef MAC
 	mac_devfs_vnode_associate(mp, de, vp);
 #endif
 	sx_xunlock(&dmp->dm_lock);
 	*vpp = vp;
 	return (0);
 }
 
 static int
 devfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *de;
 	struct proc *p;
 	int error;
 
 	de = vp->v_data;
 	if (vp->v_type == VDIR)
 		de = de->de_dir;
 
 	error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid,
 	    ap->a_accmode, ap->a_cred);
 	if (error == 0)
 		return (0);
 	if (error != EACCES)
 		return (error);
 	p = ap->a_td->td_proc;
 	/* We do, however, allow access to the controlling terminal */
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_CONTROLT)) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (p->p_session->s_ttydp == de->de_cdp)
 		error = 0;
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 _Static_assert(((FMASK | FCNTLFLAGS) & (FLASTCLOSE | FREVOKE)) == 0,
     "devfs-only flag reuse failed");
 
 static int
 devfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp, *oldvp;
 	struct thread *td = ap->a_td;
 	struct proc *p;
 	struct cdev *dev = vp->v_rdev;
 	struct cdevsw *dsw;
 	struct devfs_dirent *de = vp->v_data;
 	int dflags, error, ref, vp_locked;
 
 	/*
 	 * XXX: Don't call d_close() if we were called because of
 	 * XXX: insmntque1() failure.
 	 */
 	if (vp->v_data == NULL)
 		return (0);
 
 	/*
 	 * Hack: a tty device that is a controlling terminal
 	 * has a reference from the session structure.
 	 * We cannot easily tell that a character device is
 	 * a controlling terminal, unless it is the closing
 	 * process' controlling terminal.  In that case,
 	 * if the reference count is 2 (this last descriptor
 	 * plus the session), release the reference from the session.
 	 */
 	if (de->de_usecount == 2 && td != NULL) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		if (vp == p->p_session->s_ttyvp) {
 			PROC_UNLOCK(p);
 			oldvp = NULL;
 			sx_xlock(&proctree_lock);
 			if (vp == p->p_session->s_ttyvp) {
 				SESS_LOCK(p->p_session);
 				mtx_lock(&devfs_de_interlock);
 				VI_LOCK(vp);
 				if (devfs_usecountl(vp) == 2 && !VN_IS_DOOMED(vp)) {
 					p->p_session->s_ttyvp = NULL;
 					p->p_session->s_ttydp = NULL;
 					oldvp = vp;
 				}
 				VI_UNLOCK(vp);
 				mtx_unlock(&devfs_de_interlock);
 				SESS_UNLOCK(p->p_session);
 			}
 			sx_xunlock(&proctree_lock);
 			if (oldvp != NULL)
 				devfs_ctty_unref(oldvp);
 		} else
 			PROC_UNLOCK(p);
 	}
 	/*
 	 * We do not want to really close the device if it
 	 * is still in use unless we are trying to close it
 	 * forcibly. Since every use (buffer, vnode, swap, cmap)
 	 * holds a reference to the vnode, and because we mark
 	 * any other vnodes that alias this device, when the
 	 * sum of the reference counts on all the aliased
 	 * vnodes descends to one, we are on last close.
 	 */
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	dflags = 0;
 	mtx_lock(&devfs_de_interlock);
 	VI_LOCK(vp);
 	if (devfs_usecountl(vp) == 1)
 		dflags |= FLASTCLOSE;
 	devfs_usecount_subl(vp);
 	mtx_unlock(&devfs_de_interlock);
 	if (VN_IS_DOOMED(vp)) {
 		/* Forced close. */
 		dflags |= FREVOKE | FNONBLOCK;
 	} else if (dsw->d_flags & D_TRACKCLOSE) {
 		/* Keep device updated on status. */
 	} else if ((dflags & FLASTCLOSE) == 0) {
 		VI_UNLOCK(vp);
 		dev_relthread(dev, ref);
 		return (0);
 	}
 	vholdnz(vp);
 	VI_UNLOCK(vp);
 	vp_locked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp);
 	KASSERT(dev->si_refcount > 0,
 	    ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev)));
 	error = dsw->d_close(dev, ap->a_fflag | dflags, S_IFCHR, td);
 	dev_relthread(dev, ref);
 	vn_lock(vp, vp_locked | LK_RETRY);
 	vdrop(vp);
 	return (error);
 }
 
 static int
 devfs_close_f(struct file *fp, struct thread *td)
 {
 	int error;
 	struct file *fpop;
 
 	/*
 	 * NB: td may be NULL if this descriptor is closed due to
 	 * garbage collection from a closed UNIX domain socket.
 	 */
 	fpop = curthread->td_fpop;
 	curthread->td_fpop = fp;
 	error = vnops.fo_close(fp, td);
 	curthread->td_fpop = fpop;
 
 	/*
 	 * The f_cdevpriv cannot be assigned non-NULL value while we
 	 * are destroying the file.
 	 */
 	if (fp->f_cdevpriv != NULL)
 		devfs_fpdrop(fp);
 	return (error);
 }
 
 static int
 devfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	struct cdev *dev;
 	struct timeval boottime;
 	int error;
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	dmp = VFSTODEVFS(vp->v_mount);
 	sx_xunlock(&dmp->dm_lock);
 
 	de = vp->v_data;
 	KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp));
 	if (vp->v_type == VDIR) {
 		de = de->de_dir;
 		KASSERT(de != NULL,
 		    ("Null dir dirent in devfs_getattr vp=%p", vp));
 	}
 	vap->va_uid = de->de_uid;
 	vap->va_gid = de->de_gid;
 	vap->va_mode = de->de_mode;
 	if (vp->v_type == VLNK)
 		vap->va_size = strlen(de->de_symlink);
 	else if (vp->v_type == VDIR)
 		vap->va_size = vap->va_bytes = DEV_BSIZE;
 	else
 		vap->va_size = 0;
 	if (vp->v_type != VDIR)
 		vap->va_bytes = 0;
 	vap->va_blocksize = DEV_BSIZE;
 	vap->va_type = vp->v_type;
 
 	getboottime(&boottime);
 #define fix(aa)							\
 	do {							\
 		if ((aa).tv_sec <= 3600) {			\
 			(aa).tv_sec = boottime.tv_sec;		\
 			(aa).tv_nsec = boottime.tv_usec * 1000; \
 		}						\
 	} while (0)
 
 	if (vp->v_type != VCHR)  {
 		fix(de->de_atime);
 		vap->va_atime = de->de_atime;
 		fix(de->de_mtime);
 		vap->va_mtime = de->de_mtime;
 		fix(de->de_ctime);
 		vap->va_ctime = de->de_ctime;
 	} else {
 		dev = vp->v_rdev;
 		fix(dev->si_atime);
 		vap->va_atime = dev->si_atime;
 		fix(dev->si_mtime);
 		vap->va_mtime = dev->si_mtime;
 		fix(dev->si_ctime);
 		vap->va_ctime = dev->si_ctime;
 
 		vap->va_rdev = cdev2priv(dev)->cdp_inode;
 	}
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_filerev = 0;
 	vap->va_nlink = de->de_links;
 	vap->va_fileid = de->de_inode;
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td)
 {
 	struct file *fpop;
 	int error;
 
 	fpop = td->td_fpop;
 	td->td_fpop = fp;
 	error = vnops.fo_ioctl(fp, com, data, cred, td);
 	td->td_fpop = fpop;
 	return (error);
 }
 
 void *
 fiodgname_buf_get_ptr(void *fgnp, u_long com)
 {
 	union {
 		struct fiodgname_arg	fgn;
 #ifdef COMPAT_FREEBSD32
 		struct fiodgname_arg32	fgn32;
 #endif
 	} *fgnup;
 
 	fgnup = fgnp;
 	switch (com) {
 	case FIODGNAME:
 		return (fgnup->fgn.buf);
 #ifdef COMPAT_FREEBSD32
 	case FIODGNAME_32:
 		return ((void *)(uintptr_t)fgnup->fgn32.buf);
 #endif
 	default:
 		panic("Unhandled ioctl command %ld", com);
 	}
 }
 
 static int
 devfs_ioctl(struct vop_ioctl_args *ap)
 {
 	struct fiodgname_arg *fgn;
 	struct vnode *vpold, *vp;
 	struct cdevsw *dsw;
 	struct thread *td;
 	struct session *sess;
 	struct cdev *dev;
 	int error, ref, i;
 	const char *p;
 	u_long com;
 
 	vp = ap->a_vp;
 	com = ap->a_command;
 	td = ap->a_td;
 
 	dsw = devvn_refthread(vp, &dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	KASSERT(dev->si_refcount > 0,
 	    ("devfs: un-referenced struct cdev *(%s)", devtoname(dev)));
 
 	switch (com) {
 	case FIODTYPE:
 		*(int *)ap->a_data = dsw->d_flags & D_TYPEMASK;
 		error = 0;
 		break;
 	case FIODGNAME:
 #ifdef	COMPAT_FREEBSD32
 	case FIODGNAME_32:
 #endif
 		fgn = ap->a_data;
 		p = devtoname(dev);
 		i = strlen(p) + 1;
 		if (i > fgn->len)
 			error = EINVAL;
 		else
 			error = copyout(p, fiodgname_buf_get_ptr(fgn, com), i);
 		break;
 	default:
 		error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td);
 	}
 
 	dev_relthread(dev, ref);
 	if (error == ENOIOCTL)
 		error = ENOTTY;
 
 	if (error == 0 && com == TIOCSCTTY) {
 		/*
 		 * Do nothing if reassigning same control tty, or if the
 		 * control tty has already disappeared.  If it disappeared,
 		 * it's because we were racing with TIOCNOTTY.  TIOCNOTTY
 		 * already took care of releasing the old vnode and we have
 		 * nothing left to do.
 		 */
 		sx_slock(&proctree_lock);
 		sess = td->td_proc->p_session;
 		if (sess->s_ttyvp == vp || sess->s_ttyp == NULL) {
 			sx_sunlock(&proctree_lock);
 			return (0);
 		}
 
 		devfs_ctty_ref(vp);
 		SESS_LOCK(sess);
 		vpold = sess->s_ttyvp;
 		sess->s_ttyvp = vp;
 		sess->s_ttydp = cdev2priv(dev);
 		SESS_UNLOCK(sess);
 
 		sx_sunlock(&proctree_lock);
 
 		/* Get rid of reference to old control tty */
 		if (vpold)
 			devfs_ctty_unref(vpold);
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_kqfilter_f(struct file *fp, struct knote *kn)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	int error, ref;
 	struct file *fpop;
 	struct thread *td;
 
 	td = curthread;
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error)
 		return (error);
 	error = dsw->d_kqfilter(dev, kn);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	return (error);
 }
 
 static inline int
 devfs_prison_check(struct devfs_dirent *de, struct thread *td)
 {
 	struct cdev_priv *cdp;
 	struct ucred *dcr;
 	struct proc *p;
 	int error;
 
 	cdp = de->de_cdp;
 	if (cdp == NULL)
 		return (0);
 	dcr = cdp->cdp_c.si_cred;
 	if (dcr == NULL)
 		return (0);
 
 	error = prison_check(td->td_ucred, dcr);
 	if (error == 0)
 		return (0);
 	/* We do, however, allow access to the controlling terminal */
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_CONTROLT)) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (p->p_session->s_ttydp == cdp)
 		error = 0;
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 static int
 devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct thread *td;
 	struct devfs_dirent *de, *dd;
 	struct devfs_dirent **dde;
 	struct devfs_mount *dmp;
 	struct mount *mp;
 	struct cdev *cdev;
 	int error, flags, nameiop, dvplocked;
 	char specname[SPECNAMELEN + 1], *pname;
 
 	td = curthread;
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	mp = dvp->v_mount;
 	dmp = VFSTODEVFS(mp);
 	dd = dvp->v_data;
 	*vpp = NULLVP;
 
 	if ((flags & ISLASTCN) && nameiop == RENAME)
 		return (EOPNOTSUPP);
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT))
 		return (EIO);
 
 	error = vn_dir_check_exec(dvp, cnp);
 	if (error != 0)
 		return (error);
 
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		de = devfs_parent_dirent(dd);
 		if (de == NULL)
 			return (ENOENT);
 		dvplocked = VOP_ISLOCKED(dvp);
 		VOP_UNLOCK(dvp);
 		error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK,
 		    vpp);
 		*dm_unlock = 0;
 		vn_lock(dvp, dvplocked | LK_RETRY);
 		return (error);
 	}
 
 	dd = dvp->v_data;
 	de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen, 0);
 	while (de == NULL) {	/* While(...) so we can use break */
 
 		if (nameiop == DELETE)
 			return (ENOENT);
 
 		/*
 		 * OK, we didn't have an entry for the name we were asked for
 		 * so we try to see if anybody can create it on demand.
 		 */
 		pname = devfs_fqpn(specname, dmp, dd, cnp);
 		if (pname == NULL)
 			break;
 
 		cdev = NULL;
 		DEVFS_DMP_HOLD(dmp);
 		sx_xunlock(&dmp->dm_lock);
 		sx_slock(&clone_drain_lock);
 		EVENTHANDLER_INVOKE(dev_clone,
 		    td->td_ucred, pname, strlen(pname), &cdev);
 		sx_sunlock(&clone_drain_lock);
 
 		if (cdev == NULL)
 			sx_xlock(&dmp->dm_lock);
 		else if (devfs_populate_vp(dvp) != 0) {
 			*dm_unlock = 0;
 			sx_xlock(&dmp->dm_lock);
 			if (DEVFS_DMP_DROP(dmp)) {
 				sx_xunlock(&dmp->dm_lock);
 				devfs_unmount_final(dmp);
 			} else
 				sx_xunlock(&dmp->dm_lock);
 			dev_rel(cdev);
 			return (ENOENT);
 		}
 		if (DEVFS_DMP_DROP(dmp)) {
 			*dm_unlock = 0;
 			sx_xunlock(&dmp->dm_lock);
 			devfs_unmount_final(dmp);
 			if (cdev != NULL)
 				dev_rel(cdev);
 			return (ENOENT);
 		}
 
 		if (cdev == NULL)
 			break;
 
 		dev_lock();
 		dde = &cdev2priv(cdev)->cdp_dirents[dmp->dm_idx];
 		if (dde != NULL && *dde != NULL)
 			de = *dde;
 		dev_unlock();
 		dev_rel(cdev);
 		break;
 	}
 
 	if (de == NULL || de->de_flags & DE_WHITEOUT) {
 		if ((nameiop == CREATE || nameiop == RENAME) &&
 		    (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) {
 			cnp->cn_flags |= SAVENAME;
 			return (EJUSTRETURN);
 		}
 		return (ENOENT);
 	}
 
 	if (devfs_prison_check(de, td))
 		return (ENOENT);
 
 	if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		if (*vpp == dvp) {
 			VREF(dvp);
 			*vpp = dvp;
 			return (0);
 		}
 	}
 	error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp);
 	*dm_unlock = 0;
 	return (error);
 }
 
 static int
 devfs_lookup(struct vop_lookup_args *ap)
 {
 	int j;
 	struct devfs_mount *dmp;
 	int dm_unlock;
 
 	if (devfs_populate_vp(ap->a_dvp) != 0)
 		return (ENOTDIR);
 
 	dmp = VFSTODEVFS(ap->a_dvp->v_mount);
 	dm_unlock = 1;
 	j = devfs_lookupx(ap, &dm_unlock);
 	if (dm_unlock == 1)
 		sx_xunlock(&dmp->dm_lock);
 	return (j);
 }
 
 static int
 devfs_mknod(struct vop_mknod_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct devfs_dirent *dd, *de;
 	struct devfs_mount *dmp;
 	int error;
 
 	/*
 	 * The only type of node we should be creating here is a
 	 * character device, for anything else return EOPNOTSUPP.
 	 */
 	if (ap->a_vap->va_type != VCHR)
 		return (EOPNOTSUPP);
 	dvp = ap->a_dvp;
 	dmp = VFSTODEVFS(dvp->v_mount);
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dd = dvp->v_data;
 
 	error = ENOENT;
 	sx_xlock(&dmp->dm_lock);
 	TAILQ_FOREACH(de, &dd->de_dlist, de_list) {
 		if (cnp->cn_namelen != de->de_dirent->d_namlen)
 			continue;
 		if (de->de_dirent->d_type == DT_CHR &&
 		    (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0)
 			continue;
 		if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name,
 		    de->de_dirent->d_namlen) != 0)
 			continue;
 		if (de->de_flags & DE_WHITEOUT)
 			break;
 		goto notfound;
 	}
 	if (de == NULL)
 		goto notfound;
 	de->de_flags &= ~DE_WHITEOUT;
 	error = devfs_allocv(de, dvp->v_mount, LK_EXCLUSIVE, vpp);
 	return (error);
 notfound:
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_open(struct vop_open_args *ap)
 {
 	struct thread *td = ap->a_td;
 	struct vnode *vp = ap->a_vp;
 	struct cdev *dev = vp->v_rdev;
 	struct file *fp = ap->a_fp;
 	int error, ref, vlocked;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (vp->v_type == VBLK)
 		return (ENXIO);
 
 	if (dev == NULL)
 		return (ENXIO);
 
 	/* Make this field valid before any I/O in d_open. */
 	if (dev->si_iosize_max == 0)
 		dev->si_iosize_max = DFLTPHYS;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	if (fp == NULL && dsw->d_fdopen != NULL) {
 		dev_relthread(dev, ref);
 		return (ENXIO);
 	}
 
 	if (vp->v_type == VCHR)
 		devfs_usecount_add(vp);
 
 	vlocked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp);
 
 	fpop = td->td_fpop;
 	td->td_fpop = fp;
 	if (fp != NULL) {
 		fp->f_data = dev;
 		fp->f_vnode = vp;
 	}
 	if (dsw->d_fdopen != NULL)
 		error = dsw->d_fdopen(dev, ap->a_mode, td, fp);
 	else
 		error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td);
 	/* Clean up any cdevpriv upon error. */
 	if (error != 0)
 		devfs_clear_cdevpriv();
 	td->td_fpop = fpop;
 
 	vn_lock(vp, vlocked | LK_RETRY);
 	if (error != 0 && vp->v_type == VCHR)
 		devfs_usecount_sub(vp);
 
 	dev_relthread(dev, ref);
 	if (error != 0) {
 		if (error == ERESTART)
 			error = EINTR;
 		return (error);
 	}
 
 #if 0	/* /dev/console */
 	KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp"));
 #else
 	if (fp == NULL)
 		return (error);
 #endif
 	if (fp->f_ops == &badfileops)
 		finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f);
 	return (error);
 }
 
 static int
 devfs_pathconf(struct vop_pathconf_args *ap)
 {
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = INT_MAX;
 		return (0);
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		return (0);
 	case _PC_MAX_CANON:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = MAX_CANON;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_MAX_INPUT:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = MAX_INPUT;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_VDISABLE:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = _POSIX_VDISABLE;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_MAC_PRESENT:
 #ifdef MAC
 		/*
 		 * If MAC is enabled, devfs automatically supports
 		 * trivial non-persistant label storage.
 		 */
 		*ap->a_retval = 1;
 #else
 		*ap->a_retval = 0;
 #endif
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 	/* NOTREACHED */
 }
 
 /* ARGSUSED */
 static int
 devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	int error, ref;
 	struct file *fpop;
 
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_poll(fp, events, cred, td);
 		return (error);
 	}
 	error = dsw->d_poll(dev, events, td);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	return(error);
 }
 
 /*
  * Print out the contents of a special device vnode.
  */
 static int
 devfs_print(struct vop_print_args *ap)
 {
 
 	printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev));
 	return (0);
 }
 
 static int
 devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred,
     int flags, struct thread *td)
 {
 	struct cdev *dev;
 	int ioflag, error, ref;
 	ssize_t resid;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_read(fp, uio, cred, flags, td);
 		return (error);
 	}
 	resid = uio->uio_resid;
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 
 	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 	error = dsw->d_read(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0))
 		devfs_timestamp(&dev->si_atime);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
 	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF_R);
 	return (error);
 }
 
 static int
 devfs_readdir(struct vop_readdir_args *ap)
 {
 	int error;
 	struct uio *uio;
 	struct dirent *dp;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	off_t off;
 	int *tmp_ncookies = NULL;
 
 	if (ap->a_vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	uio = ap->a_uio;
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	/*
 	 * XXX: This is a temporary hack to get around this filesystem not
 	 * supporting cookies. We store the location of the ncookies pointer
 	 * in a temporary variable before calling vfs_subr.c:vfs_read_dirent()
 	 * and set the number of cookies to 0. We then set the pointer to
 	 * NULL so that vfs_read_dirent doesn't try to call realloc() on 
 	 * ap->a_cookies. Later in this function, we restore the ap->a_ncookies
 	 * pointer to its original location before returning to the caller.
 	 */
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
 	}
 
 	dmp = VFSTODEVFS(ap->a_vp->v_mount);
 	if (devfs_populate_vp(ap->a_vp) != 0) {
 		if (tmp_ncookies != NULL)
 			ap->a_ncookies = tmp_ncookies;
 		return (EIO);
 	}
 	error = 0;
 	de = ap->a_vp->v_data;
 	off = 0;
 	TAILQ_FOREACH(dd, &de->de_dlist, de_list) {
 		KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__));
 		if (dd->de_flags & (DE_COVERED | DE_WHITEOUT))
 			continue;
 		if (devfs_prison_check(dd, uio->uio_td))
 			continue;
 		if (dd->de_dirent->d_type == DT_DIR)
 			de = dd->de_dir;
 		else
 			de = dd;
 		dp = dd->de_dirent;
 		MPASS(dp->d_reclen == GENERIC_DIRSIZ(dp));
 		if (dp->d_reclen > uio->uio_resid)
 			break;
 		dp->d_fileno = de->de_inode;
 		/* NOTE: d_off is the offset for the *next* entry. */
 		dp->d_off = off + dp->d_reclen;
 		if (off >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, dp, off);
 			if (error)
 				break;
 		}
 		off += dp->d_reclen;
 	}
 	sx_xunlock(&dmp->dm_lock);
 	uio->uio_offset = off;
 
 	/*
 	 * Restore ap->a_ncookies if it wasn't originally NULL in the first
 	 * place.
 	 */
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 static int
 devfs_readlink(struct vop_readlink_args *ap)
 {
 	struct devfs_dirent *de;
 
 	de = ap->a_vp->v_data;
 	return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio));
 }
 
 static void
 devfs_reclaiml(struct vnode *vp)
 {
 	struct devfs_dirent *de;
 
 	mtx_assert(&devfs_de_interlock, MA_OWNED);
 	de = vp->v_data;
 	if (de != NULL) {
 		MPASS(de->de_usecount == 0);
 		de->de_vnode = NULL;
 		vp->v_data = NULL;
 	}
 }
 
 static int
 devfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 
 	vp = ap->a_vp;
 	mtx_lock(&devfs_de_interlock);
 	devfs_reclaiml(vp);
 	mtx_unlock(&devfs_de_interlock);
 	return (0);
 }
 
 static int
 devfs_reclaim_vchr(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct cdev *dev;
 
 	vp = ap->a_vp;
 	MPASS(vp->v_type == VCHR);
 
 	mtx_lock(&devfs_de_interlock);
 	VI_LOCK(vp);
 	devfs_usecount_subl(vp);
 	devfs_reclaiml(vp);
 	mtx_unlock(&devfs_de_interlock);
 	dev_lock();
 	dev = vp->v_rdev;
 	vp->v_rdev = NULL;
 	dev_unlock();
 	VI_UNLOCK(vp);
 	if (dev != NULL)
 		dev_rel(dev);
 	return (0);
 }
 
 static int
 devfs_remove(struct vop_remove_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de, *de_covered;
 	struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount);
 
 	ASSERT_VOP_ELOCKED(dvp, "devfs_remove");
 	ASSERT_VOP_ELOCKED(vp, "devfs_remove");
 
 	sx_xlock(&dmp->dm_lock);
 	dd = ap->a_dvp->v_data;
 	de = vp->v_data;
 	if (de->de_cdp == NULL) {
 		TAILQ_REMOVE(&dd->de_dlist, de, de_list);
 		if (de->de_dirent->d_type == DT_LNK) {
 			de_covered = devfs_find(dd, de->de_dirent->d_name,
 			    de->de_dirent->d_namlen, 0);
 			if (de_covered != NULL)
 				de_covered->de_flags &= ~DE_COVERED;
 		}
 		/* We need to unlock dvp because devfs_delete() may lock it. */
 		VOP_UNLOCK(vp);
 		if (dvp != vp)
 			VOP_UNLOCK(dvp);
 		devfs_delete(dmp, de, 0);
 		sx_xunlock(&dmp->dm_lock);
 		if (dvp != vp)
 			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	} else {
 		de->de_flags |= DE_WHITEOUT;
 		sx_xunlock(&dmp->dm_lock);
 	}
 	return (0);
 }
 
 /*
  * Revoke is called on a tty when a terminal session ends.  The vnode
  * is orphaned by setting v_op to deadfs so we need to let go of it
  * as well so that we create a new one next time around.
  *
  */
 static int
 devfs_revoke(struct vop_revoke_args *ap)
 {
 	struct vnode *vp = ap->a_vp, *vp2;
 	struct cdev *dev;
 	struct cdev_priv *cdp;
 	struct devfs_dirent *de;
 	enum vgetstate vs;
 	u_int i;
 
 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL"));
 
 	dev = vp->v_rdev;
 	cdp = cdev2priv(dev);
 
 	dev_lock();
 	cdp->cdp_inuse++;
 	dev_unlock();
 
 	vhold(vp);
 	vgone(vp);
 	vdrop(vp);
 
 	VOP_UNLOCK(vp);
  loop:
 	for (;;) {
 		mtx_lock(&devfs_de_interlock);
 		dev_lock();
 		vp2 = NULL;
 		for (i = 0; i <= cdp->cdp_maxdirent; i++) {
 			de = cdp->cdp_dirents[i];
 			if (de == NULL)
 				continue;
 
 			vp2 = de->de_vnode;
 			if (vp2 != NULL) {
 				dev_unlock();
 				vs = vget_prep(vp2);
 				mtx_unlock(&devfs_de_interlock);
 				if (vget_finish(vp2, LK_EXCLUSIVE, vs) != 0)
 					goto loop;
 				vhold(vp2);
 				vgone(vp2);
 				vdrop(vp2);
 				vput(vp2);
 				break;
 			}
 		}
 		if (vp2 != NULL) {
 			continue;
 		}
 		dev_unlock();
 		mtx_unlock(&devfs_de_interlock);
 		break;
 	}
 	dev_lock();
 	cdp->cdp_inuse--;
 	if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) {
 		TAILQ_REMOVE(&cdevp_list, cdp, cdp_list);
 		dev_unlock();
 		dev_rel(&cdp->cdp_c);
 	} else
 		dev_unlock();
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	return (0);
 }
 
 static int
 devfs_rioctl(struct vop_ioctl_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_mount *dmp;
 	int error;
 
 	vp = ap->a_vp;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	if (VN_IS_DOOMED(vp)) {
 		VOP_UNLOCK(vp);
 		return (EBADF);
 	}
 	dmp = VFSTODEVFS(vp->v_mount);
 	sx_xlock(&dmp->dm_lock);
 	VOP_UNLOCK(vp);
 	DEVFS_DMP_HOLD(dmp);
 	devfs_populate(dmp);
 	if (DEVFS_DMP_DROP(dmp)) {
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		return (ENOENT);
 	}
 	error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td);
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 static int
 devfs_rread(struct vop_read_args *ap)
 {
 
 	if (ap->a_vp->v_type != VDIR)
 		return (EINVAL);
 	return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL));
 }
 
 static int
 devfs_setattr(struct vop_setattr_args *ap)
 {
 	struct devfs_dirent *de;
 	struct vattr *vap;
 	struct vnode *vp;
 	struct thread *td;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	td = curthread;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	de = vp->v_data;
 	if (vp->v_type == VDIR)
 		de = de->de_dir;
 
 	c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = de->de_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = de->de_gid;
 	else
 		gid = vap->va_gid;
 	if (uid != de->de_uid || gid != de->de_gid) {
 		if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid ||
 		    (gid != de->de_gid && !groupmember(gid, ap->a_cred))) {
 			error = priv_check(td, PRIV_VFS_CHOWN);
 			if (error != 0)
 				goto ret;
 		}
 		de->de_uid = uid;
 		de->de_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (ap->a_cred->cr_uid != de->de_uid) {
 			error = priv_check(td, PRIV_VFS_ADMIN);
 			if (error != 0)
 				goto ret;
 		}
 		de->de_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		error = vn_utimes_perm(vp, vap, ap->a_cred, td);
 		if (error != 0)
 			goto ret;
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			if (vp->v_type == VCHR)
 				vp->v_rdev->si_atime = vap->va_atime;
 			else
 				de->de_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			if (vp->v_type == VCHR)
 				vp->v_rdev->si_mtime = vap->va_mtime;
 			else
 				de->de_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 
 	if (c) {
 		if (vp->v_type == VCHR)
 			vfs_timestamp(&vp->v_rdev->si_ctime);
 		else
 			vfs_timestamp(&de->de_mtime);
 	}
 
 ret:
 	sx_xunlock(&VFSTODEVFS(vp->v_mount)->dm_lock);
 	return (error);
 }
 
 #ifdef MAC
 static int
 devfs_setlabel(struct vop_setlabel_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_dirent *de;
 
 	vp = ap->a_vp;
 	de = vp->v_data;
 
 	mac_vnode_relabel(ap->a_cred, vp, ap->a_label);
 	mac_devfs_update(vp->v_mount, de, vp);
 
 	return (0);
 }
 #endif
 
 static int
-devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td)
+devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred)
 {
 
-	return (vnops.fo_stat(fp, sb, cred, td));
+	return (vnops.fo_stat(fp, sb, cred));
 }
 
 static int
 devfs_symlink(struct vop_symlink_args *ap)
 {
 	int i, error;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de, *de_covered, *de_dotdot;
 	struct devfs_mount *dmp;
 
 	error = priv_check(curthread, PRIV_DEVFS_SYMLINK);
 	if (error)
 		return(error);
 	dmp = VFSTODEVFS(ap->a_dvp->v_mount);
 	if (devfs_populate_vp(ap->a_dvp) != 0)
 		return (ENOENT);
 
 	dd = ap->a_dvp->v_data;
 	de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen);
 	de->de_flags = DE_USER;
 	de->de_uid = 0;
 	de->de_gid = 0;
 	de->de_mode = 0755;
 	de->de_inode = alloc_unr(devfs_inos);
 	de->de_dir = dd;
 	de->de_dirent->d_type = DT_LNK;
 	i = strlen(ap->a_target) + 1;
 	de->de_symlink = malloc(i, M_DEVFS, M_WAITOK);
 	bcopy(ap->a_target, de->de_symlink, i);
 #ifdef MAC
 	mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de);
 #endif
 	de_covered = devfs_find(dd, de->de_dirent->d_name,
 	    de->de_dirent->d_namlen, 0);
 	if (de_covered != NULL) {
 		if ((de_covered->de_flags & DE_USER) != 0) {
 			devfs_delete(dmp, de, DEVFS_DEL_NORECURSE);
 			sx_xunlock(&dmp->dm_lock);
 			return (EEXIST);
 		}
 		KASSERT((de_covered->de_flags & DE_COVERED) == 0,
 		    ("devfs_symlink: entry %p already covered", de_covered));
 		de_covered->de_flags |= DE_COVERED;
 	}
 
 	de_dotdot = TAILQ_FIRST(&dd->de_dlist);		/* "." */
 	de_dotdot = TAILQ_NEXT(de_dotdot, de_list);	/* ".." */
 	TAILQ_INSERT_AFTER(&dd->de_dlist, de_dotdot, de, de_list);
 	devfs_dir_ref_de(dmp, dd);
 	devfs_rules_apply(dmp, de);
 
 	return (devfs_allocv(de, ap->a_dvp->v_mount, LK_EXCLUSIVE, ap->a_vpp));
 }
 
 static int
 devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td)
 {
 
 	return (vnops.fo_truncate(fp, length, cred, td));
 }
 
 static int
 devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred,
     int flags, struct thread *td)
 {
 	struct cdev *dev;
 	int error, ioflag, ref;
 	ssize_t resid;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_write(fp, uio, cred, flags, td);
 		return (error);
 	}
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td));
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 
 	resid = uio->uio_resid;
 
 	error = dsw->d_write(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0)) {
 		devfs_timestamp(&dev->si_ctime);
 		dev->si_mtime = dev->si_ctime;
 	}
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
 	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF_W);
 	return (error);
 }
 
 static int
 devfs_mmap_f(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	struct mount *mp;
 	struct vnode *vp;
 	struct file *fpop;
 	vm_object_t object;
 	vm_prot_t maxprot;
 	int error, ref;
 
 	vp = fp->f_vnode;
 
 	/*
 	 * Ensure that file and memory protections are
 	 * compatible.
 	 */
 	mp = vp->v_mount;
 	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 		maxprot = VM_PROT_NONE;
 		if ((prot & VM_PROT_EXECUTE) != 0)
 			return (EACCES);
 	} else
 		maxprot = VM_PROT_EXECUTE;
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_READ;
 	else if ((prot & VM_PROT_READ) != 0)
 		return (EACCES);
 
 	/*
 	 * If we are sharing potential changes via MAP_SHARED and we
 	 * are trying to get write permission although we opened it
 	 * without asking for it, bail out.
 	 *
 	 * Note that most character devices always share mappings.
 	 * The one exception is that D_MMAP_ANON devices
 	 * (i.e. /dev/zero) permit private writable mappings.
 	 *
 	 * Rely on vm_mmap_cdev() to fail invalid MAP_PRIVATE requests
 	 * as well as updating maxprot to permit writing for
 	 * D_MMAP_ANON devices rather than doing that here.
 	 */
 	if ((flags & MAP_SHARED) != 0) {
 		if ((fp->f_flag & FWRITE) != 0)
 			maxprot |= VM_PROT_WRITE;
 		else if ((prot & VM_PROT_WRITE) != 0)
 			return (EACCES);
 	}
 	maxprot &= cap_maxprot;
 
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0)
 		return (error);
 
 	error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, dev, dsw, &foff,
 	    &object);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	if (error != 0)
 		return (error);
 
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, FALSE, td);
 	if (error != 0)
 		vm_object_deallocate(object);
 	return (error);
 }
 
 dev_t
 dev2udev(struct cdev *x)
 {
 	if (x == NULL)
 		return (NODEV);
 	return (cdev2priv(x)->cdp_inode);
 }
 
 static struct fileops devfs_ops_f = {
 	.fo_read =	devfs_read_f,
 	.fo_write =	devfs_write_f,
 	.fo_truncate =	devfs_truncate_f,
 	.fo_ioctl =	devfs_ioctl_f,
 	.fo_poll =	devfs_poll_f,
 	.fo_kqfilter =	devfs_kqfilter_f,
 	.fo_stat =	devfs_stat_f,
 	.fo_close =	devfs_close_f,
 	.fo_chmod =	vn_chmod,
 	.fo_chown =	vn_chown,
 	.fo_sendfile =	vn_sendfile,
 	.fo_seek =	vn_seek,
 	.fo_fill_kinfo = vn_fill_kinfo,
 	.fo_mmap =	devfs_mmap_f,
 	.fo_flags =	DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 /* Vops for non-CHR vnodes in /dev. */
 static struct vop_vector devfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		devfs_access,
 	.vop_getattr =		devfs_getattr,
 	.vop_ioctl =		devfs_rioctl,
 	.vop_lookup =		devfs_lookup,
 	.vop_mknod =		devfs_mknod,
 	.vop_pathconf =		devfs_pathconf,
 	.vop_read =		devfs_rread,
 	.vop_readdir =		devfs_readdir,
 	.vop_readlink =		devfs_readlink,
 	.vop_reclaim =		devfs_reclaim,
 	.vop_remove =		devfs_remove,
 	.vop_revoke =		devfs_revoke,
 	.vop_setattr =		devfs_setattr,
 #ifdef MAC
 	.vop_setlabel =		devfs_setlabel,
 #endif
 	.vop_symlink =		devfs_symlink,
 	.vop_vptocnp =		devfs_vptocnp,
 	.vop_lock1 =		vop_lock,
 	.vop_unlock =		vop_unlock,
 	.vop_islocked =		vop_islocked,
 };
 VFS_VOP_VECTOR_REGISTER(devfs_vnodeops);
 
 /* Vops for VCHR vnodes in /dev. */
 static struct vop_vector devfs_specops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		devfs_access,
 	.vop_bmap =		VOP_PANIC,
 	.vop_close =		devfs_close,
 	.vop_create =		VOP_PANIC,
 	.vop_fsync =		vop_stdfsync,
 	.vop_getattr =		devfs_getattr,
 	.vop_ioctl =		devfs_ioctl,
 	.vop_link =		VOP_PANIC,
 	.vop_mkdir =		VOP_PANIC,
 	.vop_mknod =		VOP_PANIC,
 	.vop_open =		devfs_open,
 	.vop_pathconf =		devfs_pathconf,
 	.vop_poll =		dead_poll,
 	.vop_print =		devfs_print,
 	.vop_read =		dead_read,
 	.vop_readdir =		VOP_PANIC,
 	.vop_readlink =		VOP_PANIC,
 	.vop_reallocblks =	VOP_PANIC,
 	.vop_reclaim =		devfs_reclaim_vchr,
 	.vop_remove =		devfs_remove,
 	.vop_rename =		VOP_PANIC,
 	.vop_revoke =		devfs_revoke,
 	.vop_rmdir =		VOP_PANIC,
 	.vop_setattr =		devfs_setattr,
 #ifdef MAC
 	.vop_setlabel =		devfs_setlabel,
 #endif
 	.vop_strategy =		VOP_PANIC,
 	.vop_symlink =		VOP_PANIC,
 	.vop_vptocnp =		devfs_vptocnp,
 	.vop_write =		dead_write,
 	.vop_lock1 =		vop_lock,
 	.vop_unlock =		vop_unlock,
 	.vop_islocked =		vop_islocked,
 };
 VFS_VOP_VECTOR_REGISTER(devfs_specops);
 
 /*
  * Our calling convention to the device drivers used to be that we passed
  * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ 
  * flags instead since that's what open(), close() and ioctl() takes and
  * we don't really want vnode.h in device drivers.
  * We solved the source compatibility by redefining some vnode flags to
  * be the same as the fcntl ones and by sending down the bitwise OR of
  * the respective fcntl/vnode flags.  These CTASSERTS make sure nobody
  * pulls the rug out under this.
  */
 CTASSERT(O_NONBLOCK == IO_NDELAY);
 CTASSERT(O_FSYNC == IO_SYNC);
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index b10dcc2f9469..55c2a36955a5 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -1,5083 +1,5082 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/selinfo.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/kdb.h>
 #include <sys/smr.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #include <sys/ktrace.h>
 
 #include <net/vnet.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <ddb/ddb.h>
 
 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 static MALLOC_DEFINE(M_PWD, "pwd", "Descriptor table vnodes");
 static MALLOC_DEFINE(M_PWDDESC, "pwddesc", "Pwd descriptors");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
     "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
 
 MALLOC_DECLARE(M_FADVISE);
 
 static __read_mostly uma_zone_t file_zone;
 static __read_mostly uma_zone_t filedesc0_zone;
 __read_mostly uma_zone_t pwd_zone;
 VFS_SMR_DECLARE;
 
 static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
 		    struct thread *td, bool holdleaders, bool audit);
 static int	fd_first_free(struct filedesc *fdp, int low, int size);
 static void	fdgrowtable(struct filedesc *fdp, int nfd);
 static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
 static int	getmaxfd(struct thread *td);
 static u_long	*filecaps_copy_prep(const struct filecaps *src);
 static void	filecaps_copy_finish(const struct filecaps *src,
 		    struct filecaps *dst, u_long *ioctls);
 static u_long 	*filecaps_free_prep(struct filecaps *fcaps);
 static void	filecaps_free_finish(u_long *ioctls);
 
 static struct pwd *pwd_alloc(void);
 
 /*
  * Each process has:
  *
  * - An array of open file descriptors (fd_ofiles)
  * - An array of file flags (fd_ofileflags)
  * - A bitmap recording which descriptors are in use (fd_map)
  *
  * A process starts out with NDFILE descriptors.  The value of NDFILE has
  * been selected based the historical limit of 20 open files, and an
  * assumption that the majority of processes, especially short-lived
  * processes like shells, will never need more.
  *
  * If this initial allocation is exhausted, a larger descriptor table and
  * map are allocated dynamically, and the pointers in the process's struct
  * filedesc are updated to point to those.  This is repeated every time
  * the process runs out of file descriptors (provided it hasn't hit its
  * resource limit).
  *
  * Since threads may hold references to individual descriptor table
  * entries, the tables are never freed.  Instead, they are placed on a
  * linked list and freed only when the struct filedesc is released.
  */
 #define NDFILE		20
 #define NDSLOTSIZE	sizeof(NDSLOTTYPE)
 #define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
 #define NDSLOT(x)	((x) / NDENTRIES)
 #define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
 
 /*
  * SLIST entry used to keep track of ofiles which must be reclaimed when
  * the process exits.
  */
 struct freetable {
 	struct fdescenttbl *ft_table;
 	SLIST_ENTRY(freetable) ft_next;
 };
 
 /*
  * Initial allocation: a filedesc structure + the head of SLIST used to
  * keep track of old ofiles + enough space for NDFILE descriptors.
  */
 
 struct fdescenttbl0 {
 	int	fdt_nfiles;
 	struct	filedescent fdt_ofiles[NDFILE];
 };
 
 struct filedesc0 {
 	struct filedesc fd_fd;
 	SLIST_HEAD(, freetable) fd_free;
 	struct	fdescenttbl0 fd_dfiles;
 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
 };
 
 /*
  * Descriptor management.
  */
 static int __exclusive_cache_line openfiles; /* actual number of open files */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
 void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /*
  * If low >= size, just return low. Otherwise find the first zero bit in the
  * given bitmap, starting at low and not exceeding size - 1. Return size if
  * not found.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, maxoff;
 
 	if (low >= size)
 		return (low);
 
 	off = NDSLOT(low);
 	if (low % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
 		if ((mask &= ~map[off]) != 0UL)
 			return (off * NDENTRIES + ffsl(mask) - 1);
 		++off;
 	}
 	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
 		if (map[off] != ~0UL)
 			return (off * NDENTRIES + ffsl(~map[off]) - 1);
 	return (size);
 }
 
 /*
  * Find the last used fd.
  *
  * Call this variant if fdp can't be modified by anyone else (e.g, during exec).
  * Otherwise use fdlastfile.
  */
 int
 fdlastfile_single(struct filedesc *fdp)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	int off, minoff;
 
 	off = NDSLOT(fdp->fd_nfiles - 1);
 	for (minoff = NDSLOT(0); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
 	return (-1);
 }
 
 int
 fdlastfile(struct filedesc *fdp)
 {
 
 	FILEDESC_LOCK_ASSERT(fdp);
 	return (fdlastfile_single(fdp));
 }
 
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
 
 	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
 	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
 
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 
 /*
  * Mark a file descriptor as used.
  */
 static void
 fdused_init(struct filedesc *fdp, int fd)
 {
 
 	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
 
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 }
 
 static void
 fdused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	fdused_init(fdp, fd);
 	if (fd == fdp->fd_freefile)
 		fdp->fd_freefile++;
 }
 
 /*
  * Mark a file descriptor as unused.
  */
 static void
 fdunused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("fd=%d is still in use", fd));
 
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 }
 
 /*
  * Free a file descriptor.
  *
  * Avoid some work if fdp is about to be destroyed.
  */
 static inline void
 fdefree_last(struct filedescent *fde)
 {
 
 	filecaps_free(&fde->fde_caps);
 }
 
 static inline void
 fdfree(struct filedesc *fdp, int fd)
 {
 	struct filedescent *fde;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 	fde = &fdp->fd_ofiles[fd];
 #ifdef CAPABILITIES
 	seqc_write_begin(&fde->fde_seqc);
 #endif
 	fde->fde_file = NULL;
 #ifdef CAPABILITIES
 	seqc_write_end(&fde->fde_seqc);
 #endif
 	fdefree_last(fde);
 	fdunused(fdp, fd);
 }
 
 /*
  * System calls on descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdtablesize_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
 {
 #ifdef	RACCT
 	uint64_t lim;
 #endif
 
 	td->td_retval[0] = getmaxfd(td);
 #ifdef	RACCT
 	PROC_LOCK(td->td_proc);
 	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
 	PROC_UNLOCK(td->td_proc);
 	if (lim < td->td_retval[0])
 		td->td_retval[0] = lim;
 #endif
 	return (0);
 }
 
 /*
  * Duplicate a file descriptor to a particular value.
  *
  * Note: keep in mind that a potential race condition exists when closing
  * descriptors from a shared descriptor table (via rfork).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup2_args {
 	u_int	from;
 	u_int	to;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup2(struct thread *td, struct dup2_args *uap)
 {
 
 	return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to));
 }
 
 /*
  * Duplicate a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup_args {
 	u_int	fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup(struct thread *td, struct dup_args *uap)
 {
 
 	return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0));
 }
 
 /*
  * The file control system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fcntl_args {
 	int	fd;
 	int	cmd;
 	long	arg;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fcntl(struct thread *td, struct fcntl_args *uap)
 {
 
 	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
 }
 
 int
 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
 {
 	struct flock fl;
 	struct __oflock ofl;
 	intptr_t arg1;
 	int error, newcmd;
 
 	error = 0;
 	newcmd = cmd;
 	switch (cmd) {
 	case F_OGETLK:
 	case F_OSETLK:
 	case F_OSETLKW:
 		/*
 		 * Convert old flock structure to new.
 		 */
 		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
 		fl.l_start = ofl.l_start;
 		fl.l_len = ofl.l_len;
 		fl.l_pid = ofl.l_pid;
 		fl.l_type = ofl.l_type;
 		fl.l_whence = ofl.l_whence;
 		fl.l_sysid = 0;
 
 		switch (cmd) {
 		case F_OGETLK:
 			newcmd = F_GETLK;
 			break;
 		case F_OSETLK:
 			newcmd = F_SETLK;
 			break;
 		case F_OSETLKW:
 			newcmd = F_SETLKW;
 			break;
 		}
 		arg1 = (intptr_t)&fl;
 		break;
 	case F_GETLK:
 	case F_SETLK:
 	case F_SETLKW:
 	case F_SETLK_REMOTE:
 		error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
 		arg1 = (intptr_t)&fl;
 		break;
 	default:
 		arg1 = arg;
 		break;
 	}
 	if (error)
 		return (error);
 	error = kern_fcntl(td, fd, newcmd, arg1);
 	if (error)
 		return (error);
 	if (cmd == F_OGETLK) {
 		ofl.l_start = fl.l_start;
 		ofl.l_len = fl.l_len;
 		ofl.l_pid = fl.l_pid;
 		ofl.l_type = fl.l_type;
 		ofl.l_whence = fl.l_whence;
 		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
 	} else if (cmd == F_GETLK) {
 		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
 	}
 	return (error);
 }
 
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
 	struct filedesc *fdp;
 	struct flock *flp;
 	struct file *fp, *fp2;
 	struct filedescent *fde;
 	struct proc *p;
 	struct vnode *vp;
 	struct mount *mp;
 	int error, flg, seals, tmp;
 	uint64_t bsize;
 	off_t foffset;
 
 	error = 0;
 	flg = F_POSIX;
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	AUDIT_ARG_FD(cmd);
 	AUDIT_ARG_CMD(cmd);
 	switch (cmd) {
 	case F_DUPFD:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp);
 		break;
 
 	case F_DUPFD_CLOEXEC:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
 		break;
 
 	case F_DUP2FD:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
 		break;
 
 	case F_DUP2FD_CLOEXEC:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp);
 		break;
 
 	case F_GETFD:
 		error = EBADF;
 		FILEDESC_SLOCK(fdp);
 		fde = fdeget_locked(fdp, fd);
 		if (fde != NULL) {
 			td->td_retval[0] =
 			    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
 			error = 0;
 		}
 		FILEDESC_SUNLOCK(fdp);
 		break;
 
 	case F_SETFD:
 		error = EBADF;
 		FILEDESC_XLOCK(fdp);
 		fde = fdeget_locked(fdp, fd);
 		if (fde != NULL) {
 			fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
 			    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
 			error = 0;
 		}
 		FILEDESC_XUNLOCK(fdp);
 		break;
 
 	case F_GETFL:
 		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETFL, &fp);
 		if (error != 0)
 			break;
 		td->td_retval[0] = OFLAGS(fp->f_flag);
 		fdrop(fp, td);
 		break;
 
 	case F_SETFL:
 		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETFL, &fp);
 		if (error != 0)
 			break;
 		if (fp->f_ops == &path_fileops) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
 			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
 		} while (atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		tmp = fp->f_flag & FASYNC;
 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
 		if (error == 0) {
 			fdrop(fp, td);
 			break;
 		}
 		atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		tmp = 0;
 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_GETOWN:
 		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_GETOWN, &fp);
 		if (error != 0)
 			break;
 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
 		if (error == 0)
 			td->td_retval[0] = tmp;
 		fdrop(fp, td);
 		break;
 
 	case F_SETOWN:
 		error = fget_fcntl(td, fd, &cap_fcntl_rights, F_SETOWN, &fp);
 		if (error != 0)
 			break;
 		tmp = arg;
 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETLK_REMOTE:
 		error = priv_check(td, PRIV_NFS_LOCKD);
 		if (error != 0)
 			return (error);
 		flg = F_REMOTE;
 		goto do_setlk;
 
 	case F_SETLKW:
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
 	do_setlk:
 		flp = (struct flock *)arg;
 		if ((flg & F_REMOTE) != 0 && flp->l_sysid == 0) {
 			error = EINVAL;
 			break;
 		}
 
 		error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if (foffset < 0 ||
 			    (flp->l_start > 0 &&
 			     foffset > OFF_MAX - flp->l_start)) {
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 
 		vp = fp->f_vnode;
 		switch (flp->l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
 				error = EBADF;
 				break;
 			}
 			if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
 				PROC_LOCK(p->p_leader);
 				p->p_leader->p_flag |= P_ADVLOCK;
 				PROC_UNLOCK(p->p_leader);
 			}
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_WRLCK:
 			if ((fp->f_flag & FWRITE) == 0) {
 				error = EBADF;
 				break;
 			}
 			if ((p->p_leader->p_flag & P_ADVLOCK) == 0) {
 				PROC_LOCK(p->p_leader);
 				p->p_leader->p_flag |= P_ADVLOCK;
 				PROC_UNLOCK(p->p_leader);
 			}
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_UNLCK:
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
 			    flp, flg);
 			break;
 		case F_UNLCKSYS:
 			if (flg != F_REMOTE) {
 				error = EINVAL;
 				break;
 			}
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCKSYS, flp, flg);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error != 0 || flp->l_type == F_UNLCK ||
 		    flp->l_type == F_UNLCKSYS) {
 			fdrop(fp, td);
 			break;
 		}
 
 		/*
 		 * Check for a race with close.
 		 *
 		 * The vnode is now advisory locked (or unlocked, but this case
 		 * is not really important) as the caller requested.
 		 * We had to drop the filedesc lock, so we need to recheck if
 		 * the descriptor is still valid, because if it was closed
 		 * in the meantime we need to remove advisory lock from the
 		 * vnode - close on any descriptor leading to an advisory
 		 * locked vnode, removes that lock.
 		 * We will return 0 on purpose in that case, as the result of
 		 * successful advisory lock might have been externally visible
 		 * already. This is fine - effectively we pretend to the caller
 		 * that the closing thread was a bit slower and that the
 		 * advisory lock succeeded before the close.
 		 */
 		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp2);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (fp != fp2) {
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
 			flp->l_len = 0;
 			flp->l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCK, flp, F_POSIX);
 		}
 		fdrop(fp, td);
 		fdrop(fp2, td);
 		break;
 
 	case F_GETLK:
 		error = fget_unlocked(fdp, fd, &cap_flock_rights, &fp);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
 		    flp->l_type != F_UNLCK) {
 			error = EINVAL;
 			fdrop(fp, td);
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if ((flp->l_start > 0 &&
 			    foffset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
 			    foffset < OFF_MIN - flp->l_start)) {
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 		vp = fp->f_vnode;
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
 		    F_POSIX);
 		fdrop(fp, td);
 		break;
 
 	case F_ADD_SEALS:
 		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			break;
 		error = fo_add_seals(fp, arg);
 		fdrop(fp, td);
 		break;
 
 	case F_GET_SEALS:
 		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			break;
 		if (fo_get_seals(fp, &seals) == 0)
 			td->td_retval[0] = seals;
 		else
 			error = EINVAL;
 		fdrop(fp, td);
 		break;
 
 	case F_RDAHEAD:
 		arg = arg ? 128 * 1024: 0;
 		/* FALLTHROUGH */
 	case F_READAHEAD:
 		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE || fp->f_ops == &path_fileops) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		vp = fp->f_vnode;
 		if (vp->v_type != VREG) {
 			fdrop(fp, td);
 			error = ENOTTY;
 			break;
 		}
 
 		/*
 		 * Exclusive lock synchronizes against f_seqcount reads and
 		 * writes in sequential_heuristic().
 		 */
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (arg >= 0) {
 			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
 			arg = MIN(arg, INT_MAX - bsize + 1);
 			fp->f_seqcount[UIO_READ] = MIN(IO_SEQMAX,
 			    (arg + bsize - 1) / bsize);
 			atomic_set_int(&fp->f_flag, FRDAHEAD);
 		} else {
 			atomic_clear_int(&fp->f_flag, FRDAHEAD);
 		}
 		VOP_UNLOCK(vp);
 		fdrop(fp, td);
 		break;
 
 	case F_ISUNIONSTACK:
 		/*
 		 * Check if the vnode is part of a union stack (either the
 		 * "union" flag from mount(2) or unionfs).
 		 *
 		 * Prior to introduction of this op libc's readdir would call
 		 * fstatfs(2), in effect unnecessarily copying kilobytes of
 		 * data just to check fs name and a mount flag.
 		 *
 		 * Fixing the code to handle everything in the kernel instead
 		 * is a non-trivial endeavor and has low priority, thus this
 		 * horrible kludge facilitates the current behavior in a much
 		 * cheaper manner until someone(tm) sorts this out.
 		 */
 		error = fget_unlocked(fdp, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		vp = fp->f_vnode;
 		/*
 		 * Since we don't prevent dooming the vnode even non-null mp
 		 * found can become immediately stale. This is tolerable since
 		 * mount points are type-stable (providing safe memory access)
 		 * and any vfs op on this vnode going forward will return an
 		 * error (meaning return value in this case is meaningless).
 		 */
 		mp = atomic_load_ptr(&vp->v_mount);
 		if (__predict_false(mp == NULL)) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		td->td_retval[0] = 0;
 		if (mp->mnt_kern_flag & MNTK_UNIONFS ||
 		    mp->mnt_flag & MNT_UNION)
 			td->td_retval[0] = 1;
 		fdrop(fp, td);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static int
 getmaxfd(struct thread *td)
 {
 
 	return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc));
 }
 
 /*
  * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
  */
 int
 kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
 {
 	struct filedesc *fdp;
 	struct filedescent *oldfde, *newfde;
 	struct proc *p;
 	struct file *delfp, *oldfp;
 	u_long *oioctls, *nioctls;
 	int error, maxfd;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	oioctls = NULL;
 
 	MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
 	MPASS(mode < FDDUP_LASTMODE);
 
 	AUDIT_ARG_FD(old);
 	/* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */
 
 	/*
 	 * Verify we have a valid descriptor to dup from and possibly to
 	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
 	 * return EINVAL when the new descriptor is out of bounds.
 	 */
 	if (old < 0)
 		return (EBADF);
 	if (new < 0)
 		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
 	maxfd = getmaxfd(td);
 	if (new >= maxfd)
 		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
 
 	error = EBADF;
 	FILEDESC_XLOCK(fdp);
 	if (fget_locked(fdp, old) == NULL)
 		goto unlock;
 	if (mode == FDDUP_FIXED && old == new) {
 		td->td_retval[0] = new;
 		if (flags & FDDUP_FLAG_CLOEXEC)
 			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
 		error = 0;
 		goto unlock;
 	}
 
 	oldfde = &fdp->fd_ofiles[old];
 	oldfp = oldfde->fde_file;
 	if (!fhold(oldfp))
 		goto unlock;
 
 	/*
 	 * If the caller specified a file descriptor, make sure the file
 	 * table is large enough to hold it, and grab it.  Otherwise, just
 	 * allocate a new descriptor the usual way.
 	 */
 	switch (mode) {
 	case FDDUP_NORMAL:
 	case FDDUP_FCNTL:
 		if ((error = fdalloc(td, new, &new)) != 0) {
 			fdrop(oldfp, td);
 			goto unlock;
 		}
 		break;
 	case FDDUP_FIXED:
 		if (new >= fdp->fd_nfiles) {
 			/*
 			 * The resource limits are here instead of e.g.
 			 * fdalloc(), because the file descriptor table may be
 			 * shared between processes, so we can't really use
 			 * racct_add()/racct_sub().  Instead of counting the
 			 * number of actually allocated descriptors, just put
 			 * the limit on the size of the file descriptor table.
 			 */
 #ifdef RACCT
 			if (RACCT_ENABLED()) {
 				error = racct_set_unlocked(p, RACCT_NOFILE, new + 1);
 				if (error != 0) {
 					error = EMFILE;
 					fdrop(oldfp, td);
 					goto unlock;
 				}
 			}
 #endif
 			fdgrowtable_exp(fdp, new + 1);
 		}
 		if (!fdisused(fdp, new))
 			fdused(fdp, new);
 		break;
 	default:
 		KASSERT(0, ("%s unsupported mode %d", __func__, mode));
 	}
 
 	KASSERT(old != new, ("new fd is same as old"));
 
 	/* Refetch oldfde because the table may have grown and old one freed. */
 	oldfde = &fdp->fd_ofiles[old];
 	KASSERT(oldfp == oldfde->fde_file,
 	    ("fdt_ofiles shift from growth observed at fd %d",
 	    old));
 
 	newfde = &fdp->fd_ofiles[new];
 	delfp = newfde->fde_file;
 
 	nioctls = filecaps_copy_prep(&oldfde->fde_caps);
 
 	/*
 	 * Duplicate the source descriptor.
 	 */
 #ifdef CAPABILITIES
 	seqc_write_begin(&newfde->fde_seqc);
 #endif
 	oioctls = filecaps_free_prep(&newfde->fde_caps);
 	memcpy(newfde, oldfde, fde_change_size);
 	filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
 	    nioctls);
 	if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
 		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
 	else
 		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
 #ifdef CAPABILITIES
 	seqc_write_end(&newfde->fde_seqc);
 #endif
 	td->td_retval[0] = new;
 
 	error = 0;
 
 	if (delfp != NULL) {
 		(void) closefp(fdp, new, delfp, td, true, false);
 		FILEDESC_UNLOCK_ASSERT(fdp);
 	} else {
 unlock:
 		FILEDESC_XUNLOCK(fdp);
 	}
 
 	filecaps_free_finish(oioctls);
 	return (error);
 }
 
 static void
 sigiofree(struct sigio *sigio)
 {
 	crfree(sigio->sio_ucred);
 	free(sigio, M_SIGIO);
 }
 
 static struct sigio *
 funsetown_locked(struct sigio *sigio)
 {
 	struct proc *p;
 	struct pgrp *pg;
 
 	SIGIO_ASSERT_LOCKED();
 
 	if (sigio == NULL)
 		return (NULL);
 	*sigio->sio_myref = NULL;
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		PGRP_LOCK(pg);
 		SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio, sio_pgsigio);
 		PGRP_UNLOCK(pg);
 	} else {
 		p = sigio->sio_proc;
 		PROC_LOCK(p);
 		SLIST_REMOVE(&p->p_sigiolst, sigio, sigio, sio_pgsigio);
 		PROC_UNLOCK(p);
 	}
 	return (sigio);
 }
 
 /*
  * If sigio is on the list associated with a process or process group,
  * disable signalling from the device, remove sigio from the list and
  * free sigio.
  */
 void
 funsetown(struct sigio **sigiop)
 {
 	struct sigio *sigio;
 
 	/* Racy check, consumers must provide synchronization. */
 	if (*sigiop == NULL)
 		return;
 
 	SIGIO_LOCK();
 	sigio = funsetown_locked(*sigiop);
 	SIGIO_UNLOCK();
 	if (sigio != NULL)
 		sigiofree(sigio);
 }
 
 /*
  * Free a list of sigio structures.  The caller must ensure that new sigio
  * structures cannot be added after this point.  For process groups this is
  * guaranteed using the proctree lock; for processes, the P_WEXIT flag serves
  * as an interlock.
  */
 void
 funsetownlst(struct sigiolst *sigiolst)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	struct sigio *sigio, *tmp;
 
 	/* Racy check. */
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL)
 		return;
 
 	p = NULL;
 	pg = NULL;
 
 	SIGIO_LOCK();
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 
 	/*
 	 * Every entry of the list should belong to a single proc or pgrp.
 	 */
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		sx_assert(&proctree_lock, SX_XLOCKED);
 		PGRP_LOCK(pg);
 	} else /* if (sigio->sio_pgid > 0) */ {
 		p = sigio->sio_proc;
 		PROC_LOCK(p);
 		KASSERT((p->p_flag & P_WEXIT) != 0,
 		    ("%s: process %p is not exiting", __func__, p));
 	}
 
 	SLIST_FOREACH(sigio, sigiolst, sio_pgsigio) {
 		*sigio->sio_myref = NULL;
 		if (pg != NULL) {
 			KASSERT(sigio->sio_pgid < 0,
 			    ("Proc sigio in pgrp sigio list"));
 			KASSERT(sigio->sio_pgrp == pg,
 			    ("Bogus pgrp in sigio list"));
 		} else /* if (p != NULL) */ {
 			KASSERT(sigio->sio_pgid > 0,
 			    ("Pgrp sigio in proc sigio list"));
 			KASSERT(sigio->sio_proc == p,
 			    ("Bogus proc in sigio list"));
 		}
 	}
 
 	if (pg != NULL)
 		PGRP_UNLOCK(pg);
 	else
 		PROC_UNLOCK(p);
 	SIGIO_UNLOCK();
 
 	SLIST_FOREACH_SAFE(sigio, sigiolst, sio_pgsigio, tmp)
 		sigiofree(sigio);
 }
 
 /*
  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
  *
  * After permission checking, add a sigio structure to the sigio list for
  * the process or process group.
  */
 int
 fsetown(pid_t pgid, struct sigio **sigiop)
 {
 	struct proc *proc;
 	struct pgrp *pgrp;
 	struct sigio *osigio, *sigio;
 	int ret;
 
 	if (pgid == 0) {
 		funsetown(sigiop);
 		return (0);
 	}
 
 	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
 	sigio->sio_pgid = pgid;
 	sigio->sio_ucred = crhold(curthread->td_ucred);
 	sigio->sio_myref = sigiop;
 
 	ret = 0;
 	if (pgid > 0) {
 		ret = pget(pgid, PGET_NOTWEXIT | PGET_NOTID | PGET_HOLD, &proc);
 		SIGIO_LOCK();
 		osigio = funsetown_locked(*sigiop);
 		if (ret == 0) {
 			PROC_LOCK(proc);
 			_PRELE(proc);
 			if ((proc->p_flag & P_WEXIT) != 0) {
 				ret = ESRCH;
 			} else if (proc->p_session !=
 			    curthread->td_proc->p_session) {
 				/*
 				 * Policy - Don't allow a process to FSETOWN a
 				 * process in another session.
 				 *
 				 * Remove this test to allow maximum flexibility
 				 * or restrict FSETOWN to the current process or
 				 * process group for maximum safety.
 				 */
 				ret = EPERM;
 			} else {
 				sigio->sio_proc = proc;
 				SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio,
 				    sio_pgsigio);
 			}
 			PROC_UNLOCK(proc);
 		}
 	} else /* if (pgid < 0) */ {
 		sx_slock(&proctree_lock);
 		SIGIO_LOCK();
 		osigio = funsetown_locked(*sigiop);
 		pgrp = pgfind(-pgid);
 		if (pgrp == NULL) {
 			ret = ESRCH;
 		} else {
 			if (pgrp->pg_session != curthread->td_proc->p_session) {
 				/*
 				 * Policy - Don't allow a process to FSETOWN a
 				 * process in another session.
 				 *
 				 * Remove this test to allow maximum flexibility
 				 * or restrict FSETOWN to the current process or
 				 * process group for maximum safety.
 				 */
 				ret = EPERM;
 			} else {
 				sigio->sio_pgrp = pgrp;
 				SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio,
 				    sio_pgsigio);
 			}
 			PGRP_UNLOCK(pgrp);
 		}
 		sx_sunlock(&proctree_lock);
 	}
 	if (ret == 0)
 		*sigiop = sigio;
 	SIGIO_UNLOCK();
 	if (osigio != NULL)
 		sigiofree(osigio);
 	return (ret);
 }
 
 /*
  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
  */
 pid_t
 fgetown(struct sigio **sigiop)
 {
 	pid_t pgid;
 
 	SIGIO_LOCK();
 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
 	SIGIO_UNLOCK();
 	return (pgid);
 }
 
 static int
 closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
     bool audit)
 {
 	int error;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	/*
 	 * We now hold the fp reference that used to be owned by the
 	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
 	 * knote_fdclose to prevent a race of the fd getting opened, a knote
 	 * added, and deleteing a knote for the new fd.
 	 */
 	if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist)))
 		knote_fdclose(td, fd);
 
 	/*
 	 * We need to notify mqueue if the object is of type mqueue.
 	 */
 	if (__predict_false(fp->f_type == DTYPE_MQUEUE))
 		mq_fdclose(td, fd, fp);
 	FILEDESC_XUNLOCK(fdp);
 
 #ifdef AUDIT
 	if (AUDITING_TD(td) && audit)
 		audit_sysclose(td, fd, fp);
 #endif
 	error = closef(fp, td);
 
 	/*
 	 * All paths leading up to closefp() will have already removed or
 	 * replaced the fd in the filedesc table, so a restart would not
 	 * operate on the same file.
 	 */
 	if (error == ERESTART)
 		error = EINTR;
 
 	return (error);
 }
 
 static int
 closefp_hl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
     bool holdleaders, bool audit)
 {
 	int error;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (holdleaders) {
 		if (td->td_proc->p_fdtol != NULL) {
 			/*
 			 * Ask fdfree() to sleep to ensure that all relevant
 			 * process leaders can be traversed in closef().
 			 */
 			fdp->fd_holdleaderscount++;
 		} else {
 			holdleaders = false;
 		}
 	}
 
 	error = closefp_impl(fdp, fd, fp, td, audit);
 	if (holdleaders) {
 		FILEDESC_XLOCK(fdp);
 		fdp->fd_holdleaderscount--;
 		if (fdp->fd_holdleaderscount == 0 &&
 		    fdp->fd_holdleaderswakeup != 0) {
 			fdp->fd_holdleaderswakeup = 0;
 			wakeup(&fdp->fd_holdleaderscount);
 		}
 		FILEDESC_XUNLOCK(fdp);
 	}
 	return (error);
 }
 
 static int
 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
     bool holdleaders, bool audit)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (__predict_false(td->td_proc->p_fdtol != NULL)) {
 		return (closefp_hl(fdp, fd, fp, td, holdleaders, audit));
 	} else {
 		return (closefp_impl(fdp, fd, fp, td, audit));
 	}
 }
 
 /*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct close_args {
 	int     fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_close(struct thread *td, struct close_args *uap)
 {
 
 	return (kern_close(td, uap->fd));
 }
 
 int
 kern_close(struct thread *td, int fd)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 
 	fdp = td->td_proc->p_fd;
 
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_locked(fdp, fd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	fdfree(fdp, fd);
 
 	/* closefp() drops the FILEDESC lock for us. */
 	return (closefp(fdp, fd, fp, td, true, true));
 }
 
 int
 kern_close_range(struct thread *td, u_int lowfd, u_int highfd)
 {
 	struct filedesc *fdp;
 	const struct fdescenttbl *fdt;
 	struct file *fp;
 	int fd;
 
 	/*
 	 * Check this prior to clamping; closefrom(3) with only fd 0, 1, and 2
 	 * open should not be a usage error.  From a close_range() perspective,
 	 * close_range(3, ~0U, 0) in the same scenario should also likely not
 	 * be a usage error as all fd above 3 are in-fact already closed.
 	 */
 	if (highfd < lowfd) {
 		return (EINVAL);
 	}
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	fdt = atomic_load_ptr(&fdp->fd_files);
 	highfd = MIN(highfd, fdt->fdt_nfiles - 1);
 	fd = lowfd;
 	if (__predict_false(fd > highfd)) {
 		goto out_locked;
 	}
 	for (;;) {
 		fp = fdt->fdt_ofiles[fd].fde_file;
 		if (fp == NULL) {
 			if (fd == highfd)
 				goto out_locked;
 		} else {
 			fdfree(fdp, fd);
 			(void) closefp(fdp, fd, fp, td, true, true);
 			if (fd == highfd)
 				goto out_unlocked;
 			FILEDESC_XLOCK(fdp);
 			fdt = atomic_load_ptr(&fdp->fd_files);
 		}
 		fd++;
 	}
 out_locked:
 	FILEDESC_XUNLOCK(fdp);
 out_unlocked:
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct close_range_args {
 	u_int	lowfd;
 	u_int	highfd;
 	int	flags;
 };
 #endif
 int
 sys_close_range(struct thread *td, struct close_range_args *uap)
 {
 
 	AUDIT_ARG_FD(uap->lowfd);
 	AUDIT_ARG_CMD(uap->highfd);
 	AUDIT_ARG_FFLAGS(uap->flags);
 
 	/* No flags currently defined */
 	if (uap->flags != 0)
 		return (EINVAL);
 	return (kern_close_range(td, uap->lowfd, uap->highfd));
 }
 
 #ifdef COMPAT_FREEBSD12
 /*
  * Close open file descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd12_closefrom_args {
 	int	lowfd;
 };
 #endif
 /* ARGSUSED */
 int
 freebsd12_closefrom(struct thread *td, struct freebsd12_closefrom_args *uap)
 {
 	u_int lowfd;
 
 	AUDIT_ARG_FD(uap->lowfd);
 
 	/*
 	 * Treat negative starting file descriptor values identical to
 	 * closefrom(0) which closes all files.
 	 */
 	lowfd = MAX(0, uap->lowfd);
 	return (kern_close_range(td, lowfd, ~0U));
 }
 #endif	/* COMPAT_FREEBSD12 */
 
 #if defined(COMPAT_43)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ofstat_args {
 	int	fd;
 	struct	ostat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 ofstat(struct thread *td, struct ofstat_args *uap)
 {
 	struct ostat oub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtstat(&ub, &oub);
 		error = copyout(&oub, uap->sb, sizeof(oub));
 	}
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_FREEBSD11)
 int
 freebsd11_fstat(struct thread *td, struct freebsd11_fstat_args *uap)
 {
 	struct stat sb;
 	struct freebsd11_stat osb;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &sb);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat(&sb, &osb);
 	if (error == 0)
 		error = copyout(&osb, uap->sb, sizeof(osb));
 	return (error);
 }
 #endif	/* COMPAT_FREEBSD11 */
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstat_args {
 	int	fd;
 	struct	stat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fstat(struct thread *td, struct fstat_args *uap)
 {
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0)
 		error = copyout(&ub, uap->sb, sizeof(ub));
 	return (error);
 }
 
 int
 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 
 	error = fget(td, fd, &cap_fstat_rights, &fp);
 	if (__predict_false(error != 0))
 		return (error);
 
 	AUDIT_ARG_FILE(td->td_proc, fp);
 
-	error = fo_stat(fp, sbp, td->td_ucred, td);
+	error = fo_stat(fp, sbp, td->td_ucred);
 	fdrop(fp, td);
 #ifdef __STAT_TIME_T_EXT
 	sbp->st_atim_ext = 0;
 	sbp->st_mtim_ext = 0;
 	sbp->st_ctim_ext = 0;
 	sbp->st_btim_ext = 0;
 #endif
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrstat_error(sbp, error);
 #endif
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD11)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd11_nfstat_args {
 	int	fd;
 	struct	nstat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 freebsd11_nfstat(struct thread *td, struct freebsd11_nfstat_args *uap)
 {
 	struct nstat nub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		freebsd11_cvtnstat(&ub, &nub);
 		error = copyout(&nub, uap->sb, sizeof(nub));
 	}
 	return (error);
 }
 #endif /* COMPAT_FREEBSD11 */
 
 /*
  * Return pathconf information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fpathconf_args {
 	int	fd;
 	int	name;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
 {
 	long value;
 	int error;
 
 	error = kern_fpathconf(td, uap->fd, uap->name, &value);
 	if (error == 0)
 		td->td_retval[0] = value;
 	return (error);
 }
 
 int
 kern_fpathconf(struct thread *td, int fd, int name, long *valuep)
 {
 	struct file *fp;
 	struct vnode *vp;
 	int error;
 
 	error = fget(td, fd, &cap_fpathconf_rights, &fp);
 	if (error != 0)
 		return (error);
 
 	if (name == _PC_ASYNC_IO) {
 		*valuep = _POSIX_ASYNCHRONOUS_IO;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp != NULL) {
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_PATHCONF(vp, name, valuep);
 		VOP_UNLOCK(vp);
 	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 		if (name != _PC_PIPE_BUF) {
 			error = EINVAL;
 		} else {
 			*valuep = PIPE_BUF;
 			error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
 	}
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Copy filecaps structure allocating memory for ioctls array if needed.
  *
  * The last parameter indicates whether the fdtable is locked. If it is not and
  * ioctls are encountered, copying fails and the caller must lock the table.
  *
  * Note that if the table was not locked, the caller has to check the relevant
  * sequence counter to determine whether the operation was successful.
  */
 bool
 filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked)
 {
 	size_t size;
 
 	if (src->fc_ioctls != NULL && !locked)
 		return (false);
 	memcpy(dst, src, sizeof(*src));
 	if (src->fc_ioctls == NULL)
 		return (true);
 
 	KASSERT(src->fc_nioctls > 0,
 	    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
 
 	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 	dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
 	memcpy(dst->fc_ioctls, src->fc_ioctls, size);
 	return (true);
 }
 
 static u_long *
 filecaps_copy_prep(const struct filecaps *src)
 {
 	u_long *ioctls;
 	size_t size;
 
 	if (__predict_true(src->fc_ioctls == NULL))
 		return (NULL);
 
 	KASSERT(src->fc_nioctls > 0,
 	    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
 
 	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 	ioctls = malloc(size, M_FILECAPS, M_WAITOK);
 	return (ioctls);
 }
 
 static void
 filecaps_copy_finish(const struct filecaps *src, struct filecaps *dst,
     u_long *ioctls)
 {
 	size_t size;
 
 	*dst = *src;
 	if (__predict_true(src->fc_ioctls == NULL)) {
 		MPASS(ioctls == NULL);
 		return;
 	}
 
 	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 	dst->fc_ioctls = ioctls;
 	bcopy(src->fc_ioctls, dst->fc_ioctls, size);
 }
 
 /*
  * Move filecaps structure to the new place and clear the old place.
  */
 void
 filecaps_move(struct filecaps *src, struct filecaps *dst)
 {
 
 	*dst = *src;
 	bzero(src, sizeof(*src));
 }
 
 /*
  * Fill the given filecaps structure with full rights.
  */
 static void
 filecaps_fill(struct filecaps *fcaps)
 {
 
 	CAP_ALL(&fcaps->fc_rights);
 	fcaps->fc_ioctls = NULL;
 	fcaps->fc_nioctls = -1;
 	fcaps->fc_fcntls = CAP_FCNTL_ALL;
 }
 
 /*
  * Free memory allocated within filecaps structure.
  */
 void
 filecaps_free(struct filecaps *fcaps)
 {
 
 	free(fcaps->fc_ioctls, M_FILECAPS);
 	bzero(fcaps, sizeof(*fcaps));
 }
 
 static u_long *
 filecaps_free_prep(struct filecaps *fcaps)
 {
 	u_long *ioctls;
 
 	ioctls = fcaps->fc_ioctls;
 	bzero(fcaps, sizeof(*fcaps));
 	return (ioctls);
 }
 
 static void
 filecaps_free_finish(u_long *ioctls)
 {
 
 	free(ioctls, M_FILECAPS);
 }
 
 /*
  * Validate the given filecaps structure.
  */
 static void
 filecaps_validate(const struct filecaps *fcaps, const char *func)
 {
 
 	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
 	    ("%s: invalid rights", func));
 	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
 	    ("%s: invalid fcntls", func));
 	KASSERT(fcaps->fc_fcntls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
 	    ("%s: fcntls without CAP_FCNTL", func));
 	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
 	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
 	    ("%s: invalid ioctls", func));
 	KASSERT(fcaps->fc_nioctls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
 	    ("%s: ioctls without CAP_IOCTL", func));
 }
 
 static void
 fdgrowtable_exp(struct filedesc *fdp, int nfd)
 {
 	int nfd1;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	nfd1 = fdp->fd_nfiles * 2;
 	if (nfd1 < nfd)
 		nfd1 = nfd;
 	fdgrowtable(fdp, nfd1);
 }
 
 /*
  * Grow the file table to accommodate (at least) nfd descriptors.
  */
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft;
 	struct fdescenttbl *ntable;
 	struct fdescenttbl *otable;
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap, *omap;
 
 	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
 
 	/* save old values */
 	onfiles = fdp->fd_nfiles;
 	otable = fdp->fd_files;
 	omap = fdp->fd_map;
 
 	/* compute the size of the new table */
 	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 	if (nnfiles <= onfiles)
 		/* the table is already large enough */
 		return;
 
 	/*
 	 * Allocate a new table.  We need enough space for the number of
 	 * entries, file entries themselves and the struct freetable we will use
 	 * when we decommission the table and place it on the freelist.
 	 * We place the struct freetable in the middle so we don't have
 	 * to worry about padding.
 	 */
 	ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
 	    nnfiles * sizeof(ntable->fdt_ofiles[0]) +
 	    sizeof(struct freetable),
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	/* copy the old data */
 	ntable->fdt_nfiles = nnfiles;
 	memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
 	    onfiles * sizeof(ntable->fdt_ofiles[0]));
 
 	/*
 	 * Allocate a new map only if the old is not large enough.  It will
 	 * grow at a slower rate than the table as it can map more
 	 * entries than the table can hold.
 	 */
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
 		    M_ZERO | M_WAITOK);
 		/* copy over the old data and update the pointer */
 		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
 		fdp->fd_map = nmap;
 	}
 
 	/*
 	 * Make sure that ntable is correctly initialized before we replace
 	 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent
 	 * data.
 	 */
 	atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
 
 	/*
 	 * Free the old file table when not shared by other threads or processes.
 	 * The old file table is considered to be shared when either are true:
 	 * - The process has more than one thread.
 	 * - The file descriptor table has been shared via fdshare().
 	 *
 	 * When shared, the old file table will be placed on a freelist
 	 * which will be processed when the struct filedesc is released.
 	 *
 	 * Note that if onfiles == NDFILE, we're dealing with the original
 	 * static allocation contained within (struct filedesc0 *)fdp,
 	 * which must not be freed.
 	 */
 	if (onfiles > NDFILE) {
 		/*
 		 * Note we may be called here from fdinit while allocating a
 		 * table for a new process in which case ->p_fd points
 		 * elsewhere.
 		 */
 		if (curproc->p_fd != fdp || FILEDESC_IS_ONLY_USER(fdp)) {
 			free(otable, M_FILEDESC);
 		} else {
 			ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
 			fdp0 = (struct filedesc0 *)fdp;
 			ft->ft_table = otable;
 			SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
 		}
 	}
 	/*
 	 * The map does not have the same possibility of threads still
 	 * holding references to it.  So always free it as long as it
 	 * does not reference the original static allocation.
 	 */
 	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
 		free(omap, M_FILEDESC);
 }
 
 /*
  * Allocate a file descriptor for the process.
  */
 int
 fdalloc(struct thread *td, int minfd, int *result)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int fd, maxfd, allocfd;
 #ifdef RACCT
 	int error;
 #endif
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (fdp->fd_freefile > minfd)
 		minfd = fdp->fd_freefile;
 
 	maxfd = getmaxfd(td);
 
 	/*
 	 * Search the bitmap for a free descriptor starting at minfd.
 	 * If none is found, grow the file table.
 	 */
 	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
 	if (__predict_false(fd >= maxfd))
 		return (EMFILE);
 	if (__predict_false(fd >= fdp->fd_nfiles)) {
 		allocfd = min(fd * 2, maxfd);
 #ifdef RACCT
 		if (RACCT_ENABLED()) {
 			error = racct_set_unlocked(p, RACCT_NOFILE, allocfd);
 			if (error != 0)
 				return (EMFILE);
 		}
 #endif
 		/*
 		 * fd is already equal to first free descriptor >= minfd, so
 		 * we only need to grow the table and we are done.
 		 */
 		fdgrowtable_exp(fdp, allocfd);
 	}
 
 	/*
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
 	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
 	    ("invalid descriptor %d", fd));
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("file descriptor isn't free"));
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
 }
 
 /*
  * Allocate n file descriptors for the process.
  */
 int
 fdallocn(struct thread *td, int minfd, int *fds, int n)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int i;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	for (i = 0; i < n; i++)
 		if (fdalloc(td, 0, &fds[i]) != 0)
 			break;
 
 	if (i < n) {
 		for (i--; i >= 0; i--)
 			fdunused(fdp, fds[i]);
 		return (EMFILE);
 	}
 
 	return (0);
 }
 
 /*
  * Create a new open file structure and allocate a file descriptor for the
  * process that refers to it.  We add one reference to the file for the
  * descriptor table and one reference for resultfp. This is to prevent us
  * being preempted and the entry in the descriptor table closed after we
  * release the FILEDESC lock.
  */
 int
 falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags,
     struct filecaps *fcaps)
 {
 	struct file *fp;
 	int error, fd;
 
 	MPASS(resultfp != NULL);
 	MPASS(resultfd != NULL);
 
 	error = _falloc_noinstall(td, &fp, 2);
 	if (__predict_false(error != 0)) {
 		return (error);
 	}
 
 	error = finstall_refed(td, fp, &fd, flags, fcaps);
 	if (__predict_false(error != 0)) {
 		falloc_abort(td, fp);
 		return (error);
 	}
 
 	*resultfp = fp;
 	*resultfd = fd;
 
 	return (0);
 }
 
 /*
  * Create a new open file structure without allocating a file descriptor.
  */
 int
 _falloc_noinstall(struct thread *td, struct file **resultfp, u_int n)
 {
 	struct file *fp;
 	int maxuserfiles = maxfiles - (maxfiles / 20);
 	int openfiles_new;
 	static struct timeval lastfail;
 	static int curfail;
 
 	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
 	MPASS(n > 0);
 
 	openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1;
 	if ((openfiles_new >= maxuserfiles &&
 	    priv_check(td, PRIV_MAXFILES) != 0) ||
 	    openfiles_new >= maxfiles) {
 		atomic_subtract_int(&openfiles, 1);
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
 			printf("kern.maxfiles limit exceeded by uid %i, (%s) "
 			    "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm);
 		}
 		return (ENFILE);
 	}
 	fp = uma_zalloc(file_zone, M_WAITOK);
 	bzero(fp, sizeof(*fp));
 	refcount_init(&fp->f_count, n);
 	fp->f_cred = crhold(td->td_ucred);
 	fp->f_ops = &badfileops;
 	*resultfp = fp;
 	return (0);
 }
 
 void
 falloc_abort(struct thread *td, struct file *fp)
 {
 
 	/*
 	 * For assertion purposes.
 	 */
 	refcount_init(&fp->f_count, 0);
 	_fdrop(fp, td);
 }
 
 /*
  * Install a file in a file descriptor table.
  */
 void
 _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
     struct filecaps *fcaps)
 {
 	struct filedescent *fde;
 
 	MPASS(fp != NULL);
 	if (fcaps != NULL)
 		filecaps_validate(fcaps, __func__);
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	fde = &fdp->fd_ofiles[fd];
 #ifdef CAPABILITIES
 	seqc_write_begin(&fde->fde_seqc);
 #endif
 	fde->fde_file = fp;
 	fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0;
 	if (fcaps != NULL)
 		filecaps_move(fcaps, &fde->fde_caps);
 	else
 		filecaps_fill(&fde->fde_caps);
 #ifdef CAPABILITIES
 	seqc_write_end(&fde->fde_seqc);
 #endif
 }
 
 int
 finstall_refed(struct thread *td, struct file *fp, int *fd, int flags,
     struct filecaps *fcaps)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 
 	MPASS(fd != NULL);
 
 	FILEDESC_XLOCK(fdp);
 	error = fdalloc(td, 0, fd);
 	if (__predict_true(error == 0)) {
 		_finstall(fdp, fp, *fd, flags, fcaps);
 	}
 	FILEDESC_XUNLOCK(fdp);
 	return (error);
 }
 
 int
 finstall(struct thread *td, struct file *fp, int *fd, int flags,
     struct filecaps *fcaps)
 {
 	int error;
 
 	MPASS(fd != NULL);
 
 	if (!fhold(fp))
 		return (EBADF);
 	error = finstall_refed(td, fp, fd, flags, fcaps);
 	if (__predict_false(error != 0)) {
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 /*
  * Build a new filedesc structure from another.
  *
  * If fdp is not NULL, return with it shared locked.
  */
 struct filedesc *
 fdinit(struct filedesc *fdp, bool prepfiles, int *lastfile)
 {
 	struct filedesc0 *newfdp0;
 	struct filedesc *newfdp;
 
 	if (prepfiles)
 		MPASS(lastfile != NULL);
 	else
 		MPASS(lastfile == NULL);
 
 	newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO);
 	newfdp = &newfdp0->fd_fd;
 
 	/* Create the file descriptor table. */
 	FILEDESC_LOCK_INIT(newfdp);
 	refcount_init(&newfdp->fd_refcnt, 1);
 	refcount_init(&newfdp->fd_holdcnt, 1);
 	newfdp->fd_map = newfdp0->fd_dmap;
 	newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
 	newfdp->fd_files->fdt_nfiles = NDFILE;
 
 	if (fdp == NULL)
 		return (newfdp);
 
 	FILEDESC_SLOCK(fdp);
 	if (!prepfiles) {
 		FILEDESC_SUNLOCK(fdp);
 		return (newfdp);
 	}
 
 	for (;;) {
 		*lastfile = fdlastfile(fdp);
 		if (*lastfile < newfdp->fd_nfiles)
 			break;
 		FILEDESC_SUNLOCK(fdp);
 		fdgrowtable(newfdp, *lastfile + 1);
 		FILEDESC_SLOCK(fdp);
 	}
 
 	return (newfdp);
 }
 
 /*
  * Build a pwddesc structure from another.
  * Copy the current, root, and jail root vnode references.
  *
  * If pdp is not NULL, return with it shared locked.
  */
 struct pwddesc *
 pdinit(struct pwddesc *pdp, bool keeplock)
 {
 	struct pwddesc *newpdp;
 	struct pwd *newpwd;
 
 	newpdp = malloc(sizeof(*newpdp), M_PWDDESC, M_WAITOK | M_ZERO);
 
 	PWDDESC_LOCK_INIT(newpdp);
 	refcount_init(&newpdp->pd_refcount, 1);
 	newpdp->pd_cmask = CMASK;
 
 	if (pdp == NULL) {
 		newpwd = pwd_alloc();
 		smr_serialized_store(&newpdp->pd_pwd, newpwd, true);
 		return (newpdp);
 	}
 
 	PWDDESC_XLOCK(pdp);
 	newpwd = pwd_hold_pwddesc(pdp);
 	smr_serialized_store(&newpdp->pd_pwd, newpwd, true);
 	if (!keeplock)
 		PWDDESC_XUNLOCK(pdp);
 	return (newpdp);
 }
 
 /*
  * Hold either filedesc or pwddesc of the passed process.
  *
  * The process lock is used to synchronize against the target exiting and
  * freeing the data.
  *
  * Clearing can be ilustrated in 3 steps:
  * 1. set the pointer to NULL. Either routine can race against it, hence
  *   atomic_load_ptr.
  * 2. observe the process lock as not taken. Until then fdhold/pdhold can
  *   race to either still see the pointer or find NULL. It is still safe to
  *   grab a reference as clearing is stalled.
  * 3. after the lock is observed as not taken, any fdhold/pdhold calls are
  *   guaranteed to see NULL, making it safe to finish clearing
  */
 static struct filedesc *
 fdhold(struct proc *p)
 {
 	struct filedesc *fdp;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	fdp = atomic_load_ptr(&p->p_fd);
 	if (fdp != NULL)
 		refcount_acquire(&fdp->fd_holdcnt);
 	return (fdp);
 }
 
 static struct pwddesc *
 pdhold(struct proc *p)
 {
 	struct pwddesc *pdp;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	pdp = atomic_load_ptr(&p->p_pd);
 	if (pdp != NULL)
 		refcount_acquire(&pdp->pd_refcount);
 	return (pdp);
 }
 
 static void
 fddrop(struct filedesc *fdp)
 {
 
 	if (refcount_load(&fdp->fd_holdcnt) > 1) {
 		if (refcount_release(&fdp->fd_holdcnt) == 0)
 			return;
 	}
 
 	FILEDESC_LOCK_DESTROY(fdp);
 	uma_zfree(filedesc0_zone, fdp);
 }
 
 static void
 pddrop(struct pwddesc *pdp)
 {
 	struct pwd *pwd;
 
 	if (refcount_release_if_not_last(&pdp->pd_refcount))
 		return;
 
 	PWDDESC_XLOCK(pdp);
 	if (refcount_release(&pdp->pd_refcount) == 0) {
 		PWDDESC_XUNLOCK(pdp);
 		return;
 	}
 	pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	pwd_set(pdp, NULL);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(pwd);
 
 	PWDDESC_LOCK_DESTROY(pdp);
 	free(pdp, M_PWDDESC);
 }
 
 /*
  * Share a filedesc structure.
  */
 struct filedesc *
 fdshare(struct filedesc *fdp)
 {
 
 	refcount_acquire(&fdp->fd_refcnt);
 	return (fdp);
 }
 
 /*
  * Share a pwddesc structure.
  */
 struct pwddesc *
 pdshare(struct pwddesc *pdp)
 {
 	refcount_acquire(&pdp->pd_refcount);
 	return (pdp);
 }
 
 /*
  * Unshare a filedesc structure, if necessary by making a copy
  */
 void
 fdunshare(struct thread *td)
 {
 	struct filedesc *tmp;
 	struct proc *p = td->td_proc;
 
 	if (refcount_load(&p->p_fd->fd_refcnt) == 1)
 		return;
 
 	tmp = fdcopy(p->p_fd);
 	fdescfree(td);
 	p->p_fd = tmp;
 }
 
 /*
  * Unshare a pwddesc structure.
  */
 void
 pdunshare(struct thread *td)
 {
 	struct pwddesc *pdp;
 	struct proc *p;
 
 	p = td->td_proc;
 	/* Not shared. */
 	if (p->p_pd->pd_refcount == 1)
 		return;
 
 	pdp = pdcopy(p->p_pd);
 	pdescfree(td);
 	p->p_pd = pdp;
 }
 
 /*
  * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
  * this is to ease callers, not catch errors.
  */
 struct filedesc *
 fdcopy(struct filedesc *fdp)
 {
 	struct filedesc *newfdp;
 	struct filedescent *nfde, *ofde;
 	int i, lastfile;
 
 	MPASS(fdp != NULL);
 
 	newfdp = fdinit(fdp, true, &lastfile);
 	/* copy all passable descriptors (i.e. not kqueue) */
 	newfdp->fd_freefile = -1;
 	for (i = 0; i <= lastfile; ++i) {
 		ofde = &fdp->fd_ofiles[i];
 		if (ofde->fde_file == NULL ||
 		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0 ||
 		    !fhold(ofde->fde_file)) {
 			if (newfdp->fd_freefile == -1)
 				newfdp->fd_freefile = i;
 			continue;
 		}
 		nfde = &newfdp->fd_ofiles[i];
 		*nfde = *ofde;
 		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 		fdused_init(newfdp, i);
 	}
 	if (newfdp->fd_freefile == -1)
 		newfdp->fd_freefile = i;
 	FILEDESC_SUNLOCK(fdp);
 	return (newfdp);
 }
 
 /*
  * Copy a pwddesc structure.
  */
 struct pwddesc *
 pdcopy(struct pwddesc *pdp)
 {
 	struct pwddesc *newpdp;
 
 	MPASS(pdp != NULL);
 
 	newpdp = pdinit(pdp, true);
 	newpdp->pd_cmask = pdp->pd_cmask;
 	PWDDESC_XUNLOCK(pdp);
 	return (newpdp);
 }
 
 /*
  * Clear POSIX style locks. This is only used when fdp looses a reference (i.e.
  * one of processes using it exits) and the table used to be shared.
  */
 static void
 fdclearlocks(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedesc_to_leader *fdtol;
 	struct flock lf;
 	struct file *fp;
 	struct proc *p;
 	struct vnode *vp;
 	int i, lastfile;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	fdtol = p->p_fdtol;
 	MPASS(fdtol != NULL);
 
 	FILEDESC_XLOCK(fdp);
 	KASSERT(fdtol->fdl_refcount > 0,
 	    ("filedesc_to_refcount botch: fdl_refcount=%d",
 	    fdtol->fdl_refcount));
 	if (fdtol->fdl_refcount == 1 &&
 	    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 		lastfile = fdlastfile(fdp);
 		for (i = 0; i <= lastfile; i++) {
 			fp = fdp->fd_ofiles[i].fde_file;
 			if (fp == NULL || fp->f_type != DTYPE_VNODE ||
 			    !fhold(fp))
 				continue;
 			FILEDESC_XUNLOCK(fdp);
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			vp = fp->f_vnode;
 			(void) VOP_ADVLOCK(vp,
 			    (caddr_t)p->p_leader, F_UNLCK,
 			    &lf, F_POSIX);
 			FILEDESC_XLOCK(fdp);
 			fdrop(fp, td);
 		}
 	}
 retry:
 	if (fdtol->fdl_refcount == 1) {
 		if (fdp->fd_holdleaderscount > 0 &&
 		    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 			/*
 			 * close() or kern_dup() has cleared a reference
 			 * in a shared file descriptor table.
 			 */
 			fdp->fd_holdleaderswakeup = 1;
 			sx_sleep(&fdp->fd_holdleaderscount,
 			    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
 			goto retry;
 		}
 		if (fdtol->fdl_holdcount > 0) {
 			/*
 			 * Ensure that fdtol->fdl_leader remains
 			 * valid in closef().
 			 */
 			fdtol->fdl_wakeup = 1;
 			sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
 			    "fdlhold", 0);
 			goto retry;
 		}
 	}
 	fdtol->fdl_refcount--;
 	if (fdtol->fdl_refcount == 0 &&
 	    fdtol->fdl_holdcount == 0) {
 		fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
 		fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
 	} else
 		fdtol = NULL;
 	p->p_fdtol = NULL;
 	FILEDESC_XUNLOCK(fdp);
 	if (fdtol != NULL)
 		free(fdtol, M_FILEDESC_TO_LEADER);
 }
 
 /*
  * Release a filedesc structure.
  */
 static void
 fdescfree_fds(struct thread *td, struct filedesc *fdp)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft, *tft;
 	struct filedescent *fde;
 	struct file *fp;
 	int i, lastfile;
 
 	KASSERT(refcount_load(&fdp->fd_refcnt) == 0,
 	    ("%s: fd table %p carries references", __func__, fdp));
 
 	/*
 	 * Serialize with threads iterating over the table, if any.
 	 */
 	if (refcount_load(&fdp->fd_holdcnt) > 1) {
 		FILEDESC_XLOCK(fdp);
 		FILEDESC_XUNLOCK(fdp);
 	}
 
 	lastfile = fdlastfile_single(fdp);
 	for (i = 0; i <= lastfile; i++) {
 		fde = &fdp->fd_ofiles[i];
 		fp = fde->fde_file;
 		if (fp != NULL) {
 			fdefree_last(fde);
 			(void) closef(fp, td);
 		}
 	}
 
 	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
 		free(fdp->fd_map, M_FILEDESC);
 	if (fdp->fd_nfiles > NDFILE)
 		free(fdp->fd_files, M_FILEDESC);
 
 	fdp0 = (struct filedesc0 *)fdp;
 	SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft)
 		free(ft->ft_table, M_FILEDESC);
 
 	fddrop(fdp);
 }
 
 void
 fdescfree(struct thread *td)
 {
 	struct proc *p;
 	struct filedesc *fdp;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	MPASS(fdp != NULL);
 
 #ifdef RACCT
 	if (RACCT_ENABLED())
 		racct_set_unlocked(p, RACCT_NOFILE, 0);
 #endif
 
 	if (p->p_fdtol != NULL)
 		fdclearlocks(td);
 
 	/*
 	 * Check fdhold for an explanation.
 	 */
 	atomic_store_ptr(&p->p_fd, NULL);
 	atomic_thread_fence_seq_cst();
 	PROC_WAIT_UNLOCKED(p);
 
 	if (refcount_release(&fdp->fd_refcnt) == 0)
 		return;
 
 	fdescfree_fds(td, fdp);
 }
 
 void
 pdescfree(struct thread *td)
 {
 	struct proc *p;
 	struct pwddesc *pdp;
 
 	p = td->td_proc;
 	pdp = p->p_pd;
 	MPASS(pdp != NULL);
 
 	/*
 	 * Check pdhold for an explanation.
 	 */
 	atomic_store_ptr(&p->p_pd, NULL);
 	atomic_thread_fence_seq_cst();
 	PROC_WAIT_UNLOCKED(p);
 
 	pddrop(pdp);
 }
 
 /*
  * For setugid programs, we don't want to people to use that setugidness
  * to generate error messages which write to a file which otherwise would
  * otherwise be off-limits to the process.  We check for filesystems where
  * the vnode can change out from under us after execve (like [lin]procfs).
  *
  * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
  * sufficient.  We also don't check for setugidness since we know we are.
  */
 static bool
 is_unsafe(struct file *fp)
 {
 	struct vnode *vp;
 
 	if (fp->f_type != DTYPE_VNODE)
 		return (false);
 
 	vp = fp->f_vnode;
 	return ((vp->v_vflag & VV_PROCDEP) != 0);
 }
 
 /*
  * Make this setguid thing safe, if at all possible.
  */
 void
 fdsetugidsafety(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
 	    ("the fdtable should not be shared"));
 	MPASS(fdp->fd_nfiles >= 3);
 	for (i = 0; i <= 2; i++) {
 		fp = fdp->fd_ofiles[i].fde_file;
 		if (fp != NULL && is_unsafe(fp)) {
 			FILEDESC_XLOCK(fdp);
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fdfree(fdp, i);
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
 		}
 	}
 }
 
 /*
  * If a specific file object occupies a specific file descriptor, close the
  * file descriptor entry and drop a reference on the file object.  This is a
  * convenience function to handle a subsequent error in a function that calls
  * falloc() that handles the race that another thread might have closed the
  * file descriptor out from under the thread creating the file object.
  */
 void
 fdclose(struct thread *td, struct file *fp, int idx)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 
 	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_ofiles[idx].fde_file == fp) {
 		fdfree(fdp, idx);
 		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
 	} else
 		FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * Close any files on exec?
  */
 void
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedescent *fde;
 	struct file *fp;
 	int i, lastfile;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
 	    ("the fdtable should not be shared"));
 	lastfile = fdlastfile_single(fdp);
 	for (i = 0; i <= lastfile; i++) {
 		fde = &fdp->fd_ofiles[i];
 		fp = fde->fde_file;
 		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
 		    (fde->fde_flags & UF_EXCLOSE))) {
 			FILEDESC_XLOCK(fdp);
 			fdfree(fdp, i);
 			(void) closefp(fdp, i, fp, td, false, false);
 			FILEDESC_UNLOCK_ASSERT(fdp);
 		}
 	}
 }
 
 /*
  * It is unsafe for set[ug]id processes to be started with file
  * descriptors 0..2 closed, as these descriptors are given implicit
  * significance in the Standard C library.  fdcheckstd() will create a
  * descriptor referencing /dev/null for each of stdin, stdout, and
  * stderr that is not already open.
  */
 int
 fdcheckstd(struct thread *td)
 {
 	struct filedesc *fdp;
 	register_t save;
 	int i, error, devnull;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(refcount_load(&fdp->fd_refcnt) == 1,
 	    ("the fdtable should not be shared"));
 	MPASS(fdp->fd_nfiles >= 3);
 	devnull = -1;
 	for (i = 0; i <= 2; i++) {
 		if (fdp->fd_ofiles[i].fde_file != NULL)
 			continue;
 
 		save = td->td_retval[0];
 		if (devnull != -1) {
 			error = kern_dup(td, FDDUP_FIXED, 0, devnull, i);
 		} else {
 			error = kern_openat(td, AT_FDCWD, "/dev/null",
 			    UIO_SYSSPACE, O_RDWR, 0);
 			if (error == 0) {
 				devnull = td->td_retval[0];
 				KASSERT(devnull == i, ("we didn't get our fd"));
 			}
 		}
 		td->td_retval[0] = save;
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Internal form of close.  Decrement reference count on file structure.
  * Note: td may be NULL when closing a file that was being passed in a
  * message.
  */
 int
 closef(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	struct filedesc_to_leader *fdtol;
 	struct filedesc *fdp;
 
 	MPASS(td != NULL);
 
 	/*
 	 * POSIX record locking dictates that any close releases ALL
 	 * locks owned by this process.  This is handled by setting
 	 * a flag in the unlock to free ONLY locks obeying POSIX
 	 * semantics, and not to free BSD-style file locks.
 	 * If the descriptor was in a message, POSIX-style locks
 	 * aren't passed with the descriptor, and the thread pointer
 	 * will be NULL.  Callers should be careful only to pass a
 	 * NULL thread pointer when there really is no owning
 	 * context that might have locks, or the locks will be
 	 * leaked.
 	 */
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 			    F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
 			/*
 			 * Handle special case where file descriptor table is
 			 * shared between multiple process leaders.
 			 */
 			fdp = td->td_proc->p_fd;
 			FILEDESC_XLOCK(fdp);
 			for (fdtol = fdtol->fdl_next;
 			    fdtol != td->td_proc->p_fdtol;
 			    fdtol = fdtol->fdl_next) {
 				if ((fdtol->fdl_leader->p_flag &
 				    P_ADVLOCK) == 0)
 					continue;
 				fdtol->fdl_holdcount++;
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				(void) VOP_ADVLOCK(vp,
 				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
 				    F_POSIX);
 				FILEDESC_XLOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
 				    fdtol->fdl_wakeup != 0) {
 					fdtol->fdl_wakeup = 0;
 					wakeup(fdtol);
 				}
 			}
 			FILEDESC_XUNLOCK(fdp);
 		}
 	}
 	return (fdrop_close(fp, td));
 }
 
 /*
  * Hack for file descriptor passing code.
  */
 void
 closef_nothread(struct file *fp)
 {
 
 	fdrop(fp, NULL);
 }
 
 /*
  * Initialize the file pointer with the specified properties.
  *
  * The ops are set with release semantics to be certain that the flags, type,
  * and data are visible when ops is.  This is to prevent ops methods from being
  * called with bad data.
  */
 void
 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
 {
 	fp->f_data = data;
 	fp->f_flag = flag;
 	fp->f_type = type;
 	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
 }
 
 void
 finit_vnode(struct file *fp, u_int flag, void *data, struct fileops *ops)
 {
 	fp->f_seqcount[UIO_READ] = 1;
 	fp->f_seqcount[UIO_WRITE] = 1;
 	finit(fp, (flag & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE,
 	    data, ops);
 }
 
 int
 fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp, struct filecaps *havecapsp)
 {
 	struct filedescent *fde;
 	int error;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	fde = fdeget_locked(fdp, fd);
 	if (fde == NULL) {
 		error = EBADF;
 		goto out;
 	}
 
 #ifdef CAPABILITIES
 	error = cap_check(cap_rights_fde_inline(fde), needrightsp);
 	if (error != 0)
 		goto out;
 #endif
 
 	if (havecapsp != NULL)
 		filecaps_copy(&fde->fde_caps, havecapsp, true);
 
 	*fpp = fde->fde_file;
 
 	error = 0;
 out:
 	return (error);
 }
 
 int
 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct file **fpp, struct filecaps *havecapsp)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 #ifndef CAPABILITIES
 	error = fget_unlocked(fdp, fd, needrightsp, fpp);
 	if (havecapsp != NULL && error == 0)
 		filecaps_fill(havecapsp);
 #else
 	struct file *fp;
 	seqc_t seq;
 
 	*fpp = NULL;
 	for (;;) {
 		error = fget_unlocked_seq(fdp, fd, needrightsp, &fp, &seq);
 		if (error != 0)
 			return (error);
 
 		if (havecapsp != NULL) {
 			if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps,
 			    havecapsp, false)) {
 				fdrop(fp, td);
 				goto get_locked;
 			}
 		}
 
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(fp, td);
 	}
 
 	*fpp = fp;
 	return (0);
 
 get_locked:
 	FILEDESC_SLOCK(fdp);
 	error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp);
 	if (error == 0 && !fhold(*fpp))
 		error = EBADF;
 	FILEDESC_SUNLOCK(fdp);
 #endif
 	return (error);
 }
 
 #ifdef CAPABILITIES
 int
 fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch)
 {
 	const struct filedescent *fde;
 	const struct fdescenttbl *fdt;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct vnode *vp;
 	const cap_rights_t *haverights;
 	cap_rights_t rights;
 	seqc_t seq;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	rights = *ndp->ni_rightsneeded;
 	cap_rights_set_one(&rights, CAP_LOOKUP);
 
 	fdp = curproc->p_fd;
 	fdt = fdp->fd_files;
 	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 		return (EBADF);
 	seq = seqc_read_notmodify(fd_seqc(fdt, fd));
 	fde = &fdt->fdt_ofiles[fd];
 	haverights = cap_rights_fde_inline(fde);
 	fp = fde->fde_file;
 	if (__predict_false(fp == NULL))
 		return (EAGAIN);
 	if (__predict_false(cap_check_inline_transient(haverights, &rights)))
 		return (EAGAIN);
 	*fsearch = ((fp->f_flag & FSEARCH) != 0);
 	vp = fp->f_vnode;
 	if (__predict_false(vp == NULL || vp->v_type != VDIR)) {
 		return (EAGAIN);
 	}
 	if (!filecaps_copy(&fde->fde_caps, &ndp->ni_filecaps, false)) {
 		return (EAGAIN);
 	}
 	/*
 	 * Use an acquire barrier to force re-reading of fdt so it is
 	 * refreshed for verification.
 	 */
 	atomic_thread_fence_acq();
 	fdt = fdp->fd_files;
 	if (__predict_false(!seqc_consistent_nomb(fd_seqc(fdt, fd), seq)))
 		return (EAGAIN);
 	/*
 	 * If file descriptor doesn't have all rights,
 	 * all lookups relative to it must also be
 	 * strictly relative.
 	 *
 	 * Not yet supported by fast path.
 	 */
 	CAP_ALL(&rights);
 	if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) ||
 	    ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
 	    ndp->ni_filecaps.fc_nioctls != -1) {
 #ifdef notyet
 		ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
 #else
 		return (EAGAIN);
 #endif
 	}
 	*vpp = vp;
 	return (0);
 }
 #else
 int
 fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch)
 {
 	const struct fdescenttbl *fdt;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct vnode *vp;
 
 	VFS_SMR_ASSERT_ENTERED();
 
 	fdp = curproc->p_fd;
 	fdt = fdp->fd_files;
 	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 		return (EBADF);
 	fp = fdt->fdt_ofiles[fd].fde_file;
 	if (__predict_false(fp == NULL))
 		return (EAGAIN);
 	*fsearch = ((fp->f_flag & FSEARCH) != 0);
 	vp = fp->f_vnode;
 	if (__predict_false(vp == NULL || vp->v_type != VDIR)) {
 		return (EAGAIN);
 	}
 	/*
 	 * Use an acquire barrier to force re-reading of fdt so it is
 	 * refreshed for verification.
 	 */
 	atomic_thread_fence_acq();
 	fdt = fdp->fd_files;
 	if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file))
 		return (EAGAIN);
 	filecaps_fill(&ndp->ni_filecaps);
 	*vpp = vp;
 	return (0);
 }
 #endif
 
 int
 fget_unlocked_seq(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp, seqc_t *seqp)
 {
 #ifdef CAPABILITIES
 	const struct filedescent *fde;
 #endif
 	const struct fdescenttbl *fdt;
 	struct file *fp;
 #ifdef CAPABILITIES
 	seqc_t seq;
 	cap_rights_t haverights;
 	int error;
 #endif
 
 	fdt = fdp->fd_files;
 	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 		return (EBADF);
 	/*
 	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
 	 * never raising a refcount above 0.  To accomplish this we have
 	 * to use a cmpset loop rather than an atomic_add.  The descriptor
 	 * must be re-verified once we acquire a reference to be certain
 	 * that the identity is still correct and we did not lose a race
 	 * due to preemption.
 	 */
 	for (;;) {
 #ifdef CAPABILITIES
 		seq = seqc_read_notmodify(fd_seqc(fdt, fd));
 		fde = &fdt->fdt_ofiles[fd];
 		haverights = *cap_rights_fde_inline(fde);
 		fp = fde->fde_file;
 		if (!seqc_consistent(fd_seqc(fdt, fd), seq))
 			continue;
 #else
 		fp = fdt->fdt_ofiles[fd].fde_file;
 #endif
 		if (fp == NULL)
 			return (EBADF);
 #ifdef CAPABILITIES
 		error = cap_check_inline(&haverights, needrightsp);
 		if (error != 0)
 			return (error);
 #endif
 		if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count))) {
 			/*
 			 * Force a reload. Other thread could reallocate the
 			 * table before this fd was closed, so it is possible
 			 * that there is a stale fp pointer in cached version.
 			 */
 			fdt = atomic_load_ptr(&fdp->fd_files);
 			continue;
 		}
 		/*
 		 * Use an acquire barrier to force re-reading of fdt so it is
 		 * refreshed for verification.
 		 */
 		atomic_thread_fence_acq();
 		fdt = fdp->fd_files;
 #ifdef	CAPABILITIES
 		if (seqc_consistent_nomb(fd_seqc(fdt, fd), seq))
 #else
 		if (fp == fdt->fdt_ofiles[fd].fde_file)
 #endif
 			break;
 		fdrop(fp, curthread);
 	}
 	*fpp = fp;
 	if (seqp != NULL) {
 #ifdef CAPABILITIES
 		*seqp = seq;
 #endif
 	}
 	return (0);
 }
 
 /*
  * See the comments in fget_unlocked_seq for an explanation of how this works.
  *
  * This is a simplified variant which bails out to the aforementioned routine
  * if anything goes wrong. In practice this only happens when userspace is
  * racing with itself.
  */
 int
 fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp)
 {
 #ifdef CAPABILITIES
 	const struct filedescent *fde;
 #endif
 	const struct fdescenttbl *fdt;
 	struct file *fp;
 #ifdef CAPABILITIES
 	seqc_t seq;
 	const cap_rights_t *haverights;
 #endif
 
 	fdt = fdp->fd_files;
 	if (__predict_false((u_int)fd >= fdt->fdt_nfiles))
 		return (EBADF);
 #ifdef CAPABILITIES
 	seq = seqc_read_notmodify(fd_seqc(fdt, fd));
 	fde = &fdt->fdt_ofiles[fd];
 	haverights = cap_rights_fde_inline(fde);
 	fp = fde->fde_file;
 #else
 	fp = fdt->fdt_ofiles[fd].fde_file;
 #endif
 	if (__predict_false(fp == NULL))
 		goto out_fallback;
 #ifdef CAPABILITIES
 	if (__predict_false(cap_check_inline_transient(haverights, needrightsp)))
 		goto out_fallback;
 #endif
 	if (__predict_false(!refcount_acquire_if_not_zero(&fp->f_count)))
 		goto out_fallback;
 
 	/*
 	 * Use an acquire barrier to force re-reading of fdt so it is
 	 * refreshed for verification.
 	 */
 	atomic_thread_fence_acq();
 	fdt = fdp->fd_files;
 #ifdef	CAPABILITIES
 	if (__predict_false(!seqc_consistent_nomb(fd_seqc(fdt, fd), seq)))
 #else
 	if (__predict_false(fp != fdt->fdt_ofiles[fd].fde_file))
 #endif
 		goto out_fdrop;
 	*fpp = fp;
 	return (0);
 out_fdrop:
 	fdrop(fp, curthread);
 out_fallback:
 	return (fget_unlocked_seq(fdp, fd, needrightsp, fpp, NULL));
 }
 
 /*
  * Translate fd -> file when the caller guarantees the file descriptor table
  * can't be changed by others.
  *
  * Note this does not mean the file object itself is only visible to the caller,
  * merely that it wont disappear without having to be referenced.
  *
  * Must be paired with fput_only_user.
  */
 #ifdef	CAPABILITIES
 int
 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp)
 {
 	const struct filedescent *fde;
 	const struct fdescenttbl *fdt;
 	const cap_rights_t *haverights;
 	struct file *fp;
 	int error;
 
 	MPASS(FILEDESC_IS_ONLY_USER(fdp));
 
 	if (__predict_false(fd >= fdp->fd_nfiles))
 		return (EBADF);
 
 	fdt = fdp->fd_files;
 	fde = &fdt->fdt_ofiles[fd];
 	fp = fde->fde_file;
 	if (__predict_false(fp == NULL))
 		return (EBADF);
 	MPASS(refcount_load(&fp->f_count) > 0);
 	haverights = cap_rights_fde_inline(fde);
 	error = cap_check_inline(haverights, needrightsp);
 	if (__predict_false(error != 0))
 		return (error);
 	*fpp = fp;
 	return (0);
 }
 #else
 int
 fget_only_user(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp)
 {
 	struct file *fp;
 
 	MPASS(FILEDESC_IS_ONLY_USER(fdp));
 
 	if (__predict_false(fd >= fdp->fd_nfiles))
 		return (EBADF);
 
 	fp = fdp->fd_ofiles[fd].fde_file;
 	if (__predict_false(fp == NULL))
 		return (EBADF);
 
 	MPASS(refcount_load(&fp->f_count) > 0);
 	*fpp = fp;
 	return (0);
 }
 #endif
 
 /*
  * Extract the file pointer associated with the specified descriptor for the
  * current user process.
  *
  * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
  * returned.
  *
  * File's rights will be checked against the capability rights mask.
  *
  * If an error occurred the non-zero error is returned and *fpp is set to
  * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
  * responsible for fdrop().
  */
 static __inline int
 _fget(struct thread *td, int fd, struct file **fpp, int flags,
     cap_rights_t *needrightsp)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int error;
 
 	*fpp = NULL;
 	fdp = td->td_proc->p_fd;
 	error = fget_unlocked(fdp, fd, needrightsp, &fp);
 	if (__predict_false(error != 0))
 		return (error);
 	if (__predict_false(fp->f_ops == &badfileops)) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 */
 	error = 0;
 	switch (flags) {
 	case FREAD:
 	case FWRITE:
 		if ((fp->f_flag & flags) == 0)
 			error = EBADF;
 		break;
 	case FEXEC:
 	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
 		    ((fp->f_flag & FWRITE) != 0))
 			error = EBADF;
 		break;
 	case 0:
 		break;
 	default:
 		KASSERT(0, ("wrong flags"));
 	}
 
 	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
 
 	*fpp = fp;
 	return (0);
 }
 
 int
 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, 0, rightsp));
 }
 
 int
 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp,
     struct file **fpp)
 {
 	int error;
 #ifndef CAPABILITIES
 	error = _fget(td, fd, fpp, 0, rightsp);
 	if (maxprotp != NULL)
 		*maxprotp = VM_PROT_ALL;
 	return (error);
 #else
 	cap_rights_t fdrights;
 	struct filedesc *fdp;
 	struct file *fp;
 	seqc_t seq;
 
 	*fpp = NULL;
 	fdp = td->td_proc->p_fd;
 	MPASS(cap_rights_is_set(rightsp, CAP_MMAP));
 	for (;;) {
 		error = fget_unlocked_seq(fdp, fd, rightsp, &fp, &seq);
 		if (__predict_false(error != 0))
 			return (error);
 		if (__predict_false(fp->f_ops == &badfileops)) {
 			fdrop(fp, td);
 			return (EBADF);
 		}
 		if (maxprotp != NULL)
 			fdrights = *cap_rights(fdp, fd);
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(fp, td);
 	}
 
 	/*
 	 * If requested, convert capability rights to access flags.
 	 */
 	if (maxprotp != NULL)
 		*maxprotp = cap_rights_to_vmprot(&fdrights);
 	*fpp = fp;
 	return (0);
 #endif
 }
 
 int
 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, FREAD, rightsp));
 }
 
 int
 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, FWRITE, rightsp));
 }
 
 int
 fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl,
     struct file **fpp)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 #ifndef CAPABILITIES
 	return (fget_unlocked(fdp, fd, rightsp, fpp));
 #else
 	struct file *fp;
 	int error;
 	seqc_t seq;
 
 	*fpp = NULL;
 	MPASS(cap_rights_is_set(rightsp, CAP_FCNTL));
 	for (;;) {
 		error = fget_unlocked_seq(fdp, fd, rightsp, &fp, &seq);
 		if (error != 0)
 			return (error);
 		error = cap_fcntl_check(fdp, fd, needfcntl);
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(fp, td);
 	}
 	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
 	*fpp = fp;
 	return (0);
 #endif
 }
 
 /*
  * Like fget() but loads the underlying vnode, or returns an error if the
  * descriptor does not represent a vnode.  Note that pipes use vnodes but
  * never have VM objects.  The returned vnode will be vref()'d.
  *
  * XXX: what about the unused flags ?
  */
 static __inline int
 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
     struct vnode **vpp)
 {
 	struct file *fp;
 	int error;
 
 	*vpp = NULL;
 	error = _fget(td, fd, &fp, flags, needrightsp);
 	if (error != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 	} else {
 		*vpp = fp->f_vnode;
 		vref(*vpp);
 	}
 	fdrop(fp, td);
 
 	return (error);
 }
 
 int
 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, 0, rightsp, vpp));
 }
 
 int
 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct filecaps *havecaps, struct vnode **vpp)
 {
 	struct filecaps caps;
 	struct file *fp;
 	int error;
 
 	error = fget_cap(td, fd, needrightsp, &fp, &caps);
 	if (error != 0)
 		return (error);
 	if (fp->f_ops == &badfileops) {
 		error = EBADF;
 		goto out;
 	}
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	*havecaps = caps;
 	*vpp = fp->f_vnode;
 	vref(*vpp);
 	fdrop(fp, td);
 
 	return (0);
 out:
 	filecaps_free(&caps);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
 }
 
 int
 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
 }
 
 #ifdef notyet
 int
 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
 }
 #endif
 
 /*
  * Handle the last reference to a file being closed.
  *
  * Without the noinline attribute clang keeps inlining the func thorough this
  * file when fdrop is used.
  */
 int __noinline
 _fdrop(struct file *fp, struct thread *td)
 {
 	int error;
 #ifdef INVARIANTS
 	int count;
 
 	count = refcount_load(&fp->f_count);
 	if (count != 0)
 		panic("fdrop: fp %p count %d", fp, count);
 #endif
 	error = fo_close(fp, td);
 	atomic_subtract_int(&openfiles, 1);
 	crfree(fp->f_cred);
 	free(fp->f_advice, M_FADVISE);
 	uma_zfree(file_zone, fp);
 
 	return (error);
 }
 
 /*
  * Apply an advisory lock on a file descriptor.
  *
  * Just attempt to get a record lock of the requested type on the entire file
  * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct flock_args {
 	int	fd;
 	int	how;
 };
 #endif
 /* ARGSUSED */
 int
 sys_flock(struct thread *td, struct flock_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
 	int error;
 
 	error = fget(td, uap->fd, &cap_flock_rights, &fp);
 	if (error != 0)
 		return (error);
 	error = EOPNOTSUPP;
 	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
 		goto done;
 	}
 	if (fp->f_ops == &path_fileops) {
 		goto done;
 	}
 
 	error = 0;
 	vp = fp->f_vnode;
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
 		atomic_clear_int(&fp->f_flag, FHASLOCK);
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 		goto done;
 	}
 	if (uap->how & LOCK_EX)
 		lf.l_type = F_WRLCK;
 	else if (uap->how & LOCK_SH)
 		lf.l_type = F_RDLCK;
 	else {
 		error = EBADF;
 		goto done;
 	}
 	atomic_set_int(&fp->f_flag, FHASLOCK);
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done:
 	fdrop(fp, td);
 	return (error);
 }
 /*
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
     int openerror, int *indxp)
 {
 	struct filedescent *newfde, *oldfde;
 	struct file *fp;
 	u_long *ioctls;
 	int error, indx;
 
 	KASSERT(openerror == ENODEV || openerror == ENXIO,
 	    ("unexpected error %d in %s", openerror, __func__));
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
 	 * of file descriptors, or the fd to be dup'd has already been
 	 * closed, then reject.
 	 */
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_locked(fdp, dfd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 
 	error = fdalloc(td, 0, &indx);
 	if (error != 0) {
 		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 
 	/*
 	 * There are two cases of interest here.
 	 *
 	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and store it in
 	 * (indx).  (dfd) is effectively closed by this operation.
 	 */
 	switch (openerror) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
 		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
 			fdunused(fdp, indx);
 			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
 		if (!fhold(fp)) {
 			fdunused(fdp, indx);
 			FILEDESC_XUNLOCK(fdp);
 			return (EBADF);
 		}
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 		ioctls = filecaps_copy_prep(&oldfde->fde_caps);
 #ifdef CAPABILITIES
 		seqc_write_begin(&newfde->fde_seqc);
 #endif
 		memcpy(newfde, oldfde, fde_change_size);
 		filecaps_copy_finish(&oldfde->fde_caps, &newfde->fde_caps,
 		    ioctls);
 #ifdef CAPABILITIES
 		seqc_write_end(&newfde->fde_seqc);
 #endif
 		break;
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 #ifdef CAPABILITIES
 		seqc_write_begin(&newfde->fde_seqc);
 #endif
 		memcpy(newfde, oldfde, fde_change_size);
 		oldfde->fde_file = NULL;
 		fdunused(fdp, dfd);
 #ifdef CAPABILITIES
 		seqc_write_end(&newfde->fde_seqc);
 #endif
 		break;
 	}
 	FILEDESC_XUNLOCK(fdp);
 	*indxp = indx;
 	return (0);
 }
 
 /*
  * This sysctl determines if we will allow a process to chroot(2) if it
  * has a directory open:
  *	0: disallowed for all processes.
  *	1: allowed for processes that were not already chroot(2)'ed.
  *	2: allowed for all processes.
  */
 
 static int chroot_allow_open_directories = 1;
 
 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
     &chroot_allow_open_directories, 0,
     "Allow a process to chroot(2) if it has a directory open");
 
 /*
  * Helper function for raised chroot(2) security function:  Refuse if
  * any filedescriptors are open directories.
  */
 static int
 chroot_refuse_vdir_fds(struct filedesc *fdp)
 {
 	struct vnode *vp;
 	struct file *fp;
 	int fd, lastfile;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	lastfile = fdlastfile(fdp);
 	for (fd = 0; fd <= lastfile; fd++) {
 		fp = fget_locked(fdp, fd);
 		if (fp == NULL)
 			continue;
 		if (fp->f_type == DTYPE_VNODE) {
 			vp = fp->f_vnode;
 			if (vp->v_type == VDIR)
 				return (EPERM);
 		}
 	}
 	return (0);
 }
 
 static void
 pwd_fill(struct pwd *oldpwd, struct pwd *newpwd)
 {
 
 	if (newpwd->pwd_cdir == NULL && oldpwd->pwd_cdir != NULL) {
 		vrefact(oldpwd->pwd_cdir);
 		newpwd->pwd_cdir = oldpwd->pwd_cdir;
 	}
 
 	if (newpwd->pwd_rdir == NULL && oldpwd->pwd_rdir != NULL) {
 		vrefact(oldpwd->pwd_rdir);
 		newpwd->pwd_rdir = oldpwd->pwd_rdir;
 	}
 
 	if (newpwd->pwd_jdir == NULL && oldpwd->pwd_jdir != NULL) {
 		vrefact(oldpwd->pwd_jdir);
 		newpwd->pwd_jdir = oldpwd->pwd_jdir;
 	}
 }
 
 struct pwd *
 pwd_hold_pwddesc(struct pwddesc *pdp)
 {
 	struct pwd *pwd;
 
 	PWDDESC_ASSERT_XLOCKED(pdp);
 	pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	if (pwd != NULL)
 		refcount_acquire(&pwd->pwd_refcount);
 	return (pwd);
 }
 
 bool
 pwd_hold_smr(struct pwd *pwd)
 {
 
 	MPASS(pwd != NULL);
 	if (__predict_true(refcount_acquire_if_not_zero(&pwd->pwd_refcount))) {
 		return (true);
 	}
 	return (false);
 }
 
 struct pwd *
 pwd_hold(struct thread *td)
 {
 	struct pwddesc *pdp;
 	struct pwd *pwd;
 
 	pdp = td->td_proc->p_pd;
 
 	vfs_smr_enter();
 	pwd = vfs_smr_entered_load(&pdp->pd_pwd);
 	if (pwd_hold_smr(pwd)) {
 		vfs_smr_exit();
 		return (pwd);
 	}
 	vfs_smr_exit();
 	PWDDESC_XLOCK(pdp);
 	pwd = pwd_hold_pwddesc(pdp);
 	MPASS(pwd != NULL);
 	PWDDESC_XUNLOCK(pdp);
 	return (pwd);
 }
 
 struct pwd *
 pwd_hold_proc(struct proc *p)
 {
 	struct pwddesc *pdp;
 	struct pwd *pwd;
 
 	PROC_ASSERT_HELD(p);
 	PROC_LOCK(p);
 	pdp = pdhold(p);
 	MPASS(pdp != NULL);
 	PROC_UNLOCK(p);
 
 	PWDDESC_XLOCK(pdp);
 	pwd = pwd_hold_pwddesc(pdp);
 	MPASS(pwd != NULL);
 	PWDDESC_XUNLOCK(pdp);
 	pddrop(pdp);
 	return (pwd);
 }
 
 static struct pwd *
 pwd_alloc(void)
 {
 	struct pwd *pwd;
 
 	pwd = uma_zalloc_smr(pwd_zone, M_WAITOK);
 	bzero(pwd, sizeof(*pwd));
 	refcount_init(&pwd->pwd_refcount, 1);
 	return (pwd);
 }
 
 void
 pwd_drop(struct pwd *pwd)
 {
 
 	if (!refcount_release(&pwd->pwd_refcount))
 		return;
 
 	if (pwd->pwd_cdir != NULL)
 		vrele(pwd->pwd_cdir);
 	if (pwd->pwd_rdir != NULL)
 		vrele(pwd->pwd_rdir);
 	if (pwd->pwd_jdir != NULL)
 		vrele(pwd->pwd_jdir);
 	uma_zfree_smr(pwd_zone, pwd);
 }
 
 /*
 * The caller is responsible for invoking priv_check() and
 * mac_vnode_check_chroot() to authorize this operation.
 */
 int
 pwd_chroot(struct thread *td, struct vnode *vp)
 {
 	struct pwddesc *pdp;
 	struct filedesc *fdp;
 	struct pwd *newpwd, *oldpwd;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	pdp = td->td_proc->p_pd;
 	newpwd = pwd_alloc();
 	FILEDESC_SLOCK(fdp);
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	if (chroot_allow_open_directories == 0 ||
 	    (chroot_allow_open_directories == 1 &&
 	    oldpwd->pwd_rdir != rootvnode)) {
 		error = chroot_refuse_vdir_fds(fdp);
 		FILEDESC_SUNLOCK(fdp);
 		if (error != 0) {
 			PWDDESC_XUNLOCK(pdp);
 			pwd_drop(newpwd);
 			return (error);
 		}
 	} else {
 		FILEDESC_SUNLOCK(fdp);
 	}
 
 	vrefact(vp);
 	newpwd->pwd_rdir = vp;
 	if (oldpwd->pwd_jdir == NULL) {
 		vrefact(vp);
 		newpwd->pwd_jdir = vp;
 	}
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 	return (0);
 }
 
 void
 pwd_chdir(struct thread *td, struct vnode *vp)
 {
 	struct pwddesc *pdp;
 	struct pwd *newpwd, *oldpwd;
 
 	VNPASS(vp->v_usecount > 0, vp);
 
 	newpwd = pwd_alloc();
 	pdp = td->td_proc->p_pd;
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	newpwd->pwd_cdir = vp;
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 }
 
 /*
  * jail_attach(2) changes both root and working directories.
  */
 int
 pwd_chroot_chdir(struct thread *td, struct vnode *vp)
 {
 	struct pwddesc *pdp;
 	struct filedesc *fdp;
 	struct pwd *newpwd, *oldpwd;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	pdp = td->td_proc->p_pd;
 	newpwd = pwd_alloc();
 	FILEDESC_SLOCK(fdp);
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	error = chroot_refuse_vdir_fds(fdp);
 	FILEDESC_SUNLOCK(fdp);
 	if (error != 0) {
 		PWDDESC_XUNLOCK(pdp);
 		pwd_drop(newpwd);
 		return (error);
 	}
 
 	vrefact(vp);
 	newpwd->pwd_rdir = vp;
 	vrefact(vp);
 	newpwd->pwd_cdir = vp;
 	if (oldpwd->pwd_jdir == NULL) {
 		vrefact(vp);
 		newpwd->pwd_jdir = vp;
 	}
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 	return (0);
 }
 
 void
 pwd_ensure_dirs(void)
 {
 	struct pwddesc *pdp;
 	struct pwd *oldpwd, *newpwd;
 
 	pdp = curproc->p_pd;
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	if (oldpwd->pwd_cdir != NULL && oldpwd->pwd_rdir != NULL) {
 		PWDDESC_XUNLOCK(pdp);
 		return;
 	}
 	PWDDESC_XUNLOCK(pdp);
 
 	newpwd = pwd_alloc();
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	pwd_fill(oldpwd, newpwd);
 	if (newpwd->pwd_cdir == NULL) {
 		vrefact(rootvnode);
 		newpwd->pwd_cdir = rootvnode;
 	}
 	if (newpwd->pwd_rdir == NULL) {
 		vrefact(rootvnode);
 		newpwd->pwd_rdir = rootvnode;
 	}
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 }
 
 void
 pwd_set_rootvnode(void)
 {
 	struct pwddesc *pdp;
 	struct pwd *oldpwd, *newpwd;
 
 	pdp = curproc->p_pd;
 
 	newpwd = pwd_alloc();
 	PWDDESC_XLOCK(pdp);
 	oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	vrefact(rootvnode);
 	newpwd->pwd_cdir = rootvnode;
 	vrefact(rootvnode);
 	newpwd->pwd_rdir = rootvnode;
 	pwd_fill(oldpwd, newpwd);
 	pwd_set(pdp, newpwd);
 	PWDDESC_XUNLOCK(pdp);
 	pwd_drop(oldpwd);
 }
 
 /*
  * Scan all active processes and prisons to see if any of them have a current
  * or root directory of `olddp'. If so, replace them with the new mount point.
  */
 void
 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 {
 	struct pwddesc *pdp;
 	struct pwd *newpwd, *oldpwd;
 	struct prison *pr;
 	struct proc *p;
 	int nrele;
 
 	if (vrefcnt(olddp) == 1)
 		return;
 	nrele = 0;
 	newpwd = pwd_alloc();
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		pdp = pdhold(p);
 		PROC_UNLOCK(p);
 		if (pdp == NULL)
 			continue;
 		PWDDESC_XLOCK(pdp);
 		oldpwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 		if (oldpwd == NULL ||
 		    (oldpwd->pwd_cdir != olddp &&
 		    oldpwd->pwd_rdir != olddp &&
 		    oldpwd->pwd_jdir != olddp)) {
 			PWDDESC_XUNLOCK(pdp);
 			pddrop(pdp);
 			continue;
 		}
 		if (oldpwd->pwd_cdir == olddp) {
 			vrefact(newdp);
 			newpwd->pwd_cdir = newdp;
 		}
 		if (oldpwd->pwd_rdir == olddp) {
 			vrefact(newdp);
 			newpwd->pwd_rdir = newdp;
 		}
 		if (oldpwd->pwd_jdir == olddp) {
 			vrefact(newdp);
 			newpwd->pwd_jdir = newdp;
 		}
 		pwd_fill(oldpwd, newpwd);
 		pwd_set(pdp, newpwd);
 		PWDDESC_XUNLOCK(pdp);
 		pwd_drop(oldpwd);
 		pddrop(pdp);
 		newpwd = pwd_alloc();
 	}
 	sx_sunlock(&allproc_lock);
 	pwd_drop(newpwd);
 	if (rootvnode == olddp) {
 		vrefact(newdp);
 		rootvnode = newdp;
 		nrele++;
 	}
 	mtx_lock(&prison0.pr_mtx);
 	if (prison0.pr_root == olddp) {
 		vrefact(newdp);
 		prison0.pr_root = newdp;
 		nrele++;
 	}
 	mtx_unlock(&prison0.pr_mtx);
 	sx_slock(&allprison_lock);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		mtx_lock(&pr->pr_mtx);
 		if (pr->pr_root == olddp) {
 			vrefact(newdp);
 			pr->pr_root = newdp;
 			nrele++;
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 	sx_sunlock(&allprison_lock);
 	while (nrele--)
 		vrele(olddp);
 }
 
 struct filedesc_to_leader *
 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
 {
 	struct filedesc_to_leader *fdtol;
 
 	fdtol = malloc(sizeof(struct filedesc_to_leader),
 	    M_FILEDESC_TO_LEADER, M_WAITOK);
 	fdtol->fdl_refcount = 1;
 	fdtol->fdl_holdcount = 0;
 	fdtol->fdl_wakeup = 0;
 	fdtol->fdl_leader = leader;
 	if (old != NULL) {
 		FILEDESC_XLOCK(fdp);
 		fdtol->fdl_next = old->fdl_next;
 		fdtol->fdl_prev = old;
 		old->fdl_next = fdtol;
 		fdtol->fdl_next->fdl_prev = fdtol;
 		FILEDESC_XUNLOCK(fdp);
 	} else {
 		fdtol->fdl_next = fdtol;
 		fdtol->fdl_prev = fdtol;
 	}
 	return (fdtol);
 }
 
 static int
 sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS)
 {
 	NDSLOTTYPE *map;
 	struct filedesc *fdp;
 	u_int namelen;
 	int count, off, minoff;
 
 	namelen = arg2;
 	if (namelen != 1)
 		return (EINVAL);
 
 	if (*(int *)arg1 != 0)
 		return (EINVAL);
 
 	fdp = curproc->p_fd;
 	count = 0;
 	FILEDESC_SLOCK(fdp);
 	map = fdp->fd_map;
 	off = NDSLOT(fdp->fd_nfiles - 1);
 	for (minoff = NDSLOT(0); off >= minoff; --off)
 		count += bitcountl(map[off]);
 	FILEDESC_SUNLOCK(fdp);
 
 	return (SYSCTL_OUT(req, &count, sizeof(count)));
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds,
     CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds,
     "Number of open file descriptors");
 
 /*
  * Get file structures globally.
  */
 static int
 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 {
 	struct xfile xf;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int error, n, lastfile;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	if (req->oldptr == NULL) {
 		n = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			fdp = fdhold(p);
 			PROC_UNLOCK(p);
 			if (fdp == NULL)
 				continue;
 			/* overestimates sparse tables. */
 			n += fdp->fd_nfiles;
 			fddrop(fdp);
 		}
 		sx_sunlock(&allproc_lock);
 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 	}
 	error = 0;
 	bzero(&xf, sizeof(xf));
 	xf.xf_size = sizeof(xf);
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if (p_cansee(req->td, p) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		xf.xf_pid = p->p_pid;
 		xf.xf_uid = p->p_ucred->cr_uid;
 		fdp = fdhold(p);
 		PROC_UNLOCK(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_SLOCK(fdp);
 		lastfile = fdlastfile(fdp);
 		for (n = 0; refcount_load(&fdp->fd_refcnt) > 0 && n <= lastfile;
 		    n++) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			xf.xf_fd = n;
 			xf.xf_file = (uintptr_t)fp;
 			xf.xf_data = (uintptr_t)fp->f_data;
 			xf.xf_vnode = (uintptr_t)fp->f_vnode;
 			xf.xf_type = (uintptr_t)fp->f_type;
 			xf.xf_count = refcount_load(&fp->f_count);
 			xf.xf_msgcount = 0;
 			xf.xf_offset = foffset_get(fp);
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
 			if (error)
 				break;
 		}
 		FILEDESC_SUNLOCK(fdp);
 		fddrop(fdp);
 		if (error)
 			break;
 	}
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 
 #ifdef KINFO_FILE_SIZE
 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 #endif
 
 static int
 xlate_fflags(int fflags)
 {
 	static const struct {
 		int	fflag;
 		int	kf_fflag;
 	} fflags_table[] = {
 		{ FAPPEND, KF_FLAG_APPEND },
 		{ FASYNC, KF_FLAG_ASYNC },
 		{ FFSYNC, KF_FLAG_FSYNC },
 		{ FHASLOCK, KF_FLAG_HASLOCK },
 		{ FNONBLOCK, KF_FLAG_NONBLOCK },
 		{ FREAD, KF_FLAG_READ },
 		{ FWRITE, KF_FLAG_WRITE },
 		{ O_CREAT, KF_FLAG_CREAT },
 		{ O_DIRECT, KF_FLAG_DIRECT },
 		{ O_EXCL, KF_FLAG_EXCL },
 		{ O_EXEC, KF_FLAG_EXEC },
 		{ O_EXLOCK, KF_FLAG_EXLOCK },
 		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
 		{ O_SHLOCK, KF_FLAG_SHLOCK },
 		{ O_TRUNC, KF_FLAG_TRUNC }
 	};
 	unsigned int i;
 	int kflags;
 
 	kflags = 0;
 	for (i = 0; i < nitems(fflags_table); i++)
 		if (fflags & fflags_table[i].fflag)
 			kflags |=  fflags_table[i].kf_fflag;
 	return (kflags);
 }
 
 /* Trim unused data from kf_path by truncating the structure size. */
 void
 pack_kinfo(struct kinfo_file *kif)
 {
 
 	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
 	    strlen(kif->kf_path) + 1;
 	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
 }
 
 static void
 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
     struct kinfo_file *kif, struct filedesc *fdp, int flags)
 {
 	int error;
 
 	bzero(kif, sizeof(*kif));
 
 	/* Set a default type to allow for empty fill_kinfo() methods. */
 	kif->kf_type = KF_TYPE_UNKNOWN;
 	kif->kf_flags = xlate_fflags(fp->f_flag);
 	if (rightsp != NULL)
 		kif->kf_cap_rights = *rightsp;
 	else
 		cap_rights_init_zero(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_ref_count = refcount_load(&fp->f_count);
 	kif->kf_offset = foffset_get(fp);
 
 	/*
 	 * This may drop the filedesc lock, so the 'fp' cannot be
 	 * accessed after this call.
 	 */
 	error = fo_fill_kinfo(fp, kif, fdp);
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 		pack_kinfo(kif);
 	else
 		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 }
 
 static void
 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
     struct kinfo_file *kif, int flags)
 {
 	int error;
 
 	bzero(kif, sizeof(*kif));
 
 	kif->kf_type = KF_TYPE_VNODE;
 	error = vn_fill_kinfo_vnode(vp, kif);
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 	kif->kf_flags = xlate_fflags(fflags);
 	cap_rights_init_zero(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_ref_count = -1;
 	kif->kf_offset = -1;
 	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 		pack_kinfo(kif);
 	else
 		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 	vrele(vp);
 }
 
 struct export_fd_buf {
 	struct filedesc		*fdp;
 	struct pwddesc	*pdp;
 	struct sbuf 		*sb;
 	ssize_t			remainder;
 	struct kinfo_file	kif;
 	int			flags;
 };
 
 static int
 export_kinfo_to_sb(struct export_fd_buf *efbuf)
 {
 	struct kinfo_file *kif;
 
 	kif = &efbuf->kif;
 	if (efbuf->remainder != -1) {
 		if (efbuf->remainder < kif->kf_structsize) {
 			/* Terminate export. */
 			efbuf->remainder = 0;
 			return (0);
 		}
 		efbuf->remainder -= kif->kf_structsize;
 	}
 	return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM);
 }
 
 static int
 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
     struct export_fd_buf *efbuf)
 {
 	int error;
 
 	if (efbuf->remainder == 0)
 		return (0);
 	export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp,
 	    efbuf->flags);
 	FILEDESC_SUNLOCK(efbuf->fdp);
 	error = export_kinfo_to_sb(efbuf);
 	FILEDESC_SLOCK(efbuf->fdp);
 	return (error);
 }
 
 static int
 export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
     struct export_fd_buf *efbuf)
 {
 	int error;
 
 	if (efbuf->remainder == 0)
 		return (0);
 	if (efbuf->pdp != NULL)
 		PWDDESC_XUNLOCK(efbuf->pdp);
 	export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags);
 	error = export_kinfo_to_sb(efbuf);
 	if (efbuf->pdp != NULL)
 		PWDDESC_XLOCK(efbuf->pdp);
 	return (error);
 }
 
 /*
  * Store a process file descriptor information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen,
     int flags)
 {
 	struct file *fp;
 	struct filedesc *fdp;
 	struct pwddesc *pdp;
 	struct export_fd_buf *efbuf;
 	struct vnode *cttyvp, *textvp, *tracevp;
 	struct pwd *pwd;
 	int error, i, lastfile;
 	cap_rights_t rights;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/* ktrace vnode */
 	tracevp = ktr_get_tracevp(p, true);
 	/* text vnode */
 	textvp = p->p_textvp;
 	if (textvp != NULL)
 		vrefact(textvp);
 	/* Controlling tty. */
 	cttyvp = NULL;
 	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
 		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
 		if (cttyvp != NULL)
 			vrefact(cttyvp);
 	}
 	fdp = fdhold(p);
 	pdp = pdhold(p);
 	PROC_UNLOCK(p);
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->fdp = NULL;
 	efbuf->pdp = NULL;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 	efbuf->flags = flags;
 	if (tracevp != NULL)
 		export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE,
 		    efbuf);
 	if (textvp != NULL)
 		export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf);
 	if (cttyvp != NULL)
 		export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE,
 		    efbuf);
 	error = 0;
 	if (pdp == NULL || fdp == NULL)
 		goto fail;
 	efbuf->fdp = fdp;
 	efbuf->pdp = pdp;
 	PWDDESC_XLOCK(pdp);
 	pwd = pwd_hold_pwddesc(pdp);
 	if (pwd != NULL) {
 		/* working directory */
 		if (pwd->pwd_cdir != NULL) {
 			vrefact(pwd->pwd_cdir);
 			export_vnode_to_sb(pwd->pwd_cdir, KF_FD_TYPE_CWD,
 			    FREAD, efbuf);
 		}
 		/* root directory */
 		if (pwd->pwd_rdir != NULL) {
 			vrefact(pwd->pwd_rdir);
 			export_vnode_to_sb(pwd->pwd_rdir, KF_FD_TYPE_ROOT,
 			    FREAD, efbuf);
 		}
 		/* jail directory */
 		if (pwd->pwd_jdir != NULL) {
 			vrefact(pwd->pwd_jdir);
 			export_vnode_to_sb(pwd->pwd_jdir, KF_FD_TYPE_JAIL,
 			    FREAD, efbuf);
 		}
 	}
 	PWDDESC_XUNLOCK(pdp);
 	if (pwd != NULL)
 		pwd_drop(pwd);
 	FILEDESC_SLOCK(fdp);
 	lastfile = fdlastfile(fdp);
 	for (i = 0; refcount_load(&fdp->fd_refcnt) > 0 && i <= lastfile; i++) {
 		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 #ifdef CAPABILITIES
 		rights = *cap_rights(fdp, i);
 #else /* !CAPABILITIES */
 		rights = cap_no_rights;
 #endif
 		/*
 		 * Create sysctl entry.  It is OK to drop the filedesc
 		 * lock inside of export_file_to_sb() as we will
 		 * re-validate and re-evaluate its properties when the
 		 * loop continues.
 		 */
 		error = export_file_to_sb(fp, i, &rights, efbuf);
 		if (error != 0 || efbuf->remainder == 0)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 fail:
 	if (fdp != NULL)
 		fddrop(fdp);
 	if (pdp != NULL)
 		pddrop(pdp);
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 #define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	u_int namelen;
 	int error, error2, *name;
 
 	namelen = arg2;
 	if (namelen != 1)
 		return (EINVAL);
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_filedesc_out(p, &sb, maxlen,
 	    KERN_FILEDESC_PACK_KINFO);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 #ifdef COMPAT_FREEBSD7
 #ifdef KINFO_OFILE_SIZE
 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
 #endif
 
 static void
 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
 {
 
 	okif->kf_structsize = sizeof(*okif);
 	okif->kf_type = kif->kf_type;
 	okif->kf_fd = kif->kf_fd;
 	okif->kf_ref_count = kif->kf_ref_count;
 	okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
 	    KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
 	    KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
 	okif->kf_offset = kif->kf_offset;
 	if (kif->kf_type == KF_TYPE_VNODE)
 		okif->kf_vnode_type = kif->kf_un.kf_file.kf_file_type;
 	else
 		okif->kf_vnode_type = KF_VTYPE_VNON;
 	strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
 	if (kif->kf_type == KF_TYPE_SOCKET) {
 		okif->kf_sock_domain = kif->kf_un.kf_sock.kf_sock_domain0;
 		okif->kf_sock_type = kif->kf_un.kf_sock.kf_sock_type0;
 		okif->kf_sock_protocol = kif->kf_un.kf_sock.kf_sock_protocol0;
 		okif->kf_sa_local = kif->kf_un.kf_sock.kf_sa_local;
 		okif->kf_sa_peer = kif->kf_un.kf_sock.kf_sa_peer;
 	} else {
 		okif->kf_sa_local.ss_family = AF_UNSPEC;
 		okif->kf_sa_peer.ss_family = AF_UNSPEC;
 	}
 }
 
 static int
 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
     struct kinfo_ofile *okif, struct pwddesc *pdp, struct sysctl_req *req)
 {
 	int error;
 
 	vrefact(vp);
 	PWDDESC_XUNLOCK(pdp);
 	export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO);
 	kinfo_to_okinfo(kif, okif);
 	error = SYSCTL_OUT(req, okif, sizeof(*okif));
 	PWDDESC_XLOCK(pdp);
 	return (error);
 }
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
 {
 	struct kinfo_ofile *okif;
 	struct kinfo_file *kif;
 	struct filedesc *fdp;
 	struct pwddesc *pdp;
 	struct pwd *pwd;
 	u_int namelen;
 	int error, i, lastfile, *name;
 	struct file *fp;
 	struct proc *p;
 
 	namelen = arg2;
 	if (namelen != 1)
 		return (EINVAL);
 
 	name = (int *)arg1;
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0)
 		return (error);
 	fdp = fdhold(p);
 	if (fdp != NULL)
 		pdp = pdhold(p);
 	PROC_UNLOCK(p);
 	if (fdp == NULL || pdp == NULL) {
 		if (fdp != NULL)
 			fddrop(fdp);
 		return (ENOENT);
 	}
 	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
 	okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
 	PWDDESC_XLOCK(pdp);
 	pwd = pwd_hold_pwddesc(pdp);
 	if (pwd != NULL) {
 		if (pwd->pwd_cdir != NULL)
 			export_vnode_for_osysctl(pwd->pwd_cdir, KF_FD_TYPE_CWD, kif,
 			    okif, pdp, req);
 		if (pwd->pwd_rdir != NULL)
 			export_vnode_for_osysctl(pwd->pwd_rdir, KF_FD_TYPE_ROOT, kif,
 			    okif, pdp, req);
 		if (pwd->pwd_jdir != NULL)
 			export_vnode_for_osysctl(pwd->pwd_jdir, KF_FD_TYPE_JAIL, kif,
 			    okif, pdp, req);
 	}
 	PWDDESC_XUNLOCK(pdp);
 	if (pwd != NULL)
 		pwd_drop(pwd);
 	FILEDESC_SLOCK(fdp);
 	lastfile = fdlastfile(fdp);
 	for (i = 0; refcount_load(&fdp->fd_refcnt) > 0 && i <= lastfile; i++) {
 		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 		export_file_to_kinfo(fp, i, NULL, kif, fdp,
 		    KERN_FILEDESC_PACK_KINFO);
 		FILEDESC_SUNLOCK(fdp);
 		kinfo_to_okinfo(kif, okif);
 		error = SYSCTL_OUT(req, okif, sizeof(*okif));
 		FILEDESC_SLOCK(fdp);
 		if (error)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 	pddrop(pdp);
 	free(kif, M_TEMP);
 	free(okif, M_TEMP);
 	return (0);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
     "Process ofiledesc entries");
 #endif	/* COMPAT_FREEBSD7 */
 
 int
 vntype_to_kinfo(int vtype)
 {
 	struct {
 		int	vtype;
 		int	kf_vtype;
 	} vtypes_table[] = {
 		{ VBAD, KF_VTYPE_VBAD },
 		{ VBLK, KF_VTYPE_VBLK },
 		{ VCHR, KF_VTYPE_VCHR },
 		{ VDIR, KF_VTYPE_VDIR },
 		{ VFIFO, KF_VTYPE_VFIFO },
 		{ VLNK, KF_VTYPE_VLNK },
 		{ VNON, KF_VTYPE_VNON },
 		{ VREG, KF_VTYPE_VREG },
 		{ VSOCK, KF_VTYPE_VSOCK }
 	};
 	unsigned int i;
 
 	/*
 	 * Perform vtype translation.
 	 */
 	for (i = 0; i < nitems(vtypes_table); i++)
 		if (vtypes_table[i].vtype == vtype)
 			return (vtypes_table[i].kf_vtype);
 
 	return (KF_VTYPE_UNKNOWN);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
     "Process filedesc entries");
 
 /*
  * Store a process current working directory information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_cwd_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
 {
 	struct pwddesc *pdp;
 	struct pwd *pwd;
 	struct export_fd_buf *efbuf;
 	struct vnode *cdir;
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	pdp = pdhold(p);
 	PROC_UNLOCK(p);
 	if (pdp == NULL)
 		return (EINVAL);
 
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->pdp = pdp;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 
 	PWDDESC_XLOCK(pdp);
 	pwd = PWDDESC_XLOCKED_LOAD_PWD(pdp);
 	cdir = pwd->pwd_cdir;
 	if (cdir == NULL) {
 		error = EINVAL;
 	} else {
 		vrefact(cdir);
 		error = export_vnode_to_sb(cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
 	}
 	PWDDESC_XUNLOCK(pdp);
 	pddrop(pdp);
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * Get per-process current working directory.
  */
 static int
 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	u_int namelen;
 	int error, error2, *name;
 
 	namelen = arg2;
 	if (namelen != 1)
 		return (EINVAL);
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_cwd_out(p, &sb, maxlen);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE,
     sysctl_kern_proc_cwd, "Process current working directory");
 
 #ifdef DDB
 /*
  * For the purposes of debugging, generate a human-readable string for the
  * file type.
  */
 static const char *
 file_type_to_name(short type)
 {
 
 	switch (type) {
 	case 0:
 		return ("zero");
 	case DTYPE_VNODE:
 		return ("vnode");
 	case DTYPE_SOCKET:
 		return ("socket");
 	case DTYPE_PIPE:
 		return ("pipe");
 	case DTYPE_FIFO:
 		return ("fifo");
 	case DTYPE_KQUEUE:
 		return ("kqueue");
 	case DTYPE_CRYPTO:
 		return ("crypto");
 	case DTYPE_MQUEUE:
 		return ("mqueue");
 	case DTYPE_SHM:
 		return ("shm");
 	case DTYPE_SEM:
 		return ("ksem");
 	case DTYPE_PTS:
 		return ("pts");
 	case DTYPE_DEV:
 		return ("dev");
 	case DTYPE_PROCDESC:
 		return ("proc");
 	case DTYPE_EVENTFD:
 		return ("eventfd");
 	case DTYPE_LINUXTFD:
 		return ("ltimer");
 	default:
 		return ("unkn");
 	}
 }
 
 /*
  * For the purposes of debugging, identify a process (if any, perhaps one of
  * many) that references the passed file in its file descriptor array. Return
  * NULL if none.
  */
 static struct proc *
 file_to_first_proc(struct file *fp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int n;
 
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
 		for (n = 0; n < fdp->fd_nfiles; n++) {
 			if (fp == fdp->fd_ofiles[n].fde_file)
 				return (p);
 		}
 	}
 	return (NULL);
 }
 
 static void
 db_print_file(struct file *fp, int header)
 {
 #define XPTRWIDTH ((int)howmany(sizeof(void *) * NBBY, 4))
 	struct proc *p;
 
 	if (header)
 		db_printf("%*s %6s %*s %8s %4s %5s %6s %*s %5s %s\n",
 		    XPTRWIDTH, "File", "Type", XPTRWIDTH, "Data", "Flag",
 		    "GCFl", "Count", "MCount", XPTRWIDTH, "Vnode", "FPID",
 		    "FCmd");
 	p = file_to_first_proc(fp);
 	db_printf("%*p %6s %*p %08x %04x %5d %6d %*p %5d %s\n", XPTRWIDTH,
 	    fp, file_type_to_name(fp->f_type), XPTRWIDTH, fp->f_data,
 	    fp->f_flag, 0, refcount_load(&fp->f_count), 0, XPTRWIDTH, fp->f_vnode,
 	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 
 #undef XPTRWIDTH
 }
 
 DB_SHOW_COMMAND(file, db_show_file)
 {
 	struct file *fp;
 
 	if (!have_addr) {
 		db_printf("usage: show file <addr>\n");
 		return;
 	}
 	fp = (struct file *)addr;
 	db_print_file(fp, 1);
 }
 
 DB_SHOW_COMMAND(files, db_show_files)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int header;
 	int n;
 
 	header = 1;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		if ((fdp = p->p_fd) == NULL)
 			continue;
 		for (n = 0; n < fdp->fd_nfiles; ++n) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			db_print_file(fp, header);
 			header = 0;
 		}
 	}
 }
 #endif
 
 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
     &maxfilesperproc, 0, "Maximum files allowed open per process");
 
 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
     &maxfiles, 0, "Maximum number of files");
 
 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
     &openfiles, 0, "System-wide number of open files");
 
 /* ARGSUSED*/
 static void
 filelistinit(void *dummy)
 {
 
 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR);
 	/*
 	 * XXXMJG this is a temporary hack due to boot ordering issues against
 	 * the vnode zone.
 	 */
 	vfs_smr = uma_zone_get_smr(pwd_zone);
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 }
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
 
 /*-------------------------------------------------------------------*/
 
 static int
 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 static int
 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EBADF);
 }
 
 static int
-badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
-    struct thread *td)
+badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_close(struct file *fp, struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	return (0);
 }
 
 struct fileops badfileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
 	.fo_truncate = badfo_truncate,
 	.fo_ioctl = badfo_ioctl,
 	.fo_poll = badfo_poll,
 	.fo_kqfilter = badfo_kqfilter,
 	.fo_stat = badfo_stat,
 	.fo_close = badfo_close,
 	.fo_chmod = badfo_chmod,
 	.fo_chown = badfo_chown,
 	.fo_sendfile = badfo_sendfile,
 	.fo_fill_kinfo = badfo_fill_kinfo,
 };
 
 static int
 path_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	return (POLLNVAL);
 }
 
 static int
 path_close(struct file *fp, struct thread *td)
 {
 	MPASS(fp->f_type == DTYPE_VNODE);
 	fp->f_ops = &badfileops;
 	vdrop(fp->f_vnode);
 	return (0);
 }
 
 struct fileops path_fileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
 	.fo_truncate = badfo_truncate,
 	.fo_ioctl = badfo_ioctl,
 	.fo_poll = path_poll,
 	.fo_kqfilter = vn_kqfilter_opath,
 	.fo_stat = vn_statfile,
 	.fo_close = path_close,
 	.fo_chmod = badfo_chmod,
 	.fo_chown = badfo_chown,
 	.fo_sendfile = badfo_sendfile,
 	.fo_fill_kinfo = vn_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE,
 };
 
 int
 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_ioctl(struct file *fp, u_long com, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 
 	return (ENOTTY);
 }
 
 int
 invfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (poll_no_poll(events));
 }
 
 int
 invfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 /*-------------------------------------------------------------------*/
 
 /*
  * File Descriptor pseudo-device driver (/dev/fd/).
  *
  * Opening minor device N dup()s the file (if any) connected to file
  * descriptor N belonging to the calling process.  Note that this driver
  * consists of only the ``open()'' routine, because all subsequent
  * references to this file will be direct to the other driver.
  *
  * XXX: we could give this one a cloning event handler if necessary.
  */
 
 /* ARGSUSED */
 static int
 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
 {
 
 	/*
 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
 	 * the file descriptor being sought for duplication. The error
 	 * return ensures that the vnode for this device will be released
 	 * by vn_open. Open will detect this special error and take the
 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 	 * will simply report the error.
 	 */
 	td->td_dupfd = dev2unit(dev);
 	return (ENODEV);
 }
 
 static struct cdevsw fildesc_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	fdopen,
 	.d_name =	"FD",
 };
 
 static void
 fildesc_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
 	make_dev_alias(dev, "stdin");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
 	make_dev_alias(dev, "stdout");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
 	make_dev_alias(dev, "stderr");
 }
 
 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index c9b50912a22d..ade71a848868 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -1,2855 +1,2854 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
  * Copyright (c) 2009 Apple, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_kqueue.h"
 
 #ifdef COMPAT_FREEBSD11
 #define	_WANT_FREEBSD11_KEVENT
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/unistd.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/kthread.h>
 #include <sys/selinfo.h>
 #include <sys/queue.h>
 #include <sys/event.h>
 #include <sys/eventvar.h>
 #include <sys/poll.h>
 #include <sys/protosw.h>
 #include <sys/resourcevar.h>
 #include <sys/sigio.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/syscallsubr.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <sys/user.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #include <machine/atomic.h>
 
 #include <vm/uma.h>
 
 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 
 /*
  * This lock is used if multiple kq locks are required.  This possibly
  * should be made into a per proc lock.
  */
 static struct mtx	kq_global;
 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
 #define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
 	if (!haslck)				\
 		mtx_lock(lck);			\
 	haslck = 1;				\
 } while (0)
 #define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
 	if (haslck)				\
 		mtx_unlock(lck);			\
 	haslck = 0;				\
 } while (0)
 
 TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
 
 static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
 static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
 static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
 		    struct thread *td, int mflag);
 static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
 static void	kqueue_release(struct kqueue *kq, int locked);
 static void	kqueue_destroy(struct kqueue *kq);
 static void	kqueue_drain(struct kqueue *kq, struct thread *td);
 static int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
 		    uintptr_t ident, int mflag);
 static void	kqueue_task(void *arg, int pending);
 static int	kqueue_scan(struct kqueue *kq, int maxevents,
 		    struct kevent_copyops *k_ops,
 		    const struct timespec *timeout,
 		    struct kevent *keva, struct thread *td);
 static void 	kqueue_wakeup(struct kqueue *kq);
 static struct filterops *kqueue_fo_find(int filt);
 static void	kqueue_fo_release(int filt);
 struct g_kevent_args;
 static int	kern_kevent_generic(struct thread *td,
 		    struct g_kevent_args *uap,
 		    struct kevent_copyops *k_ops, const char *struct_name);
 
 static fo_ioctl_t	kqueue_ioctl;
 static fo_poll_t	kqueue_poll;
 static fo_kqfilter_t	kqueue_kqfilter;
 static fo_stat_t	kqueue_stat;
 static fo_close_t	kqueue_close;
 static fo_fill_kinfo_t	kqueue_fill_kinfo;
 
 static struct fileops kqueueops = {
 	.fo_read = invfo_rdwr,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = kqueue_ioctl,
 	.fo_poll = kqueue_poll,
 	.fo_kqfilter = kqueue_kqfilter,
 	.fo_stat = kqueue_stat,
 	.fo_close = kqueue_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = kqueue_fill_kinfo,
 };
 
 static int 	knote_attach(struct knote *kn, struct kqueue *kq);
 static void 	knote_drop(struct knote *kn, struct thread *td);
 static void 	knote_drop_detached(struct knote *kn, struct thread *td);
 static void 	knote_enqueue(struct knote *kn);
 static void 	knote_dequeue(struct knote *kn);
 static void 	knote_init(void);
 static struct 	knote *knote_alloc(int mflag);
 static void 	knote_free(struct knote *kn);
 
 static void	filt_kqdetach(struct knote *kn);
 static int	filt_kqueue(struct knote *kn, long hint);
 static int	filt_procattach(struct knote *kn);
 static void	filt_procdetach(struct knote *kn);
 static int	filt_proc(struct knote *kn, long hint);
 static int	filt_fileattach(struct knote *kn);
 static void	filt_timerexpire(void *knx);
 static void	filt_timerexpire_l(struct knote *kn, bool proc_locked);
 static int	filt_timerattach(struct knote *kn);
 static void	filt_timerdetach(struct knote *kn);
 static void	filt_timerstart(struct knote *kn, sbintime_t to);
 static void	filt_timertouch(struct knote *kn, struct kevent *kev,
 		    u_long type);
 static int	filt_timervalidate(struct knote *kn, sbintime_t *to);
 static int	filt_timer(struct knote *kn, long hint);
 static int	filt_userattach(struct knote *kn);
 static void	filt_userdetach(struct knote *kn);
 static int	filt_user(struct knote *kn, long hint);
 static void	filt_usertouch(struct knote *kn, struct kevent *kev,
 		    u_long type);
 
 static struct filterops file_filtops = {
 	.f_isfd = 1,
 	.f_attach = filt_fileattach,
 };
 static struct filterops kqread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_kqdetach,
 	.f_event = filt_kqueue,
 };
 /* XXX - move to kern_proc.c?  */
 static struct filterops proc_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_procattach,
 	.f_detach = filt_procdetach,
 	.f_event = filt_proc,
 };
 static struct filterops timer_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_timerattach,
 	.f_detach = filt_timerdetach,
 	.f_event = filt_timer,
 	.f_touch = filt_timertouch,
 };
 static struct filterops user_filtops = {
 	.f_attach = filt_userattach,
 	.f_detach = filt_userdetach,
 	.f_event = filt_user,
 	.f_touch = filt_usertouch,
 };
 
 static uma_zone_t	knote_zone;
 static unsigned int __exclusive_cache_line	kq_ncallouts;
 static unsigned int 	kq_calloutmax = 4 * 1024;
 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
 
 /* XXX - ensure not influx ? */
 #define KNOTE_ACTIVATE(kn, islock) do { 				\
 	if ((islock))							\
 		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
 	else								\
 		KQ_LOCK((kn)->kn_kq);					\
 	(kn)->kn_status |= KN_ACTIVE;					\
 	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
 		knote_enqueue((kn));					\
 	if (!(islock))							\
 		KQ_UNLOCK((kn)->kn_kq);					\
 } while (0)
 #define KQ_LOCK(kq) do {						\
 	mtx_lock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_FLUX_WAKEUP(kq) do {						\
 	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
 		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
 		wakeup((kq));						\
 	}								\
 } while (0)
 #define KQ_UNLOCK_FLUX(kq) do {						\
 	KQ_FLUX_WAKEUP(kq);						\
 	mtx_unlock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_UNLOCK(kq) do {						\
 	mtx_unlock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_OWNED(kq) do {						\
 	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
 } while (0)
 #define KQ_NOTOWNED(kq) do {						\
 	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
 } while (0)
 
 static struct knlist *
 kn_list_lock(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = kn->kn_knlist;
 	if (knl != NULL)
 		knl->kl_lock(knl->kl_lockarg);
 	return (knl);
 }
 
 static void
 kn_list_unlock(struct knlist *knl)
 {
 	bool do_free;
 
 	if (knl == NULL)
 		return;
 	do_free = knl->kl_autodestroy && knlist_empty(knl);
 	knl->kl_unlock(knl->kl_lockarg);
 	if (do_free) {
 		knlist_destroy(knl);
 		free(knl, M_KQUEUE);
 	}
 }
 
 static bool
 kn_in_flux(struct knote *kn)
 {
 
 	return (kn->kn_influx > 0);
 }
 
 static void
 kn_enter_flux(struct knote *kn)
 {
 
 	KQ_OWNED(kn->kn_kq);
 	MPASS(kn->kn_influx < INT_MAX);
 	kn->kn_influx++;
 }
 
 static bool
 kn_leave_flux(struct knote *kn)
 {
 
 	KQ_OWNED(kn->kn_kq);
 	MPASS(kn->kn_influx > 0);
 	kn->kn_influx--;
 	return (kn->kn_influx == 0);
 }
 
 #define	KNL_ASSERT_LOCK(knl, islocked) do {				\
 	if (islocked)							\
 		KNL_ASSERT_LOCKED(knl);				\
 	else								\
 		KNL_ASSERT_UNLOCKED(knl);				\
 } while (0)
 #ifdef INVARIANTS
 #define	KNL_ASSERT_LOCKED(knl) do {					\
 	knl->kl_assert_lock((knl)->kl_lockarg, LA_LOCKED);		\
 } while (0)
 #define	KNL_ASSERT_UNLOCKED(knl) do {					\
 	knl->kl_assert_lock((knl)->kl_lockarg, LA_UNLOCKED);		\
 } while (0)
 #else /* !INVARIANTS */
 #define	KNL_ASSERT_LOCKED(knl) do {} while (0)
 #define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
 #endif /* INVARIANTS */
 
 #ifndef	KN_HASHSIZE
 #define	KN_HASHSIZE		64		/* XXX should be tunable */
 #endif
 
 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
 
 static int
 filt_nullattach(struct knote *kn)
 {
 
 	return (ENXIO);
 };
 
 struct filterops null_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_nullattach,
 };
 
 /* XXX - make SYSINIT to add these, and move into respective modules. */
 extern struct filterops sig_filtops;
 extern struct filterops fs_filtops;
 
 /*
  * Table for for all system-defined filters.
  */
 static struct mtx	filterops_lock;
 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
 	MTX_DEF);
 static struct {
 	struct filterops *for_fop;
 	int for_nolock;
 	int for_refcnt;
 } sysfilt_ops[EVFILT_SYSCOUNT] = {
 	{ &file_filtops, 1 },			/* EVFILT_READ */
 	{ &file_filtops, 1 },			/* EVFILT_WRITE */
 	{ &null_filtops },			/* EVFILT_AIO */
 	{ &file_filtops, 1 },			/* EVFILT_VNODE */
 	{ &proc_filtops, 1 },			/* EVFILT_PROC */
 	{ &sig_filtops, 1 },			/* EVFILT_SIGNAL */
 	{ &timer_filtops, 1 },			/* EVFILT_TIMER */
 	{ &file_filtops, 1 },			/* EVFILT_PROCDESC */
 	{ &fs_filtops, 1 },			/* EVFILT_FS */
 	{ &null_filtops },			/* EVFILT_LIO */
 	{ &user_filtops, 1 },			/* EVFILT_USER */
 	{ &null_filtops },			/* EVFILT_SENDFILE */
 	{ &file_filtops, 1 },                   /* EVFILT_EMPTY */
 };
 
 /*
  * Simple redirection for all cdevsw style objects to call their fo_kqfilter
  * method.
  */
 static int
 filt_fileattach(struct knote *kn)
 {
 
 	return (fo_kqfilter(kn->kn_fp, kn));
 }
 
 /*ARGSUSED*/
 static int
 kqueue_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	if (kn->kn_filter != EVFILT_READ)
 		return (EINVAL);
 
 	kn->kn_status |= KN_KQUEUE;
 	kn->kn_fop = &kqread_filtops;
 	knlist_add(&kq->kq_sel.si_note, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_kqdetach(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	knlist_remove(&kq->kq_sel.si_note, kn, 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_kqueue(struct knote *kn, long hint)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	kn->kn_data = kq->kq_count;
 	return (kn->kn_data > 0);
 }
 
 /* XXX - move to kern_proc.c?  */
 static int
 filt_procattach(struct knote *kn)
 {
 	struct proc *p;
 	int error;
 	bool exiting, immediate;
 
 	exiting = immediate = false;
 	if (kn->kn_sfflags & NOTE_EXIT)
 		p = pfind_any(kn->kn_id);
 	else
 		p = pfind(kn->kn_id);
 	if (p == NULL)
 		return (ESRCH);
 	if (p->p_flag & P_WEXIT)
 		exiting = true;
 
 	if ((error = p_cansee(curthread, p))) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	/*
 	 * Internal flag indicating registration done by kernel for the
 	 * purposes of getting a NOTE_CHILD notification.
 	 */
 	if (kn->kn_flags & EV_FLAG2) {
 		kn->kn_flags &= ~EV_FLAG2;
 		kn->kn_data = kn->kn_sdata;		/* ppid */
 		kn->kn_fflags = NOTE_CHILD;
 		kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
 		immediate = true; /* Force immediate activation of child note. */
 	}
 	/*
 	 * Internal flag indicating registration done by kernel (for other than
 	 * NOTE_CHILD).
 	 */
 	if (kn->kn_flags & EV_FLAG1) {
 		kn->kn_flags &= ~EV_FLAG1;
 	}
 
 	knlist_add(p->p_klist, kn, 1);
 
 	/*
 	 * Immediately activate any child notes or, in the case of a zombie
 	 * target process, exit notes.  The latter is necessary to handle the
 	 * case where the target process, e.g. a child, dies before the kevent
 	 * is registered.
 	 */
 	if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
 		KNOTE_ACTIVATE(kn, 0);
 
 	PROC_UNLOCK(p);
 
 	return (0);
 }
 
 /*
  * The knote may be attached to a different process, which may exit,
  * leaving nothing for the knote to be attached to.  So when the process
  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
  * it will be deleted when read out.  However, as part of the knote deletion,
  * this routine is called, so a check is needed to avoid actually performing
  * a detach, because the original process does not exist any more.
  */
 /* XXX - move to kern_proc.c?  */
 static void
 filt_procdetach(struct knote *kn)
 {
 
 	knlist_remove(kn->kn_knlist, kn, 0);
 	kn->kn_ptr.p_proc = NULL;
 }
 
 /* XXX - move to kern_proc.c?  */
 static int
 filt_proc(struct knote *kn, long hint)
 {
 	struct proc *p;
 	u_int event;
 
 	p = kn->kn_ptr.p_proc;
 	if (p == NULL) /* already activated, from attach filter */
 		return (0);
 
 	/* Mask off extra data. */
 	event = (u_int)hint & NOTE_PCTRLMASK;
 
 	/* If the user is interested in this event, record it. */
 	if (kn->kn_sfflags & event)
 		kn->kn_fflags |= event;
 
 	/* Process is gone, so flag the event as finished. */
 	if (event == NOTE_EXIT) {
 		kn->kn_flags |= EV_EOF | EV_ONESHOT;
 		kn->kn_ptr.p_proc = NULL;
 		if (kn->kn_fflags & NOTE_EXIT)
 			kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
 		if (kn->kn_fflags == 0)
 			kn->kn_flags |= EV_DROP;
 		return (1);
 	}
 
 	return (kn->kn_fflags != 0);
 }
 
 /*
  * Called when the process forked. It mostly does the same as the
  * knote(), activating all knotes registered to be activated when the
  * process forked. Additionally, for each knote attached to the
  * parent, check whether user wants to track the new process. If so
  * attach a new knote to it, and immediately report an event with the
  * child's pid.
  */
 void
 knote_fork(struct knlist *list, int pid)
 {
 	struct kqueue *kq;
 	struct knote *kn;
 	struct kevent kev;
 	int error;
 
 	MPASS(list != NULL);
 	KNL_ASSERT_LOCKED(list);
 	if (SLIST_EMPTY(&list->kl_list))
 		return;
 
 	memset(&kev, 0, sizeof(kev));
 	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
 			KQ_UNLOCK(kq);
 			continue;
 		}
 
 		/*
 		 * The same as knote(), activate the event.
 		 */
 		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
 			if (kn->kn_fop->f_event(kn, NOTE_FORK))
 				KNOTE_ACTIVATE(kn, 1);
 			KQ_UNLOCK(kq);
 			continue;
 		}
 
 		/*
 		 * The NOTE_TRACK case. In addition to the activation
 		 * of the event, we need to register new events to
 		 * track the child. Drop the locks in preparation for
 		 * the call to kqueue_register().
 		 */
 		kn_enter_flux(kn);
 		KQ_UNLOCK(kq);
 		list->kl_unlock(list->kl_lockarg);
 
 		/*
 		 * Activate existing knote and register tracking knotes with
 		 * new process.
 		 *
 		 * First register a knote to get just the child notice. This
 		 * must be a separate note from a potential NOTE_EXIT
 		 * notification since both NOTE_CHILD and NOTE_EXIT are defined
 		 * to use the data field (in conflicting ways).
 		 */
 		kev.ident = pid;
 		kev.filter = kn->kn_filter;
 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
 		    EV_FLAG2;
 		kev.fflags = kn->kn_sfflags;
 		kev.data = kn->kn_id;		/* parent */
 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
 		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
 		if (error)
 			kn->kn_fflags |= NOTE_TRACKERR;
 
 		/*
 		 * Then register another knote to track other potential events
 		 * from the new process.
 		 */
 		kev.ident = pid;
 		kev.filter = kn->kn_filter;
 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
 		kev.fflags = kn->kn_sfflags;
 		kev.data = kn->kn_id;		/* parent */
 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
 		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
 		if (error)
 			kn->kn_fflags |= NOTE_TRACKERR;
 		if (kn->kn_fop->f_event(kn, NOTE_FORK))
 			KNOTE_ACTIVATE(kn, 0);
 		list->kl_lock(list->kl_lockarg);
 		KQ_LOCK(kq);
 		kn_leave_flux(kn);
 		KQ_UNLOCK_FLUX(kq);
 	}
 }
 
 /*
  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
  * interval timer support code.
  */
 
 #define NOTE_TIMER_PRECMASK						\
     (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
 
 static sbintime_t
 timer2sbintime(int64_t data, int flags)
 {
 	int64_t secs;
 
         /*
          * Macros for converting to the fractional second portion of an
          * sbintime_t using 64bit multiplication to improve precision.
          */
 #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
 #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
 #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
 	switch (flags & NOTE_TIMER_PRECMASK) {
 	case NOTE_SECONDS:
 #ifdef __LP64__
 		if (data > (SBT_MAX / SBT_1S))
 			return (SBT_MAX);
 #endif
 		return ((sbintime_t)data << 32);
 	case NOTE_MSECONDS: /* FALLTHROUGH */
 	case 0:
 		if (data >= 1000) {
 			secs = data / 1000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | MS_TO_SBT(data % 1000));
 		}
 		return (MS_TO_SBT(data));
 	case NOTE_USECONDS:
 		if (data >= 1000000) {
 			secs = data / 1000000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | US_TO_SBT(data % 1000000));
 		}
 		return (US_TO_SBT(data));
 	case NOTE_NSECONDS:
 		if (data >= 1000000000) {
 			secs = data / 1000000000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | NS_TO_SBT(data % 1000000000));
 		}
 		return (NS_TO_SBT(data));
 	default:
 		break;
 	}
 	return (-1);
 }
 
 struct kq_timer_cb_data {
 	struct callout c;
 	struct proc *p;
 	struct knote *kn;
 	int cpuid;
 	int flags;
 	TAILQ_ENTRY(kq_timer_cb_data) link;
 	sbintime_t next;	/* next timer event fires at */
 	sbintime_t to;		/* precalculated timer period, 0 for abs */
 };
 
 #define	KQ_TIMER_CB_ENQUEUED	0x01
 
 static void
 kqtimer_sched_callout(struct kq_timer_cb_data *kc)
 {
 	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kc->kn,
 	    kc->cpuid, C_ABSOLUTE);
 }
 
 void
 kqtimer_proc_continue(struct proc *p)
 {
 	struct kq_timer_cb_data *kc, *kc1;
 	struct bintime bt;
 	sbintime_t now;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	getboottimebin(&bt);
 	now = bttosbt(bt);
 
 	TAILQ_FOREACH_SAFE(kc, &p->p_kqtim_stop, link, kc1) {
 		TAILQ_REMOVE(&p->p_kqtim_stop, kc, link);
 		kc->flags &= ~KQ_TIMER_CB_ENQUEUED;
 		if (kc->next <= now)
 			filt_timerexpire_l(kc->kn, true);
 		else
 			kqtimer_sched_callout(kc);
 	}
 }
 
 static void
 filt_timerexpire_l(struct knote *kn, bool proc_locked)
 {
 	struct kq_timer_cb_data *kc;
 	struct proc *p;
 	uint64_t delta;
 	sbintime_t now;
 
 	kc = kn->kn_ptr.p_v;
 
 	if ((kn->kn_flags & EV_ONESHOT) != 0 || kc->to == 0) {
 		kn->kn_data++;
 		KNOTE_ACTIVATE(kn, 0);
 		return;
 	}
 
 	now = sbinuptime();
 	if (now >= kc->next) {
 		delta = (now - kc->next) / kc->to;
 		if (delta == 0)
 			delta = 1;
 		kn->kn_data += delta;
 		kc->next += (delta + 1) * kc->to;
 		if (now >= kc->next)	/* overflow */
 			kc->next = now + kc->to;
 		KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
 	}
 
 	/*
 	 * Initial check for stopped kc->p is racy.  It is fine to
 	 * miss the set of the stop flags, at worst we would schedule
 	 * one more callout.  On the other hand, it is not fine to not
 	 * schedule when we we missed clearing of the flags, we
 	 * recheck them under the lock and observe consistent state.
 	 */
 	p = kc->p;
 	if (P_SHOULDSTOP(p) || P_KILLED(p)) {
 		if (!proc_locked)
 			PROC_LOCK(p);
 		if (P_SHOULDSTOP(p) || P_KILLED(p)) {
 			if ((kc->flags & KQ_TIMER_CB_ENQUEUED) == 0) {
 				kc->flags |= KQ_TIMER_CB_ENQUEUED;
 				TAILQ_INSERT_TAIL(&p->p_kqtim_stop, kc, link);
 			}
 			if (!proc_locked)
 				PROC_UNLOCK(p);
 			return;
 		}
 		if (!proc_locked)
 			PROC_UNLOCK(p);
 	}
 	kqtimer_sched_callout(kc);
 }
 
 static void
 filt_timerexpire(void *knx)
 {
 	filt_timerexpire_l(knx, false);
 }
 
 /*
  * data contains amount of time to sleep
  */
 static int
 filt_timervalidate(struct knote *kn, sbintime_t *to)
 {
 	struct bintime bt;
 	sbintime_t sbt;
 
 	if (kn->kn_sdata < 0)
 		return (EINVAL);
 	if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
 		kn->kn_sdata = 1;
 	/*
 	 * The only fflags values supported are the timer unit
 	 * (precision) and the absolute time indicator.
 	 */
 	if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0)
 		return (EINVAL);
 
 	*to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
 	if (*to < 0)
 		return (EINVAL);
 	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
 		getboottimebin(&bt);
 		sbt = bttosbt(bt);
 		*to = MAX(0, *to - sbt);
 	}
 	return (0);
 }
 
 static int
 filt_timerattach(struct knote *kn)
 {
 	struct kq_timer_cb_data *kc;
 	sbintime_t to;
 	int error;
 
 	to = -1;
 	error = filt_timervalidate(kn, &to);
 	if (error != 0)
 		return (error);
 	KASSERT(to > 0 || (kn->kn_flags & EV_ONESHOT) != 0 ||
 	    (kn->kn_sfflags & NOTE_ABSTIME) != 0,
 	    ("%s: periodic timer has a calculated zero timeout", __func__));
 	KASSERT(to >= 0,
 	    ("%s: timer has a calculated negative timeout", __func__));
 
 	if (atomic_fetchadd_int(&kq_ncallouts, 1) + 1 > kq_calloutmax) {
 		atomic_subtract_int(&kq_ncallouts, 1);
 		return (ENOMEM);
 	}
 
 	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
 		kn->kn_flags |= EV_CLEAR;	/* automatically set */
 	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
 	kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
 	kc->kn = kn;
 	kc->p = curproc;
 	kc->cpuid = PCPU_GET(cpuid);
 	kc->flags = 0;
 	callout_init(&kc->c, 1);
 	filt_timerstart(kn, to);
 
 	return (0);
 }
 
 static void
 filt_timerstart(struct knote *kn, sbintime_t to)
 {
 	struct kq_timer_cb_data *kc;
 
 	kc = kn->kn_ptr.p_v;
 	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
 		kc->next = to;
 		kc->to = 0;
 	} else {
 		kc->next = to + sbinuptime();
 		kc->to = to;
 	}
 	kqtimer_sched_callout(kc);
 }
 
 static void
 filt_timerdetach(struct knote *kn)
 {
 	struct kq_timer_cb_data *kc;
 	unsigned int old __unused;
 	bool pending;
 
 	kc = kn->kn_ptr.p_v;
 	do {
 		callout_drain(&kc->c);
 
 		/*
 		 * kqtimer_proc_continue() might have rescheduled this callout.
 		 * Double-check, using the process mutex as an interlock.
 		 */
 		PROC_LOCK(kc->p);
 		if ((kc->flags & KQ_TIMER_CB_ENQUEUED) != 0) {
 			kc->flags &= ~KQ_TIMER_CB_ENQUEUED;
 			TAILQ_REMOVE(&kc->p->p_kqtim_stop, kc, link);
 		}
 		pending = callout_pending(&kc->c);
 		PROC_UNLOCK(kc->p);
 	} while (pending);
 	free(kc, M_KQUEUE);
 	old = atomic_fetchadd_int(&kq_ncallouts, -1);
 	KASSERT(old > 0, ("Number of callouts cannot become negative"));
 	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
 }
 
 static void
 filt_timertouch(struct knote *kn, struct kevent *kev, u_long type)
 {
 	struct kq_timer_cb_data *kc;	
 	struct kqueue *kq;
 	sbintime_t to;
 	int error;
 
 	switch (type) {
 	case EVENT_REGISTER:
 		/* Handle re-added timers that update data/fflags */
 		if (kev->flags & EV_ADD) {
 			kc = kn->kn_ptr.p_v;
 
 			/* Drain any existing callout. */
 			callout_drain(&kc->c);
 
 			/* Throw away any existing undelivered record
 			 * of the timer expiration. This is done under
 			 * the presumption that if a process is
 			 * re-adding this timer with new parameters,
 			 * it is no longer interested in what may have
 			 * happened under the old parameters. If it is
 			 * interested, it can wait for the expiration,
 			 * delete the old timer definition, and then
 			 * add the new one.
 			 *
 			 * This has to be done while the kq is locked:
 			 *   - if enqueued, dequeue
 			 *   - make it no longer active
 			 *   - clear the count of expiration events
 			 */
 			kq = kn->kn_kq;
 			KQ_LOCK(kq);
 			if (kn->kn_status & KN_QUEUED)
 				knote_dequeue(kn);
 
 			kn->kn_status &= ~KN_ACTIVE;
 			kn->kn_data = 0;
 			KQ_UNLOCK(kq);
 			
 			/* Reschedule timer based on new data/fflags */
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			error = filt_timervalidate(kn, &to);
 			if (error != 0) {
 			  	kn->kn_flags |= EV_ERROR;
 				kn->kn_data = error;
 			} else
 			  	filt_timerstart(kn, to);
 		}
 		break;
 
         case EVENT_PROCESS:
 		*kev = kn->kn_kevent;
 		if (kn->kn_flags & EV_CLEAR) {
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 		}
 		break;
 
 	default:
 		panic("filt_timertouch() - invalid type (%ld)", type);
 		break;
 	}
 }
 
 static int
 filt_timer(struct knote *kn, long hint)
 {
 
 	return (kn->kn_data != 0);
 }
 
 static int
 filt_userattach(struct knote *kn)
 {
 
 	/* 
 	 * EVFILT_USER knotes are not attached to anything in the kernel.
 	 */ 
 	kn->kn_hook = NULL;
 	if (kn->kn_fflags & NOTE_TRIGGER)
 		kn->kn_hookid = 1;
 	else
 		kn->kn_hookid = 0;
 	return (0);
 }
 
 static void
 filt_userdetach(__unused struct knote *kn)
 {
 
 	/*
 	 * EVFILT_USER knotes are not attached to anything in the kernel.
 	 */
 }
 
 static int
 filt_user(struct knote *kn, __unused long hint)
 {
 
 	return (kn->kn_hookid);
 }
 
 static void
 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
 {
 	u_int ffctrl;
 
 	switch (type) {
 	case EVENT_REGISTER:
 		if (kev->fflags & NOTE_TRIGGER)
 			kn->kn_hookid = 1;
 
 		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
 		kev->fflags &= NOTE_FFLAGSMASK;
 		switch (ffctrl) {
 		case NOTE_FFNOP:
 			break;
 
 		case NOTE_FFAND:
 			kn->kn_sfflags &= kev->fflags;
 			break;
 
 		case NOTE_FFOR:
 			kn->kn_sfflags |= kev->fflags;
 			break;
 
 		case NOTE_FFCOPY:
 			kn->kn_sfflags = kev->fflags;
 			break;
 
 		default:
 			/* XXX Return error? */
 			break;
 		}
 		kn->kn_sdata = kev->data;
 		if (kev->flags & EV_CLEAR) {
 			kn->kn_hookid = 0;
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 		}
 		break;
 
         case EVENT_PROCESS:
 		*kev = kn->kn_kevent;
 		kev->fflags = kn->kn_sfflags;
 		kev->data = kn->kn_sdata;
 		if (kn->kn_flags & EV_CLEAR) {
 			kn->kn_hookid = 0;
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 		}
 		break;
 
 	default:
 		panic("filt_usertouch() - invalid type (%ld)", type);
 		break;
 	}
 }
 
 int
 sys_kqueue(struct thread *td, struct kqueue_args *uap)
 {
 
 	return (kern_kqueue(td, 0, NULL));
 }
 
 static void
 kqueue_init(struct kqueue *kq)
 {
 
 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
 	TAILQ_INIT(&kq->kq_head);
 	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
 }
 
 int
 kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
 {
 	struct filedesc *fdp;
 	struct kqueue *kq;
 	struct file *fp;
 	struct ucred *cred;
 	int fd, error;
 
 	fdp = td->td_proc->p_fd;
 	cred = td->td_ucred;
 	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
 		return (ENOMEM);
 
 	error = falloc_caps(td, &fp, &fd, flags, fcaps);
 	if (error != 0) {
 		chgkqcnt(cred->cr_ruidinfo, -1, 0);
 		return (error);
 	}
 
 	/* An extra reference on `fp' has been held for us by falloc(). */
 	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
 	kqueue_init(kq);
 	kq->kq_fdp = fdp;
 	kq->kq_cred = crhold(cred);
 
 	FILEDESC_XLOCK(fdp);
 	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
 	FILEDESC_XUNLOCK(fdp);
 
 	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
 	return (0);
 }
 
 struct g_kevent_args {
 	int	fd;
 	void	*changelist;
 	int	nchanges;
 	void	*eventlist;
 	int	nevents;
 	const struct timespec *timeout;
 };
 
 int
 sys_kevent(struct thread *td, struct kevent_args *uap)
 {
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = kevent_copyout,
 		.k_copyin = kevent_copyin,
 		.kevent_size = sizeof(struct kevent),
 	};
 	struct g_kevent_args gk_args = {
 		.fd = uap->fd,
 		.changelist = uap->changelist,
 		.nchanges = uap->nchanges,
 		.eventlist = uap->eventlist,
 		.nevents = uap->nevents,
 		.timeout = uap->timeout,
 	};
 
 	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent"));
 }
 
 static int
 kern_kevent_generic(struct thread *td, struct g_kevent_args *uap,
     struct kevent_copyops *k_ops, const char *struct_name)
 {
 	struct timespec ts, *tsp;
 #ifdef KTRACE
 	struct kevent *eventlist = uap->eventlist;
 #endif
 	int error;
 
 	if (uap->timeout != NULL) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist,
 		    uap->nchanges, k_ops->kevent_size);
 #endif
 
 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
 	    k_ops, tsp);
 
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray(struct_name, UIO_USERSPACE, eventlist,
 		    td->td_retval[0], k_ops->kevent_size);
 #endif
 
 	return (error);
 }
 
 /*
  * Copy 'count' items into the destination list pointed to by uap->eventlist.
  */
 static int
 kevent_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct kevent_args *uap;
 	int error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct kevent_args *)arg;
 
 	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
 	if (error == 0)
 		uap->eventlist += count;
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 kevent_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct kevent_args *uap;
 	int error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct kevent_args *)arg;
 
 	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
 	if (error == 0)
 		uap->changelist += count;
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD11
 static int
 kevent11_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd11_kevent_args *uap;
 	struct kevent_freebsd11 kev11;
 	int error, i;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd11_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		kev11.ident = kevp->ident;
 		kev11.filter = kevp->filter;
 		kev11.flags = kevp->flags;
 		kev11.fflags = kevp->fflags;
 		kev11.data = kevp->data;
 		kev11.udata = kevp->udata;
 		error = copyout(&kev11, uap->eventlist, sizeof(kev11));
 		if (error != 0)
 			break;
 		uap->eventlist++;
 		kevp++;
 	}
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 kevent11_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd11_kevent_args *uap;
 	struct kevent_freebsd11 kev11;
 	int error, i;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd11_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		error = copyin(uap->changelist, &kev11, sizeof(kev11));
 		if (error != 0)
 			break;
 		kevp->ident = kev11.ident;
 		kevp->filter = kev11.filter;
 		kevp->flags = kev11.flags;
 		kevp->fflags = kev11.fflags;
 		kevp->data = (uintptr_t)kev11.data;
 		kevp->udata = kev11.udata;
 		bzero(&kevp->ext, sizeof(kevp->ext));
 		uap->changelist++;
 		kevp++;
 	}
 	return (error);
 }
 
 int
 freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap)
 {
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = kevent11_copyout,
 		.k_copyin = kevent11_copyin,
 		.kevent_size = sizeof(struct kevent_freebsd11),
 	};
 	struct g_kevent_args gk_args = {
 		.fd = uap->fd,
 		.changelist = uap->changelist,
 		.nchanges = uap->nchanges,
 		.eventlist = uap->eventlist,
 		.nevents = uap->nevents,
 		.timeout = uap->timeout,
 	};
 
 	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent_freebsd11"));
 }
 #endif
 
 int
 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	int error;
 
 	cap_rights_init_zero(&rights);
 	if (nchanges > 0)
 		cap_rights_set_one(&rights, CAP_KQUEUE_CHANGE);
 	if (nevents > 0)
 		cap_rights_set_one(&rights, CAP_KQUEUE_EVENT);
 	error = fget(td, fd, &rights, &fp);
 	if (error != 0)
 		return (error);
 
 	error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
 	fdrop(fp, td);
 
 	return (error);
 }
 
 static int
 kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	struct kevent keva[KQ_NEVENTS];
 	struct kevent *kevp, *changes;
 	int i, n, nerrors, error;
 
 	if (nchanges < 0)
 		return (EINVAL);
 
 	nerrors = 0;
 	while (nchanges > 0) {
 		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
 		error = k_ops->k_copyin(k_ops->arg, keva, n);
 		if (error)
 			return (error);
 		changes = keva;
 		for (i = 0; i < n; i++) {
 			kevp = &changes[i];
 			if (!kevp->filter)
 				continue;
 			kevp->flags &= ~EV_SYSFLAGS;
 			error = kqueue_register(kq, kevp, td, M_WAITOK);
 			if (error || (kevp->flags & EV_RECEIPT)) {
 				if (nevents == 0)
 					return (error);
 				kevp->flags = EV_ERROR;
 				kevp->data = error;
 				(void)k_ops->k_copyout(k_ops->arg, kevp, 1);
 				nevents--;
 				nerrors++;
 			}
 		}
 		nchanges -= n;
 	}
 	if (nerrors) {
 		td->td_retval[0] = nerrors;
 		return (0);
 	}
 
 	return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
 }
 
 int
 kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	struct kqueue *kq;
 	int error;
 
 	error = kqueue_acquire(fp, &kq);
 	if (error != 0)
 		return (error);
 	error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
 	kqueue_release(kq, 0);
 	return (error);
 }
 
 /*
  * Performs a kevent() call on a temporarily created kqueue. This can be
  * used to perform one-shot polling, similar to poll() and select().
  */
 int
 kern_kevent_anonymous(struct thread *td, int nevents,
     struct kevent_copyops *k_ops)
 {
 	struct kqueue kq = {};
 	int error;
 
 	kqueue_init(&kq);
 	kq.kq_refcnt = 1;
 	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
 	kqueue_drain(&kq, td);
 	kqueue_destroy(&kq);
 	return (error);
 }
 
 int
 kqueue_add_filteropts(int filt, struct filterops *filtops)
 {
 	int error;
 
 	error = 0;
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
 		printf(
 "trying to add a filterop that is out of range: %d is beyond %d\n",
 		    ~filt, EVFILT_SYSCOUNT);
 		return EINVAL;
 	}
 	mtx_lock(&filterops_lock);
 	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
 	    sysfilt_ops[~filt].for_fop != NULL)
 		error = EEXIST;
 	else {
 		sysfilt_ops[~filt].for_fop = filtops;
 		sysfilt_ops[~filt].for_refcnt = 0;
 	}
 	mtx_unlock(&filterops_lock);
 
 	return (error);
 }
 
 int
 kqueue_del_filteropts(int filt)
 {
 	int error;
 
 	error = 0;
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return EINVAL;
 
 	mtx_lock(&filterops_lock);
 	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
 	    sysfilt_ops[~filt].for_fop == NULL)
 		error = EINVAL;
 	else if (sysfilt_ops[~filt].for_refcnt != 0)
 		error = EBUSY;
 	else {
 		sysfilt_ops[~filt].for_fop = &null_filtops;
 		sysfilt_ops[~filt].for_refcnt = 0;
 	}
 	mtx_unlock(&filterops_lock);
 
 	return error;
 }
 
 static struct filterops *
 kqueue_fo_find(int filt)
 {
 
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return NULL;
 
 	if (sysfilt_ops[~filt].for_nolock)
 		return sysfilt_ops[~filt].for_fop;
 
 	mtx_lock(&filterops_lock);
 	sysfilt_ops[~filt].for_refcnt++;
 	if (sysfilt_ops[~filt].for_fop == NULL)
 		sysfilt_ops[~filt].for_fop = &null_filtops;
 	mtx_unlock(&filterops_lock);
 
 	return sysfilt_ops[~filt].for_fop;
 }
 
 static void
 kqueue_fo_release(int filt)
 {
 
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return;
 
 	if (sysfilt_ops[~filt].for_nolock)
 		return;
 
 	mtx_lock(&filterops_lock);
 	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
 	    ("filter object refcount not valid on release"));
 	sysfilt_ops[~filt].for_refcnt--;
 	mtx_unlock(&filterops_lock);
 }
 
 /*
  * A ref to kq (obtained via kqueue_acquire) must be held.
  */
 static int
 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td,
     int mflag)
 {
 	struct filterops *fops;
 	struct file *fp;
 	struct knote *kn, *tkn;
 	struct knlist *knl;
 	int error, filt, event;
 	int haskqglobal, filedesc_unlock;
 
 	if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
 		return (EINVAL);
 
 	fp = NULL;
 	kn = NULL;
 	knl = NULL;
 	error = 0;
 	haskqglobal = 0;
 	filedesc_unlock = 0;
 
 	filt = kev->filter;
 	fops = kqueue_fo_find(filt);
 	if (fops == NULL)
 		return EINVAL;
 
 	if (kev->flags & EV_ADD) {
 		/* Reject an invalid flag pair early */
 		if (kev->flags & EV_KEEPUDATA) {
 			tkn = NULL;
 			error = EINVAL;
 			goto done;
 		}
 
 		/*
 		 * Prevent waiting with locks.  Non-sleepable
 		 * allocation failures are handled in the loop, only
 		 * if the spare knote appears to be actually required.
 		 */
 		tkn = knote_alloc(mflag);
 	} else {
 		tkn = NULL;
 	}
 
 findkn:
 	if (fops->f_isfd) {
 		KASSERT(td != NULL, ("td is NULL"));
 		if (kev->ident > INT_MAX)
 			error = EBADF;
 		else
 			error = fget(td, kev->ident, &cap_event_rights, &fp);
 		if (error)
 			goto done;
 
 		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
 		    kev->ident, M_NOWAIT) != 0) {
 			/* try again */
 			fdrop(fp, td);
 			fp = NULL;
 			error = kqueue_expand(kq, fops, kev->ident, mflag);
 			if (error)
 				goto done;
 			goto findkn;
 		}
 
 		if (fp->f_type == DTYPE_KQUEUE) {
 			/*
 			 * If we add some intelligence about what we are doing,
 			 * we should be able to support events on ourselves.
 			 * We need to know when we are doing this to prevent
 			 * getting both the knlist lock and the kq lock since
 			 * they are the same thing.
 			 */
 			if (fp->f_data == kq) {
 				error = EINVAL;
 				goto done;
 			}
 
 			/*
 			 * Pre-lock the filedesc before the global
 			 * lock mutex, see the comment in
 			 * kqueue_close().
 			 */
 			FILEDESC_XLOCK(td->td_proc->p_fd);
 			filedesc_unlock = 1;
 			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 		}
 
 		KQ_LOCK(kq);
 		if (kev->ident < kq->kq_knlistsize) {
 			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
 				if (kev->filter == kn->kn_filter)
 					break;
 		}
 	} else {
 		if ((kev->flags & EV_ADD) == EV_ADD) {
 			error = kqueue_expand(kq, fops, kev->ident, mflag);
 			if (error != 0)
 				goto done;
 		}
 
 		KQ_LOCK(kq);
 
 		/*
 		 * If possible, find an existing knote to use for this kevent.
 		 */
 		if (kev->filter == EVFILT_PROC &&
 		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
 			/* This is an internal creation of a process tracking
 			 * note. Don't attempt to coalesce this with an
 			 * existing note.
 			 */
 			;			
 		} else if (kq->kq_knhashmask != 0) {
 			struct klist *list;
 
 			list = &kq->kq_knhash[
 			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
 			SLIST_FOREACH(kn, list, kn_link)
 				if (kev->ident == kn->kn_id &&
 				    kev->filter == kn->kn_filter)
 					break;
 		}
 	}
 
 	/* knote is in the process of changing, wait for it to stabilize. */
 	if (kn != NULL && kn_in_flux(kn)) {
 		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 		if (filedesc_unlock) {
 			FILEDESC_XUNLOCK(td->td_proc->p_fd);
 			filedesc_unlock = 0;
 		}
 		kq->kq_state |= KQ_FLUXWAIT;
 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
 		if (fp != NULL) {
 			fdrop(fp, td);
 			fp = NULL;
 		}
 		goto findkn;
 	}
 
 	/*
 	 * kn now contains the matching knote, or NULL if no match
 	 */
 	if (kn == NULL) {
 		if (kev->flags & EV_ADD) {
 			kn = tkn;
 			tkn = NULL;
 			if (kn == NULL) {
 				KQ_UNLOCK(kq);
 				error = ENOMEM;
 				goto done;
 			}
 			kn->kn_fp = fp;
 			kn->kn_kq = kq;
 			kn->kn_fop = fops;
 			/*
 			 * apply reference counts to knote structure, and
 			 * do not release it at the end of this routine.
 			 */
 			fops = NULL;
 			fp = NULL;
 
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			kev->fflags = 0;
 			kev->data = 0;
 			kn->kn_kevent = *kev;
 			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
 			    EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
 			kn->kn_status = KN_DETACHED;
 			if ((kev->flags & EV_DISABLE) != 0)
 				kn->kn_status |= KN_DISABLED;
 			kn_enter_flux(kn);
 
 			error = knote_attach(kn, kq);
 			KQ_UNLOCK(kq);
 			if (error != 0) {
 				tkn = kn;
 				goto done;
 			}
 
 			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
 				knote_drop_detached(kn, td);
 				goto done;
 			}
 			knl = kn_list_lock(kn);
 			goto done_ev_add;
 		} else {
 			/* No matching knote and the EV_ADD flag is not set. */
 			KQ_UNLOCK(kq);
 			error = ENOENT;
 			goto done;
 		}
 	}
 
 	if (kev->flags & EV_DELETE) {
 		kn_enter_flux(kn);
 		KQ_UNLOCK(kq);
 		knote_drop(kn, td);
 		goto done;
 	}
 
 	if (kev->flags & EV_FORCEONESHOT) {
 		kn->kn_flags |= EV_ONESHOT;
 		KNOTE_ACTIVATE(kn, 1);
 	}
 
 	if ((kev->flags & EV_ENABLE) != 0)
 		kn->kn_status &= ~KN_DISABLED;
 	else if ((kev->flags & EV_DISABLE) != 0)
 		kn->kn_status |= KN_DISABLED;
 
 	/*
 	 * The user may change some filter values after the initial EV_ADD,
 	 * but doing so will not reset any filter which has already been
 	 * triggered.
 	 */
 	kn->kn_status |= KN_SCAN;
 	kn_enter_flux(kn);
 	KQ_UNLOCK(kq);
 	knl = kn_list_lock(kn);
 	if ((kev->flags & EV_KEEPUDATA) == 0)
 		kn->kn_kevent.udata = kev->udata;
 	if (!fops->f_isfd && fops->f_touch != NULL) {
 		fops->f_touch(kn, kev, EVENT_REGISTER);
 	} else {
 		kn->kn_sfflags = kev->fflags;
 		kn->kn_sdata = kev->data;
 	}
 
 done_ev_add:
 	/*
 	 * We can get here with kn->kn_knlist == NULL.  This can happen when
 	 * the initial attach event decides that the event is "completed" 
 	 * already, e.g., filt_procattach() is called on a zombie process.  It
 	 * will call filt_proc() which will remove it from the list, and NULL
 	 * kn_knlist.
 	 *
 	 * KN_DISABLED will be stable while the knote is in flux, so the
 	 * unlocked read will not race with an update.
 	 */
 	if ((kn->kn_status & KN_DISABLED) == 0)
 		event = kn->kn_fop->f_event(kn, 0);
 	else
 		event = 0;
 
 	KQ_LOCK(kq);
 	if (event)
 		kn->kn_status |= KN_ACTIVE;
 	if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
 	    KN_ACTIVE)
 		knote_enqueue(kn);
 	kn->kn_status &= ~KN_SCAN;
 	kn_leave_flux(kn);
 	kn_list_unlock(knl);
 	KQ_UNLOCK_FLUX(kq);
 
 done:
 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 	if (filedesc_unlock)
 		FILEDESC_XUNLOCK(td->td_proc->p_fd);
 	if (fp != NULL)
 		fdrop(fp, td);
 	knote_free(tkn);
 	if (fops != NULL)
 		kqueue_fo_release(filt);
 	return (error);
 }
 
 static int
 kqueue_acquire(struct file *fp, struct kqueue **kqp)
 {
 	int error;
 	struct kqueue *kq;
 
 	error = 0;
 
 	kq = fp->f_data;
 	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
 		return (EBADF);
 	*kqp = kq;
 	KQ_LOCK(kq);
 	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
 		KQ_UNLOCK(kq);
 		return (EBADF);
 	}
 	kq->kq_refcnt++;
 	KQ_UNLOCK(kq);
 
 	return error;
 }
 
 static void
 kqueue_release(struct kqueue *kq, int locked)
 {
 	if (locked)
 		KQ_OWNED(kq);
 	else
 		KQ_LOCK(kq);
 	kq->kq_refcnt--;
 	if (kq->kq_refcnt == 1)
 		wakeup(&kq->kq_refcnt);
 	if (!locked)
 		KQ_UNLOCK(kq);
 }
 
 void
 kqueue_drain_schedtask(void)
 {
 	taskqueue_quiesce(taskqueue_kqueue_ctx);
 }
 
 static void
 kqueue_schedtask(struct kqueue *kq)
 {
 	struct thread *td;
 
 	KQ_OWNED(kq);
 	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
 	    ("scheduling kqueue task while draining"));
 
 	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
 		taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
 		kq->kq_state |= KQ_TASKSCHED;
 		td = curthread;
 		thread_lock(td);
 		td->td_flags |= TDF_ASTPENDING | TDF_KQTICKLED;
 		thread_unlock(td);
 	}
 }
 
 /*
  * Expand the kq to make sure we have storage for fops/ident pair.
  *
  * Return 0 on success (or no work necessary), return errno on failure.
  */
 static int
 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
     int mflag)
 {
 	struct klist *list, *tmp_knhash, *to_free;
 	u_long tmp_knhashmask;
 	int error, fd, size;
 
 	KQ_NOTOWNED(kq);
 
 	error = 0;
 	to_free = NULL;
 	if (fops->f_isfd) {
 		fd = ident;
 		if (kq->kq_knlistsize <= fd) {
 			size = kq->kq_knlistsize;
 			while (size <= fd)
 				size += KQEXTENT;
 			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
 			if (list == NULL)
 				return ENOMEM;
 			KQ_LOCK(kq);
 			if ((kq->kq_state & KQ_CLOSING) != 0) {
 				to_free = list;
 				error = EBADF;
 			} else if (kq->kq_knlistsize > fd) {
 				to_free = list;
 			} else {
 				if (kq->kq_knlist != NULL) {
 					bcopy(kq->kq_knlist, list,
 					    kq->kq_knlistsize * sizeof(*list));
 					to_free = kq->kq_knlist;
 					kq->kq_knlist = NULL;
 				}
 				bzero((caddr_t)list +
 				    kq->kq_knlistsize * sizeof(*list),
 				    (size - kq->kq_knlistsize) * sizeof(*list));
 				kq->kq_knlistsize = size;
 				kq->kq_knlist = list;
 			}
 			KQ_UNLOCK(kq);
 		}
 	} else {
 		if (kq->kq_knhashmask == 0) {
 			tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE,
 			    &tmp_knhashmask, (mflag & M_WAITOK) != 0 ?
 			    HASH_WAITOK : HASH_NOWAIT);
 			if (tmp_knhash == NULL)
 				return (ENOMEM);
 			KQ_LOCK(kq);
 			if ((kq->kq_state & KQ_CLOSING) != 0) {
 				to_free = tmp_knhash;
 				error = EBADF;
 			} else if (kq->kq_knhashmask == 0) {
 				kq->kq_knhash = tmp_knhash;
 				kq->kq_knhashmask = tmp_knhashmask;
 			} else {
 				to_free = tmp_knhash;
 			}
 			KQ_UNLOCK(kq);
 		}
 	}
 	free(to_free, M_KQUEUE);
 
 	KQ_NOTOWNED(kq);
 	return (error);
 }
 
 static void
 kqueue_task(void *arg, int pending)
 {
 	struct kqueue *kq;
 	int haskqglobal;
 
 	haskqglobal = 0;
 	kq = arg;
 
 	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 	KQ_LOCK(kq);
 
 	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
 
 	kq->kq_state &= ~KQ_TASKSCHED;
 	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
 		wakeup(&kq->kq_state);
 	}
 	KQ_UNLOCK(kq);
 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 }
 
 /*
  * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
  * We treat KN_MARKER knotes as if they are in flux.
  */
 static int
 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
     const struct timespec *tsp, struct kevent *keva, struct thread *td)
 {
 	struct kevent *kevp;
 	struct knote *kn, *marker;
 	struct knlist *knl;
 	sbintime_t asbt, rsbt;
 	int count, error, haskqglobal, influx, nkev, touch;
 
 	count = maxevents;
 	nkev = 0;
 	error = 0;
 	haskqglobal = 0;
 
 	if (maxevents == 0)
 		goto done_nl;
 	if (maxevents < 0) {
 		error = EINVAL;
 		goto done_nl;
 	}
 
 	rsbt = 0;
 	if (tsp != NULL) {
 		if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
 		    tsp->tv_nsec >= 1000000000) {
 			error = EINVAL;
 			goto done_nl;
 		}
 		if (timespecisset(tsp)) {
 			if (tsp->tv_sec <= INT32_MAX) {
 				rsbt = tstosbt(*tsp);
 				if (TIMESEL(&asbt, rsbt))
 					asbt += tc_tick_sbt;
 				if (asbt <= SBT_MAX - rsbt)
 					asbt += rsbt;
 				else
 					asbt = 0;
 				rsbt >>= tc_precexp;
 			} else
 				asbt = 0;
 		} else
 			asbt = -1;
 	} else
 		asbt = 0;
 	marker = knote_alloc(M_WAITOK);
 	marker->kn_status = KN_MARKER;
 	KQ_LOCK(kq);
 
 retry:
 	kevp = keva;
 	if (kq->kq_count == 0) {
 		if (asbt == -1) {
 			error = EWOULDBLOCK;
 		} else {
 			kq->kq_state |= KQ_SLEEP;
 			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
 			    "kqread", asbt, rsbt, C_ABSOLUTE);
 		}
 		if (error == 0)
 			goto retry;
 		/* don't restart after signals... */
 		if (error == ERESTART)
 			error = EINTR;
 		else if (error == EWOULDBLOCK)
 			error = 0;
 		goto done;
 	}
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
 	influx = 0;
 	while (count) {
 		KQ_OWNED(kq);
 		kn = TAILQ_FIRST(&kq->kq_head);
 
 		if ((kn->kn_status == KN_MARKER && kn != marker) ||
 		    kn_in_flux(kn)) {
 			if (influx) {
 				influx = 0;
 				KQ_FLUX_WAKEUP(kq);
 			}
 			kq->kq_state |= KQ_FLUXWAIT;
 			error = msleep(kq, &kq->kq_lock, PSOCK,
 			    "kqflxwt", 0);
 			continue;
 		}
 
 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
 			kn->kn_status &= ~KN_QUEUED;
 			kq->kq_count--;
 			continue;
 		}
 		if (kn == marker) {
 			KQ_FLUX_WAKEUP(kq);
 			if (count == maxevents)
 				goto retry;
 			goto done;
 		}
 		KASSERT(!kn_in_flux(kn),
 		    ("knote %p is unexpectedly in flux", kn));
 
 		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
 			kn->kn_status &= ~KN_QUEUED;
 			kn_enter_flux(kn);
 			kq->kq_count--;
 			KQ_UNLOCK(kq);
 			/*
 			 * We don't need to lock the list since we've
 			 * marked it as in flux.
 			 */
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 			continue;
 		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
 			kn->kn_status &= ~KN_QUEUED;
 			kn_enter_flux(kn);
 			kq->kq_count--;
 			KQ_UNLOCK(kq);
 			/*
 			 * We don't need to lock the list since we've
 			 * marked the knote as being in flux.
 			 */
 			*kevp = kn->kn_kevent;
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 			kn = NULL;
 		} else {
 			kn->kn_status |= KN_SCAN;
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
 				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 			knl = kn_list_lock(kn);
 			if (kn->kn_fop->f_event(kn, 0) == 0) {
 				KQ_LOCK(kq);
 				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
 				    KN_SCAN);
 				kn_leave_flux(kn);
 				kq->kq_count--;
 				kn_list_unlock(knl);
 				influx = 1;
 				continue;
 			}
 			touch = (!kn->kn_fop->f_isfd &&
 			    kn->kn_fop->f_touch != NULL);
 			if (touch)
 				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
 			else
 				*kevp = kn->kn_kevent;
 			KQ_LOCK(kq);
 			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
 				/* 
 				 * Manually clear knotes who weren't 
 				 * 'touch'ed.
 				 */
 				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
 					kn->kn_data = 0;
 					kn->kn_fflags = 0;
 				}
 				if (kn->kn_flags & EV_DISPATCH)
 					kn->kn_status |= KN_DISABLED;
 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
 				kq->kq_count--;
 			} else
 				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 			
 			kn->kn_status &= ~KN_SCAN;
 			kn_leave_flux(kn);
 			kn_list_unlock(knl);
 			influx = 1;
 		}
 
 		/* we are returning a copy to the user */
 		kevp++;
 		nkev++;
 		count--;
 
 		if (nkev == KQ_NEVENTS) {
 			influx = 0;
 			KQ_UNLOCK_FLUX(kq);
 			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
 			nkev = 0;
 			kevp = keva;
 			KQ_LOCK(kq);
 			if (error)
 				break;
 		}
 	}
 	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
 done:
 	KQ_OWNED(kq);
 	KQ_UNLOCK_FLUX(kq);
 	knote_free(marker);
 done_nl:
 	KQ_NOTOWNED(kq);
 	if (nkev != 0)
 		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
 	td->td_retval[0] = maxevents - count;
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
 	struct ucred *active_cred, struct thread *td)
 {
 	/*
 	 * Enabling sigio causes two major problems:
 	 * 1) infinite recursion:
 	 * Synopsys: kevent is being used to track signals and have FIOASYNC
 	 * set.  On receipt of a signal this will cause a kqueue to recurse
 	 * into itself over and over.  Sending the sigio causes the kqueue
 	 * to become ready, which in turn posts sigio again, forever.
 	 * Solution: this can be solved by setting a flag in the kqueue that
 	 * we have a SIGIO in progress.
 	 * 2) locking problems:
 	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
 	 * us above the proc and pgrp locks.
 	 * Solution: Post a signal using an async mechanism, being sure to
 	 * record a generation count in the delivery so that we do not deliver
 	 * a signal to the wrong process.
 	 *
 	 * Note, these two mechanisms are somewhat mutually exclusive!
 	 */
 #if 0
 	struct kqueue *kq;
 
 	kq = fp->f_data;
 	switch (cmd) {
 	case FIOASYNC:
 		if (*(int *)data) {
 			kq->kq_state |= KQ_ASYNC;
 		} else {
 			kq->kq_state &= ~KQ_ASYNC;
 		}
 		return (0);
 
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &kq->kq_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&kq->kq_sigio);
 		return (0);
 	}
 #endif
 
 	return (ENOTTY);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct kqueue *kq;
 	int revents = 0;
 	int error;
 
 	if ((error = kqueue_acquire(fp, &kq)))
 		return POLLERR;
 
 	KQ_LOCK(kq);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (kq->kq_count) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			selrecord(td, &kq->kq_sel);
 			if (SEL_WAITING(&kq->kq_sel))
 				kq->kq_state |= KQ_SEL;
 		}
 	}
 	kqueue_release(kq, 1);
 	KQ_UNLOCK(kq);
 	return (revents);
 }
 
 /*ARGSUSED*/
 static int
-kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
-	struct thread *td)
+kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
 {
 
 	bzero((void *)st, sizeof *st);
 	/*
 	 * We no longer return kq_count because the unlocked value is useless.
 	 * If you spent all this time getting the count, why not spend your
 	 * syscall better by calling kevent?
 	 *
 	 * XXX - This is needed for libc_r.
 	 */
 	st->st_mode = S_IFIFO;
 	return (0);
 }
 
 static void
 kqueue_drain(struct kqueue *kq, struct thread *td)
 {
 	struct knote *kn;
 	int i;
 
 	KQ_LOCK(kq);
 
 	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
 	    ("kqueue already closing"));
 	kq->kq_state |= KQ_CLOSING;
 	if (kq->kq_refcnt > 1)
 		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
 
 	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
 
 	KASSERT(knlist_empty(&kq->kq_sel.si_note),
 	    ("kqueue's knlist not empty"));
 
 	for (i = 0; i < kq->kq_knlistsize; i++) {
 		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
 			if (kn_in_flux(kn)) {
 				kq->kq_state |= KQ_FLUXWAIT;
 				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
 				continue;
 			}
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 		}
 	}
 	if (kq->kq_knhashmask != 0) {
 		for (i = 0; i <= kq->kq_knhashmask; i++) {
 			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
 				if (kn_in_flux(kn)) {
 					kq->kq_state |= KQ_FLUXWAIT;
 					msleep(kq, &kq->kq_lock, PSOCK,
 					       "kqclo2", 0);
 					continue;
 				}
 				kn_enter_flux(kn);
 				KQ_UNLOCK(kq);
 				knote_drop(kn, td);
 				KQ_LOCK(kq);
 			}
 		}
 	}
 
 	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
 		kq->kq_state |= KQ_TASKDRAIN;
 		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
 	}
 
 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
 		selwakeuppri(&kq->kq_sel, PSOCK);
 		if (!SEL_WAITING(&kq->kq_sel))
 			kq->kq_state &= ~KQ_SEL;
 	}
 
 	KQ_UNLOCK(kq);
 }
 
 static void
 kqueue_destroy(struct kqueue *kq)
 {
 
 	KASSERT(kq->kq_fdp == NULL,
 	    ("kqueue still attached to a file descriptor"));
 	seldrain(&kq->kq_sel);
 	knlist_destroy(&kq->kq_sel.si_note);
 	mtx_destroy(&kq->kq_lock);
 
 	if (kq->kq_knhash != NULL)
 		free(kq->kq_knhash, M_KQUEUE);
 	if (kq->kq_knlist != NULL)
 		free(kq->kq_knlist, M_KQUEUE);
 
 	funsetown(&kq->kq_sigio);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_close(struct file *fp, struct thread *td)
 {
 	struct kqueue *kq = fp->f_data;
 	struct filedesc *fdp;
 	int error;
 	int filedesc_unlock;
 
 	if ((error = kqueue_acquire(fp, &kq)))
 		return error;
 	kqueue_drain(kq, td);
 
 	/*
 	 * We could be called due to the knote_drop() doing fdrop(),
 	 * called from kqueue_register().  In this case the global
 	 * lock is owned, and filedesc sx is locked before, to not
 	 * take the sleepable lock after non-sleepable.
 	 */
 	fdp = kq->kq_fdp;
 	kq->kq_fdp = NULL;
 	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
 		FILEDESC_XLOCK(fdp);
 		filedesc_unlock = 1;
 	} else
 		filedesc_unlock = 0;
 	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
 	if (filedesc_unlock)
 		FILEDESC_XUNLOCK(fdp);
 
 	kqueue_destroy(kq);
 	chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
 	crfree(kq->kq_cred);
 	free(kq, M_KQUEUE);
 	fp->f_data = NULL;
 
 	return (0);
 }
 
 static int
 kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_KQUEUE;
 	return (0);
 }
 
 static void
 kqueue_wakeup(struct kqueue *kq)
 {
 	KQ_OWNED(kq);
 
 	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
 		kq->kq_state &= ~KQ_SLEEP;
 		wakeup(kq);
 	}
 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
 		selwakeuppri(&kq->kq_sel, PSOCK);
 		if (!SEL_WAITING(&kq->kq_sel))
 			kq->kq_state &= ~KQ_SEL;
 	}
 	if (!knlist_empty(&kq->kq_sel.si_note))
 		kqueue_schedtask(kq);
 	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
 		pgsigio(&kq->kq_sigio, SIGIO, 0);
 	}
 }
 
 /*
  * Walk down a list of knotes, activating them if their event has triggered.
  *
  * There is a possibility to optimize in the case of one kq watching another.
  * Instead of scheduling a task to wake it up, you could pass enough state
  * down the chain to make up the parent kqueue.  Make this code functional
  * first.
  */
 void
 knote(struct knlist *list, long hint, int lockflags)
 {
 	struct kqueue *kq;
 	struct knote *kn, *tkn;
 	int error;
 
 	if (list == NULL)
 		return;
 
 	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
 
 	if ((lockflags & KNF_LISTLOCKED) == 0)
 		list->kl_lock(list->kl_lockarg); 
 
 	/*
 	 * If we unlock the list lock (and enter influx), we can
 	 * eliminate the kqueue scheduling, but this will introduce
 	 * four lock/unlock's for each knote to test.  Also, marker
 	 * would be needed to keep iteration position, since filters
 	 * or other threads could remove events.
 	 */
 	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
 			/*
 			 * Do not process the influx notes, except for
 			 * the influx coming from the kq unlock in the
 			 * kqueue_scan().  In the later case, we do
 			 * not interfere with the scan, since the code
 			 * fragment in kqueue_scan() locks the knlist,
 			 * and cannot proceed until we finished.
 			 */
 			KQ_UNLOCK(kq);
 		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			error = kn->kn_fop->f_event(kn, hint);
 			KQ_LOCK(kq);
 			kn_leave_flux(kn);
 			if (error)
 				KNOTE_ACTIVATE(kn, 1);
 			KQ_UNLOCK_FLUX(kq);
 		} else {
 			if (kn->kn_fop->f_event(kn, hint))
 				KNOTE_ACTIVATE(kn, 1);
 			KQ_UNLOCK(kq);
 		}
 	}
 	if ((lockflags & KNF_LISTLOCKED) == 0)
 		list->kl_unlock(list->kl_lockarg); 
 }
 
 /*
  * add a knote to a knlist
  */
 void
 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
 {
 
 	KNL_ASSERT_LOCK(knl, islocked);
 	KQ_NOTOWNED(kn->kn_kq);
 	KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn));
 	KASSERT((kn->kn_status & KN_DETACHED) != 0,
 	    ("knote %p was not detached", kn));
 	if (!islocked)
 		knl->kl_lock(knl->kl_lockarg);
 	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
 	if (!islocked)
 		knl->kl_unlock(knl->kl_lockarg);
 	KQ_LOCK(kn->kn_kq);
 	kn->kn_knlist = knl;
 	kn->kn_status &= ~KN_DETACHED;
 	KQ_UNLOCK(kn->kn_kq);
 }
 
 static void
 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
     int kqislocked)
 {
 
 	KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked"));
 	KNL_ASSERT_LOCK(knl, knlislocked);
 	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
 	KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn));
 	KASSERT((kn->kn_status & KN_DETACHED) == 0,
 	    ("knote %p was already detached", kn));
 	if (!knlislocked)
 		knl->kl_lock(knl->kl_lockarg);
 	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
 	kn->kn_knlist = NULL;
 	if (!knlislocked)
 		kn_list_unlock(knl);
 	if (!kqislocked)
 		KQ_LOCK(kn->kn_kq);
 	kn->kn_status |= KN_DETACHED;
 	if (!kqislocked)
 		KQ_UNLOCK(kn->kn_kq);
 }
 
 /*
  * remove knote from the specified knlist
  */
 void
 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
 {
 
 	knlist_remove_kq(knl, kn, islocked, 0);
 }
 
 int
 knlist_empty(struct knlist *knl)
 {
 
 	KNL_ASSERT_LOCKED(knl);
 	return (SLIST_EMPTY(&knl->kl_list));
 }
 
 static struct mtx knlist_lock;
 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
     MTX_DEF);
 static void knlist_mtx_lock(void *arg);
 static void knlist_mtx_unlock(void *arg);
 
 static void
 knlist_mtx_lock(void *arg)
 {
 
 	mtx_lock((struct mtx *)arg);
 }
 
 static void
 knlist_mtx_unlock(void *arg)
 {
 
 	mtx_unlock((struct mtx *)arg);
 }
 
 static void
 knlist_mtx_assert_lock(void *arg, int what)
 {
 
 	if (what == LA_LOCKED)
 		mtx_assert((struct mtx *)arg, MA_OWNED);
 	else
 		mtx_assert((struct mtx *)arg, MA_NOTOWNED);
 }
 
 static void
 knlist_rw_rlock(void *arg)
 {
 
 	rw_rlock((struct rwlock *)arg);
 }
 
 static void
 knlist_rw_runlock(void *arg)
 {
 
 	rw_runlock((struct rwlock *)arg);
 }
 
 static void
 knlist_rw_assert_lock(void *arg, int what)
 {
 
 	if (what == LA_LOCKED)
 		rw_assert((struct rwlock *)arg, RA_LOCKED);
 	else
 		rw_assert((struct rwlock *)arg, RA_UNLOCKED);
 }
 
 void
 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
     void (*kl_unlock)(void *),
     void (*kl_assert_lock)(void *, int))
 {
 
 	if (lock == NULL)
 		knl->kl_lockarg = &knlist_lock;
 	else
 		knl->kl_lockarg = lock;
 
 	if (kl_lock == NULL)
 		knl->kl_lock = knlist_mtx_lock;
 	else
 		knl->kl_lock = kl_lock;
 	if (kl_unlock == NULL)
 		knl->kl_unlock = knlist_mtx_unlock;
 	else
 		knl->kl_unlock = kl_unlock;
 	if (kl_assert_lock == NULL)
 		knl->kl_assert_lock = knlist_mtx_assert_lock;
 	else
 		knl->kl_assert_lock = kl_assert_lock;
 
 	knl->kl_autodestroy = 0;
 	SLIST_INIT(&knl->kl_list);
 }
 
 void
 knlist_init_mtx(struct knlist *knl, struct mtx *lock)
 {
 
 	knlist_init(knl, lock, NULL, NULL, NULL);
 }
 
 struct knlist *
 knlist_alloc(struct mtx *lock)
 {
 	struct knlist *knl;
 
 	knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
 	knlist_init_mtx(knl, lock);
 	return (knl);
 }
 
 void
 knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
 {
 
 	knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
 	    knlist_rw_assert_lock);
 }
 
 void
 knlist_destroy(struct knlist *knl)
 {
 
 	KASSERT(KNLIST_EMPTY(knl),
 	    ("destroying knlist %p with knotes on it", knl));
 }
 
 void
 knlist_detach(struct knlist *knl)
 {
 
 	KNL_ASSERT_LOCKED(knl);
 	knl->kl_autodestroy = 1;
 	if (knlist_empty(knl)) {
 		knlist_destroy(knl);
 		free(knl, M_KQUEUE);
 	}
 }
 
 /*
  * Even if we are locked, we may need to drop the lock to allow any influx
  * knotes time to "settle".
  */
 void
 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
 {
 	struct knote *kn, *kn2;
 	struct kqueue *kq;
 
 	KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
 	if (islocked)
 		KNL_ASSERT_LOCKED(knl);
 	else {
 		KNL_ASSERT_UNLOCKED(knl);
 again:		/* need to reacquire lock since we have dropped it */
 		knl->kl_lock(knl->kl_lockarg);
 	}
 
 	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn)) {
 			KQ_UNLOCK(kq);
 			continue;
 		}
 		knlist_remove_kq(knl, kn, 1, 1);
 		if (killkn) {
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			knote_drop_detached(kn, td);
 		} else {
 			/* Make sure cleared knotes disappear soon */
 			kn->kn_flags |= EV_EOF | EV_ONESHOT;
 			KQ_UNLOCK(kq);
 		}
 		kq = NULL;
 	}
 
 	if (!SLIST_EMPTY(&knl->kl_list)) {
 		/* there are still in flux knotes remaining */
 		kn = SLIST_FIRST(&knl->kl_list);
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		KASSERT(kn_in_flux(kn), ("knote removed w/o list lock"));
 		knl->kl_unlock(knl->kl_lockarg);
 		kq->kq_state |= KQ_FLUXWAIT;
 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
 		kq = NULL;
 		goto again;
 	}
 
 	if (islocked)
 		KNL_ASSERT_LOCKED(knl);
 	else {
 		knl->kl_unlock(knl->kl_lockarg);
 		KNL_ASSERT_UNLOCKED(knl);
 	}
 }
 
 /*
  * Remove all knotes referencing a specified fd must be called with FILEDESC
  * lock.  This prevents a race where a new fd comes along and occupies the
  * entry and we attach a knote to the fd.
  */
 void
 knote_fdclose(struct thread *td, int fd)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct kqueue *kq;
 	struct knote *kn;
 	int influx;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	/*
 	 * We shouldn't have to worry about new kevents appearing on fd
 	 * since filedesc is locked.
 	 */
 	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
 		KQ_LOCK(kq);
 
 again:
 		influx = 0;
 		while (kq->kq_knlistsize > fd &&
 		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
 			if (kn_in_flux(kn)) {
 				/* someone else might be waiting on our knote */
 				if (influx)
 					wakeup(kq);
 				kq->kq_state |= KQ_FLUXWAIT;
 				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
 				goto again;
 			}
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			influx = 1;
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 		}
 		KQ_UNLOCK_FLUX(kq);
 	}
 }
 
 static int
 knote_attach(struct knote *kn, struct kqueue *kq)
 {
 	struct klist *list;
 
 	KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn));
 	KQ_OWNED(kq);
 
 	if ((kq->kq_state & KQ_CLOSING) != 0)
 		return (EBADF);
 	if (kn->kn_fop->f_isfd) {
 		if (kn->kn_id >= kq->kq_knlistsize)
 			return (ENOMEM);
 		list = &kq->kq_knlist[kn->kn_id];
 	} else {
 		if (kq->kq_knhash == NULL)
 			return (ENOMEM);
 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 	}
 	SLIST_INSERT_HEAD(list, kn, kn_link);
 	return (0);
 }
 
 static void
 knote_drop(struct knote *kn, struct thread *td)
 {
 
 	if ((kn->kn_status & KN_DETACHED) == 0)
 		kn->kn_fop->f_detach(kn);
 	knote_drop_detached(kn, td);
 }
 
 static void
 knote_drop_detached(struct knote *kn, struct thread *td)
 {
 	struct kqueue *kq;
 	struct klist *list;
 
 	kq = kn->kn_kq;
 
 	KASSERT((kn->kn_status & KN_DETACHED) != 0,
 	    ("knote %p still attached", kn));
 	KQ_NOTOWNED(kq);
 
 	KQ_LOCK(kq);
 	KASSERT(kn->kn_influx == 1,
 	    ("knote_drop called on %p with influx %d", kn, kn->kn_influx));
 
 	if (kn->kn_fop->f_isfd)
 		list = &kq->kq_knlist[kn->kn_id];
 	else
 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 
 	if (!SLIST_EMPTY(list))
 		SLIST_REMOVE(list, kn, knote, kn_link);
 	if (kn->kn_status & KN_QUEUED)
 		knote_dequeue(kn);
 	KQ_UNLOCK_FLUX(kq);
 
 	if (kn->kn_fop->f_isfd) {
 		fdrop(kn->kn_fp, td);
 		kn->kn_fp = NULL;
 	}
 	kqueue_fo_release(kn->kn_kevent.filter);
 	kn->kn_fop = NULL;
 	knote_free(kn);
 }
 
 static void
 knote_enqueue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 
 	KQ_OWNED(kn->kn_kq);
 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 	kn->kn_status |= KN_QUEUED;
 	kq->kq_count++;
 	kqueue_wakeup(kq);
 }
 
 static void
 knote_dequeue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 
 	KQ_OWNED(kn->kn_kq);
 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
 
 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 	kn->kn_status &= ~KN_QUEUED;
 	kq->kq_count--;
 }
 
 static void
 knote_init(void)
 {
 
 	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
 
 static struct knote *
 knote_alloc(int mflag)
 {
 
 	return (uma_zalloc(knote_zone, mflag | M_ZERO));
 }
 
 static void
 knote_free(struct knote *kn)
 {
 
 	uma_zfree(knote_zone, kn);
 }
 
 /*
  * Register the kev w/ the kq specified by fd.
  */
 int 
 kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag)
 {
 	struct kqueue *kq;
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(td, fd, cap_rights_init_one(&rights, CAP_KQUEUE_CHANGE),
 	    &fp);
 	if (error != 0)
 		return (error);
 	if ((error = kqueue_acquire(fp, &kq)) != 0)
 		goto noacquire;
 
 	error = kqueue_register(kq, kev, td, mflag);
 	kqueue_release(kq, 0);
 
 noacquire:
 	fdrop(fp, td);
 	return (error);
 }
diff --git a/sys/kern/sys_eventfd.c b/sys/kern/sys_eventfd.c
index 91c045e85faf..8ee8421296e5 100644
--- a/sys/kern/sys_eventfd.c
+++ b/sys/kern/sys_eventfd.c
@@ -1,347 +1,346 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2014 Dmitry Chagin <dchagin@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/types.h>
 #include <sys/user.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/stat.h>
 #include <sys/errno.h>
 #include <sys/event.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/uio.h>
 #include <sys/selinfo.h>
 #include <sys/eventfd.h>
 
 #include <security/audit/audit.h>
 
 _Static_assert(EFD_CLOEXEC == O_CLOEXEC, "Mismatched EFD_CLOEXEC");
 _Static_assert(EFD_NONBLOCK == O_NONBLOCK, "Mismatched EFD_NONBLOCK");
 
 MALLOC_DEFINE(M_EVENTFD, "eventfd", "eventfd structures");
 
 static fo_rdwr_t	eventfd_read;
 static fo_rdwr_t	eventfd_write;
 static fo_ioctl_t	eventfd_ioctl;
 static fo_poll_t	eventfd_poll;
 static fo_kqfilter_t	eventfd_kqfilter;
 static fo_stat_t	eventfd_stat;
 static fo_close_t	eventfd_close;
 static fo_fill_kinfo_t	eventfd_fill_kinfo;
 
 static struct fileops eventfdops = {
 	.fo_read = eventfd_read,
 	.fo_write = eventfd_write,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = eventfd_ioctl,
 	.fo_poll = eventfd_poll,
 	.fo_kqfilter = eventfd_kqfilter,
 	.fo_stat = eventfd_stat,
 	.fo_close = eventfd_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = eventfd_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static void	filt_eventfddetach(struct knote *kn);
 static int	filt_eventfdread(struct knote *kn, long hint);
 static int	filt_eventfdwrite(struct knote *kn, long hint);
 
 static struct filterops eventfd_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_eventfddetach,
 	.f_event = filt_eventfdread
 };
 
 static struct filterops eventfd_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_eventfddetach,
 	.f_event = filt_eventfdwrite
 };
 
 struct eventfd {
 	eventfd_t	efd_count;
 	uint32_t	efd_flags;
 	struct selinfo	efd_sel;
 	struct mtx	efd_lock;
 };
 
 int
 eventfd_create_file(struct thread *td, struct file *fp, uint32_t initval,
     int flags)
 {
 	struct eventfd *efd;
 	int fflags;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_VALUE(initval);
 
 	efd = malloc(sizeof(*efd), M_EVENTFD, M_WAITOK | M_ZERO);
 	efd->efd_flags = flags;
 	efd->efd_count = initval;
 	mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF);
 	knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock);
 
 	fflags = FREAD | FWRITE;
 	if ((flags & EFD_NONBLOCK) != 0)
 		fflags |= FNONBLOCK;
 	finit(fp, fflags, DTYPE_EVENTFD, efd, &eventfdops);
 
 	return (0);
 }
 
 static int
 eventfd_close(struct file *fp, struct thread *td)
 {
 	struct eventfd *efd;
 
 	efd = fp->f_data;
 	seldrain(&efd->efd_sel);
 	knlist_destroy(&efd->efd_sel.si_note);
 	mtx_destroy(&efd->efd_lock);
 	free(efd, M_EVENTFD);
 	return (0);
 }
 
 static int
 eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct eventfd *efd;
 	eventfd_t count;
 	int error;
 
 	if (uio->uio_resid < sizeof(eventfd_t))
 		return (EINVAL);
 
 	error = 0;
 	efd = fp->f_data;
 	mtx_lock(&efd->efd_lock);
 	while (error == 0 && efd->efd_count == 0) {
 		if ((fp->f_flag & FNONBLOCK) != 0) {
 			mtx_unlock(&efd->efd_lock);
 			return (EAGAIN);
 		}
 		error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH,
 		    "efdrd", 0);
 	}
 	if (error == 0) {
 		MPASS(efd->efd_count > 0);
 		if ((efd->efd_flags & EFD_SEMAPHORE) != 0) {
 			count = 1;
 			--efd->efd_count;
 		} else {
 			count = efd->efd_count;
 			efd->efd_count = 0;
 		}
 		KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
 		selwakeup(&efd->efd_sel);
 		wakeup(&efd->efd_count);
 		mtx_unlock(&efd->efd_lock);
 		error = uiomove(&count, sizeof(eventfd_t), uio);
 	} else
 		mtx_unlock(&efd->efd_lock);
 
 	return (error);
 }
 
 static int
 eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct eventfd *efd;
 	eventfd_t count;
 	int error;
 
 	if (uio->uio_resid < sizeof(eventfd_t))
 		return (EINVAL);
 
 	error = uiomove(&count, sizeof(eventfd_t), uio);
 	if (error != 0)
 		return (error);
 	if (count == UINT64_MAX)
 		return (EINVAL);
 
 	efd = fp->f_data;
 	mtx_lock(&efd->efd_lock);
 retry:
 	if (UINT64_MAX - efd->efd_count <= count) {
 		if ((fp->f_flag & FNONBLOCK) != 0) {
 			mtx_unlock(&efd->efd_lock);
 			/* Do not not return the number of bytes written */
 			uio->uio_resid += sizeof(eventfd_t);
 			return (EAGAIN);
 		}
 		error = mtx_sleep(&efd->efd_count, &efd->efd_lock,
 		    PCATCH, "efdwr", 0);
 		if (error == 0)
 			goto retry;
 	}
 	if (error == 0) {
 		MPASS(UINT64_MAX - efd->efd_count > count);
 		efd->efd_count += count;
 		KNOTE_LOCKED(&efd->efd_sel.si_note, 0);
 		selwakeup(&efd->efd_sel);
 		wakeup(&efd->efd_count);
 	}
 	mtx_unlock(&efd->efd_lock);
 
 	return (error);
 }
 
 static int
 eventfd_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct eventfd *efd;
 	int revents;
 
 	efd = fp->f_data;
 	revents = 0;
 	mtx_lock(&efd->efd_lock);
 	if ((events & (POLLIN | POLLRDNORM)) != 0 && efd->efd_count > 0)
 		revents |= events & (POLLIN | POLLRDNORM);
 	if ((events & (POLLOUT | POLLWRNORM)) != 0 && UINT64_MAX - 1 >
 	    efd->efd_count)
 		revents |= events & (POLLOUT | POLLWRNORM);
 	if (revents == 0)
 		selrecord(td, &efd->efd_sel);
 	mtx_unlock(&efd->efd_lock);
 
 	return (revents);
 }
 
 static int
 eventfd_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct eventfd *efd = fp->f_data;
 
 	mtx_lock(&efd->efd_lock);
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &eventfd_rfiltops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &eventfd_wfiltops;
 		break;
 	default:
 		mtx_unlock(&efd->efd_lock);
 		return (EINVAL);
 	}
 
 	kn->kn_hook = efd;
 	knlist_add(&efd->efd_sel.si_note, kn, 1);
 	mtx_unlock(&efd->efd_lock);
 
 	return (0);
 }
 
 static void
 filt_eventfddetach(struct knote *kn)
 {
 	struct eventfd *efd = kn->kn_hook;
 
 	mtx_lock(&efd->efd_lock);
 	knlist_remove(&efd->efd_sel.si_note, kn, 1);
 	mtx_unlock(&efd->efd_lock);
 }
 
 static int
 filt_eventfdread(struct knote *kn, long hint)
 {
 	struct eventfd *efd = kn->kn_hook;
 	int ret;
 
 	mtx_assert(&efd->efd_lock, MA_OWNED);
 	kn->kn_data = (int64_t)efd->efd_count;
 	ret = efd->efd_count > 0;
 
 	return (ret);
 }
 
 static int
 filt_eventfdwrite(struct knote *kn, long hint)
 {
 	struct eventfd *efd = kn->kn_hook;
 	int ret;
 
 	mtx_assert(&efd->efd_lock, MA_OWNED);
 	kn->kn_data = (int64_t)(UINT64_MAX - 1 - efd->efd_count);
 	ret = UINT64_MAX - 1 > efd->efd_count;
 
 	return (ret);
 }
 
 static int
 eventfd_ioctl(struct file *fp, u_long cmd, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 	switch (cmd) {
 	case FIONBIO:
 	case FIOASYNC:
 		return (0);
 	}
 
 	return (ENOTTY);
 }
 
 static int
-eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
-    struct thread *td)
+eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
 {
 	bzero((void *)st, sizeof *st);
 	st->st_mode = S_IFIFO;
 	return (0);
 }
 
 static int
 eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct eventfd *efd = fp->f_data;
 
 	kif->kf_type = KF_TYPE_EVENTFD;
 	mtx_lock(&efd->efd_lock);
 	kif->kf_un.kf_eventfd.kf_eventfd_value = efd->efd_count;
 	kif->kf_un.kf_eventfd.kf_eventfd_flags = efd->efd_flags;
 	mtx_unlock(&efd->efd_lock);
 	return (0);
 }
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
index ee6ccbbad322..7bd7fea28e76 100644
--- a/sys/kern/sys_pipe.c
+++ b/sys/kern/sys_pipe.c
@@ -1,1841 +1,1840 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1996 John S. Dyson
  * Copyright (c) 2012 Giovanni Trematerra
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice immediately at the beginning of the file, without modification,
  *    this list of conditions, and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Absolutely no warranty of function or purpose is made by the author
  *    John S. Dyson.
  * 4. Modifications may be freely made to this file if the above conditions
  *    are met.
  */
 
 /*
  * This file contains a high-performance replacement for the socket-based
  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
  * all features of sockets, but does do everything that pipes normally
  * do.
  */
 
 /*
  * This code has two modes of operation, a small write mode and a large
  * write mode.  The small write mode acts like conventional pipes with
  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
  * and PIPE_SIZE in size, the sending process pins the underlying pages in
  * memory, and the receiving process copies directly from these pinned pages
  * in the sending process.
  *
  * If the sending process receives a signal, it is possible that it will
  * go away, and certainly its address space can change, because control
  * is returned back to the user-mode side.  In that case, the pipe code
  * arranges to copy the buffer supplied by the user process, to a pageable
  * kernel buffer, and the receiving process will grab the data from the
  * pageable kernel buffer.  Since signals don't happen all that often,
  * the copy operation is normally eliminated.
  *
  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
  * happen for small transfers so that the system will not spend all of
  * its time context switching.
  *
  * In order to limit the resource use of pipes, two sysctls exist:
  *
  * kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
  * address space available to us in pipe_map. This value is normally
  * autotuned, but may also be loader tuned.
  *
  * kern.ipc.pipekva - This read-only sysctl tracks the current amount of
  * memory in use by pipes.
  *
  * Based on how large pipekva is relative to maxpipekva, the following
  * will happen:
  *
  * 0% - 50%:
  *     New pipes are given 16K of memory backing, pipes may dynamically
  *     grow to as large as 64K where needed.
  * 50% - 75%:
  *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
  *     existing pipes may NOT grow.
  * 75% - 100%:
  *     New pipes are given 4K (or PAGE_SIZE) of memory backing,
  *     existing pipes will be shrunk down to 4K whenever possible.
  *
  * Resizing may be disabled by setting kern.ipc.piperesizeallowed=0.  If
  * that is set,  the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
  * resize which MUST occur for reverse-direction pipes when they are
  * first used.
  *
  * Additional information about the current state of pipes may be obtained
  * from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
  * and kern.ipc.piperesizefail.
  *
  * Locking rules:  There are two locks present here:  A mutex, used via
  * PIPE_LOCK, and a flag, used via pipelock().  All locking is done via
  * the flag, as mutexes can not persist over uiomove.  The mutex
  * exists only to guard access to the flag, and is not in itself a
  * locking mechanism.  Also note that there is only a single mutex for
  * both directions of a pipe.
  *
  * As pipelock() may have to sleep before it can acquire the flag, it
  * is important to reread all data after a call to pipelock(); everything
  * in the structure may have changed.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/ttycom.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/pipe.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/uio.h>
 #include <sys/user.h>
 #include <sys/event.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 /*
  * Use this define if you want to disable *fancy* VM things.  Expect an
  * approx 30% decrease in transfer rate.  This could be useful for
  * NetBSD or OpenBSD.
  */
 /* #define PIPE_NODIRECT */
 
 #define PIPE_PEER(pipe)	\
 	(((pipe)->pipe_type & PIPE_TYPE_NAMED) ? (pipe) : ((pipe)->pipe_peer))
 
 /*
  * interfaces to the outside world
  */
 static fo_rdwr_t	pipe_read;
 static fo_rdwr_t	pipe_write;
 static fo_truncate_t	pipe_truncate;
 static fo_ioctl_t	pipe_ioctl;
 static fo_poll_t	pipe_poll;
 static fo_kqfilter_t	pipe_kqfilter;
 static fo_stat_t	pipe_stat;
 static fo_close_t	pipe_close;
 static fo_chmod_t	pipe_chmod;
 static fo_chown_t	pipe_chown;
 static fo_fill_kinfo_t	pipe_fill_kinfo;
 
 struct fileops pipeops = {
 	.fo_read = pipe_read,
 	.fo_write = pipe_write,
 	.fo_truncate = pipe_truncate,
 	.fo_ioctl = pipe_ioctl,
 	.fo_poll = pipe_poll,
 	.fo_kqfilter = pipe_kqfilter,
 	.fo_stat = pipe_stat,
 	.fo_close = pipe_close,
 	.fo_chmod = pipe_chmod,
 	.fo_chown = pipe_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = pipe_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static void	filt_pipedetach(struct knote *kn);
 static void	filt_pipedetach_notsup(struct knote *kn);
 static int	filt_pipenotsup(struct knote *kn, long hint);
 static int	filt_piperead(struct knote *kn, long hint);
 static int	filt_pipewrite(struct knote *kn, long hint);
 
 static struct filterops pipe_nfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_pipedetach_notsup,
 	.f_event = filt_pipenotsup
 };
 static struct filterops pipe_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_pipedetach,
 	.f_event = filt_piperead
 };
 static struct filterops pipe_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_pipedetach,
 	.f_event = filt_pipewrite
 };
 
 /*
  * Default pipe buffer size(s), this can be kind-of large now because pipe
  * space is pageable.  The pipe code will try to maintain locality of
  * reference for performance reasons, so small amounts of outstanding I/O
  * will not wipe the cache.
  */
 #define MINPIPESIZE (PIPE_SIZE/3)
 #define MAXPIPESIZE (2*PIPE_SIZE/3)
 
 static long amountpipekva;
 static int pipefragretry;
 static int pipeallocfail;
 static int piperesizefail;
 static int piperesizeallowed = 1;
 static long pipe_mindirect = PIPE_MINDIRECT;
 
 SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
 	   &maxpipekva, 0, "Pipe KVA limit");
 SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
 	   &amountpipekva, 0, "Pipe KVA usage");
 SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
 	  &pipefragretry, 0, "Pipe allocation retries due to fragmentation");
 SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
 	  &pipeallocfail, 0, "Pipe allocation failures");
 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
 	  &piperesizefail, 0, "Pipe resize failures");
 SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
 	  &piperesizeallowed, 0, "Pipe resizing allowed");
 
 static void pipeinit(void *dummy __unused);
 static void pipeclose(struct pipe *cpipe);
 static void pipe_free_kmem(struct pipe *cpipe);
 static int pipe_create(struct pipe *pipe, bool backing);
 static int pipe_paircreate(struct thread *td, struct pipepair **p_pp);
 static __inline int pipelock(struct pipe *cpipe, int catch);
 static __inline void pipeunlock(struct pipe *cpipe);
 static void pipe_timestamp(struct timespec *tsp);
 #ifndef PIPE_NODIRECT
 static int pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio);
 static void pipe_destroy_write_buffer(struct pipe *wpipe);
 static int pipe_direct_write(struct pipe *wpipe, struct uio *uio);
 static void pipe_clone_write_buffer(struct pipe *wpipe);
 #endif
 static int pipespace(struct pipe *cpipe, int size);
 static int pipespace_new(struct pipe *cpipe, int size);
 
 static int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
 static int	pipe_zone_init(void *mem, int size, int flags);
 static void	pipe_zone_fini(void *mem, int size);
 
 static uma_zone_t pipe_zone;
 static struct unrhdr64 pipeino_unr;
 static dev_t pipedev_ino;
 
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);
 
 static void
 pipeinit(void *dummy __unused)
 {
 
 	pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
 	    pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini,
 	    UMA_ALIGN_PTR, 0);
 	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
 	new_unrhdr64(&pipeino_unr, 1);
 	pipedev_ino = devfs_alloc_cdp_inode();
 	KASSERT(pipedev_ino > 0, ("pipe dev inode not initialized"));
 }
 
 static int
 sysctl_handle_pipe_mindirect(SYSCTL_HANDLER_ARGS)
 {
 	int error = 0;
 	long tmp_pipe_mindirect = pipe_mindirect;
 
 	error = sysctl_handle_long(oidp, &tmp_pipe_mindirect, arg2, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	/*
 	 * Don't allow pipe_mindirect to be set so low that we violate
 	 * atomicity requirements.
 	 */
 	if (tmp_pipe_mindirect <= PIPE_BUF)
 		return (EINVAL);
 	pipe_mindirect = tmp_pipe_mindirect;
 	return (0);
 }
 SYSCTL_OID(_kern_ipc, OID_AUTO, pipe_mindirect, CTLTYPE_LONG | CTLFLAG_RW,
     &pipe_mindirect, 0, sysctl_handle_pipe_mindirect, "L",
     "Minimum write size triggering VM optimization");
 
 static int
 pipe_zone_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct pipepair *pp;
 	struct pipe *rpipe, *wpipe;
 
 	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));
 
 	pp = (struct pipepair *)mem;
 
 	/*
 	 * We zero both pipe endpoints to make sure all the kmem pointers
 	 * are NULL, flag fields are zero'd, etc.  We timestamp both
 	 * endpoints with the same time.
 	 */
 	rpipe = &pp->pp_rpipe;
 	bzero(rpipe, sizeof(*rpipe));
 	pipe_timestamp(&rpipe->pipe_ctime);
 	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;
 
 	wpipe = &pp->pp_wpipe;
 	bzero(wpipe, sizeof(*wpipe));
 	wpipe->pipe_ctime = rpipe->pipe_ctime;
 	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;
 
 	rpipe->pipe_peer = wpipe;
 	rpipe->pipe_pair = pp;
 	wpipe->pipe_peer = rpipe;
 	wpipe->pipe_pair = pp;
 
 	/*
 	 * Mark both endpoints as present; they will later get free'd
 	 * one at a time.  When both are free'd, then the whole pair
 	 * is released.
 	 */
 	rpipe->pipe_present = PIPE_ACTIVE;
 	wpipe->pipe_present = PIPE_ACTIVE;
 
 	/*
 	 * Eventually, the MAC Framework may initialize the label
 	 * in ctor or init, but for now we do it elswhere to avoid
 	 * blocking in ctor or init.
 	 */
 	pp->pp_label = NULL;
 
 	return (0);
 }
 
 static int
 pipe_zone_init(void *mem, int size, int flags)
 {
 	struct pipepair *pp;
 
 	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));
 
 	pp = (struct pipepair *)mem;
 
 	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF | MTX_NEW);
 	return (0);
 }
 
 static void
 pipe_zone_fini(void *mem, int size)
 {
 	struct pipepair *pp;
 
 	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));
 
 	pp = (struct pipepair *)mem;
 
 	mtx_destroy(&pp->pp_mtx);
 }
 
 static int
 pipe_paircreate(struct thread *td, struct pipepair **p_pp)
 {
 	struct pipepair *pp;
 	struct pipe *rpipe, *wpipe;
 	int error;
 
 	*p_pp = pp = uma_zalloc(pipe_zone, M_WAITOK);
 #ifdef MAC
 	/*
 	 * The MAC label is shared between the connected endpoints.  As a
 	 * result mac_pipe_init() and mac_pipe_create() are called once
 	 * for the pair, and not on the endpoints.
 	 */
 	mac_pipe_init(pp);
 	mac_pipe_create(td->td_ucred, pp);
 #endif
 	rpipe = &pp->pp_rpipe;
 	wpipe = &pp->pp_wpipe;
 
 	knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe));
 	knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe));
 
 	/*
 	 * Only the forward direction pipe is backed by big buffer by
 	 * default.
 	 */
 	error = pipe_create(rpipe, true);
 	if (error != 0)
 		goto fail;
 	error = pipe_create(wpipe, false);
 	if (error != 0) {
 		/*
 		 * This cleanup leaves the pipe inode number for rpipe
 		 * still allocated, but never used.  We do not free
 		 * inode numbers for opened pipes, which is required
 		 * for correctness because numbers must be unique.
 		 * But also it avoids any memory use by the unr
 		 * allocator, so stashing away the transient inode
 		 * number is reasonable.
 		 */
 		pipe_free_kmem(rpipe);
 		goto fail;
 	}
 
 	rpipe->pipe_state |= PIPE_DIRECTOK;
 	wpipe->pipe_state |= PIPE_DIRECTOK;
 	return (0);
 
 fail:
 	knlist_destroy(&rpipe->pipe_sel.si_note);
 	knlist_destroy(&wpipe->pipe_sel.si_note);
 #ifdef MAC
 	mac_pipe_destroy(pp);
 #endif
 	uma_zfree(pipe_zone, pp);
 	return (error);
 }
 
 int
 pipe_named_ctor(struct pipe **ppipe, struct thread *td)
 {
 	struct pipepair *pp;
 	int error;
 
 	error = pipe_paircreate(td, &pp);
 	if (error != 0)
 		return (error);
 	pp->pp_rpipe.pipe_type |= PIPE_TYPE_NAMED;
 	*ppipe = &pp->pp_rpipe;
 	return (0);
 }
 
 void
 pipe_dtor(struct pipe *dpipe)
 {
 	struct pipe *peer;
 
 	peer = (dpipe->pipe_type & PIPE_TYPE_NAMED) != 0 ? dpipe->pipe_peer : NULL;
 	funsetown(&dpipe->pipe_sigio);
 	pipeclose(dpipe);
 	if (peer != NULL) {
 		funsetown(&peer->pipe_sigio);
 		pipeclose(peer);
 	}
 }
 
 /*
  * Get a timestamp.
  *
  * This used to be vfs_timestamp but the higher precision is unnecessary and
  * can very negatively affect performance in virtualized environments (e.g., on
  * vms running on amd64 when using the rdtscp instruction).
  */
 static void
 pipe_timestamp(struct timespec *tsp)
 {
 
 	getnanotime(tsp);
 }
 
 /*
  * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail, let
  * the zone pick up the pieces via pipeclose().
  */
 int
 kern_pipe(struct thread *td, int fildes[2], int flags, struct filecaps *fcaps1,
     struct filecaps *fcaps2)
 {
 	struct file *rf, *wf;
 	struct pipe *rpipe, *wpipe;
 	struct pipepair *pp;
 	int fd, fflags, error;
 
 	error = pipe_paircreate(td, &pp);
 	if (error != 0)
 		return (error);
 	rpipe = &pp->pp_rpipe;
 	wpipe = &pp->pp_wpipe;
 	error = falloc_caps(td, &rf, &fd, flags, fcaps1);
 	if (error) {
 		pipeclose(rpipe);
 		pipeclose(wpipe);
 		return (error);
 	}
 	/* An extra reference on `rf' has been held for us by falloc_caps(). */
 	fildes[0] = fd;
 
 	fflags = FREAD | FWRITE;
 	if ((flags & O_NONBLOCK) != 0)
 		fflags |= FNONBLOCK;
 
 	/*
 	 * Warning: once we've gotten past allocation of the fd for the
 	 * read-side, we can only drop the read side via fdrop() in order
 	 * to avoid races against processes which manage to dup() the read
 	 * side while we are blocked trying to allocate the write side.
 	 */
 	finit(rf, fflags, DTYPE_PIPE, rpipe, &pipeops);
 	error = falloc_caps(td, &wf, &fd, flags, fcaps2);
 	if (error) {
 		fdclose(td, rf, fildes[0]);
 		fdrop(rf, td);
 		/* rpipe has been closed by fdrop(). */
 		pipeclose(wpipe);
 		return (error);
 	}
 	/* An extra reference on `wf' has been held for us by falloc_caps(). */
 	finit(wf, fflags, DTYPE_PIPE, wpipe, &pipeops);
 	fdrop(wf, td);
 	fildes[1] = fd;
 	fdrop(rf, td);
 
 	return (0);
 }
 
 #ifdef COMPAT_FREEBSD10
 /* ARGSUSED */
 int
 freebsd10_pipe(struct thread *td, struct freebsd10_pipe_args *uap __unused)
 {
 	int error;
 	int fildes[2];
 
 	error = kern_pipe(td, fildes, 0, NULL, NULL);
 	if (error)
 		return (error);
 
 	td->td_retval[0] = fildes[0];
 	td->td_retval[1] = fildes[1];
 
 	return (0);
 }
 #endif
 
 int
 sys_pipe2(struct thread *td, struct pipe2_args *uap)
 {
 	int error, fildes[2];
 
 	if (uap->flags & ~(O_CLOEXEC | O_NONBLOCK))
 		return (EINVAL);
 	error = kern_pipe(td, fildes, uap->flags, NULL, NULL);
 	if (error)
 		return (error);
 	error = copyout(fildes, uap->fildes, 2 * sizeof(int));
 	if (error) {
 		(void)kern_close(td, fildes[0]);
 		(void)kern_close(td, fildes[1]);
 	}
 	return (error);
 }
 
 /*
  * Allocate kva for pipe circular buffer, the space is pageable
  * This routine will 'realloc' the size of a pipe safely, if it fails
  * it will retain the old buffer.
  * If it fails it will return ENOMEM.
  */
 static int
 pipespace_new(struct pipe *cpipe, int size)
 {
 	caddr_t buffer;
 	int error, cnt, firstseg;
 	static int curfail = 0;
 	static struct timeval lastfail;
 
 	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
 	KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
 		("pipespace: resize of direct writes not allowed"));
 retry:
 	cnt = cpipe->pipe_buffer.cnt;
 	if (cnt > size)
 		size = cnt;
 
 	size = round_page(size);
 	buffer = (caddr_t) vm_map_min(pipe_map);
 
 	error = vm_map_find(pipe_map, NULL, 0, (vm_offset_t *)&buffer, size, 0,
 	    VMFS_ANY_SPACE, VM_PROT_RW, VM_PROT_RW, 0);
 	if (error != KERN_SUCCESS) {
 		if (cpipe->pipe_buffer.buffer == NULL &&
 		    size > SMALL_PIPE_SIZE) {
 			size = SMALL_PIPE_SIZE;
 			pipefragretry++;
 			goto retry;
 		}
 		if (cpipe->pipe_buffer.buffer == NULL) {
 			pipeallocfail++;
 			if (ppsratecheck(&lastfail, &curfail, 1))
 				printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
 		} else {
 			piperesizefail++;
 		}
 		return (ENOMEM);
 	}
 
 	/* copy data, then free old resources if we're resizing */
 	if (cnt > 0) {
 		if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
 			firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
 			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
 				buffer, firstseg);
 			if ((cnt - firstseg) > 0)
 				bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
 					cpipe->pipe_buffer.in);
 		} else {
 			bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
 				buffer, cnt);
 		}
 	}
 	pipe_free_kmem(cpipe);
 	cpipe->pipe_buffer.buffer = buffer;
 	cpipe->pipe_buffer.size = size;
 	cpipe->pipe_buffer.in = cnt;
 	cpipe->pipe_buffer.out = 0;
 	cpipe->pipe_buffer.cnt = cnt;
 	atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size);
 	return (0);
 }
 
 /*
  * Wrapper for pipespace_new() that performs locking assertions.
  */
 static int
 pipespace(struct pipe *cpipe, int size)
 {
 
 	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
 	    ("Unlocked pipe passed to pipespace"));
 	return (pipespace_new(cpipe, size));
 }
 
 /*
  * lock a pipe for I/O, blocking other access
  */
 static __inline int
 pipelock(struct pipe *cpipe, int catch)
 {
 	int error, prio;
 
 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
 
 	prio = PRIBIO;
 	if (catch)
 		prio |= PCATCH;
 	while (cpipe->pipe_state & PIPE_LOCKFL) {
 		KASSERT(cpipe->pipe_waiters >= 0,
 		    ("%s: bad waiter count %d", __func__,
 		    cpipe->pipe_waiters));
 		cpipe->pipe_waiters++;
 		error = msleep(cpipe, PIPE_MTX(cpipe),
 		    prio, "pipelk", 0);
 		cpipe->pipe_waiters--;
 		if (error != 0)
 			return (error);
 	}
 	cpipe->pipe_state |= PIPE_LOCKFL;
 	return (0);
 }
 
 /*
  * unlock a pipe I/O lock
  */
 static __inline void
 pipeunlock(struct pipe *cpipe)
 {
 
 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
 	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
 		("Unlocked pipe passed to pipeunlock"));
 	KASSERT(cpipe->pipe_waiters >= 0,
 	    ("%s: bad waiter count %d", __func__,
 	    cpipe->pipe_waiters));
 	cpipe->pipe_state &= ~PIPE_LOCKFL;
 	if (cpipe->pipe_waiters > 0) {
 		wakeup_one(cpipe);
 	}
 }
 
 void
 pipeselwakeup(struct pipe *cpipe)
 {
 
 	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
 	if (cpipe->pipe_state & PIPE_SEL) {
 		selwakeuppri(&cpipe->pipe_sel, PSOCK);
 		if (!SEL_WAITING(&cpipe->pipe_sel))
 			cpipe->pipe_state &= ~PIPE_SEL;
 	}
 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
 		pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
 	KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
 }
 
 /*
  * Initialize and allocate VM and memory for pipe.  The structure
  * will start out zero'd from the ctor, so we just manage the kmem.
  */
 static int
 pipe_create(struct pipe *pipe, bool large_backing)
 {
 	int error;
 
 	error = pipespace_new(pipe, !large_backing || amountpipekva >
 	    maxpipekva / 2 ? SMALL_PIPE_SIZE : PIPE_SIZE);
 	if (error == 0)
 		pipe->pipe_ino = alloc_unr64(&pipeino_unr);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 pipe_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct pipe *rpipe;
 	int error;
 	int nread = 0;
 	int size;
 
 	rpipe = fp->f_data;
 	PIPE_LOCK(rpipe);
 	++rpipe->pipe_busy;
 	error = pipelock(rpipe, 1);
 	if (error)
 		goto unlocked_error;
 
 #ifdef MAC
 	error = mac_pipe_check_read(active_cred, rpipe->pipe_pair);
 	if (error)
 		goto locked_error;
 #endif
 	if (amountpipekva > (3 * maxpipekva) / 4) {
 		if ((rpipe->pipe_state & PIPE_DIRECTW) == 0 &&
 		    rpipe->pipe_buffer.size > SMALL_PIPE_SIZE &&
 		    rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE &&
 		    piperesizeallowed == 1) {
 			PIPE_UNLOCK(rpipe);
 			pipespace(rpipe, SMALL_PIPE_SIZE);
 			PIPE_LOCK(rpipe);
 		}
 	}
 
 	while (uio->uio_resid) {
 		/*
 		 * normal pipe buffer receive
 		 */
 		if (rpipe->pipe_buffer.cnt > 0) {
 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
 			if (size > rpipe->pipe_buffer.cnt)
 				size = rpipe->pipe_buffer.cnt;
 			if (size > uio->uio_resid)
 				size = uio->uio_resid;
 
 			PIPE_UNLOCK(rpipe);
 			error = uiomove(
 			    &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
 			    size, uio);
 			PIPE_LOCK(rpipe);
 			if (error)
 				break;
 
 			rpipe->pipe_buffer.out += size;
 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
 				rpipe->pipe_buffer.out = 0;
 
 			rpipe->pipe_buffer.cnt -= size;
 
 			/*
 			 * If there is no more to read in the pipe, reset
 			 * its pointers to the beginning.  This improves
 			 * cache hit stats.
 			 */
 			if (rpipe->pipe_buffer.cnt == 0) {
 				rpipe->pipe_buffer.in = 0;
 				rpipe->pipe_buffer.out = 0;
 			}
 			nread += size;
 #ifndef PIPE_NODIRECT
 		/*
 		 * Direct copy, bypassing a kernel buffer.
 		 */
 		} else if ((size = rpipe->pipe_pages.cnt) != 0) {
 			if (size > uio->uio_resid)
 				size = (u_int) uio->uio_resid;
 			PIPE_UNLOCK(rpipe);
 			error = uiomove_fromphys(rpipe->pipe_pages.ms,
 			    rpipe->pipe_pages.pos, size, uio);
 			PIPE_LOCK(rpipe);
 			if (error)
 				break;
 			nread += size;
 			rpipe->pipe_pages.pos += size;
 			rpipe->pipe_pages.cnt -= size;
 			if (rpipe->pipe_pages.cnt == 0) {
 				rpipe->pipe_state &= ~PIPE_WANTW;
 				wakeup(rpipe);
 			}
 #endif
 		} else {
 			/*
 			 * detect EOF condition
 			 * read returns 0 on EOF, no need to set error
 			 */
 			if (rpipe->pipe_state & PIPE_EOF)
 				break;
 
 			/*
 			 * If the "write-side" has been blocked, wake it up now.
 			 */
 			if (rpipe->pipe_state & PIPE_WANTW) {
 				rpipe->pipe_state &= ~PIPE_WANTW;
 				wakeup(rpipe);
 			}
 
 			/*
 			 * Break if some data was read.
 			 */
 			if (nread > 0)
 				break;
 
 			/*
 			 * Unlock the pipe buffer for our remaining processing.
 			 * We will either break out with an error or we will
 			 * sleep and relock to loop.
 			 */
 			pipeunlock(rpipe);
 
 			/*
 			 * Handle non-blocking mode operation or
 			 * wait for more data.
 			 */
 			if (fp->f_flag & FNONBLOCK) {
 				error = EAGAIN;
 			} else {
 				rpipe->pipe_state |= PIPE_WANTR;
 				if ((error = msleep(rpipe, PIPE_MTX(rpipe),
 				    PRIBIO | PCATCH,
 				    "piperd", 0)) == 0)
 					error = pipelock(rpipe, 1);
 			}
 			if (error)
 				goto unlocked_error;
 		}
 	}
 #ifdef MAC
 locked_error:
 #endif
 	pipeunlock(rpipe);
 
 	/* XXX: should probably do this before getting any locks. */
 	if (error == 0)
 		pipe_timestamp(&rpipe->pipe_atime);
 unlocked_error:
 	--rpipe->pipe_busy;
 
 	/*
 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
 	 */
 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
 		wakeup(rpipe);
 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
 		/*
 		 * Handle write blocking hysteresis.
 		 */
 		if (rpipe->pipe_state & PIPE_WANTW) {
 			rpipe->pipe_state &= ~PIPE_WANTW;
 			wakeup(rpipe);
 		}
 	}
 
 	/*
 	 * Only wake up writers if there was actually something read.
 	 * Otherwise, when calling read(2) at EOF, a spurious wakeup occurs.
 	 */
 	if (nread > 0 &&
 	    rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt >= PIPE_BUF)
 		pipeselwakeup(rpipe);
 
 	PIPE_UNLOCK(rpipe);
 	if (nread > 0)
 		td->td_ru.ru_msgrcv++;
 	return (error);
 }
 
 #ifndef PIPE_NODIRECT
 /*
  * Map the sending processes' buffer into kernel space and wire it.
  * This is similar to a physical write operation.
  */
 static int
 pipe_build_write_buffer(struct pipe *wpipe, struct uio *uio)
 {
 	u_int size;
 	int i;
 
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0,
 	    ("%s: PIPE_DIRECTW set on %p", __func__, wpipe));
 	KASSERT(wpipe->pipe_pages.cnt == 0,
 	    ("%s: pipe map for %p contains residual data", __func__, wpipe));
 
 	if (uio->uio_iov->iov_len > wpipe->pipe_buffer.size)
                 size = wpipe->pipe_buffer.size;
 	else
                 size = uio->uio_iov->iov_len;
 
 	wpipe->pipe_state |= PIPE_DIRECTW;
 	PIPE_UNLOCK(wpipe);
 	i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ,
 	    wpipe->pipe_pages.ms, PIPENPAGES);
 	PIPE_LOCK(wpipe);
 	if (i < 0) {
 		wpipe->pipe_state &= ~PIPE_DIRECTW;
 		return (EFAULT);
 	}
 
 	wpipe->pipe_pages.npages = i;
 	wpipe->pipe_pages.pos =
 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
 	wpipe->pipe_pages.cnt = size;
 
 	uio->uio_iov->iov_len -= size;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
 	if (uio->uio_iov->iov_len == 0)
 		uio->uio_iov++;
 	uio->uio_resid -= size;
 	uio->uio_offset += size;
 	return (0);
 }
 
 /*
  * Unwire the process buffer.
  */
 static void
 pipe_destroy_write_buffer(struct pipe *wpipe)
 {
 
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0,
 	    ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe));
 	KASSERT(wpipe->pipe_pages.cnt == 0,
 	    ("%s: pipe map for %p contains residual data", __func__, wpipe));
 
 	wpipe->pipe_state &= ~PIPE_DIRECTW;
 	vm_page_unhold_pages(wpipe->pipe_pages.ms, wpipe->pipe_pages.npages);
 	wpipe->pipe_pages.npages = 0;
 }
 
 /*
  * In the case of a signal, the writing process might go away.  This
  * code copies the data into the circular buffer so that the source
  * pages can be freed without loss of data.
  */
 static void
 pipe_clone_write_buffer(struct pipe *wpipe)
 {
 	struct uio uio;
 	struct iovec iov;
 	int size;
 	int pos;
 
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	KASSERT((wpipe->pipe_state & PIPE_DIRECTW) != 0,
 	    ("%s: PIPE_DIRECTW not set on %p", __func__, wpipe));
 
 	size = wpipe->pipe_pages.cnt;
 	pos = wpipe->pipe_pages.pos;
 	wpipe->pipe_pages.cnt = 0;
 
 	wpipe->pipe_buffer.in = size;
 	wpipe->pipe_buffer.out = 0;
 	wpipe->pipe_buffer.cnt = size;
 
 	PIPE_UNLOCK(wpipe);
 	iov.iov_base = wpipe->pipe_buffer.buffer;
 	iov.iov_len = size;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = size;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = curthread;
 	uiomove_fromphys(wpipe->pipe_pages.ms, pos, size, &uio);
 	PIPE_LOCK(wpipe);
 	pipe_destroy_write_buffer(wpipe);
 }
 
 /*
  * This implements the pipe buffer write mechanism.  Note that only
  * a direct write OR a normal pipe write can be pending at any given time.
  * If there are any characters in the pipe buffer, the direct write will
  * be deferred until the receiving process grabs all of the bytes from
  * the pipe buffer.  Then the direct mapping write is set-up.
  */
 static int
 pipe_direct_write(struct pipe *wpipe, struct uio *uio)
 {
 	int error;
 
 retry:
 	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 	if ((wpipe->pipe_state & PIPE_EOF) != 0) {
 		error = EPIPE;
 		goto error1;
 	}
 	if (wpipe->pipe_state & PIPE_DIRECTW) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		pipeselwakeup(wpipe);
 		wpipe->pipe_state |= PIPE_WANTW;
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe),
 		    PRIBIO | PCATCH, "pipdww", 0);
 		pipelock(wpipe, 0);
 		if (error != 0)
 			goto error1;
 		goto retry;
 	}
 	if (wpipe->pipe_buffer.cnt > 0) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		pipeselwakeup(wpipe);
 		wpipe->pipe_state |= PIPE_WANTW;
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe),
 		    PRIBIO | PCATCH, "pipdwc", 0);
 		pipelock(wpipe, 0);
 		if (error != 0)
 			goto error1;
 		goto retry;
 	}
 
 	error = pipe_build_write_buffer(wpipe, uio);
 	if (error) {
 		goto error1;
 	}
 
 	while (wpipe->pipe_pages.cnt != 0 &&
 	    (wpipe->pipe_state & PIPE_EOF) == 0) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		pipeselwakeup(wpipe);
 		wpipe->pipe_state |= PIPE_WANTW;
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO | PCATCH,
 		    "pipdwt", 0);
 		pipelock(wpipe, 0);
 		if (error != 0)
 			break;
 	}
 
 	if ((wpipe->pipe_state & PIPE_EOF) != 0) {
 		wpipe->pipe_pages.cnt = 0;
 		pipe_destroy_write_buffer(wpipe);
 		pipeselwakeup(wpipe);
 		error = EPIPE;
 	} else if (error == EINTR || error == ERESTART) {
 		pipe_clone_write_buffer(wpipe);
 	} else {
 		pipe_destroy_write_buffer(wpipe);
 	}
 	KASSERT((wpipe->pipe_state & PIPE_DIRECTW) == 0,
 	    ("pipe %p leaked PIPE_DIRECTW", wpipe));
 	return (error);
 
 error1:
 	wakeup(wpipe);
 	return (error);
 }
 #endif
 
 static int
 pipe_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct pipe *wpipe, *rpipe;
 	ssize_t orig_resid;
 	int desiredsize, error;
 
 	rpipe = fp->f_data;
 	wpipe = PIPE_PEER(rpipe);
 	PIPE_LOCK(rpipe);
 	error = pipelock(wpipe, 1);
 	if (error) {
 		PIPE_UNLOCK(rpipe);
 		return (error);
 	}
 	/*
 	 * detect loss of pipe read side, issue SIGPIPE if lost.
 	 */
 	if (wpipe->pipe_present != PIPE_ACTIVE ||
 	    (wpipe->pipe_state & PIPE_EOF)) {
 		pipeunlock(wpipe);
 		PIPE_UNLOCK(rpipe);
 		return (EPIPE);
 	}
 #ifdef MAC
 	error = mac_pipe_check_write(active_cred, wpipe->pipe_pair);
 	if (error) {
 		pipeunlock(wpipe);
 		PIPE_UNLOCK(rpipe);
 		return (error);
 	}
 #endif
 	++wpipe->pipe_busy;
 
 	/* Choose a larger size if it's advantageous */
 	desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
 	while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
 		if (piperesizeallowed != 1)
 			break;
 		if (amountpipekva > maxpipekva / 2)
 			break;
 		if (desiredsize == BIG_PIPE_SIZE)
 			break;
 		desiredsize = desiredsize * 2;
 	}
 
 	/* Choose a smaller size if we're in a OOM situation */
 	if (amountpipekva > (3 * maxpipekva) / 4 &&
 	    wpipe->pipe_buffer.size > SMALL_PIPE_SIZE &&
 	    wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE &&
 	    piperesizeallowed == 1)
 		desiredsize = SMALL_PIPE_SIZE;
 
 	/* Resize if the above determined that a new size was necessary */
 	if (desiredsize != wpipe->pipe_buffer.size &&
 	    (wpipe->pipe_state & PIPE_DIRECTW) == 0) {
 		PIPE_UNLOCK(wpipe);
 		pipespace(wpipe, desiredsize);
 		PIPE_LOCK(wpipe);
 	}
 	MPASS(wpipe->pipe_buffer.size != 0);
 
 	orig_resid = uio->uio_resid;
 
 	while (uio->uio_resid) {
 		int space;
 
 		if (wpipe->pipe_state & PIPE_EOF) {
 			error = EPIPE;
 			break;
 		}
 #ifndef PIPE_NODIRECT
 		/*
 		 * If the transfer is large, we can gain performance if
 		 * we do process-to-process copies directly.
 		 * If the write is non-blocking, we don't use the
 		 * direct write mechanism.
 		 *
 		 * The direct write mechanism will detect the reader going
 		 * away on us.
 		 */
 		if (uio->uio_segflg == UIO_USERSPACE &&
 		    uio->uio_iov->iov_len >= pipe_mindirect &&
 		    wpipe->pipe_buffer.size >= pipe_mindirect &&
 		    (fp->f_flag & FNONBLOCK) == 0) {
 			error = pipe_direct_write(wpipe, uio);
 			if (error != 0)
 				break;
 			continue;
 		}
 #endif
 
 		/*
 		 * Pipe buffered writes cannot be coincidental with
 		 * direct writes.  We wait until the currently executing
 		 * direct write is completed before we start filling the
 		 * pipe buffer.  We break out if a signal occurs or the
 		 * reader goes away.
 		 */
 		if (wpipe->pipe_pages.cnt != 0) {
 			if (wpipe->pipe_state & PIPE_WANTR) {
 				wpipe->pipe_state &= ~PIPE_WANTR;
 				wakeup(wpipe);
 			}
 			pipeselwakeup(wpipe);
 			wpipe->pipe_state |= PIPE_WANTW;
 			pipeunlock(wpipe);
 			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
 			    "pipbww", 0);
 			pipelock(wpipe, 0);
 			if (error != 0)
 				break;
 			continue;
 		}
 
 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
 
 		/* Writes of size <= PIPE_BUF must be atomic. */
 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
 			space = 0;
 
 		if (space > 0) {
 			int size;	/* Transfer size */
 			int segsize;	/* first segment to transfer */
 
 			/*
 			 * Transfer size is minimum of uio transfer
 			 * and free space in pipe buffer.
 			 */
 			if (space > uio->uio_resid)
 				size = uio->uio_resid;
 			else
 				size = space;
 			/*
 			 * First segment to transfer is minimum of
 			 * transfer size and contiguous space in
 			 * pipe buffer.  If first segment to transfer
 			 * is less than the transfer size, we've got
 			 * a wraparound in the buffer.
 			 */
 			segsize = wpipe->pipe_buffer.size -
 				wpipe->pipe_buffer.in;
 			if (segsize > size)
 				segsize = size;
 
 			/* Transfer first segment */
 
 			PIPE_UNLOCK(rpipe);
 			error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
 					segsize, uio);
 			PIPE_LOCK(rpipe);
 
 			if (error == 0 && segsize < size) {
 				KASSERT(wpipe->pipe_buffer.in + segsize ==
 					wpipe->pipe_buffer.size,
 					("Pipe buffer wraparound disappeared"));
 				/*
 				 * Transfer remaining part now, to
 				 * support atomic writes.  Wraparound
 				 * happened.
 				 */
 
 				PIPE_UNLOCK(rpipe);
 				error = uiomove(
 				    &wpipe->pipe_buffer.buffer[0],
 				    size - segsize, uio);
 				PIPE_LOCK(rpipe);
 			}
 			if (error == 0) {
 				wpipe->pipe_buffer.in += size;
 				if (wpipe->pipe_buffer.in >=
 				    wpipe->pipe_buffer.size) {
 					KASSERT(wpipe->pipe_buffer.in ==
 						size - segsize +
 						wpipe->pipe_buffer.size,
 						("Expected wraparound bad"));
 					wpipe->pipe_buffer.in = size - segsize;
 				}
 
 				wpipe->pipe_buffer.cnt += size;
 				KASSERT(wpipe->pipe_buffer.cnt <=
 					wpipe->pipe_buffer.size,
 					("Pipe buffer overflow"));
 			}
 			if (error != 0)
 				break;
 			continue;
 		} else {
 			/*
 			 * If the "read-side" has been blocked, wake it up now.
 			 */
 			if (wpipe->pipe_state & PIPE_WANTR) {
 				wpipe->pipe_state &= ~PIPE_WANTR;
 				wakeup(wpipe);
 			}
 
 			/*
 			 * don't block on non-blocking I/O
 			 */
 			if (fp->f_flag & FNONBLOCK) {
 				error = EAGAIN;
 				break;
 			}
 
 			/*
 			 * We have no more space and have something to offer,
 			 * wake up select/poll.
 			 */
 			pipeselwakeup(wpipe);
 
 			wpipe->pipe_state |= PIPE_WANTW;
 			pipeunlock(wpipe);
 			error = msleep(wpipe, PIPE_MTX(rpipe),
 			    PRIBIO | PCATCH, "pipewr", 0);
 			pipelock(wpipe, 0);
 			if (error != 0)
 				break;
 			continue;
 		}
 	}
 
 	--wpipe->pipe_busy;
 
 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
 		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
 		wakeup(wpipe);
 	} else if (wpipe->pipe_buffer.cnt > 0) {
 		/*
 		 * If we have put any characters in the buffer, we wake up
 		 * the reader.
 		 */
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 	}
 
 	/*
 	 * Don't return EPIPE if any byte was written.
 	 * EINTR and other interrupts are handled by generic I/O layer.
 	 * Do not pretend that I/O succeeded for obvious user error
 	 * like EFAULT.
 	 */
 	if (uio->uio_resid != orig_resid && error == EPIPE)
 		error = 0;
 
 	if (error == 0)
 		pipe_timestamp(&wpipe->pipe_mtime);
 
 	/*
 	 * We have something to offer,
 	 * wake up select/poll.
 	 */
 	if (wpipe->pipe_buffer.cnt)
 		pipeselwakeup(wpipe);
 
 	pipeunlock(wpipe);
 	PIPE_UNLOCK(rpipe);
 	if (uio->uio_resid != orig_resid)
 		td->td_ru.ru_msgsnd++;
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 pipe_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct pipe *cpipe;
 	int error;
 
 	cpipe = fp->f_data;
 	if (cpipe->pipe_type & PIPE_TYPE_NAMED)
 		error = vnops.fo_truncate(fp, length, active_cred, td);
 	else
 		error = invfo_truncate(fp, length, active_cred, td);
 	return (error);
 }
 
 /*
  * we implement a very minimal set of ioctls for compatibility with sockets.
  */
 static int
 pipe_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 	struct pipe *mpipe = fp->f_data;
 	int error;
 
 	PIPE_LOCK(mpipe);
 
 #ifdef MAC
 	error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
 	if (error) {
 		PIPE_UNLOCK(mpipe);
 		return (error);
 	}
 #endif
 
 	error = 0;
 	switch (cmd) {
 	case FIONBIO:
 		break;
 
 	case FIOASYNC:
 		if (*(int *)data) {
 			mpipe->pipe_state |= PIPE_ASYNC;
 		} else {
 			mpipe->pipe_state &= ~PIPE_ASYNC;
 		}
 		break;
 
 	case FIONREAD:
 		if (!(fp->f_flag & FREAD)) {
 			*(int *)data = 0;
 			PIPE_UNLOCK(mpipe);
 			return (0);
 		}
 		if (mpipe->pipe_pages.cnt != 0)
 			*(int *)data = mpipe->pipe_pages.cnt;
 		else
 			*(int *)data = mpipe->pipe_buffer.cnt;
 		break;
 
 	case FIOSETOWN:
 		PIPE_UNLOCK(mpipe);
 		error = fsetown(*(int *)data, &mpipe->pipe_sigio);
 		goto out_unlocked;
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&mpipe->pipe_sigio);
 		break;
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		PIPE_UNLOCK(mpipe);
 		error = fsetown(-(*(int *)data), &mpipe->pipe_sigio);
 		goto out_unlocked;
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)data = -fgetown(&mpipe->pipe_sigio);
 		break;
 
 	default:
 		error = ENOTTY;
 		break;
 	}
 	PIPE_UNLOCK(mpipe);
 out_unlocked:
 	return (error);
 }
 
 static int
 pipe_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct pipe *rpipe;
 	struct pipe *wpipe;
 	int levents, revents;
 #ifdef MAC
 	int error;
 #endif
 
 	revents = 0;
 	rpipe = fp->f_data;
 	wpipe = PIPE_PEER(rpipe);
 	PIPE_LOCK(rpipe);
 #ifdef MAC
 	error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair);
 	if (error)
 		goto locked_error;
 #endif
 	if (fp->f_flag & FREAD && events & (POLLIN | POLLRDNORM))
 		if (rpipe->pipe_pages.cnt > 0 || rpipe->pipe_buffer.cnt > 0)
 			revents |= events & (POLLIN | POLLRDNORM);
 
 	if (fp->f_flag & FWRITE && events & (POLLOUT | POLLWRNORM))
 		if (wpipe->pipe_present != PIPE_ACTIVE ||
 		    (wpipe->pipe_state & PIPE_EOF) ||
 		    ((wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
 		     ((wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF ||
 			 wpipe->pipe_buffer.size == 0)))
 			revents |= events & (POLLOUT | POLLWRNORM);
 
 	levents = events &
 	    (POLLIN | POLLINIGNEOF | POLLPRI | POLLRDNORM | POLLRDBAND);
 	if (rpipe->pipe_type & PIPE_TYPE_NAMED && fp->f_flag & FREAD && levents &&
 	    fp->f_pipegen == rpipe->pipe_wgen)
 		events |= POLLINIGNEOF;
 
 	if ((events & POLLINIGNEOF) == 0) {
 		if (rpipe->pipe_state & PIPE_EOF) {
 			if (fp->f_flag & FREAD)
 				revents |= (events & (POLLIN | POLLRDNORM));
 			if (wpipe->pipe_present != PIPE_ACTIVE ||
 			    (wpipe->pipe_state & PIPE_EOF))
 				revents |= POLLHUP;
 		}
 	}
 
 	if (revents == 0) {
 		/*
 		 * Add ourselves regardless of eventmask as we have to return
 		 * POLLHUP even if it was not asked for.
 		 */
 		if ((fp->f_flag & FREAD) != 0) {
 			selrecord(td, &rpipe->pipe_sel);
 			if (SEL_WAITING(&rpipe->pipe_sel))
 				rpipe->pipe_state |= PIPE_SEL;
 		}
 
 		if ((fp->f_flag & FWRITE) != 0 &&
 		    wpipe->pipe_present == PIPE_ACTIVE) {
 			selrecord(td, &wpipe->pipe_sel);
 			if (SEL_WAITING(&wpipe->pipe_sel))
 				wpipe->pipe_state |= PIPE_SEL;
 		}
 	}
 #ifdef MAC
 locked_error:
 #endif
 	PIPE_UNLOCK(rpipe);
 
 	return (revents);
 }
 
 /*
  * We shouldn't need locks here as we're doing a read and this should
  * be a natural race.
  */
 static int
-pipe_stat(struct file *fp, struct stat *ub, struct ucred *active_cred,
-    struct thread *td)
+pipe_stat(struct file *fp, struct stat *ub, struct ucred *active_cred)
 {
 	struct pipe *pipe;
 #ifdef MAC
 	int error;
 #endif
 
 	pipe = fp->f_data;
 #ifdef MAC
 	if (mac_pipe_check_stat_enabled()) {
 		PIPE_LOCK(pipe);
 		error = mac_pipe_check_stat(active_cred, pipe->pipe_pair);
 		PIPE_UNLOCK(pipe);
 		if (error) {
 			return (error);
 		}
 	}
 #endif
 
 	/* For named pipes ask the underlying filesystem. */
 	if (pipe->pipe_type & PIPE_TYPE_NAMED) {
-		return (vnops.fo_stat(fp, ub, active_cred, td));
+		return (vnops.fo_stat(fp, ub, active_cred));
 	}
 
 	bzero(ub, sizeof(*ub));
 	ub->st_mode = S_IFIFO;
 	ub->st_blksize = PAGE_SIZE;
 	if (pipe->pipe_pages.cnt != 0)
 		ub->st_size = pipe->pipe_pages.cnt;
 	else
 		ub->st_size = pipe->pipe_buffer.cnt;
 	ub->st_blocks = howmany(ub->st_size, ub->st_blksize);
 	ub->st_atim = pipe->pipe_atime;
 	ub->st_mtim = pipe->pipe_mtime;
 	ub->st_ctim = pipe->pipe_ctime;
 	ub->st_uid = fp->f_cred->cr_uid;
 	ub->st_gid = fp->f_cred->cr_gid;
 	ub->st_dev = pipedev_ino;
 	ub->st_ino = pipe->pipe_ino;
 	/*
 	 * Left as 0: st_nlink, st_rdev, st_flags, st_gen.
 	 */
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 pipe_close(struct file *fp, struct thread *td)
 {
 
 	if (fp->f_vnode != NULL) 
 		return vnops.fo_close(fp, td);
 	fp->f_ops = &badfileops;
 	pipe_dtor(fp->f_data);
 	fp->f_data = NULL;
 	return (0);
 }
 
 static int
 pipe_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td)
 {
 	struct pipe *cpipe;
 	int error;
 
 	cpipe = fp->f_data;
 	if (cpipe->pipe_type & PIPE_TYPE_NAMED)
 		error = vn_chmod(fp, mode, active_cred, td);
 	else
 		error = invfo_chmod(fp, mode, active_cred, td);
 	return (error);
 }
 
 static int
 pipe_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct pipe *cpipe;
 	int error;
 
 	cpipe = fp->f_data;
 	if (cpipe->pipe_type & PIPE_TYPE_NAMED)
 		error = vn_chown(fp, uid, gid, active_cred, td);
 	else
 		error = invfo_chown(fp, uid, gid, active_cred, td);
 	return (error);
 }
 
 static int
 pipe_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct pipe *pi;
 
 	if (fp->f_type == DTYPE_FIFO)
 		return (vn_fill_kinfo(fp, kif, fdp));
 	kif->kf_type = KF_TYPE_PIPE;
 	pi = fp->f_data;
 	kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
 	kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
 	kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
 	return (0);
 }
 
 static void
 pipe_free_kmem(struct pipe *cpipe)
 {
 
 	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
 	    ("pipe_free_kmem: pipe mutex locked"));
 
 	if (cpipe->pipe_buffer.buffer != NULL) {
 		atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size);
 		vm_map_remove(pipe_map,
 		    (vm_offset_t)cpipe->pipe_buffer.buffer,
 		    (vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
 		cpipe->pipe_buffer.buffer = NULL;
 	}
 #ifndef PIPE_NODIRECT
 	{
 		cpipe->pipe_pages.cnt = 0;
 		cpipe->pipe_pages.pos = 0;
 		cpipe->pipe_pages.npages = 0;
 	}
 #endif
 }
 
 /*
  * shutdown the pipe
  */
 static void
 pipeclose(struct pipe *cpipe)
 {
 	struct pipepair *pp;
 	struct pipe *ppipe;
 
 	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));
 
 	PIPE_LOCK(cpipe);
 	pipelock(cpipe, 0);
 	pp = cpipe->pipe_pair;
 
 	/*
 	 * If the other side is blocked, wake it up saying that
 	 * we want to close it down.
 	 */
 	cpipe->pipe_state |= PIPE_EOF;
 	while (cpipe->pipe_busy) {
 		wakeup(cpipe);
 		cpipe->pipe_state |= PIPE_WANT;
 		pipeunlock(cpipe);
 		msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
 		pipelock(cpipe, 0);
 	}
 
 	pipeselwakeup(cpipe);
 
 	/*
 	 * Disconnect from peer, if any.
 	 */
 	ppipe = cpipe->pipe_peer;
 	if (ppipe->pipe_present == PIPE_ACTIVE) {
 		ppipe->pipe_state |= PIPE_EOF;
 		wakeup(ppipe);
 		pipeselwakeup(ppipe);
 	}
 
 	/*
 	 * Mark this endpoint as free.  Release kmem resources.  We
 	 * don't mark this endpoint as unused until we've finished
 	 * doing that, or the pipe might disappear out from under
 	 * us.
 	 */
 	PIPE_UNLOCK(cpipe);
 	pipe_free_kmem(cpipe);
 	PIPE_LOCK(cpipe);
 	cpipe->pipe_present = PIPE_CLOSING;
 	pipeunlock(cpipe);
 
 	/*
 	 * knlist_clear() may sleep dropping the PIPE_MTX. Set the
 	 * PIPE_FINALIZED, that allows other end to free the
 	 * pipe_pair, only after the knotes are completely dismantled.
 	 */
 	knlist_clear(&cpipe->pipe_sel.si_note, 1);
 	cpipe->pipe_present = PIPE_FINALIZED;
 	seldrain(&cpipe->pipe_sel);
 	knlist_destroy(&cpipe->pipe_sel.si_note);
 
 	/*
 	 * If both endpoints are now closed, release the memory for the
 	 * pipe pair.  If not, unlock.
 	 */
 	if (ppipe->pipe_present == PIPE_FINALIZED) {
 		PIPE_UNLOCK(cpipe);
 #ifdef MAC
 		mac_pipe_destroy(pp);
 #endif
 		uma_zfree(pipe_zone, cpipe->pipe_pair);
 	} else
 		PIPE_UNLOCK(cpipe);
 }
 
 /*ARGSUSED*/
 static int
 pipe_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct pipe *cpipe;
 
 	/*
 	 * If a filter is requested that is not supported by this file
 	 * descriptor, don't return an error, but also don't ever generate an
 	 * event.
 	 */
 	if ((kn->kn_filter == EVFILT_READ) && !(fp->f_flag & FREAD)) {
 		kn->kn_fop = &pipe_nfiltops;
 		return (0);
 	}
 	if ((kn->kn_filter == EVFILT_WRITE) && !(fp->f_flag & FWRITE)) {
 		kn->kn_fop = &pipe_nfiltops;
 		return (0);
 	}
 	cpipe = fp->f_data;
 	PIPE_LOCK(cpipe);
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &pipe_rfiltops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &pipe_wfiltops;
 		if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) {
 			/* other end of pipe has been closed */
 			PIPE_UNLOCK(cpipe);
 			return (EPIPE);
 		}
 		cpipe = PIPE_PEER(cpipe);
 		break;
 	default:
 		PIPE_UNLOCK(cpipe);
 		return (EINVAL);
 	}
 
 	kn->kn_hook = cpipe; 
 	knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
 	PIPE_UNLOCK(cpipe);
 	return (0);
 }
 
 static void
 filt_pipedetach(struct knote *kn)
 {
 	struct pipe *cpipe = kn->kn_hook;
 
 	PIPE_LOCK(cpipe);
 	knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
 	PIPE_UNLOCK(cpipe);
 }
 
 /*ARGSUSED*/
 static int
 filt_piperead(struct knote *kn, long hint)
 {
 	struct file *fp = kn->kn_fp;
 	struct pipe *rpipe = kn->kn_hook;
 
 	PIPE_LOCK_ASSERT(rpipe, MA_OWNED);
 	kn->kn_data = rpipe->pipe_buffer.cnt;
 	if (kn->kn_data == 0)
 		kn->kn_data = rpipe->pipe_pages.cnt;
 
 	if ((rpipe->pipe_state & PIPE_EOF) != 0 &&
 	    ((rpipe->pipe_type & PIPE_TYPE_NAMED) == 0 ||
 	    fp->f_pipegen != rpipe->pipe_wgen)) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
 	kn->kn_flags &= ~EV_EOF;
 	return (kn->kn_data > 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_pipewrite(struct knote *kn, long hint)
 {
 	struct pipe *wpipe = kn->kn_hook;
 
 	/*
 	 * If this end of the pipe is closed, the knote was removed from the
 	 * knlist and the list lock (i.e., the pipe lock) is therefore not held.
 	 */
 	if (wpipe->pipe_present == PIPE_ACTIVE ||
 	    (wpipe->pipe_type & PIPE_TYPE_NAMED) != 0) {
 		PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
 
 		if (wpipe->pipe_state & PIPE_DIRECTW) {
 			kn->kn_data = 0;
 		} else if (wpipe->pipe_buffer.size > 0) {
 			kn->kn_data = wpipe->pipe_buffer.size -
 			    wpipe->pipe_buffer.cnt;
 		} else {
 			kn->kn_data = PIPE_BUF;
 		}
 	}
 
 	if (wpipe->pipe_present != PIPE_ACTIVE ||
 	    (wpipe->pipe_state & PIPE_EOF)) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
 	kn->kn_flags &= ~EV_EOF;
 	return (kn->kn_data >= PIPE_BUF);
 }
 
 static void
 filt_pipedetach_notsup(struct knote *kn)
 {
 
 }
 
 static int
 filt_pipenotsup(struct knote *kn, long hint)
 {
 
 	return (0);
 }
diff --git a/sys/kern/sys_procdesc.c b/sys/kern/sys_procdesc.c
index 95a389c582b0..ff1a1502aa71 100644
--- a/sys/kern/sys_procdesc.c
+++ b/sys/kern/sys_procdesc.c
@@ -1,560 +1,559 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009, 2016 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*-
  * FreeBSD process descriptor facility.
  *
  * Some processes are represented by a file descriptor, which will be used in
  * preference to signaling and pids for the purposes of process management,
  * and is, in effect, a form of capability.  When a process descriptor is
  * used with a process, it ceases to be visible to certain traditional UNIX
  * process facilities, such as waitpid(2).
  *
  * Some semantics:
  *
  * - At most one process descriptor will exist for any process, although
  *   references to that descriptor may be held from many processes (or even
  *   be in flight between processes over a local domain socket).
  * - Last close on the process descriptor will terminate the process using
  *   SIGKILL and reparent it to init so that there's a process to reap it
  *   when it's done exiting.
  * - If the process exits before the descriptor is closed, it will not
  *   generate SIGCHLD on termination, or be picked up by waitpid().
  * - The pdkill(2) system call may be used to deliver a signal to the process
  *   using its process descriptor.
  * - The pdwait4(2) system call may be used to block (or not) on a process
  *   descriptor to collect termination information.
  *
  * Open questions:
  *
  * - Will we want to add a pidtoprocdesc(2) system call to allow process
  *   descriptors to be created for processes without pdfork(2)?
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/resourcevar.h>
 #include <sys/stat.h>
 #include <sys/sysproto.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/ucred.h>
 #include <sys/user.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 
 FEATURE(process_descriptors, "Process Descriptors");
 
 MALLOC_DEFINE(M_PROCDESC, "procdesc", "process descriptors");
 
 static fo_poll_t	procdesc_poll;
 static fo_kqfilter_t	procdesc_kqfilter;
 static fo_stat_t	procdesc_stat;
 static fo_close_t	procdesc_close;
 static fo_fill_kinfo_t	procdesc_fill_kinfo;
 
 static struct fileops procdesc_ops = {
 	.fo_read = invfo_rdwr,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = invfo_ioctl,
 	.fo_poll = procdesc_poll,
 	.fo_kqfilter = procdesc_kqfilter,
 	.fo_stat = procdesc_stat,
 	.fo_close = procdesc_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = procdesc_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE,
 };
 
 /*
  * Return a locked process given a process descriptor, or ESRCH if it has
  * died.
  */
 int
 procdesc_find(struct thread *td, int fd, cap_rights_t *rightsp,
     struct proc **p)
 {
 	struct procdesc *pd;
 	struct file *fp;
 	int error;
 
 	error = fget(td, fd, rightsp, &fp);
 	if (error)
 		return (error);
 	if (fp->f_type != DTYPE_PROCDESC) {
 		error = EBADF;
 		goto out;
 	}
 	pd = fp->f_data;
 	sx_slock(&proctree_lock);
 	if (pd->pd_proc != NULL) {
 		*p = pd->pd_proc;
 		PROC_LOCK(*p);
 	} else
 		error = ESRCH;
 	sx_sunlock(&proctree_lock);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Function to be used by procstat(1) sysctls when returning procdesc
  * information.
  */
 pid_t
 procdesc_pid(struct file *fp_procdesc)
 {
 	struct procdesc *pd;
 
 	KASSERT(fp_procdesc->f_type == DTYPE_PROCDESC,
 	   ("procdesc_pid: !procdesc"));
 
 	pd = fp_procdesc->f_data;
 	return (pd->pd_pid);
 }
 
 /*
  * Retrieve the PID associated with a process descriptor.
  */
 int
 kern_pdgetpid(struct thread *td, int fd, cap_rights_t *rightsp, pid_t *pidp)
 {
 	struct file *fp;
 	int error;
 
 	error = fget(td, fd, rightsp, &fp);
 	if (error)
 		return (error);
 	if (fp->f_type != DTYPE_PROCDESC) {
 		error = EBADF;
 		goto out;
 	}
 	*pidp = procdesc_pid(fp);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * System call to return the pid of a process given its process descriptor.
  */
 int
 sys_pdgetpid(struct thread *td, struct pdgetpid_args *uap)
 {
 	pid_t pid;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	error = kern_pdgetpid(td, uap->fd, &cap_pdgetpid_rights, &pid);
 	if (error == 0)
 		error = copyout(&pid, uap->pidp, sizeof(pid));
 	return (error);
 }
 
 /*
  * When a new process is forked by pdfork(), a file descriptor is allocated
  * by the fork code first, then the process is forked, and then we get a
  * chance to set up the process descriptor.  Failure is not permitted at this
  * point, so procdesc_new() must succeed.
  */
 void
 procdesc_new(struct proc *p, int flags)
 {
 	struct procdesc *pd;
 
 	pd = malloc(sizeof(*pd), M_PROCDESC, M_WAITOK | M_ZERO);
 	pd->pd_proc = p;
 	pd->pd_pid = p->p_pid;
 	p->p_procdesc = pd;
 	pd->pd_flags = 0;
 	if (flags & PD_DAEMON)
 		pd->pd_flags |= PDF_DAEMON;
 	PROCDESC_LOCK_INIT(pd);
 	knlist_init_mtx(&pd->pd_selinfo.si_note, &pd->pd_lock);
 
 	/*
 	 * Process descriptors start out with two references: one from their
 	 * struct file, and the other from their struct proc.
 	 */
 	refcount_init(&pd->pd_refcount, 2);
 }
 
 /*
  * Create a new process decriptor for the process that refers to it.
  */
 int
 procdesc_falloc(struct thread *td, struct file **resultfp, int *resultfd,
     int flags, struct filecaps *fcaps)
 {
 	int fflags;
 
 	fflags = 0;
 	if (flags & PD_CLOEXEC)
 		fflags = O_CLOEXEC;
 
 	return (falloc_caps(td, resultfp, resultfd, fflags, fcaps));
 }
 
 /*
  * Initialize a file with a process descriptor.
  */
 void
 procdesc_finit(struct procdesc *pdp, struct file *fp)
 {
 
 	finit(fp, FREAD | FWRITE, DTYPE_PROCDESC, pdp, &procdesc_ops);
 }
 
 static void
 procdesc_free(struct procdesc *pd)
 {
 
 	/*
 	 * When the last reference is released, we assert that the descriptor
 	 * has been closed, but not that the process has exited, as we will
 	 * detach the descriptor before the process dies if the descript is
 	 * closed, as we can't wait synchronously.
 	 */
 	if (refcount_release(&pd->pd_refcount)) {
 		KASSERT(pd->pd_proc == NULL,
 		    ("procdesc_free: pd_proc != NULL"));
 		KASSERT((pd->pd_flags & PDF_CLOSED),
 		    ("procdesc_free: !PDF_CLOSED"));
 
 		knlist_destroy(&pd->pd_selinfo.si_note);
 		PROCDESC_LOCK_DESTROY(pd);
 		free(pd, M_PROCDESC);
 	}
 }
 
 /*
  * procdesc_exit() - notify a process descriptor that its process is exiting.
  * We use the proctree_lock to ensure that process exit either happens
  * strictly before or strictly after a concurrent call to procdesc_close().
  */
 int
 procdesc_exit(struct proc *p)
 {
 	struct procdesc *pd;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL"));
 
 	pd = p->p_procdesc;
 
 	PROCDESC_LOCK(pd);
 	KASSERT((pd->pd_flags & PDF_CLOSED) == 0 || p->p_pptr == p->p_reaper,
 	    ("procdesc_exit: closed && parent not reaper"));
 
 	pd->pd_flags |= PDF_EXITED;
 	pd->pd_xstat = KW_EXITCODE(p->p_xexit, p->p_xsig);
 
 	/*
 	 * If the process descriptor has been closed, then we have nothing
 	 * to do; return 1 so that init will get SIGCHLD and do the reaping.
 	 * Clean up the procdesc now rather than letting it happen during
 	 * that reap.
 	 */
 	if (pd->pd_flags & PDF_CLOSED) {
 		PROCDESC_UNLOCK(pd);
 		pd->pd_proc = NULL;
 		p->p_procdesc = NULL;
 		procdesc_free(pd);
 		return (1);
 	}
 	if (pd->pd_flags & PDF_SELECTED) {
 		pd->pd_flags &= ~PDF_SELECTED;
 		selwakeup(&pd->pd_selinfo);
 	}
 	KNOTE_LOCKED(&pd->pd_selinfo.si_note, NOTE_EXIT);
 	PROCDESC_UNLOCK(pd);
 	return (0);
 }
 
 /*
  * When a process descriptor is reaped, perhaps as a result of close() or
  * pdwait4(), release the process's reference on the process descriptor.
  */
 void
 procdesc_reap(struct proc *p)
 {
 	struct procdesc *pd;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 	KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL"));
 
 	pd = p->p_procdesc;
 	pd->pd_proc = NULL;
 	p->p_procdesc = NULL;
 	procdesc_free(pd);
 }
 
 /*
  * procdesc_close() - last close on a process descriptor.  If the process is
  * still running, terminate with SIGKILL (unless PDF_DAEMON is set) and let
  * its reaper clean up the mess; if not, we have to clean up the zombie
  * ourselves.
  */
 static int
 procdesc_close(struct file *fp, struct thread *td)
 {
 	struct procdesc *pd;
 	struct proc *p;
 
 	KASSERT(fp->f_type == DTYPE_PROCDESC, ("procdesc_close: !procdesc"));
 
 	pd = fp->f_data;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 
 	sx_xlock(&proctree_lock);
 	PROCDESC_LOCK(pd);
 	pd->pd_flags |= PDF_CLOSED;
 	PROCDESC_UNLOCK(pd);
 	p = pd->pd_proc;
 	if (p == NULL) {
 		/*
 		 * This is the case where process' exit status was already
 		 * collected and procdesc_reap() was already called.
 		 */
 		sx_xunlock(&proctree_lock);
 	} else {
 		PROC_LOCK(p);
 		AUDIT_ARG_PROCESS(p);
 		if (p->p_state == PRS_ZOMBIE) {
 			/*
 			 * If the process is already dead and just awaiting
 			 * reaping, do that now.  This will release the
 			 * process's reference to the process descriptor when it
 			 * calls back into procdesc_reap().
 			 */
 			proc_reap(curthread, p, NULL, 0);
 		} else {
 			/*
 			 * If the process is not yet dead, we need to kill it,
 			 * but we can't wait around synchronously for it to go
 			 * away, as that path leads to madness (and deadlocks).
 			 * First, detach the process from its descriptor so that
 			 * its exit status will be reported normally.
 			 */
 			pd->pd_proc = NULL;
 			p->p_procdesc = NULL;
 			procdesc_free(pd);
 
 			/*
 			 * Next, reparent it to its reaper (usually init(8)) so
 			 * that there's someone to pick up the pieces; finally,
 			 * terminate with prejudice.
 			 */
 			p->p_sigparent = SIGCHLD;
 			if ((p->p_flag & P_TRACED) == 0) {
 				proc_reparent(p, p->p_reaper, true);
 			} else {
 				proc_clear_orphan(p);
 				p->p_oppid = p->p_reaper->p_pid;
 				proc_add_orphan(p, p->p_reaper);
 			}
 			if ((pd->pd_flags & PDF_DAEMON) == 0)
 				kern_psignal(p, SIGKILL);
 			PROC_UNLOCK(p);
 			sx_xunlock(&proctree_lock);
 		}
 	}
 
 	/*
 	 * Release the file descriptor's reference on the process descriptor.
 	 */
 	procdesc_free(pd);
 	return (0);
 }
 
 static int
 procdesc_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct procdesc *pd;
 	int revents;
 
 	revents = 0;
 	pd = fp->f_data;
 	PROCDESC_LOCK(pd);
 	if (pd->pd_flags & PDF_EXITED)
 		revents |= POLLHUP;
 	if (revents == 0) {
 		selrecord(td, &pd->pd_selinfo);
 		pd->pd_flags |= PDF_SELECTED;
 	}
 	PROCDESC_UNLOCK(pd);
 	return (revents);
 }
 
 static void
 procdesc_kqops_detach(struct knote *kn)
 {
 	struct procdesc *pd;
 
 	pd = kn->kn_fp->f_data;
 	knlist_remove(&pd->pd_selinfo.si_note, kn, 0);
 }
 
 static int
 procdesc_kqops_event(struct knote *kn, long hint)
 {
 	struct procdesc *pd;
 	u_int event;
 
 	pd = kn->kn_fp->f_data;
 	if (hint == 0) {
 		/*
 		 * Initial test after registration. Generate a NOTE_EXIT in
 		 * case the process already terminated before registration.
 		 */
 		event = pd->pd_flags & PDF_EXITED ? NOTE_EXIT : 0;
 	} else {
 		/* Mask off extra data. */
 		event = (u_int)hint & NOTE_PCTRLMASK;
 	}
 
 	/* If the user is interested in this event, record it. */
 	if (kn->kn_sfflags & event)
 		kn->kn_fflags |= event;
 
 	/* Process is gone, so flag the event as finished. */
 	if (event == NOTE_EXIT) {
 		kn->kn_flags |= EV_EOF | EV_ONESHOT;
 		if (kn->kn_fflags & NOTE_EXIT)
 			kn->kn_data = pd->pd_xstat;
 		if (kn->kn_fflags == 0)
 			kn->kn_flags |= EV_DROP;
 		return (1);
 	}
 
 	return (kn->kn_fflags != 0);
 }
 
 static struct filterops procdesc_kqops = {
 	.f_isfd = 1,
 	.f_detach = procdesc_kqops_detach,
 	.f_event = procdesc_kqops_event,
 };
 
 static int
 procdesc_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct procdesc *pd;
 
 	pd = fp->f_data;
 	switch (kn->kn_filter) {
 	case EVFILT_PROCDESC:
 		kn->kn_fop = &procdesc_kqops;
 		kn->kn_flags |= EV_CLEAR;
 		knlist_add(&pd->pd_selinfo.si_note, kn, 0);
 		return (0);
 	default:
 		return (EINVAL);
 	}
 }
 
 static int
-procdesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
-    struct thread *td)
+procdesc_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 	struct procdesc *pd;
 	struct timeval pstart, boottime;
 
 	/*
 	 * XXXRW: Perhaps we should cache some more information from the
 	 * process so that we can return it reliably here even after it has
 	 * died.  For example, caching its credential data.
 	 */
 	bzero(sb, sizeof(*sb));
 	pd = fp->f_data;
 	sx_slock(&proctree_lock);
 	if (pd->pd_proc != NULL) {
 		PROC_LOCK(pd->pd_proc);
 		AUDIT_ARG_PROCESS(pd->pd_proc);
 
 		/* Set birth and [acm] times to process start time. */
 		pstart = pd->pd_proc->p_stats->p_start;
 		getboottime(&boottime);
 		timevaladd(&pstart, &boottime);
 		TIMEVAL_TO_TIMESPEC(&pstart, &sb->st_birthtim);
 		sb->st_atim = sb->st_birthtim;
 		sb->st_ctim = sb->st_birthtim;
 		sb->st_mtim = sb->st_birthtim;
 		if (pd->pd_proc->p_state != PRS_ZOMBIE)
 			sb->st_mode = S_IFREG | S_IRWXU;
 		else
 			sb->st_mode = S_IFREG;
 		sb->st_uid = pd->pd_proc->p_ucred->cr_ruid;
 		sb->st_gid = pd->pd_proc->p_ucred->cr_rgid;
 		PROC_UNLOCK(pd->pd_proc);
 	} else
 		sb->st_mode = S_IFREG;
 	sx_sunlock(&proctree_lock);
 	return (0);
 }
 
 static int
 procdesc_fill_kinfo(struct file *fp, struct kinfo_file *kif,
     struct filedesc *fdp)
 {
 	struct procdesc *pdp;
 
 	kif->kf_type = KF_TYPE_PROCDESC;
 	pdp = fp->f_data;
 	kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
 	return (0);
 }
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
index 8cf703ab8ebd..d4200e5618d2 100644
--- a/sys/kern/sys_socket.c
+++ b/sys/kern/sys_socket.c
@@ -1,845 +1,844 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)sys_socket.c	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/aio.h>
 #include <sys/domain.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sigio.h>
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/filio.h>			/* XXX */
 #include <sys/sockio.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <sys/ucred.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/user.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 
 static SYSCTL_NODE(_kern_ipc, OID_AUTO, aio, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
     "socket AIO stats");
 
 static int empty_results;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_results, CTLFLAG_RD, &empty_results,
     0, "socket operation returned EAGAIN");
 
 static int empty_retries;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_retries, CTLFLAG_RD, &empty_retries,
     0, "socket operation retries");
 
 static fo_rdwr_t soo_read;
 static fo_rdwr_t soo_write;
 static fo_ioctl_t soo_ioctl;
 static fo_poll_t soo_poll;
 extern fo_kqfilter_t soo_kqfilter;
 static fo_stat_t soo_stat;
 static fo_close_t soo_close;
 static fo_fill_kinfo_t soo_fill_kinfo;
 static fo_aio_queue_t soo_aio_queue;
 
 static void	soo_aio_cancel(struct kaiocb *job);
 
 struct fileops	socketops = {
 	.fo_read = soo_read,
 	.fo_write = soo_write,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = soo_ioctl,
 	.fo_poll = soo_poll,
 	.fo_kqfilter = soo_kqfilter,
 	.fo_stat = soo_stat,
 	.fo_close = soo_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = soo_fill_kinfo,
 	.fo_aio_queue = soo_aio_queue,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static int
 soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error;
 
 #ifdef MAC
 	error = mac_socket_check_receive(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	error = soreceive(so, 0, uio, 0, 0, 0);
 	return (error);
 }
 
 static int
 soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error;
 
 #ifdef MAC
 	error = mac_socket_check_send(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	error = sosend(so, 0, uio, 0, 0, 0, uio->uio_td);
 	if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) {
 		PROC_LOCK(uio->uio_td->td_proc);
 		tdsignal(uio->uio_td, SIGPIPE);
 		PROC_UNLOCK(uio->uio_td->td_proc);
 	}
 	return (error);
 }
 
 static int
 soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error = 0;
 
 	switch (cmd) {
 	case FIONBIO:
 		SOCK_LOCK(so);
 		if (*(int *)data)
 			so->so_state |= SS_NBIO;
 		else
 			so->so_state &= ~SS_NBIO;
 		SOCK_UNLOCK(so);
 		break;
 
 	case FIOASYNC:
 		if (*(int *)data) {
 			SOCK_LOCK(so);
 			so->so_state |= SS_ASYNC;
 			if (SOLISTENING(so)) {
 				so->sol_sbrcv_flags |= SB_ASYNC;
 				so->sol_sbsnd_flags |= SB_ASYNC;
 			} else {
 				SOCKBUF_LOCK(&so->so_rcv);
 				so->so_rcv.sb_flags |= SB_ASYNC;
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				SOCKBUF_LOCK(&so->so_snd);
 				so->so_snd.sb_flags |= SB_ASYNC;
 				SOCKBUF_UNLOCK(&so->so_snd);
 			}
 			SOCK_UNLOCK(so);
 		} else {
 			SOCK_LOCK(so);
 			so->so_state &= ~SS_ASYNC;
 			if (SOLISTENING(so)) {
 				so->sol_sbrcv_flags &= ~SB_ASYNC;
 				so->sol_sbsnd_flags &= ~SB_ASYNC;
 			} else {
 				SOCKBUF_LOCK(&so->so_rcv);
 				so->so_rcv.sb_flags &= ~SB_ASYNC;
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				SOCKBUF_LOCK(&so->so_snd);
 				so->so_snd.sb_flags &= ~SB_ASYNC;
 				SOCKBUF_UNLOCK(&so->so_snd);
 			}
 			SOCK_UNLOCK(so);
 		}
 		break;
 
 	case FIONREAD:
 		SOCK_RECVBUF_LOCK(so);
 		if (SOLISTENING(so)) {
 			error = EINVAL;
 		} else {
 			*(int *)data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
 		}
 		SOCK_RECVBUF_UNLOCK(so);
 		break;
 
 	case FIONWRITE:
 		/* Unlocked read. */
 		if (SOLISTENING(so)) {
 			error = EINVAL;
 		} else {
 			*(int *)data = sbavail(&so->so_snd);
 		}
 		break;
 
 	case FIONSPACE:
 		/* Unlocked read. */
 		if (SOLISTENING(so)) {
 			error = EINVAL;
 		} else {
 			if ((so->so_snd.sb_hiwat < sbused(&so->so_snd)) ||
 			    (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt)) {
 				*(int *)data = 0;
 			} else {
 				*(int *)data = sbspace(&so->so_snd);
 			}
 		}
 		break;
 
 	case FIOSETOWN:
 		error = fsetown(*(int *)data, &so->so_sigio);
 		break;
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&so->so_sigio);
 		break;
 
 	case SIOCSPGRP:
 		error = fsetown(-(*(int *)data), &so->so_sigio);
 		break;
 
 	case SIOCGPGRP:
 		*(int *)data = -fgetown(&so->so_sigio);
 		break;
 
 	case SIOCATMARK:
 		/* Unlocked read. */
 		if (SOLISTENING(so)) {
 			error = EINVAL;
 		} else {
 			*(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0;
 		}
 		break;
 	default:
 		/*
 		 * Interface/routing/protocol specific ioctls: interface and
 		 * routing ioctls should have a different entry since a
 		 * socket is unnecessary.
 		 */
 		if (IOCGROUP(cmd) == 'i')
 			error = ifioctl(so, cmd, data, td);
 		else if (IOCGROUP(cmd) == 'r') {
 			CURVNET_SET(so->so_vnet);
 			error = rtioctl_fib(cmd, data, so->so_fibnum);
 			CURVNET_RESTORE();
 		} else {
 			CURVNET_SET(so->so_vnet);
 			error = ((*so->so_proto->pr_usrreqs->pru_control)
 			    (so, cmd, data, 0, td));
 			CURVNET_RESTORE();
 		}
 		break;
 	}
 	return (error);
 }
 
 static int
 soo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct socket *so = fp->f_data;
 #ifdef MAC
 	int error;
 
 	error = mac_socket_check_poll(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	return (sopoll(so, events, fp->f_cred, td));
 }
 
 static int
-soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred,
-    struct thread *td)
+soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred)
 {
 	struct socket *so = fp->f_data;
 	int error;
 
 	bzero((caddr_t)ub, sizeof (*ub));
 	ub->st_mode = S_IFSOCK;
 #ifdef MAC
 	error = mac_socket_check_stat(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	SOCK_LOCK(so);
 	if (!SOLISTENING(so)) {
 		struct sockbuf *sb;
 
 		/*
 		 * If SBS_CANTRCVMORE is set, but there's still data left
 		 * in the receive buffer, the socket is still readable.
 		 */
 		sb = &so->so_rcv;
 		SOCKBUF_LOCK(sb);
 		if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb))
 			ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
 		ub->st_size = sbavail(sb) - sb->sb_ctl;
 		SOCKBUF_UNLOCK(sb);
 
 		sb = &so->so_snd;
 		SOCKBUF_LOCK(sb);
 		if ((sb->sb_state & SBS_CANTSENDMORE) == 0)
 			ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
 		SOCKBUF_UNLOCK(sb);
 	}
 	ub->st_uid = so->so_cred->cr_uid;
 	ub->st_gid = so->so_cred->cr_gid;
 	error = so->so_proto->pr_usrreqs->pru_sense(so, ub);
 	SOCK_UNLOCK(so);
 	return (error);
 }
 
 /*
  * API socket close on file pointer.  We call soclose() to close the socket
  * (including initiating closing protocols).  soclose() will sorele() the
  * file reference but the actual socket will not go away until the socket's
  * ref count hits 0.
  */
 static int
 soo_close(struct file *fp, struct thread *td)
 {
 	int error = 0;
 	struct socket *so;
 
 	so = fp->f_data;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 
 	if (so)
 		error = soclose(so);
 	return (error);
 }
 
 static int
 soo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct sockaddr *sa;
 	struct inpcb *inpcb;
 	struct unpcb *unpcb;
 	struct socket *so;
 	int error;
 
 	kif->kf_type = KF_TYPE_SOCKET;
 	so = fp->f_data;
 	CURVNET_SET(so->so_vnet);
 	kif->kf_un.kf_sock.kf_sock_domain0 =
 	    so->so_proto->pr_domain->dom_family;
 	kif->kf_un.kf_sock.kf_sock_type0 = so->so_type;
 	kif->kf_un.kf_sock.kf_sock_protocol0 = so->so_proto->pr_protocol;
 	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
 	switch (kif->kf_un.kf_sock.kf_sock_domain0) {
 	case AF_INET:
 	case AF_INET6:
 		if (kif->kf_un.kf_sock.kf_sock_protocol0 == IPPROTO_TCP) {
 			if (so->so_pcb != NULL) {
 				inpcb = (struct inpcb *)(so->so_pcb);
 				kif->kf_un.kf_sock.kf_sock_inpcb =
 				    (uintptr_t)inpcb->inp_ppcb;
 				kif->kf_un.kf_sock.kf_sock_sendq =
 				    sbused(&so->so_snd);
 				kif->kf_un.kf_sock.kf_sock_recvq =
 				    sbused(&so->so_rcv);
 			}
 		}
 		break;
 	case AF_UNIX:
 		if (so->so_pcb != NULL) {
 			unpcb = (struct unpcb *)(so->so_pcb);
 			if (unpcb->unp_conn) {
 				kif->kf_un.kf_sock.kf_sock_unpconn =
 				    (uintptr_t)unpcb->unp_conn;
 				kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
 				    so->so_rcv.sb_state;
 				kif->kf_un.kf_sock.kf_sock_snd_sb_state =
 				    so->so_snd.sb_state;
 				kif->kf_un.kf_sock.kf_sock_sendq =
 				    sbused(&so->so_snd);
 				kif->kf_un.kf_sock.kf_sock_recvq =
 				    sbused(&so->so_rcv);
 			}
 		}
 		break;
 	}
 	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
 	if (error == 0 &&
 	    sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_local)) {
 		bcopy(sa, &kif->kf_un.kf_sock.kf_sa_local, sa->sa_len);
 		free(sa, M_SONAME);
 	}
 	error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
 	if (error == 0 &&
 	    sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_peer)) {
 		bcopy(sa, &kif->kf_un.kf_sock.kf_sa_peer, sa->sa_len);
 		free(sa, M_SONAME);
 	}
 	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
 	    sizeof(kif->kf_path));
 	CURVNET_RESTORE();
 	return (0);	
 }
 
 /*
  * Use the 'backend3' field in AIO jobs to store the amount of data
  * completed by the AIO job so far.
  */
 #define	aio_done	backend3
 
 static STAILQ_HEAD(, task) soaio_jobs;
 static struct mtx soaio_jobs_lock;
 static struct task soaio_kproc_task;
 static int soaio_starting, soaio_idle, soaio_queued;
 static struct unrhdr *soaio_kproc_unr;
 
 static int soaio_max_procs = MAX_AIO_PROCS;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, max_procs, CTLFLAG_RW, &soaio_max_procs, 0,
     "Maximum number of kernel processes to use for async socket IO");
 
 static int soaio_num_procs;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, num_procs, CTLFLAG_RD, &soaio_num_procs, 0,
     "Number of active kernel processes for async socket IO");
 
 static int soaio_target_procs = TARGET_AIO_PROCS;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, target_procs, CTLFLAG_RD,
     &soaio_target_procs, 0,
     "Preferred number of ready kernel processes for async socket IO");
 
 static int soaio_lifetime;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, lifetime, CTLFLAG_RW, &soaio_lifetime, 0,
     "Maximum lifetime for idle aiod");
 
 static void
 soaio_kproc_loop(void *arg)
 {
 	struct proc *p;
 	struct vmspace *myvm;
 	struct task *task;
 	int error, id, pending;
 
 	id = (intptr_t)arg;
 
 	/*
 	 * Grab an extra reference on the daemon's vmspace so that it
 	 * doesn't get freed by jobs that switch to a different
 	 * vmspace.
 	 */
 	p = curproc;
 	myvm = vmspace_acquire_ref(p);
 
 	mtx_lock(&soaio_jobs_lock);
 	MPASS(soaio_starting > 0);
 	soaio_starting--;
 	for (;;) {
 		while (!STAILQ_EMPTY(&soaio_jobs)) {
 			task = STAILQ_FIRST(&soaio_jobs);
 			STAILQ_REMOVE_HEAD(&soaio_jobs, ta_link);
 			soaio_queued--;
 			pending = task->ta_pending;
 			task->ta_pending = 0;
 			mtx_unlock(&soaio_jobs_lock);
 
 			task->ta_func(task->ta_context, pending);
 
 			mtx_lock(&soaio_jobs_lock);
 		}
 		MPASS(soaio_queued == 0);
 
 		if (p->p_vmspace != myvm) {
 			mtx_unlock(&soaio_jobs_lock);
 			vmspace_switch_aio(myvm);
 			mtx_lock(&soaio_jobs_lock);
 			continue;
 		}
 
 		soaio_idle++;
 		error = mtx_sleep(&soaio_idle, &soaio_jobs_lock, 0, "-",
 		    soaio_lifetime);
 		soaio_idle--;
 		if (error == EWOULDBLOCK && STAILQ_EMPTY(&soaio_jobs) &&
 		    soaio_num_procs > soaio_target_procs)
 			break;
 	}
 	soaio_num_procs--;
 	mtx_unlock(&soaio_jobs_lock);
 	free_unr(soaio_kproc_unr, id);
 	kproc_exit(0);
 }
 
 static void
 soaio_kproc_create(void *context, int pending)
 {
 	struct proc *p;
 	int error, id;
 
 	mtx_lock(&soaio_jobs_lock);
 	for (;;) {
 		if (soaio_num_procs < soaio_target_procs) {
 			/* Must create */
 		} else if (soaio_num_procs >= soaio_max_procs) {
 			/*
 			 * Hit the limit on kernel processes, don't
 			 * create another one.
 			 */
 			break;
 		} else if (soaio_queued <= soaio_idle + soaio_starting) {
 			/*
 			 * No more AIO jobs waiting for a process to be
 			 * created, so stop.
 			 */
 			break;
 		}
 		soaio_starting++;
 		mtx_unlock(&soaio_jobs_lock);
 
 		id = alloc_unr(soaio_kproc_unr);
 		error = kproc_create(soaio_kproc_loop, (void *)(intptr_t)id,
 		    &p, 0, 0, "soaiod%d", id);
 		if (error != 0) {
 			free_unr(soaio_kproc_unr, id);
 			mtx_lock(&soaio_jobs_lock);
 			soaio_starting--;
 			break;
 		}
 
 		mtx_lock(&soaio_jobs_lock);
 		soaio_num_procs++;
 	}
 	mtx_unlock(&soaio_jobs_lock);
 }
 
 void
 soaio_enqueue(struct task *task)
 {
 
 	mtx_lock(&soaio_jobs_lock);
 	MPASS(task->ta_pending == 0);
 	task->ta_pending++;
 	STAILQ_INSERT_TAIL(&soaio_jobs, task, ta_link);
 	soaio_queued++;
 	if (soaio_queued <= soaio_idle)
 		wakeup_one(&soaio_idle);
 	else if (soaio_num_procs < soaio_max_procs)
 		taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task);
 	mtx_unlock(&soaio_jobs_lock);
 }
 
 static void
 soaio_init(void)
 {
 
 	soaio_lifetime = AIOD_LIFETIME_DEFAULT;
 	STAILQ_INIT(&soaio_jobs);
 	mtx_init(&soaio_jobs_lock, "soaio jobs", NULL, MTX_DEF);
 	soaio_kproc_unr = new_unrhdr(1, INT_MAX, NULL);
 	TASK_INIT(&soaio_kproc_task, 0, soaio_kproc_create, NULL);
 	if (soaio_target_procs > 0)
 		taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task);
 }
 SYSINIT(soaio, SI_SUB_VFS, SI_ORDER_ANY, soaio_init, NULL);
 
 static __inline int
 soaio_ready(struct socket *so, struct sockbuf *sb)
 {
 	return (sb == &so->so_rcv ? soreadable(so) : sowriteable(so));
 }
 
 static void
 soaio_process_job(struct socket *so, struct sockbuf *sb, struct kaiocb *job)
 {
 	struct ucred *td_savedcred;
 	struct thread *td;
 	struct file *fp;
 	size_t cnt, done, job_total_nbytes;
 	long ru_before;
 	int error, flags;
 
 	SOCKBUF_UNLOCK(sb);
 	aio_switch_vmspace(job);
 	td = curthread;
 	fp = job->fd_file;
 retry:
 	td_savedcred = td->td_ucred;
 	td->td_ucred = job->cred;
 
 	job_total_nbytes = job->uiop->uio_resid + job->aio_done;
 	done = job->aio_done;
 	cnt = job->uiop->uio_resid;
 	job->uiop->uio_offset = 0;
 	job->uiop->uio_td = td;
 	flags = MSG_NBIO;
 
 	/*
 	 * For resource usage accounting, only count a completed request
 	 * as a single message to avoid counting multiple calls to
 	 * sosend/soreceive on a blocking socket.
 	 */
 
 	if (sb == &so->so_rcv) {
 		ru_before = td->td_ru.ru_msgrcv;
 #ifdef MAC
 		error = mac_socket_check_receive(fp->f_cred, so);
 		if (error == 0)
 
 #endif
 			error = soreceive(so, NULL, job->uiop, NULL, NULL,
 			    &flags);
 		if (td->td_ru.ru_msgrcv != ru_before)
 			job->msgrcv = 1;
 	} else {
 		if (!TAILQ_EMPTY(&sb->sb_aiojobq))
 			flags |= MSG_MORETOCOME;
 		ru_before = td->td_ru.ru_msgsnd;
 #ifdef MAC
 		error = mac_socket_check_send(fp->f_cred, so);
 		if (error == 0)
 #endif
 			error = sosend(so, NULL, job->uiop, NULL, NULL, flags,
 			    td);
 		if (td->td_ru.ru_msgsnd != ru_before)
 			job->msgsnd = 1;
 		if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 	}
 
 	done += cnt - job->uiop->uio_resid;
 	job->aio_done = done;
 	td->td_ucred = td_savedcred;
 
 	if (error == EWOULDBLOCK) {
 		/*
 		 * The request was either partially completed or not
 		 * completed at all due to racing with a read() or
 		 * write() on the socket.  If the socket is
 		 * non-blocking, return with any partial completion.
 		 * If the socket is blocking or if no progress has
 		 * been made, requeue this request at the head of the
 		 * queue to try again when the socket is ready.
 		 */
 		MPASS(done != job_total_nbytes);
 		SOCKBUF_LOCK(sb);
 		if (done == 0 || !(so->so_state & SS_NBIO)) {
 			empty_results++;
 			if (soaio_ready(so, sb)) {
 				empty_retries++;
 				SOCKBUF_UNLOCK(sb);
 				goto retry;
 			}
 			
 			if (!aio_set_cancel_function(job, soo_aio_cancel)) {
 				SOCKBUF_UNLOCK(sb);
 				if (done != 0)
 					aio_complete(job, done, 0);
 				else
 					aio_cancel(job);
 				SOCKBUF_LOCK(sb);
 			} else {
 				TAILQ_INSERT_HEAD(&sb->sb_aiojobq, job, list);
 			}
 			return;
 		}
 		SOCKBUF_UNLOCK(sb);
 	}		
 	if (done != 0 && (error == ERESTART || error == EINTR ||
 	    error == EWOULDBLOCK))
 		error = 0;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, done, 0);
 	SOCKBUF_LOCK(sb);
 }
 
 static void
 soaio_process_sb(struct socket *so, struct sockbuf *sb)
 {
 	struct kaiocb *job;
 
 	CURVNET_SET(so->so_vnet);
 	SOCKBUF_LOCK(sb);
 	while (!TAILQ_EMPTY(&sb->sb_aiojobq) && soaio_ready(so, sb)) {
 		job = TAILQ_FIRST(&sb->sb_aiojobq);
 		TAILQ_REMOVE(&sb->sb_aiojobq, job, list);
 		if (!aio_clear_cancel_function(job))
 			continue;
 
 		soaio_process_job(so, sb, job);
 	}
 
 	/*
 	 * If there are still pending requests, the socket must not be
 	 * ready so set SB_AIO to request a wakeup when the socket
 	 * becomes ready.
 	 */
 	if (!TAILQ_EMPTY(&sb->sb_aiojobq))
 		sb->sb_flags |= SB_AIO;
 	sb->sb_flags &= ~SB_AIO_RUNNING;
 	SOCKBUF_UNLOCK(sb);
 
 	SOCK_LOCK(so);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 void
 soaio_rcv(void *context, int pending)
 {
 	struct socket *so;
 
 	so = context;
 	soaio_process_sb(so, &so->so_rcv);
 }
 
 void
 soaio_snd(void *context, int pending)
 {
 	struct socket *so;
 
 	so = context;
 	soaio_process_sb(so, &so->so_snd);
 }
 
 void
 sowakeup_aio(struct socket *so, struct sockbuf *sb)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	sb->sb_flags &= ~SB_AIO;
 	if (sb->sb_flags & SB_AIO_RUNNING)
 		return;
 	sb->sb_flags |= SB_AIO_RUNNING;
 	soref(so);
 	soaio_enqueue(&sb->sb_aiotask);
 }
 
 static void
 soo_aio_cancel(struct kaiocb *job)
 {
 	struct socket *so;
 	struct sockbuf *sb;
 	long done;
 	int opcode;
 
 	so = job->fd_file->f_data;
 	opcode = job->uaiocb.aio_lio_opcode;
 	if (opcode & LIO_READ)
 		sb = &so->so_rcv;
 	else {
 		MPASS(opcode & LIO_WRITE);
 		sb = &so->so_snd;
 	}
 
 	SOCKBUF_LOCK(sb);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&sb->sb_aiojobq, job, list);
 	if (TAILQ_EMPTY(&sb->sb_aiojobq))
 		sb->sb_flags &= ~SB_AIO;
 	SOCKBUF_UNLOCK(sb);
 
 	done = job->aio_done;
 	if (done != 0)
 		aio_complete(job, done, 0);
 	else
 		aio_cancel(job);
 }
 
 static int
 soo_aio_queue(struct file *fp, struct kaiocb *job)
 {
 	struct socket *so;
 	struct sockbuf *sb;
 	int error;
 
 	so = fp->f_data;
 	error = (*so->so_proto->pr_usrreqs->pru_aio_queue)(so, job);
 	if (error == 0)
 		return (0);
 
 	/* Lock through the socket, since this may be a listening socket. */
 	switch (job->uaiocb.aio_lio_opcode & (LIO_WRITE | LIO_READ)) {
 	case LIO_READ:
 		sb = &so->so_rcv;
 		SOCK_RECVBUF_LOCK(so);
 		break;
 	case LIO_WRITE:
 		sb = &so->so_snd;
 		SOCK_SENDBUF_LOCK(so);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	if (SOLISTENING(so)) {
 		if (sb == &so->so_rcv)
 			SOCK_RECVBUF_UNLOCK(so);
 		else
 			SOCK_SENDBUF_UNLOCK(so);
 		return (EINVAL);
 	}
 
 	if (!aio_set_cancel_function(job, soo_aio_cancel))
 		panic("new job was cancelled");
 	TAILQ_INSERT_TAIL(&sb->sb_aiojobq, job, list);
 	if (!(sb->sb_flags & SB_AIO_RUNNING)) {
 		if (soaio_ready(so, sb))
 			sowakeup_aio(so, sb);
 		else
 			sb->sb_flags |= SB_AIO;
 	}
 	SOCKBUF_UNLOCK(sb);
 	return (0);
 }
diff --git a/sys/kern/tty_pts.c b/sys/kern/tty_pts.c
index dbbf35cf1a12..a912634393f6 100644
--- a/sys/kern/tty_pts.c
+++ b/sys/kern/tty_pts.c
@@ -1,877 +1,876 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
  * All rights reserved.
  *
  * Portions of this software were developed under sponsorship from Snow
  * B.V., the Netherlands.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* Add compatibility bits for FreeBSD. */
 #define PTS_COMPAT
 /* Add pty(4) compat bits. */
 #define PTS_EXTERNAL
 /* Add bits to make Linux binaries work. */
 #define PTS_LINUX
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/condvar.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/serial.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/tty.h>
 #include <sys/ttycom.h>
 #include <sys/uio.h>
 #include <sys/user.h>
 
 #include <machine/stdarg.h>
 
 /*
  * Our utmp(5) format is limited to 8-byte TTY line names.  This means
  * we can at most allocate 1000 pseudo-terminals ("pts/999").  Allow
  * users to increase this number, assuming they have manually increased
  * UT_LINESIZE.
  */
 static struct unrhdr *pts_pool;
 
 static MALLOC_DEFINE(M_PTS, "pts", "pseudo tty device");
 
 /*
  * Per-PTS structure.
  *
  * List of locks
  * (t)	locked by tty_lock()
  * (c)	const until freeing
  */
 struct pts_softc {
 	int		pts_unit;	/* (c) Device unit number. */
 	unsigned int	pts_flags;	/* (t) Device flags. */
 #define	PTS_PKT		0x1	/* Packet mode. */
 #define	PTS_FINISHED	0x2	/* Return errors on read()/write(). */
 	char		pts_pkt;	/* (t) Unread packet mode data. */
 
 	struct cv	pts_inwait;	/* (t) Blocking write() on master. */
 	struct selinfo	pts_inpoll;	/* (t) Select queue for write(). */
 	struct cv	pts_outwait;	/* (t) Blocking read() on master. */
 	struct selinfo	pts_outpoll;	/* (t) Select queue for read(). */
 
 #ifdef PTS_EXTERNAL
 	struct cdev	*pts_cdev;	/* (c) Master device node. */
 #endif /* PTS_EXTERNAL */
 
 	struct ucred	*pts_cred;	/* (c) Resource limit. */
 };
 
 /*
  * Controller-side file operations.
  */
 
 static int
 ptsdev_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int error = 0;
 	char pkt;
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	tty_lock(tp);
 
 	for (;;) {
 		/*
 		 * Implement packet mode. When packet mode is turned on,
 		 * the first byte contains a bitmask of events that
 		 * occurred (start, stop, flush, window size, etc).
 		 */
 		if (psc->pts_flags & PTS_PKT && psc->pts_pkt) {
 			pkt = psc->pts_pkt;
 			psc->pts_pkt = 0;
 			tty_unlock(tp);
 
 			error = ureadc(pkt, uio);
 			return (error);
 		}
 
 		/*
 		 * Transmit regular data.
 		 *
 		 * XXX: We shouldn't use ttydisc_getc_poll()! Even
 		 * though in this implementation, there is likely going
 		 * to be data, we should just call ttydisc_getc_uio()
 		 * and use its return value to sleep.
 		 */
 		if (ttydisc_getc_poll(tp)) {
 			if (psc->pts_flags & PTS_PKT) {
 				/*
 				 * XXX: Small race. Fortunately PTY
 				 * consumers aren't multithreaded.
 				 */
 
 				tty_unlock(tp);
 				error = ureadc(TIOCPKT_DATA, uio);
 				if (error)
 					return (error);
 				tty_lock(tp);
 			}
 
 			error = ttydisc_getc_uio(tp, uio);
 			break;
 		}
 
 		/* Maybe the device isn't used anyway. */
 		if (psc->pts_flags & PTS_FINISHED)
 			break;
 
 		/* Wait for more data. */
 		if (fp->f_flag & O_NONBLOCK) {
 			error = EWOULDBLOCK;
 			break;
 		}
 		error = cv_wait_sig(&psc->pts_outwait, tp->t_mtx);
 		if (error != 0)
 			break;
 	}
 
 	tty_unlock(tp);
 
 	return (error);
 }
 
 static int
 ptsdev_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	char ib[256], *ibstart;
 	size_t iblen, rintlen;
 	int error = 0;
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	for (;;) {
 		ibstart = ib;
 		iblen = MIN(uio->uio_resid, sizeof ib);
 		error = uiomove(ib, iblen, uio);
 
 		tty_lock(tp);
 		if (error != 0) {
 			iblen = 0;
 			goto done;
 		}
 
 		/*
 		 * When possible, avoid the slow path. rint_bypass()
 		 * copies all input to the input queue at once.
 		 */
 		MPASS(iblen > 0);
 		do {
 			rintlen = ttydisc_rint_simple(tp, ibstart, iblen);
 			ibstart += rintlen;
 			iblen -= rintlen;
 			if (iblen == 0) {
 				/* All data written. */
 				break;
 			}
 
 			/* Maybe the device isn't used anyway. */
 			if (psc->pts_flags & PTS_FINISHED) {
 				error = EIO;
 				goto done;
 			}
 
 			/* Wait for more data. */
 			if (fp->f_flag & O_NONBLOCK) {
 				error = EWOULDBLOCK;
 				goto done;
 			}
 
 			/* Wake up users on the slave side. */
 			ttydisc_rint_done(tp);
 			error = cv_wait_sig(&psc->pts_inwait, tp->t_mtx);
 			if (error != 0)
 				goto done;
 		} while (iblen > 0);
 
 		if (uio->uio_resid == 0)
 			break;
 		tty_unlock(tp);
 	}
 
 done:	ttydisc_rint_done(tp);
 	tty_unlock(tp);
 
 	/*
 	 * Don't account for the part of the buffer that we couldn't
 	 * pass to the TTY.
 	 */
 	uio->uio_resid += iblen;
 	return (error);
 }
 
 static int
 ptsdev_ioctl(struct file *fp, u_long cmd, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int error = 0, sig;
 
 	switch (cmd) {
 	case FIODTYPE:
 		*(int *)data = D_TTY;
 		return (0);
 	case FIONBIO:
 		/* This device supports non-blocking operation. */
 		return (0);
 	case FIONREAD:
 		tty_lock(tp);
 		if (psc->pts_flags & PTS_FINISHED) {
 			/* Force read() to be called. */
 			*(int *)data = 1;
 		} else {
 			*(int *)data = ttydisc_getc_poll(tp);
 		}
 		tty_unlock(tp);
 		return (0);
 	case FIODGNAME:
 #ifdef COMPAT_FREEBSD32
 	case FIODGNAME_32:
 #endif
 	{
 		struct fiodgname_arg *fgn;
 		const char *p;
 		int i;
 
 		/* Reverse device name lookups, for ptsname() and ttyname(). */
 		fgn = data;
 		p = tty_devname(tp);
 		i = strlen(p) + 1;
 		if (i > fgn->len)
 			return (EINVAL);
 		return (copyout(p, fiodgname_buf_get_ptr(fgn, cmd), i));
 	}
 
 	/*
 	 * We need to implement TIOCGPGRP and TIOCGSID here again. When
 	 * called on the pseudo-terminal master, it should not check if
 	 * the terminal is the foreground terminal of the calling
 	 * process.
 	 *
 	 * TIOCGETA is also implemented here. Various Linux PTY routines
 	 * often call isatty(), which is implemented by tcgetattr().
 	 */
 #ifdef PTS_LINUX
 	case TIOCGETA:
 		/* Obtain terminal flags through tcgetattr(). */
 		tty_lock(tp);
 		*(struct termios*)data = tp->t_termios;
 		tty_unlock(tp);
 		return (0);
 #endif /* PTS_LINUX */
 	case TIOCSETAF:
 	case TIOCSETAW:
 		/*
 		 * We must make sure we turn tcsetattr() calls of TCSAFLUSH and
 		 * TCSADRAIN into something different. If an application would
 		 * call TCSAFLUSH or TCSADRAIN on the master descriptor, it may
 		 * deadlock waiting for all data to be read.
 		 */
 		cmd = TIOCSETA;
 		break;
 #if defined(PTS_COMPAT) || defined(PTS_LINUX)
 	case TIOCGPTN:
 		/*
 		 * Get the device unit number.
 		 */
 		if (psc->pts_unit < 0)
 			return (ENOTTY);
 		*(unsigned int *)data = psc->pts_unit;
 		return (0);
 #endif /* PTS_COMPAT || PTS_LINUX */
 	case TIOCGPGRP:
 		/* Get the foreground process group ID. */
 		tty_lock(tp);
 		if (tp->t_pgrp != NULL)
 			*(int *)data = tp->t_pgrp->pg_id;
 		else
 			*(int *)data = NO_PID;
 		tty_unlock(tp);
 		return (0);
 	case TIOCGSID:
 		/* Get the session leader process ID. */
 		tty_lock(tp);
 		if (tp->t_session == NULL)
 			error = ENOTTY;
 		else
 			*(int *)data = tp->t_session->s_sid;
 		tty_unlock(tp);
 		return (error);
 	case TIOCPTMASTER:
 		/* Yes, we are a pseudo-terminal master. */
 		return (0);
 	case TIOCSIG:
 		/* Signal the foreground process group. */
 		sig = *(int *)data;
 		if (sig < 1 || sig >= NSIG)
 			return (EINVAL);
 
 		tty_lock(tp);
 		tty_signal_pgrp(tp, sig);
 		tty_unlock(tp);
 		return (0);
 	case TIOCPKT:
 		/* Enable/disable packet mode. */
 		tty_lock(tp);
 		if (*(int *)data)
 			psc->pts_flags |= PTS_PKT;
 		else
 			psc->pts_flags &= ~PTS_PKT;
 		tty_unlock(tp);
 		return (0);
 	}
 
 	/* Just redirect this ioctl to the slave device. */
 	tty_lock(tp);
 	error = tty_ioctl(tp, cmd, data, fp->f_flag, td);
 	tty_unlock(tp);
 	if (error == ENOIOCTL)
 		error = ENOTTY;
 
 	return (error);
 }
 
 static int
 ptsdev_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int revents = 0;
 
 	tty_lock(tp);
 
 	if (psc->pts_flags & PTS_FINISHED) {
 		/* Slave device is not opened. */
 		tty_unlock(tp);
 		return ((events & (POLLIN|POLLRDNORM)) | POLLHUP);
 	}
 
 	if (events & (POLLIN|POLLRDNORM)) {
 		/* See if we can getc something. */
 		if (ttydisc_getc_poll(tp) ||
 		    (psc->pts_flags & PTS_PKT && psc->pts_pkt))
 			revents |= events & (POLLIN|POLLRDNORM);
 	}
 	if (events & (POLLOUT|POLLWRNORM)) {
 		/* See if we can rint something. */
 		if (ttydisc_rint_poll(tp))
 			revents |= events & (POLLOUT|POLLWRNORM);
 	}
 
 	/*
 	 * No need to check for POLLHUP here. This device cannot be used
 	 * as a callout device, which means we always have a carrier,
 	 * because the master is.
 	 */
 
 	if (revents == 0) {
 		/*
 		 * This code might look misleading, but the naming of
 		 * poll events on this side is the opposite of the slave
 		 * device.
 		 */
 		if (events & (POLLIN|POLLRDNORM))
 			selrecord(td, &psc->pts_outpoll);
 		if (events & (POLLOUT|POLLWRNORM))
 			selrecord(td, &psc->pts_inpoll);
 	}
 
 	tty_unlock(tp);
 
 	return (revents);
 }
 
 /*
  * kqueue support.
  */
 
 static void
 pts_kqops_read_detach(struct knote *kn)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	knlist_remove(&psc->pts_outpoll.si_note, kn, 0);
 }
 
 static int
 pts_kqops_read_event(struct knote *kn, long hint)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	if (psc->pts_flags & PTS_FINISHED) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_data = ttydisc_getc_poll(tp);
 		return (kn->kn_data > 0);
 	}
 }
 
 static void
 pts_kqops_write_detach(struct knote *kn)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	knlist_remove(&psc->pts_inpoll.si_note, kn, 0);
 }
 
 static int
 pts_kqops_write_event(struct knote *kn, long hint)
 {
 	struct file *fp = kn->kn_fp;
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 
 	if (psc->pts_flags & PTS_FINISHED) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	} else {
 		kn->kn_data = ttydisc_rint_poll(tp);
 		return (kn->kn_data > 0);
 	}
 }
 
 static struct filterops pts_kqops_read = {
 	.f_isfd = 1,
 	.f_detach = pts_kqops_read_detach,
 	.f_event = pts_kqops_read_event,
 };
 static struct filterops pts_kqops_write = {
 	.f_isfd = 1,
 	.f_detach = pts_kqops_write_detach,
 	.f_event = pts_kqops_write_event,
 };
 
 static int
 ptsdev_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct tty *tp = fp->f_data;
 	struct pts_softc *psc = tty_softc(tp);
 	int error = 0;
 
 	tty_lock(tp);
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &pts_kqops_read;
 		knlist_add(&psc->pts_outpoll.si_note, kn, 1);
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &pts_kqops_write;
 		knlist_add(&psc->pts_inpoll.si_note, kn, 1);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	tty_unlock(tp);
 	return (error);
 }
 
 static int
-ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
-    struct thread *td)
+ptsdev_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 	struct tty *tp = fp->f_data;
 #ifdef PTS_EXTERNAL
 	struct pts_softc *psc = tty_softc(tp);
 #endif /* PTS_EXTERNAL */
 	struct cdev *dev = tp->t_dev;
 
 	/*
 	 * According to POSIX, we must implement an fstat(). This also
 	 * makes this implementation compatible with Linux binaries,
 	 * because Linux calls fstat() on the pseudo-terminal master to
 	 * obtain st_rdev.
 	 *
 	 * XXX: POSIX also mentions we must fill in st_dev, but how?
 	 */
 
 	bzero(sb, sizeof *sb);
 #ifdef PTS_EXTERNAL
 	if (psc->pts_cdev != NULL)
 		sb->st_ino = sb->st_rdev = dev2udev(psc->pts_cdev);
 	else
 #endif /* PTS_EXTERNAL */
 		sb->st_ino = sb->st_rdev = tty_udev(tp);
 
 	sb->st_atim = dev->si_atime;
 	sb->st_ctim = dev->si_ctime;
 	sb->st_mtim = dev->si_mtime;
 	sb->st_uid = dev->si_uid;
 	sb->st_gid = dev->si_gid;
 	sb->st_mode = dev->si_mode | S_IFCHR;
 
 	return (0);
 }
 
 static int
 ptsdev_close(struct file *fp, struct thread *td)
 {
 	struct tty *tp = fp->f_data;
 
 	/* Deallocate TTY device. */
 	tty_lock(tp);
 	tty_rel_gone(tp);
 
 	/*
 	 * Open of /dev/ptmx or /dev/ptyXX changes the type of file
 	 * from DTYPE_VNODE to DTYPE_PTS. vn_open() increases vnode
 	 * use count, we need to decrement it, and possibly do other
 	 * required cleanup.
 	 */
 	if (fp->f_vnode != NULL)
 		return (vnops.fo_close(fp, td));
 
 	return (0);
 }
 
 static int
 ptsdev_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct tty *tp;
 
 	kif->kf_type = KF_TYPE_PTS;
 	tp = fp->f_data;
 	kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
 	kif->kf_un.kf_pts.kf_pts_dev_freebsd11 =
 	    kif->kf_un.kf_pts.kf_pts_dev; /* truncate */
 	strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
 	return (0);
 }
 
 static struct fileops ptsdev_ops = {
 	.fo_read	= ptsdev_read,
 	.fo_write	= ptsdev_write,
 	.fo_truncate	= invfo_truncate,
 	.fo_ioctl	= ptsdev_ioctl,
 	.fo_poll	= ptsdev_poll,
 	.fo_kqfilter	= ptsdev_kqfilter,
 	.fo_stat	= ptsdev_stat,
 	.fo_close	= ptsdev_close,
 	.fo_chmod	= invfo_chmod,
 	.fo_chown	= invfo_chown,
 	.fo_sendfile	= invfo_sendfile,
 	.fo_fill_kinfo	= ptsdev_fill_kinfo,
 	.fo_flags	= DFLAG_PASSABLE,
 };
 
 /*
  * Driver-side hooks.
  */
 
 static void
 ptsdrv_outwakeup(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	cv_broadcast(&psc->pts_outwait);
 	selwakeup(&psc->pts_outpoll);
 	KNOTE_LOCKED(&psc->pts_outpoll.si_note, 0);
 }
 
 static void
 ptsdrv_inwakeup(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	cv_broadcast(&psc->pts_inwait);
 	selwakeup(&psc->pts_inpoll);
 	KNOTE_LOCKED(&psc->pts_inpoll.si_note, 0);
 }
 
 static int
 ptsdrv_open(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	psc->pts_flags &= ~PTS_FINISHED;
 
 	return (0);
 }
 
 static void
 ptsdrv_close(struct tty *tp)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	/* Wake up any blocked readers/writers. */
 	psc->pts_flags |= PTS_FINISHED;
 	ptsdrv_outwakeup(tp);
 	ptsdrv_inwakeup(tp);
 }
 
 static void
 ptsdrv_pktnotify(struct tty *tp, char event)
 {
 	struct pts_softc *psc = tty_softc(tp);
 
 	/*
 	 * Clear conflicting flags.
 	 */
 
 	switch (event) {
 	case TIOCPKT_STOP:
 		psc->pts_pkt &= ~TIOCPKT_START;
 		break;
 	case TIOCPKT_START:
 		psc->pts_pkt &= ~TIOCPKT_STOP;
 		break;
 	case TIOCPKT_NOSTOP:
 		psc->pts_pkt &= ~TIOCPKT_DOSTOP;
 		break;
 	case TIOCPKT_DOSTOP:
 		psc->pts_pkt &= ~TIOCPKT_NOSTOP;
 		break;
 	}
 
 	psc->pts_pkt |= event;
 	ptsdrv_outwakeup(tp);
 }
 
 static void
 ptsdrv_free(void *softc)
 {
 	struct pts_softc *psc = softc;
 
 	/* Make device number available again. */
 	if (psc->pts_unit >= 0)
 		free_unr(pts_pool, psc->pts_unit);
 
 	chgptscnt(psc->pts_cred->cr_ruidinfo, -1, 0);
 	racct_sub_cred(psc->pts_cred, RACCT_NPTS, 1);
 	crfree(psc->pts_cred);
 
 	seldrain(&psc->pts_inpoll);
 	seldrain(&psc->pts_outpoll);
 	knlist_destroy(&psc->pts_inpoll.si_note);
 	knlist_destroy(&psc->pts_outpoll.si_note);
 
 #ifdef PTS_EXTERNAL
 	/* Destroy master device as well. */
 	if (psc->pts_cdev != NULL)
 		destroy_dev_sched(psc->pts_cdev);
 #endif /* PTS_EXTERNAL */
 
 	free(psc, M_PTS);
 }
 
 static struct ttydevsw pts_class = {
 	.tsw_flags	= TF_NOPREFIX,
 	.tsw_outwakeup	= ptsdrv_outwakeup,
 	.tsw_inwakeup	= ptsdrv_inwakeup,
 	.tsw_open	= ptsdrv_open,
 	.tsw_close	= ptsdrv_close,
 	.tsw_pktnotify	= ptsdrv_pktnotify,
 	.tsw_free	= ptsdrv_free,
 };
 
 #ifndef PTS_EXTERNAL
 static
 #endif /* !PTS_EXTERNAL */
 int
 pts_alloc(int fflags, struct thread *td, struct file *fp)
 {
 	int unit, ok, error;
 	struct tty *tp;
 	struct pts_softc *psc;
 	struct proc *p = td->td_proc;
 	struct ucred *cred = td->td_ucred;
 
 	/* Resource limiting. */
 	PROC_LOCK(p);
 	error = racct_add(p, RACCT_NPTS, 1);
 	if (error != 0) {
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPTS));
 	if (!ok) {
 		racct_sub(p, RACCT_NPTS, 1);
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	PROC_UNLOCK(p);
 
 	/* Try to allocate a new pts unit number. */
 	unit = alloc_unr(pts_pool);
 	if (unit < 0) {
 		racct_sub(p, RACCT_NPTS, 1);
 		chgptscnt(cred->cr_ruidinfo, -1, 0);
 		return (EAGAIN);
 	}
 
 	/* Allocate TTY and softc. */
 	psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO);
 	cv_init(&psc->pts_inwait, "ptsin");
 	cv_init(&psc->pts_outwait, "ptsout");
 
 	psc->pts_unit = unit;
 	psc->pts_cred = crhold(cred);
 
 	tp = tty_alloc(&pts_class, psc);
 	knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
 	knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);
 
 	/* Expose the slave device as well. */
 	tty_makedev(tp, td->td_ucred, "pts/%u", psc->pts_unit);
 
 	finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);
 
 	return (0);
 }
 
 #ifdef PTS_EXTERNAL
 int
 pts_alloc_external(int fflags, struct thread *td, struct file *fp,
     struct cdev *dev, const char *name)
 {
 	int ok, error;
 	struct tty *tp;
 	struct pts_softc *psc;
 	struct proc *p = td->td_proc;
 	struct ucred *cred = td->td_ucred;
 
 	/* Resource limiting. */
 	PROC_LOCK(p);
 	error = racct_add(p, RACCT_NPTS, 1);
 	if (error != 0) {
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_NPTS));
 	if (!ok) {
 		racct_sub(p, RACCT_NPTS, 1);
 		PROC_UNLOCK(p);
 		return (EAGAIN);
 	}
 	PROC_UNLOCK(p);
 
 	/* Allocate TTY and softc. */
 	psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK|M_ZERO);
 	cv_init(&psc->pts_inwait, "ptsin");
 	cv_init(&psc->pts_outwait, "ptsout");
 
 	psc->pts_unit = -1;
 	psc->pts_cdev = dev;
 	psc->pts_cred = crhold(cred);
 
 	tp = tty_alloc(&pts_class, psc);
 	knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
 	knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);
 
 	/* Expose the slave device as well. */
 	tty_makedev(tp, td->td_ucred, "%s", name);
 
 	finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);
 
 	return (0);
 }
 #endif /* PTS_EXTERNAL */
 
 int
 sys_posix_openpt(struct thread *td, struct posix_openpt_args *uap)
 {
 	int error, fd;
 	struct file *fp;
 
 	/*
 	 * POSIX states it's unspecified when other flags are passed. We
 	 * don't allow this.
 	 */
 	if (uap->flags & ~(O_RDWR|O_NOCTTY|O_CLOEXEC))
 		return (EINVAL);
 
 	error = falloc(td, &fp, &fd, uap->flags);
 	if (error)
 		return (error);
 
 	/* Allocate the actual pseudo-TTY. */
 	error = pts_alloc(FFLAGS(uap->flags & O_ACCMODE), td, fp);
 	if (error != 0) {
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	/* Pass it back to userspace. */
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 
 	return (0);
 }
 
 static void
 pts_init(void *unused)
 {
 
 	pts_pool = new_unrhdr(0, INT_MAX, NULL);
 }
 
 SYSINIT(pts, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, pts_init, NULL);
diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c
index a9fb40848fbd..24ed1be6bf65 100644
--- a/sys/kern/uipc_mqueue.c
+++ b/sys/kern/uipc_mqueue.c
@@ -1,2945 +1,2944 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * Copyright (c) 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * POSIX message queue implementation.
  *
  * 1) A mqueue filesystem can be mounted, each message queue appears
  *    in mounted directory, user can change queue's permission and
  *    ownership, or remove a queue. Manually creating a file in the
  *    directory causes a message queue to be created in the kernel with
  *    default message queue attributes applied and same name used, this
  *    method is not advocated since mq_open syscall allows user to specify
  *    different attributes. Also the file system can be mounted multiple
  *    times at different mount points but shows same contents.
  *
  * 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
  *    but directly operate on internal data structure, this allows user to
  *    use the IPC facility without having to mount mqueue file system.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mqueue.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/posix4.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sysproto.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #include <machine/atomic.h>
 
 #include <security/audit/audit.h>
 
 FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support");
 
 /*
  * Limits and constants
  */
 #define	MQFS_NAMELEN		NAME_MAX
 #define MQFS_DELEN		(8 + MQFS_NAMELEN)
 
 /* node types */
 typedef enum {
 	mqfstype_none = 0,
 	mqfstype_root,
 	mqfstype_dir,
 	mqfstype_this,
 	mqfstype_parent,
 	mqfstype_file,
 	mqfstype_symlink,
 } mqfs_type_t;
 
 struct mqfs_node;
 
 /*
  * mqfs_info: describes a mqfs instance
  */
 struct mqfs_info {
 	struct sx		mi_lock;
 	struct mqfs_node	*mi_root;
 	struct unrhdr		*mi_unrhdr;
 };
 
 struct mqfs_vdata {
 	LIST_ENTRY(mqfs_vdata)	mv_link;
 	struct mqfs_node	*mv_node;
 	struct vnode		*mv_vnode;
 	struct task		mv_task;
 };
 
 /*
  * mqfs_node: describes a node (file or directory) within a mqfs
  */
 struct mqfs_node {
 	char			mn_name[MQFS_NAMELEN+1];
 	struct mqfs_info	*mn_info;
 	struct mqfs_node	*mn_parent;
 	LIST_HEAD(,mqfs_node)	mn_children;
 	LIST_ENTRY(mqfs_node)	mn_sibling;
 	LIST_HEAD(,mqfs_vdata)	mn_vnodes;
 	const void		*mn_pr_root;
 	int			mn_refcount;
 	mqfs_type_t		mn_type;
 	int			mn_deleted;
 	uint32_t		mn_fileno;
 	void			*mn_data;
 	struct timespec		mn_birth;
 	struct timespec		mn_ctime;
 	struct timespec		mn_atime;
 	struct timespec		mn_mtime;
 	uid_t			mn_uid;
 	gid_t			mn_gid;
 	int			mn_mode;
 };
 
 #define	VTON(vp)	(((struct mqfs_vdata *)((vp)->v_data))->mv_node)
 #define VTOMQ(vp) 	((struct mqueue *)(VTON(vp)->mn_data))
 #define	VFSTOMQFS(m)	((struct mqfs_info *)((m)->mnt_data))
 #define	FPTOMQ(fp)	((struct mqueue *)(((struct mqfs_node *) \
 				(fp)->f_data)->mn_data))
 
 TAILQ_HEAD(msgq, mqueue_msg);
 
 struct mqueue;
 
 struct mqueue_notifier {
 	LIST_ENTRY(mqueue_notifier)	nt_link;
 	struct sigevent			nt_sigev;
 	ksiginfo_t			nt_ksi;
 	struct proc			*nt_proc;
 };
 
 struct mqueue {
 	struct mtx	mq_mutex;
 	int		mq_flags;
 	long		mq_maxmsg;
 	long		mq_msgsize;
 	long		mq_curmsgs;
 	long		mq_totalbytes;
 	struct msgq	mq_msgq;
 	int		mq_receivers;
 	int		mq_senders;
 	struct selinfo	mq_rsel;
 	struct selinfo	mq_wsel;
 	struct mqueue_notifier	*mq_notifier;
 };
 
 #define	MQ_RSEL		0x01
 #define	MQ_WSEL		0x02
 
 struct mqueue_msg {
 	TAILQ_ENTRY(mqueue_msg)	msg_link;
 	unsigned int	msg_prio;
 	unsigned int	msg_size;
 	/* following real data... */
 };
 
 static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	"POSIX real time message queue");
 
 static int	default_maxmsg  = 10;
 static int	default_msgsize = 1024;
 
 static int	maxmsg = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
     &maxmsg, 0, "Default maximum messages in queue");
 static int	maxmsgsize = 16384;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
     &maxmsgsize, 0, "Default maximum message size");
 static int	maxmq = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
     &maxmq, 0, "maximum message queues");
 static int	curmq = 0;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
     &curmq, 0, "current message queue number");
 static int	unloadable = 0;
 static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
 
 static eventhandler_tag exit_tag;
 
 /* Only one instance per-system */
 static struct mqfs_info		mqfs_data;
 static uma_zone_t		mqnode_zone;
 static uma_zone_t		mqueue_zone;
 static uma_zone_t		mvdata_zone;
 static uma_zone_t		mqnoti_zone;
 static struct vop_vector	mqfs_vnodeops;
 static struct fileops		mqueueops;
 static unsigned			mqfs_osd_jail_slot;
 
 /*
  * Directory structure construction and manipulation
  */
 #ifdef notyet
 static struct mqfs_node	*mqfs_create_dir(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static struct mqfs_node	*mqfs_create_link(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 #endif
 
 static struct mqfs_node	*mqfs_create_file(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static int	mqfs_destroy(struct mqfs_node *mn);
 static void	mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
 static void	mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
 static int	mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
 static int	mqfs_prison_remove(void *obj, void *data);
 
 /*
  * Message queue construction and maniplation
  */
 static struct mqueue	*mqueue_alloc(const struct mq_attr *attr);
 static void	mqueue_free(struct mqueue *mq);
 static int	mqueue_send(struct mqueue *mq, const char *msg_ptr,
 			size_t msg_len, unsigned msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	mqueue_receive(struct mqueue *mq, char *msg_ptr,
 			size_t msg_len, unsigned *msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
 			int timo);
 static int	_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
 			int timo);
 static void	mqueue_send_notification(struct mqueue *mq);
 static void	mqueue_fdclose(struct thread *td, int fd, struct file *fp);
 static void	mq_proc_exit(void *arg, struct proc *p);
 
 /*
  * kqueue filters
  */
 static void	filt_mqdetach(struct knote *kn);
 static int	filt_mqread(struct knote *kn, long hint);
 static int	filt_mqwrite(struct knote *kn, long hint);
 
 struct filterops mq_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqread,
 };
 struct filterops mq_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqwrite,
 };
 
 /*
  * Initialize fileno bitmap
  */
 static void
 mqfs_fileno_init(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = new_unrhdr(1, INT_MAX, NULL);
 	mi->mi_unrhdr = up;
 }
 
 /*
  * Tear down fileno bitmap
  */
 static void
 mqfs_fileno_uninit(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = mi->mi_unrhdr;
 	mi->mi_unrhdr = NULL;
 	delete_unrhdr(up);
 }
 
 /*
  * Allocate a file number
  */
 static void
 mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	/* make sure our parent has a file number */
 	if (mn->mn_parent && !mn->mn_parent->mn_fileno)
 		mqfs_fileno_alloc(mi, mn->mn_parent);
 
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
 		break;
 	case mqfstype_this:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_this node has no parent"));
 		mn->mn_fileno = mn->mn_parent->mn_fileno;
 		break;
 	case mqfstype_parent:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_parent node has no parent"));
 		if (mn->mn_parent == mi->mi_root) {
 			mn->mn_fileno = mn->mn_parent->mn_fileno;
 			break;
 		}
 		KASSERT(mn->mn_parent->mn_parent != NULL,
 		    ("mqfstype_parent node has no grandparent"));
 		mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_alloc() called for unknown type node: %d",
 			mn->mn_type));
 		break;
 	}
 }
 
 /*
  * Release a file number
  */
 static void
 mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		free_unr(mi->mi_unrhdr, mn->mn_fileno);
 		break;
 	case mqfstype_this:
 	case mqfstype_parent:
 		/* ignore these, as they don't "own" their file number */
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_free() called for unknown type node: %d", 
 			mn->mn_type));
 		break;
 	}
 }
 
 static __inline struct mqfs_node *
 mqnode_alloc(void)
 {
 	return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO);
 }
 
 static __inline void
 mqnode_free(struct mqfs_node *node)
 {
 	uma_zfree(mqnode_zone, node);
 }
 
 static __inline void
 mqnode_addref(struct mqfs_node *node)
 {
 	atomic_add_int(&node->mn_refcount, 1);
 }
 
 static __inline void
 mqnode_release(struct mqfs_node *node)
 {
 	struct mqfs_info *mqfs;
 	int old, exp;
 
 	mqfs = node->mn_info;
 	old = atomic_fetchadd_int(&node->mn_refcount, -1);
 	if (node->mn_type == mqfstype_dir ||
 	    node->mn_type == mqfstype_root)
 		exp = 3; /* include . and .. */
 	else
 		exp = 1;
 	if (old == exp) {
 		int locked = sx_xlocked(&mqfs->mi_lock);
 		if (!locked)
 			sx_xlock(&mqfs->mi_lock);
 		mqfs_destroy(node);
 		if (!locked)
 			sx_xunlock(&mqfs->mi_lock);
 	}
 }
 
 /*
  * Add a node to a directory
  */
 static int
 mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
 {
 	KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
 	KASSERT(parent->mn_info != NULL,
 	    ("%s(): parent has no mn_info", __func__));
 	KASSERT(parent->mn_type == mqfstype_dir ||
 	    parent->mn_type == mqfstype_root,
 	    ("%s(): parent is not a directory", __func__));
 
 	node->mn_info = parent->mn_info;
 	node->mn_parent = parent;
 	LIST_INIT(&node->mn_children);
 	LIST_INIT(&node->mn_vnodes);
 	LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
 	mqnode_addref(parent);
 	return (0);
 }
 
 static struct mqfs_node *
 mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
 	int nodetype)
 {
 	struct mqfs_node *node;
 
 	node = mqnode_alloc();
 	strncpy(node->mn_name, name, namelen);
 	node->mn_pr_root = cred->cr_prison->pr_root;
 	node->mn_type = nodetype;
 	node->mn_refcount = 1;
 	vfs_timestamp(&node->mn_birth);
 	node->mn_ctime = node->mn_atime = node->mn_mtime
 		= node->mn_birth;
 	node->mn_uid = cred->cr_uid;
 	node->mn_gid = cred->cr_gid;
 	node->mn_mode = mode;
 	return (node);
 }
 
 /*
  * Create a file
  */
 static struct mqfs_node *
 mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Add . and .. to a directory
  */
 static int
 mqfs_fixup_dir(struct mqfs_node *parent)
 {
 	struct mqfs_node *dir;
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = '.';
 	dir->mn_type = mqfstype_this;
 	dir->mn_refcount = 1;
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = dir->mn_name[1] = '.';
 	dir->mn_type = mqfstype_parent;
 	dir->mn_refcount = 1;
 
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	return (0);
 }
 
 #ifdef notyet
 
 /*
  * Create a directory
  */
 static struct mqfs_node *
 mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 
 	if (mqfs_fixup_dir(node) != 0) {
 		mqfs_destroy(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Create a symlink
  */
 static struct mqfs_node *
 mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 #endif
 
 /*
  * Destroy a node or a tree of nodes
  */
 static int
 mqfs_destroy(struct mqfs_node *node)
 {
 	struct mqfs_node *parent;
 
 	KASSERT(node != NULL,
 	    ("%s(): node is NULL", __func__));
 	KASSERT(node->mn_info != NULL,
 	    ("%s(): node has no mn_info", __func__));
 
 	/* destroy children */
 	if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
 		while (! LIST_EMPTY(&node->mn_children))
 			mqfs_destroy(LIST_FIRST(&node->mn_children));
 
 	/* unlink from parent */
 	if ((parent = node->mn_parent) != NULL) {
 		KASSERT(parent->mn_info == node->mn_info,
 		    ("%s(): parent has different mn_info", __func__));
 		LIST_REMOVE(node, mn_sibling);
 	}
 
 	if (node->mn_fileno != 0)
 		mqfs_fileno_free(node->mn_info, node);
 	if (node->mn_data != NULL)
 		mqueue_free(node->mn_data);
 	mqnode_free(node);
 	return (0);
 }
 
 /*
  * Mount a mqfs instance
  */
 static int
 mqfs_mount(struct mount *mp)
 {
 	struct statfs *sbp;
 
 	if (mp->mnt_flag & MNT_UPDATE)
 		return (EOPNOTSUPP);
 
 	mp->mnt_data = &mqfs_data;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	vfs_getnewfsid(mp);
 
 	sbp = &mp->mnt_stat;
 	vfs_mountedfrom(mp, "mqueue");
 	sbp->f_bsize = PAGE_SIZE;
 	sbp->f_iosize = PAGE_SIZE;
 	sbp->f_blocks = 1;
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = 1;
 	sbp->f_ffree = 0;
 	return (0);
 }
 
 /*
  * Unmount a mqfs instance
  */
 static int
 mqfs_unmount(struct mount *mp, int mntflags)
 {
 	int error;
 
 	error = vflush(mp, 0, (mntflags & MNT_FORCE) ?  FORCECLOSE : 0,
 	    curthread);
 	return (error);
 }
 
 /*
  * Return a root vnode
  */
 static int
 mqfs_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct mqfs_info *mqfs;
 	int ret;
 
 	mqfs = VFSTOMQFS(mp);
 	ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
 	return (ret);
 }
 
 /*
  * Return filesystem stats
  */
 static int
 mqfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	/* XXX update statistics */
 	return (0);
 }
 
 /*
  * Initialize a mqfs instance
  */
 static int
 mqfs_init(struct vfsconf *vfc)
 {
 	struct mqfs_node *root;
 	struct mqfs_info *mi;
 	osd_method_t methods[PR_MAXMETHOD] = {
 	    [PR_METHOD_REMOVE] = mqfs_prison_remove,
 	};
 
 	mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mvdata_zone = uma_zcreate("mvdata",
 		sizeof(struct mqfs_vdata), NULL, NULL, NULL,
 		NULL, UMA_ALIGN_PTR, 0);
 	mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mi = &mqfs_data;
 	sx_init(&mi->mi_lock, "mqfs lock");
 	/* set up the root diretory */
 	root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
 		mqfstype_root);
 	root->mn_info = mi;
 	LIST_INIT(&root->mn_children);
 	LIST_INIT(&root->mn_vnodes);
 	mi->mi_root = root;
 	mqfs_fileno_init(mi);
 	mqfs_fileno_alloc(mi, root);
 	mqfs_fixup_dir(root);
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	mq_fdclose = mqueue_fdclose;
 	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
 	mqfs_osd_jail_slot = osd_jail_register(NULL, methods);
 	return (0);
 }
 
 /*
  * Destroy a mqfs instance
  */
 static int
 mqfs_uninit(struct vfsconf *vfc)
 {
 	struct mqfs_info *mi;
 
 	if (!unloadable)
 		return (EOPNOTSUPP);
 	osd_jail_deregister(mqfs_osd_jail_slot);
 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
 	mi = &mqfs_data;
 	mqfs_destroy(mi->mi_root);
 	mi->mi_root = NULL;
 	mqfs_fileno_uninit(mi);
 	sx_destroy(&mi->mi_lock);
 	uma_zdestroy(mqnode_zone);
 	uma_zdestroy(mqueue_zone);
 	uma_zdestroy(mvdata_zone);
 	uma_zdestroy(mqnoti_zone);
 	return (0);
 }
 
 /*
  * task routine
  */
 static void
 do_recycle(void *context, int pending __unused)
 {
 	struct vnode *vp = (struct vnode *)context;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vrecycle(vp);
 	VOP_UNLOCK(vp);
 	vdrop(vp);
 }
 
 /*
  * Allocate a vnode
  */
 static int
 mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
 {
 	struct mqfs_vdata *vd;
 	struct mqfs_info  *mqfs;
 	struct vnode *newvpp;
 	int error;
 
 	mqfs = pn->mn_info;
 	*vpp = NULL;
 	sx_xlock(&mqfs->mi_lock);
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp) {
 			vhold(vd->mv_vnode);
 			break;
 		}
 	}
 
 	if (vd != NULL) {
 found:
 		*vpp = vd->mv_vnode;
 		sx_xunlock(&mqfs->mi_lock);
 		error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE);
 		vdrop(*vpp);
 		return (error);
 	}
 	sx_xunlock(&mqfs->mi_lock);
 
 	error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp);
 	if (error)
 		return (error);
 	vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY);
 	error = insmntque(newvpp, mp);
 	if (error != 0)
 		return (error);
 
 	sx_xlock(&mqfs->mi_lock);
 	/*
 	 * Check if it has already been allocated
 	 * while we were blocked.
 	 */
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp) {
 			vhold(vd->mv_vnode);
 			sx_xunlock(&mqfs->mi_lock);
 
 			vgone(newvpp);
 			vput(newvpp);
 			goto found;
 		}
 	}
 
 	*vpp = newvpp;
 
 	vd = uma_zalloc(mvdata_zone, M_WAITOK);
 	(*vpp)->v_data = vd;
 	vd->mv_vnode = *vpp;
 	vd->mv_node = pn;
 	TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
 	LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
 	mqnode_addref(pn);
 	switch (pn->mn_type) {
 	case mqfstype_root:
 		(*vpp)->v_vflag = VV_ROOT;
 		/* fall through */
 	case mqfstype_dir:
 	case mqfstype_this:
 	case mqfstype_parent:
 		(*vpp)->v_type = VDIR;
 		break;
 	case mqfstype_file:
 		(*vpp)->v_type = VREG;
 		break;
 	case mqfstype_symlink:
 		(*vpp)->v_type = VLNK;
 		break;
 	case mqfstype_none:
 		KASSERT(0, ("mqfs_allocf called for null node\n"));
 	default:
 		panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
 	}
 	sx_xunlock(&mqfs->mi_lock);
 	return (0);
 }
 
 /* 
  * Search a directory entry
  */
 static struct mqfs_node *
 mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred)
 {
 	struct mqfs_node *pn;
 	const void *pr_root;
 
 	sx_assert(&pd->mn_info->mi_lock, SX_LOCKED);
 	pr_root = cred->cr_prison->pr_root;
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		/* Only match names within the same prison root directory */
 		if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) &&
 		    strncmp(pn->mn_name, name, len) == 0 &&
 		    pn->mn_name[len] == '\0')
 			return (pn);
 	}
 	return (NULL);
 }
 
 /*
  * Look up a file or directory.
  */
 static int
 mqfs_lookupx(struct vop_cachedlookup_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqfs_info *mqfs;
 	int nameiop, flags, error, namelen;
 	char *pname;
 	struct thread *td;
 
 	td = curthread;
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	namelen = cnp->cn_namelen;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	pd = VTON(dvp);
 	pn = NULL;
 	mqfs = pd->mn_info;
 	*vpp = NULLVP;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td);
 	if (error)
 		return (error);
 
 	/* shortcut: check if the name is too long */
 	if (cnp->cn_namelen >= MQFS_NAMELEN)
 		return (ENOENT);
 
 	/* self */
 	if (namelen == 1 && pname[0] == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		pn = pd;
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	/* parent */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (dvp->v_vflag & VV_ROOT)
 			return (EIO);
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		VOP_UNLOCK(dvp);
 		KASSERT(pd->mn_parent, ("non-root directory has no parent"));
 		pn = pd->mn_parent;
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		return (error);
 	}
 
 	/* named node */
 	sx_xlock(&mqfs->mi_lock);
 	pn = mqfs_search(pd, pname, namelen, cnp->cn_cred);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
 
 	/* found */
 	if (pn != NULL) {
 		/* DELETE */
 		if (nameiop == DELETE && (flags & ISLASTCN)) {
 			error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 			if (error) {
 				mqnode_release(pn);
 				return (error);
 			}
 			if (*vpp == dvp) {
 				VREF(dvp);
 				*vpp = dvp;
 				mqnode_release(pn);
 				return (0);
 			}
 		}
 
 		/* allocate vnode */
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		mqnode_release(pn);
 		if (error == 0 && cnp->cn_flags & MAKEENTRY)
 			cache_enter(dvp, *vpp, cnp);
 		return (error);
 	}
 
 	/* not found */
 
 	/* will create a new entry in the directory ? */
 	if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
 	    && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		cnp->cn_flags |= SAVENAME;
 		return (EJUSTRETURN);
 	}
 	return (ENOENT);
 }
 
 #if 0
 struct vop_lookup_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode lookup operation
  */
 static int
 mqfs_lookup(struct vop_cachedlookup_args *ap)
 {
 	int rc;
 
 	rc = mqfs_lookupx(ap);
 	return (rc);
 }
 
 #if 0
 struct vop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * vnode creation operation
  */
 static int
 mqfs_create(struct vop_create_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqueue *mq;
 	int error;
 
 	pd = VTON(ap->a_dvp);
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	mq = mqueue_alloc(NULL);
 	if (mq == NULL)
 		return (EAGAIN);
 	sx_xlock(&mqfs->mi_lock);
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 	pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		cnp->cn_cred, ap->a_vap->va_mode);
 	if (pn == NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		error = ENOSPC;
 	} else {
 		mqnode_addref(pn);
 		sx_xunlock(&mqfs->mi_lock);
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		mqnode_release(pn);
 		if (error)
 			mqfs_destroy(pn);
 		else
 			pn->mn_data = mq;
 	}
 	if (error)
 		mqueue_free(mq);
 	return (error);
 }
 
 /*
  * Remove an entry
  */
 static
 int do_unlink(struct mqfs_node *pn, struct ucred *ucred)
 {
 	struct mqfs_node *parent;
 	struct mqfs_vdata *vd;
 	int error = 0;
 
 	sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
 
 	if (ucred->cr_uid != pn->mn_uid &&
 	    (error = priv_check_cred(ucred, PRIV_MQ_ADMIN)) != 0)
 		error = EACCES;
 	else if (!pn->mn_deleted) {
 		parent = pn->mn_parent;
 		pn->mn_parent = NULL;
 		pn->mn_deleted = 1;
 		LIST_REMOVE(pn, mn_sibling);
 		LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 			cache_purge(vd->mv_vnode);
 			vhold(vd->mv_vnode);
 			taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
 		}
 		mqnode_release(pn);
 		mqnode_release(parent);
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 #if 0
 struct vop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode removal operation
  */
 static int
 mqfs_remove(struct vop_remove_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn;
 	int error;
 
 	if (ap->a_vp->v_type == VDIR)
                 return (EPERM);
 	pn = VTON(ap->a_vp);
 	sx_xlock(&mqfs->mi_lock);
 	error = do_unlink(pn, ap->a_cnp->cn_cred);
 	sx_xunlock(&mqfs->mi_lock);
 	return (error);
 }
 
 #if 0
 struct vop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_inactive(struct vop_inactive_args *ap)
 {
 	struct mqfs_node *pn = VTON(ap->a_vp);
 
 	if (pn->mn_deleted)
 		vrecycle(ap->a_vp);
 	return (0);
 }
 
 #if 0
 struct vop_reclaim_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 };
 #endif
 
 static int
 mqfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn;
 	struct mqfs_vdata *vd;
 
 	vd = vp->v_data;
 	pn = vd->mv_node;
 	sx_xlock(&mqfs->mi_lock);
 	vp->v_data = NULL;
 	LIST_REMOVE(vd, mv_link);
 	uma_zfree(mvdata_zone, vd);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	return (0);
 }
 
 #if 0
 struct vop_open_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 	struct file *a_fp;
 };
 #endif
 
 static int
 mqfs_open(struct vop_open_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_close_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_access_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	accmode_t a_accmode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 /*
  * Verify permissions
  */
 static int
 mqfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr vattr;
 	int error;
 
 	error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 	if (error)
 		return (error);
 	error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid,
 	    ap->a_accmode, ap->a_cred);
 	return (error);
 }
 
 #if 0
 struct vop_getattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Get file attributes
  */
 static int
 mqfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn = VTON(vp);
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 
 	vap->va_type = vp->v_type;
 	vap->va_mode = pn->mn_mode;
 	vap->va_nlink = 1;
 	vap->va_uid = pn->mn_uid;
 	vap->va_gid = pn->mn_gid;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = pn->mn_fileno;
 	vap->va_size = 0;
 	vap->va_blocksize = PAGE_SIZE;
 	vap->va_bytes = vap->va_size = 0;
 	vap->va_atime = pn->mn_atime;
 	vap->va_mtime = pn->mn_mtime;
 	vap->va_ctime = pn->mn_ctime;
 	vap->va_birthtime = pn->mn_birth;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_rdev = NODEV;
 	vap->va_bytes = 0;
 	vap->va_filerev = 0;
 	return (error);
 }
 
 #if 0
 struct vop_setattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 /*
  * Set attributes
  */
 static int
 mqfs_setattr(struct vop_setattr_args *ap)
 {
 	struct mqfs_node *pn;
 	struct vattr *vap;
 	struct vnode *vp;
 	struct thread *td;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	td = curthread;
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	pn = VTON(vp);
 
 	error = c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = pn->mn_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = pn->mn_gid;
 	else
 		gid = vap->va_gid;
 
 	if (uid != pn->mn_uid || gid != pn->mn_gid) {
 		/*
 		 * To modify the ownership of a file, must possess VADMIN
 		 * for that file.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)))
 			return (error);
 
 		/*
 		 * XXXRW: Why is there a privilege check here: shouldn't the
 		 * check in VOP_ACCESS() be enough?  Also, are the group bits
 		 * below definitely right?
 		 */
 		if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid ||
 		    (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
 		    (error = priv_check(td, PRIV_MQ_ADMIN)) != 0)
 			return (error);
 		pn->mn_uid = uid;
 		pn->mn_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if ((ap->a_cred->cr_uid != pn->mn_uid) &&
 		    (error = priv_check(td, PRIV_MQ_ADMIN)))
 			return (error);
 		pn->mn_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		/* See the comment in ufs_vnops::ufs_setattr(). */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td))))
 			return (error);
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			pn->mn_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			pn->mn_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 	if (c) {
 		vfs_timestamp(&pn->mn_ctime);
 	}
 	return (0);
 }
 
 #if 0
 struct vop_read_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int a_ioflag;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Read from a file
  */
 static int
 mqfs_read(struct vop_read_args *ap)
 {
 	char buf[80];
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct mqueue *mq;
 	int len, error;
 
 	if (vp->v_type != VREG)
 		return (EINVAL);
 
 	mq = VTOMQ(vp);
 	snprintf(buf, sizeof(buf),
 		"QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
 		mq->mq_totalbytes,
 		mq->mq_maxmsg,
 		mq->mq_curmsgs,
 		mq->mq_msgsize);
 	buf[sizeof(buf)-1] = '\0';
 	len = strlen(buf);
 	error = uiomove_frombuf(buf, len, uio);
 	return (error);
 }
 
 #if 0
 struct vop_readdir_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *a_ncookies;
 	u_long **a_cookies;
 };
 #endif
 
 /*
  * Return directory entries.
  */
 static int
 mqfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp;
 	struct mqfs_info *mi;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct dirent entry;
 	struct uio *uio;
 	const void *pr_root;
 	int *tmp_ncookies = NULL;
 	off_t offset;
 	int error, i;
 
 	vp = ap->a_vp;
 	mi = VFSTOMQFS(vp->v_mount);
 	pd = VTON(vp);
 	uio = ap->a_uio;
 
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
         }
 
 	error = 0;
 	offset = 0;
 
 	pr_root = ap->a_cred->cr_prison->pr_root;
 	sx_xlock(&mi->mi_lock);
 
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		entry.d_reclen = sizeof(entry);
 
 		/*
 		 * Only show names within the same prison root directory
 		 * (or not associated with a prison, e.g. "." and "..").
 		 */
 		if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root)
 			continue;
 		if (!pn->mn_fileno)
 			mqfs_fileno_alloc(mi, pn);
 		entry.d_fileno = pn->mn_fileno;
 		entry.d_off = offset + entry.d_reclen;
 		for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
 			entry.d_name[i] = pn->mn_name[i];
 		entry.d_namlen = i;
 		switch (pn->mn_type) {
 		case mqfstype_root:
 		case mqfstype_dir:
 		case mqfstype_this:
 		case mqfstype_parent:
 			entry.d_type = DT_DIR;
 			break;
 		case mqfstype_file:
 			entry.d_type = DT_REG;
 			break;
 		case mqfstype_symlink:
 			entry.d_type = DT_LNK;
 			break;
 		default:
 			panic("%s has unexpected node type: %d", pn->mn_name,
 				pn->mn_type);
 		}
 		dirent_terminate(&entry);
 		if (entry.d_reclen > uio->uio_resid)
                         break;
 		if (offset >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, &entry, offset);
                         if (error)
                                 break;
                 }
                 offset += entry.d_reclen;
 	}
 	sx_xunlock(&mi->mi_lock);
 
 	uio->uio_offset = offset;
 
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 #ifdef notyet
 
 #if 0
 struct vop_mkdir_args {
 	struct vnode *a_dvp;
 	struvt vnode **a_vpp;
 	struvt componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * Create a directory.
  */
 static int
 mqfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd = VTON(ap->a_dvp);
 	struct mqfs_node *pn;
 	int error;
 
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	sx_xlock(&mqfs->mi_lock);
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 	pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		ap->a_vap->cn_cred, ap->a_vap->va_mode);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	if (pn == NULL) {
 		error = ENOSPC;
 	} else {
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		mqnode_release(pn);
 	}
 	return (error);
 }
 
 #if 0
 struct vop_rmdir_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * Remove a directory.
  */
 static int
 mqfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn = VTON(ap->a_vp);
 	struct mqfs_node *pt;
 
 	if (pn->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 
 	sx_xlock(&mqfs->mi_lock);
 	if (pn->mn_deleted) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOENT);
 	}
 
 	pt = LIST_FIRST(&pn->mn_children);
 	pt = LIST_NEXT(pt, mn_sibling);
 	pt = LIST_NEXT(pt, mn_sibling);
 	if (pt != NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOTEMPTY);
 	}
 	pt = pn->mn_parent;
 	pn->mn_parent = NULL;
 	pn->mn_deleted = 1;
 	LIST_REMOVE(pn, mn_sibling);
 	mqnode_release(pn);
 	mqnode_release(pt);
 	sx_xunlock(&mqfs->mi_lock);
 	cache_purge(ap->a_vp);
 	return (0);
 }
 
 #endif /* notyet */
 
 /*
  * See if this prison root is obsolete, and clean up associated queues if it is.
  */
 static int
 mqfs_prison_remove(void *obj, void *data __unused)
 {
 	const struct prison *pr = obj;
 	struct prison *tpr;
 	struct mqfs_node *pn, *tpn;
 	struct vnode *pr_root;
 
 	pr_root = pr->pr_root;
 	if (pr->pr_parent->pr_root == pr_root)
 		return (0);
 	TAILQ_FOREACH(tpr, &allprison, pr_list) {
 		if (tpr != pr && tpr->pr_root == pr_root)
 			return (0);
 	}
 	/*
 	 * No jails are rooted in this directory anymore,
 	 * so no queues should be either.
 	 */
 	sx_xlock(&mqfs_data.mi_lock);
 	LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children,
 	    mn_sibling, tpn) {
 		if (pn->mn_pr_root == pr_root)
 			(void)do_unlink(pn, curthread->td_ucred);
 	}
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 /*
  * Allocate a message queue
  */
 static struct mqueue *
 mqueue_alloc(const struct mq_attr *attr)
 {
 	struct mqueue *mq;
 
 	if (curmq >= maxmq)
 		return (NULL);
 	mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&mq->mq_msgq);
 	if (attr != NULL) {
 		mq->mq_maxmsg = attr->mq_maxmsg;
 		mq->mq_msgsize = attr->mq_msgsize;
 	} else {
 		mq->mq_maxmsg = default_maxmsg;
 		mq->mq_msgsize = default_msgsize;
 	}
 	mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF);
 	knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex);
 	knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex);
 	atomic_add_int(&curmq, 1);
 	return (mq);
 }
 
 /*
  * Destroy a message queue
  */
 static void
 mqueue_free(struct mqueue *mq)
 {
 	struct mqueue_msg *msg;
 
 	while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
 		TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
 		free(msg, M_MQUEUEDATA);
 	}
 
 	mtx_destroy(&mq->mq_mutex);
 	seldrain(&mq->mq_rsel);
 	seldrain(&mq->mq_wsel);
 	knlist_destroy(&mq->mq_rsel.si_note);
 	knlist_destroy(&mq->mq_wsel.si_note);
 	uma_zfree(mqueue_zone, mq);
 	atomic_add_int(&curmq, -1);
 }
 
 /*
  * Load a message from user space
  */
 static struct mqueue_msg *
 mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
 {
 	struct mqueue_msg *msg;
 	size_t len;
 	int error;
 
 	len = sizeof(struct mqueue_msg) + msg_size;
 	msg = malloc(len, M_MQUEUEDATA, M_WAITOK);
 	error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
 	    msg_size);
 	if (error) {
 		free(msg, M_MQUEUEDATA);
 		msg = NULL;
 	} else {
 		msg->msg_size = msg_size;
 		msg->msg_prio = msg_prio;
 	}
 	return (msg);
 }
 
 /*
  * Save a message to user space
  */
 static int
 mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
 {
 	int error;
 
 	error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
 		msg->msg_size);
 	if (error == 0 && msg_prio != NULL)
 		error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
 	return (error);
 }
 
 /*
  * Free a message's memory
  */
 static __inline void
 mqueue_freemsg(struct mqueue_msg *msg)
 {
 	free(msg, M_MQUEUEDATA);
 }
 
 /*
  * Send a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_send(struct mqueue *mq, const char *msg_ptr,
 	size_t msg_len, unsigned msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_prio >= MQ_PRIO_MAX)
 		return (EINVAL);
 	if (msg_len > mq->mq_msgsize)
 		return (EMSGSIZE);
 	msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
 	if (msg == NULL)
 		return (EFAULT);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_send(mq, msg, -1);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* we allow a null timeout (wait forever) */
 	if (abs_timeout == NULL) {
 		error = _mqueue_send(mq, msg, 0);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* send it before checking time */
 	error = _mqueue_send(mq, msg, -1);
 	if (error == 0)
 		return (0);
 
 	if (error != EAGAIN)
 		goto bad;
 
 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
 		error = EINVAL;
 		goto bad;
 	}
 	for (;;) {
 		getnanotime(&ts);
 		timespecsub(abs_timeout, &ts, &ts2);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			break;
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_send(mq, msg, tvtohz(&tv));
 		if (error != ETIMEDOUT)
 			break;
 	}
 	if (error == 0)
 		return (0);
 bad:
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to send a message
  */
 static int
 _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
 {	
 	struct mqueue_msg *msg2;
 	int error = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_senders++;
 		error = msleep(&mq->mq_senders, &mq->mq_mutex,
 			    PCATCH, "mqsend", timo);
 		mq->mq_senders--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (mq->mq_curmsgs >= mq->mq_maxmsg) {
 		mtx_unlock(&mq->mq_mutex);
 		return (error);
 	}
 	error = 0;
 	if (TAILQ_EMPTY(&mq->mq_msgq)) {
 		TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
 	} else {
 		if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
 			TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
 		} else {
 			TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
 				if (msg2->msg_prio < msg->msg_prio)
 					break;
 			}
 			TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
 		}
 	}
 	mq->mq_curmsgs++;
 	mq->mq_totalbytes += msg->msg_size;
 	if (mq->mq_receivers)
 		wakeup_one(&mq->mq_receivers);
 	else if (mq->mq_notifier != NULL)
 		mqueue_send_notification(mq);
 	if (mq->mq_flags & MQ_RSEL) {
 		mq->mq_flags &= ~MQ_RSEL;
 		selwakeup(&mq->mq_rsel);
 	}
 	KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
 	mtx_unlock(&mq->mq_mutex);
 	return (0);
 }
 
 /*
  * Send realtime a signal to process which registered itself
  * successfully by mq_notify.
  */
 static void
 mqueue_send_notification(struct mqueue *mq)
 {
 	struct mqueue_notifier *nt;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	nt = mq->mq_notifier;
 	if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
 		p = nt->nt_proc;
 		error = sigev_findtd(p, &nt->nt_sigev, &td);
 		if (error) {
 			mq->mq_notifier = NULL;
 			return;
 		}
 		if (!KSI_ONQ(&nt->nt_ksi)) {
 			ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev);
 			tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi);
 		}
 		PROC_UNLOCK(p);
 	}
 	mq->mq_notifier = NULL;
 }
 
 /*
  * Get a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_receive(struct mqueue *mq, char *msg_ptr,
 	size_t msg_len, unsigned *msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_len < mq->mq_msgsize)
 		return (EMSGSIZE);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_recv(mq, &msg, -1);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* we allow a null timeout (wait forever). */
 	if (abs_timeout == NULL) {
 		error = _mqueue_recv(mq, &msg, 0);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* try to get a message before checking time */
 	error = _mqueue_recv(mq, &msg, -1);
 	if (error == 0)
 		goto received;
 
 	if (error != EAGAIN)
 		return (error);
 
 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
 		error = EINVAL;
 		return (error);
 	}
 
 	for (;;) {
 		getnanotime(&ts);
 		timespecsub(abs_timeout, &ts, &ts2);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			return (error);
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_recv(mq, &msg, tvtohz(&tv));
 		if (error == 0)
 			break;
 		if (error != ETIMEDOUT)
 			return (error);
 	}
 
 received:
 	error = mqueue_savemsg(msg, msg_ptr, msg_prio);
 	if (error == 0) {
 		curthread->td_retval[0] = msg->msg_size;
 		curthread->td_retval[1] = 0;
 	}
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to receive a message
  */
 static int
 _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
 {	
 	int error = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_receivers++;
 		error = msleep(&mq->mq_receivers, &mq->mq_mutex,
 			    PCATCH, "mqrecv", timo);
 		mq->mq_receivers--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (*msg != NULL) {
 		error = 0;
 		TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
 		mq->mq_curmsgs--;
 		mq->mq_totalbytes -= (*msg)->msg_size;
 		if (mq->mq_senders)
 			wakeup_one(&mq->mq_senders);
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
 	}
 	if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
 	    !TAILQ_EMPTY(&mq->mq_msgq)) {
 		mqueue_send_notification(mq);
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (error);
 }
 
 static __inline struct mqueue_notifier *
 notifier_alloc(void)
 {
 	return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
 }
 
 static __inline void
 notifier_free(struct mqueue_notifier *p)
 {
 	uma_zfree(mqnoti_zone, p);
 }
 
 static struct mqueue_notifier *
 notifier_search(struct proc *p, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
 		if (nt->nt_ksi.ksi_mqd == fd)
 			break;
 	}
 	return (nt);
 }
 
 static __inline void
 notifier_insert(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
 }
 
 static __inline void
 notifier_delete(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_REMOVE(nt, nt_link);
 	notifier_free(nt);
 }
 
 static void
 notifier_remove(struct proc *p, struct mqueue *mq, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	PROC_LOCK(p);
 	nt = notifier_search(p, fd);
 	if (nt != NULL) {
 		if (mq->mq_notifier == nt)
 			mq->mq_notifier = NULL;
 		sigqueue_take(&nt->nt_ksi);
 		notifier_delete(p, nt);
 	}
 	PROC_UNLOCK(p);
 }
 
 static int
 kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode,
     const struct mq_attr *attr)
 {
 	char path[MQFS_NAMELEN + 1];
 	struct mqfs_node *pn;
 	struct pwddesc *pdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int fd, error, len, cmode;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 
 	pdp = td->td_proc->p_pd;
 	cmode = (((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT);
 	mq = NULL;
 	if ((flags & O_CREAT) != 0 && attr != NULL) {
 		if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg)
 			return (EINVAL);
 		if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize)
 			return (EINVAL);
 	}
 
 	error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	/*
 	 * The first character of name must be a slash  (/) character
 	 * and the remaining characters of name cannot include any slash
 	 * characters. 
 	 */
 	len = strlen(path);
 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 	/*
 	 * "." and ".." are magic directories, populated on the fly, and cannot
 	 * be opened as queues.
 	 */
 	if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0)
 		return (EINVAL);
 	AUDIT_ARG_UPATH1_CANON(path);
 
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error)
 		return (error);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn == NULL) {
 		if (!(flags & O_CREAT)) {
 			error = ENOENT;
 		} else {
 			mq = mqueue_alloc(attr);
 			if (mq == NULL) {
 				error = ENFILE;
 			} else {
 				pn = mqfs_create_file(mqfs_data.mi_root,
 				         path + 1, len - 1, td->td_ucred,
 					 cmode);
 				if (pn == NULL) {
 					error = ENOSPC;
 					mqueue_free(mq);
 				}
 			}
 		}
 
 		if (error == 0) {
 			pn->mn_data = mq;
 		}
 	} else {
 		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
 			error = EEXIST;
 		} else {
 			accmode_t accmode = 0;
 
 			if (flags & FREAD)
 				accmode |= VREAD;
 			if (flags & FWRITE)
 				accmode |= VWRITE;
 			error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
 			    pn->mn_gid, accmode, td->td_ucred);
 		}
 	}
 
 	if (error) {
 		sx_xunlock(&mqfs_data.mi_lock);
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	mqnode_addref(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 
 	finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn,
 	    &mqueueops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 	return (0);
 }
 
 /*
  * Syscall to open a message queue.
  */
 int
 sys_kmq_open(struct thread *td, struct kmq_open_args *uap)
 {
 	struct mq_attr attr;
 	int flags, error;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error)
 			return (error);
 	}
 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
 	    uap->attr != NULL ? &attr : NULL));
 }
 
 /*
  * Syscall to unlink a message queue.
  */
 int
 sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
 {
 	char path[MQFS_NAMELEN+1];
 	struct mqfs_node *pn;
 	int error, len;
 
 	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	len = strlen(path);
 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 	if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0)
 		return (EINVAL);
 	AUDIT_ARG_UPATH1_CANON(path);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn != NULL)
 		error = do_unlink(pn, td->td_ucred);
 	else
 		error = ENOENT;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **);
 
 /*
  * Get message queue by giving file slot
  */
 static int
 _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func,
        struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = func(td, fd, rightsp, fpp);
 	if (error)
 		return (error);
 	if (&mqueueops != (*fpp)->f_ops) {
 		fdrop(*fpp, td);
 		return (EBADF);
 	}
 	pn = (*fpp)->f_data;
 	if (ppn)
 		*ppn = pn;
 	if (pmq)
 		*pmq = pn->mn_data;
 	return (0);
 }
 
 static __inline int
 getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
 	struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_event_rights, fget,
 	    fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_read(struct thread *td, int fd, struct file **fpp,
 	 struct mqfs_node **ppn, struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_read_rights, fget_read,
 	    fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_write(struct thread *td, int fd, struct file **fpp,
 	struct mqfs_node **ppn, struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_write_rights, fget_write,
 	    fpp, ppn, pmq);
 }
 
 static int
 kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr,
     struct mq_attr *oattr)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	u_int oflag, flag;
 	int error;
 
 	AUDIT_ARG_FD(mqd);
 	if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0)
 		return (EINVAL);
 	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	oattr->mq_maxmsg  = mq->mq_maxmsg;
 	oattr->mq_msgsize = mq->mq_msgsize;
 	oattr->mq_curmsgs = mq->mq_curmsgs;
 	if (attr != NULL) {
 		do {
 			oflag = flag = fp->f_flag;
 			flag &= ~O_NONBLOCK;
 			flag |= (attr->mq_flags & O_NONBLOCK);
 		} while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);
 	} else
 		oflag = fp->f_flag;
 	oattr->mq_flags = (O_NONBLOCK & oflag);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
 {
 	struct mq_attr attr, oattr;
 	int error;
 
 	if (uap->attr != NULL) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error != 0)
 			return (error);
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
 	if (error == 0 && uap->oattr != NULL) {
 		bzero(oattr.__reserved, sizeof(oattr.__reserved));
 		error = copyout(&oattr, uap->oattr, sizeof(oattr));
 	}
 	return (error);
 }
 
 int
 sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec *abs_timeout, ets;
 	int error;
 	int waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
 		if (error != 0)
 			goto out;
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
 		if (error != 0)
 			goto out;
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 static int
 kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	struct mqueue *mq;
 	struct file *fp, *fp2;
 	struct mqueue_notifier *nt, *newnt = NULL;
 	int error;
 
 	AUDIT_ARG_FD(mqd);
 	if (sigev != NULL) {
 		if (sigev->sigev_notify != SIGEV_SIGNAL &&
 		    sigev->sigev_notify != SIGEV_THREAD_ID &&
 		    sigev->sigev_notify != SIGEV_NONE)
 			return (EINVAL);
 		if ((sigev->sigev_notify == SIGEV_SIGNAL ||
 		    sigev->sigev_notify == SIGEV_THREAD_ID) &&
 		    !_SIG_VALID(sigev->sigev_signo))
 			return (EINVAL);
 	}
 	p = td->td_proc;
 	fdp = td->td_proc->p_fd;
 	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 again:
 	FILEDESC_SLOCK(fdp);
 	fp2 = fget_locked(fdp, mqd);
 	if (fp2 == NULL) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 #ifdef CAPABILITIES
 	error = cap_check(cap_rights(fdp, mqd), &cap_event_rights);
 	if (error) {
 		FILEDESC_SUNLOCK(fdp);
 		goto out;
 	}
 #endif
 	if (fp2 != fp) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 	mtx_lock(&mq->mq_mutex);
 	FILEDESC_SUNLOCK(fdp);
 	if (sigev != NULL) {
 		if (mq->mq_notifier != NULL) {
 			error = EBUSY;
 		} else {
 			PROC_LOCK(p);
 			nt = notifier_search(p, mqd);
 			if (nt == NULL) {
 				if (newnt == NULL) {
 					PROC_UNLOCK(p);
 					mtx_unlock(&mq->mq_mutex);
 					newnt = notifier_alloc();
 					goto again;
 				}
 			}
 
 			if (nt != NULL) {
 				sigqueue_take(&nt->nt_ksi);
 				if (newnt != NULL) {
 					notifier_free(newnt);
 					newnt = NULL;
 				}
 			} else {
 				nt = newnt;
 				newnt = NULL;
 				ksiginfo_init(&nt->nt_ksi);
 				nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
 				nt->nt_ksi.ksi_code = SI_MESGQ;
 				nt->nt_proc = p;
 				nt->nt_ksi.ksi_mqd = mqd;
 				notifier_insert(p, nt);
 			}
 			nt->nt_sigev = *sigev;
 			mq->mq_notifier = nt;
 			PROC_UNLOCK(p);
 			/*
 			 * if there is no receivers and message queue
 			 * is not empty, we should send notification
 			 * as soon as possible.
 			 */
 			if (mq->mq_receivers == 0 &&
 			    !TAILQ_EMPTY(&mq->mq_msgq))
 				mqueue_send_notification(mq);
 		}
 	} else {
 		notifier_remove(p, mq, mqd);
 	}
 	mtx_unlock(&mq->mq_mutex);
 
 out:
 	fdrop(fp, td);
 	if (newnt != NULL)
 		notifier_free(newnt);
 	return (error);
 }
 
 int
 sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap)
 {
 	struct sigevent ev, *evp;
 	int error;
 
 	if (uap->sigev == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->sigev, &ev, sizeof(ev));
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
 static void
 mqueue_fdclose(struct thread *td, int fd, struct file *fp)
 {
 	struct mqueue *mq;
 #ifdef INVARIANTS
 	struct filedesc *fdp;
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_LOCK_ASSERT(fdp);
 #endif
 
 	if (fp->f_ops == &mqueueops) {
 		mq = FPTOMQ(fp);
 		mtx_lock(&mq->mq_mutex);
 		notifier_remove(td->td_proc, mq, fd);
 
 		/* have to wakeup thread in same process */
 		if (mq->mq_flags & MQ_RSEL) {
 			mq->mq_flags &= ~MQ_RSEL;
 			selwakeup(&mq->mq_rsel);
 		}
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		mtx_unlock(&mq->mq_mutex);
 	}
 }
 
 static void
 mq_proc_exit(void *arg __unused, struct proc *p)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int i;
 
 	fdp = p->p_fd;
 	FILEDESC_SLOCK(fdp);
 	for (i = 0; i < fdp->fd_nfiles; ++i) {
 		fp = fget_locked(fdp, i);
 		if (fp != NULL && fp->f_ops == &mqueueops) {
 			mq = FPTOMQ(fp);
 			mtx_lock(&mq->mq_mutex);
 			notifier_remove(p, FPTOMQ(fp), i);
 			mtx_unlock(&mq->mq_mutex);
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
 }
 
 static int
 mqf_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int revents = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (mq->mq_curmsgs) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			mq->mq_flags |= MQ_RSEL;
 			selrecord(td, &mq->mq_rsel);
  		}
 	}
 	if (events & POLLOUT) {
 		if (mq->mq_curmsgs < mq->mq_maxmsg)
 			revents |= POLLOUT;
 		else {
 			mq->mq_flags |= MQ_WSEL;
 			selrecord(td, &mq->mq_wsel);
 		}
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (revents);
 }
 
 static int
 mqf_close(struct file *fp, struct thread *td)
 {
 	struct mqfs_node *pn;
 
 	fp->f_ops = &badfileops;
 	pn = fp->f_data;
 	fp->f_data = NULL;
 	sx_xlock(&mqfs_data.mi_lock);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
-mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
-	struct thread *td)
+mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
 {
 	struct mqfs_node *pn = fp->f_data;
 
 	bzero(st, sizeof *st);
 	sx_xlock(&mqfs_data.mi_lock);
 	st->st_atim = pn->mn_atime;
 	st->st_mtim = pn->mn_mtime;
 	st->st_ctim = pn->mn_ctime;
 	st->st_birthtim = pn->mn_birth;
 	st->st_uid = pn->mn_uid;
 	st->st_gid = pn->mn_gid;
 	st->st_mode = S_IFIFO | pn->mn_mode;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = 0;
 	pn = fp->f_data;
 	sx_xlock(&mqfs_data.mi_lock);
 	error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN,
 	    active_cred);
 	if (error != 0)
 		goto out;
 	pn->mn_mode = mode & ACCESSPERMS;
 out:
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 static int
 mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = 0;
 	pn = fp->f_data;
 	sx_xlock(&mqfs_data.mi_lock);
 	if (uid == (uid_t)-1)
 		uid = pn->mn_uid;
 	if (gid == (gid_t)-1)
 		gid = pn->mn_gid;
 	if (((uid != pn->mn_uid && uid != active_cred->cr_uid) ||
 	    (gid != pn->mn_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN)))
 		goto out;
 	pn->mn_uid = uid;
 	pn->mn_gid = gid;
 out:
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 static int
 mqf_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int error = 0;
 
 	if (kn->kn_filter == EVFILT_READ) {
 		kn->kn_fop = &mq_rfiltops;
 		knlist_add(&mq->mq_rsel.si_note, kn, 0);
 	} else if (kn->kn_filter == EVFILT_WRITE) {
 		kn->kn_fop = &mq_wfiltops;
 		knlist_add(&mq->mq_wsel.si_note, kn, 0);
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 static void
 filt_mqdetach(struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	if (kn->kn_filter == EVFILT_READ)
 		knlist_remove(&mq->mq_rsel.si_note, kn, 0);
 	else if (kn->kn_filter == EVFILT_WRITE)
 		knlist_remove(&mq->mq_wsel.si_note, kn, 0);
 	else
 		panic("filt_mqdetach");
 }
 
 static int
 filt_mqread(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs != 0);
 }
 
 static int
 filt_mqwrite(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs < mq->mq_maxmsg);
 }
 
 static int
 mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_MQUEUE;
 	return (0);
 }
 
 static struct fileops mqueueops = {
 	.fo_read		= invfo_rdwr,
 	.fo_write		= invfo_rdwr,
 	.fo_truncate		= invfo_truncate,
 	.fo_ioctl		= invfo_ioctl,
 	.fo_poll		= mqf_poll,
 	.fo_kqfilter		= mqf_kqfilter,
 	.fo_stat		= mqf_stat,
 	.fo_close		= mqf_close,
 	.fo_chmod		= mqf_chmod,
 	.fo_chown		= mqf_chown,
 	.fo_sendfile		= invfo_sendfile,
 	.fo_fill_kinfo		= mqf_fill_kinfo,
 	.fo_flags		= DFLAG_PASSABLE,
 };
 
 static struct vop_vector mqfs_vnodeops = {
 	.vop_default 		= &default_vnodeops,
 	.vop_access		= mqfs_access,
 	.vop_cachedlookup	= mqfs_lookup,
 	.vop_lookup		= vfs_cache_lookup,
 	.vop_reclaim		= mqfs_reclaim,
 	.vop_create		= mqfs_create,
 	.vop_remove		= mqfs_remove,
 	.vop_inactive		= mqfs_inactive,
 	.vop_open		= mqfs_open,
 	.vop_close		= mqfs_close,
 	.vop_getattr		= mqfs_getattr,
 	.vop_setattr		= mqfs_setattr,
 	.vop_read		= mqfs_read,
 	.vop_write		= VOP_EOPNOTSUPP,
 	.vop_readdir		= mqfs_readdir,
 	.vop_mkdir		= VOP_EOPNOTSUPP,
 	.vop_rmdir		= VOP_EOPNOTSUPP
 };
 VFS_VOP_VECTOR_REGISTER(mqfs_vnodeops);
 
 static struct vfsops mqfs_vfsops = {
 	.vfs_init 		= mqfs_init,
 	.vfs_uninit		= mqfs_uninit,
 	.vfs_mount		= mqfs_mount,
 	.vfs_unmount		= mqfs_unmount,
 	.vfs_root		= mqfs_root,
 	.vfs_statfs		= mqfs_statfs,
 };
 
 static struct vfsconf mqueuefs_vfsconf = {
 	.vfc_version = VFS_VERSION,
 	.vfc_name = "mqueuefs",
 	.vfc_vfsops = &mqfs_vfsops,
 	.vfc_typenum = -1,
 	.vfc_flags = VFCF_SYNTHETIC
 };
 
 static struct syscall_helper_data mq_syscalls[] = {
 	SYSCALL_INIT_HELPER(kmq_open),
 	SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER(kmq_unlink),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static void
 mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to)
 {
 
 	to->mq_flags = from->mq_flags;
 	to->mq_maxmsg = from->mq_maxmsg;
 	to->mq_msgsize = from->mq_msgsize;
 	to->mq_curmsgs = from->mq_curmsgs;
 }
 
 static void
 mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to)
 {
 
 	to->mq_flags = from->mq_flags;
 	to->mq_maxmsg = from->mq_maxmsg;
 	to->mq_msgsize = from->mq_msgsize;
 	to->mq_curmsgs = from->mq_curmsgs;
 }
 
 int
 freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap)
 {
 	struct mq_attr attr;
 	struct mq_attr32 attr32;
 	int flags, error;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
 		error = copyin(uap->attr, &attr32, sizeof(attr32));
 		if (error)
 			return (error);
 		mq_attr_from32(&attr32, &attr);
 	}
 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
 	    uap->attr != NULL ? &attr : NULL));
 }
 
 int
 freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap)
 {
 	struct mq_attr attr, oattr;
 	struct mq_attr32 attr32, oattr32;
 	int error;
 
 	if (uap->attr != NULL) {
 		error = copyin(uap->attr, &attr32, sizeof(attr32));
 		if (error != 0)
 			return (error);
 		mq_attr_from32(&attr32, &attr);
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
 	if (error == 0 && uap->oattr != NULL) {
 		mq_attr_to32(&oattr, &oattr32);
 		bzero(oattr32.__reserved, sizeof(oattr32.__reserved));
 		error = copyout(&oattr32, uap->oattr, sizeof(oattr32));
 	}
 	return (error);
 }
 
 int
 freebsd32_kmq_timedsend(struct thread *td,
     struct freebsd32_kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec32 ets32;
 	struct timespec *abs_timeout, ets;
 	int error;
 	int waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
 		if (error != 0)
 			goto out;
 		CP(ets32, ets, tv_sec);
 		CP(ets32, ets, tv_nsec);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 freebsd32_kmq_timedreceive(struct thread *td,
     struct freebsd32_kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec32 ets32;
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
 		if (error != 0)
 			goto out;
 		CP(ets32, ets, tv_sec);
 		CP(ets32, ets, tv_nsec);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap)
 {
 	struct sigevent ev, *evp;
 	struct sigevent32 ev32;
 	int error;
 
 	if (uap->sigev == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->sigev, &ev32, sizeof(ev32));
 		if (error != 0)
 			return (error);
 		error = convert_sigevent32(&ev32, &ev);
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
 static struct syscall_helper_data mq32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_kmq_open),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 mqinit(void)
 {
 	int error;
 
 	error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 mqunload(void)
 {
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(mq32_syscalls);
 #endif
 	syscall_helper_unregister(mq_syscalls);
 	return (0);
 }
 
 static int
 mq_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	error = vfs_modevent(module, cmd, arg);
 	if (error != 0)
 		return (error);
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = mqinit();
 		if (error != 0)
 			mqunload();
 		break;
 	case MOD_UNLOAD:
 		error = mqunload();
 		break;
 	default:
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t mqueuefs_mod = {
 	"mqueuefs",
 	mq_modload,
 	&mqueuefs_vfsconf
 };
 DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE);
 MODULE_VERSION(mqueuefs, 1);
diff --git a/sys/kern/uipc_sem.c b/sys/kern/uipc_sem.c
index db1d84696df0..e7c7a04f5c54 100644
--- a/sys/kern/uipc_sem.c
+++ b/sys/kern/uipc_sem.c
@@ -1,1111 +1,1110 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002 Alfred Perlstein <alfred@FreeBSD.org>
  * Copyright (c) 2003-2005 SPARTA, Inc.
  * Copyright (c) 2005, 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by Network
  * Associates Laboratories, the Security Research Division of Network
  * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
  * as part of the DARPA CHATS research program.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_posix.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/ksem.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/_semaphore.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support");
 /*
  * TODO
  *
  * - Resource limits?
  * - Replace global sem_lock with mtx_pool locks?
  * - Add a MAC check_create() hook for creating new named semaphores.
  */
 
 #ifndef SEM_MAX
 #define	SEM_MAX	30
 #endif
 
 #ifdef SEM_DEBUG
 #define	DP(x)	printf x
 #else
 #define	DP(x)
 #endif
 
 struct ksem_mapping {
 	char		*km_path;
 	Fnv32_t		km_fnv;
 	struct ksem	*km_ksem;
 	LIST_ENTRY(ksem_mapping) km_link;
 };
 
 static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor");
 static LIST_HEAD(, ksem_mapping) *ksem_dictionary;
 static struct sx ksem_dict_lock;
 static struct mtx ksem_count_lock;
 static struct mtx sem_lock;
 static u_long ksem_hash;
 static int ksem_dead;
 
 #define	KSEM_HASH(fnv)	(&ksem_dictionary[(fnv) & ksem_hash])
 
 static int nsems = 0;
 SYSCTL_DECL(_p1003_1b);
 SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0,
     "Number of active kernel POSIX semaphores");
 
 static int	kern_sem_wait(struct thread *td, semid_t id, int tryflag,
 		    struct timespec *abstime);
 static int	ksem_access(struct ksem *ks, struct ucred *ucred);
 static struct ksem *ksem_alloc(struct ucred *ucred, mode_t mode,
 		    unsigned int value);
 static int	ksem_create(struct thread *td, const char *path,
 		    semid_t *semidp, mode_t mode, unsigned int value,
 		    int flags, int compat32);
 static void	ksem_drop(struct ksem *ks);
 static int	ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
     struct file **fpp);
 static struct ksem *ksem_hold(struct ksem *ks);
 static void	ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks);
 static struct ksem *ksem_lookup(char *path, Fnv32_t fnv);
 static void	ksem_module_destroy(void);
 static int	ksem_module_init(void);
 static int	ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
 static int	sem_modload(struct module *module, int cmd, void *arg);
 
 static fo_stat_t	ksem_stat;
 static fo_close_t	ksem_closef;
 static fo_chmod_t	ksem_chmod;
 static fo_chown_t	ksem_chown;
 static fo_fill_kinfo_t	ksem_fill_kinfo;
 
 /* File descriptor operations. */
 static struct fileops ksem_ops = {
 	.fo_read = invfo_rdwr,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = invfo_ioctl,
 	.fo_poll = invfo_poll,
 	.fo_kqfilter = invfo_kqfilter,
 	.fo_stat = ksem_stat,
 	.fo_close = ksem_closef,
 	.fo_chmod = ksem_chmod,
 	.fo_chown = ksem_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = ksem_fill_kinfo,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 FEATURE(posix_sem, "POSIX semaphores");
 
 static int
-ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
-    struct thread *td)
+ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 	struct ksem *ks;
 #ifdef MAC
 	int error;
 #endif
 
 	ks = fp->f_data;
 
 #ifdef MAC
 	error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks);
 	if (error)
 		return (error);
 #endif
 
 	/*
 	 * Attempt to return sanish values for fstat() on a semaphore
 	 * file descriptor.
 	 */
 	bzero(sb, sizeof(*sb));
 
 	mtx_lock(&sem_lock);
 	sb->st_atim = ks->ks_atime;
 	sb->st_ctim = ks->ks_ctime;
 	sb->st_mtim = ks->ks_mtime;
 	sb->st_birthtim = ks->ks_birthtime;
 	sb->st_uid = ks->ks_uid;
 	sb->st_gid = ks->ks_gid;
 	sb->st_mode = S_IFREG | ks->ks_mode;		/* XXX */
 	mtx_unlock(&sem_lock);
 
 	return (0);
 }
 
 static int
 ksem_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 	int error;
 
 	error = 0;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_setmode(active_cred, ks, mode);
 	if (error != 0)
 		goto out;
 #endif
 	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN,
 	    active_cred);
 	if (error != 0)
 		goto out;
 	ks->ks_mode = mode & ACCESSPERMS;
 out:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 static int
 ksem_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct ksem *ks;
 	int error;
 
 	error = 0;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_setowner(active_cred, ks, uid, gid);
 	if (error != 0)
 		goto out;
 #endif
 	if (uid == (uid_t)-1)
 		uid = ks->ks_uid;
 	if (gid == (gid_t)-1)
                  gid = ks->ks_gid;
 	if (((uid != ks->ks_uid && uid != active_cred->cr_uid) ||
 	    (gid != ks->ks_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN)))
 		goto out;
 	ks->ks_uid = uid;
 	ks->ks_gid = gid;
 out:
 	mtx_unlock(&sem_lock);
 	return (error);
 }
 
 static int
 ksem_closef(struct file *fp, struct thread *td)
 {
 	struct ksem *ks;
 
 	ks = fp->f_data;
 	fp->f_data = NULL;
 	ksem_drop(ks);
 
 	return (0);
 }
 
 static int
 ksem_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	const char *path, *pr_path;
 	struct ksem *ks;
 	size_t pr_pathlen;
 
 	kif->kf_type = KF_TYPE_SEM;
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 	kif->kf_un.kf_sem.kf_sem_value = ks->ks_value;
 	kif->kf_un.kf_sem.kf_sem_mode = S_IFREG | ks->ks_mode;	/* XXX */
 	mtx_unlock(&sem_lock);
 	if (ks->ks_path != NULL) {
 		sx_slock(&ksem_dict_lock);
 		if (ks->ks_path != NULL) {
 			path = ks->ks_path;
 			pr_path = curthread->td_ucred->cr_prison->pr_path;
 			if (strcmp(pr_path, "/") != 0) {
 				/* Return the jail-rooted pathname. */
 				pr_pathlen = strlen(pr_path);
 				if (strncmp(path, pr_path, pr_pathlen) == 0 &&
 				    path[pr_pathlen] == '/')
 					path += pr_pathlen;
 			}
 			strlcpy(kif->kf_path, path, sizeof(kif->kf_path));
 		}
 		sx_sunlock(&ksem_dict_lock);
 	}
 	return (0);
 }
 
 /*
  * ksem object management including creation and reference counting
  * routines.
  */
 static struct ksem *
 ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value)
 {
 	struct ksem *ks;
 
 	mtx_lock(&ksem_count_lock);
 	if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) || ksem_dead) {
 		mtx_unlock(&ksem_count_lock);
 		return (NULL);
 	}
 	nsems++;
 	mtx_unlock(&ksem_count_lock);
 	ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK | M_ZERO);
 	ks->ks_uid = ucred->cr_uid;
 	ks->ks_gid = ucred->cr_gid;
 	ks->ks_mode = mode;
 	ks->ks_value = value;
 	cv_init(&ks->ks_cv, "ksem");
 	vfs_timestamp(&ks->ks_birthtime);
 	ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime;
 	refcount_init(&ks->ks_ref, 1);
 #ifdef MAC
 	mac_posixsem_init(ks);
 	mac_posixsem_create(ucred, ks);
 #endif
 
 	return (ks);
 }
 
 static struct ksem *
 ksem_hold(struct ksem *ks)
 {
 
 	refcount_acquire(&ks->ks_ref);
 	return (ks);
 }
 
 static void
 ksem_drop(struct ksem *ks)
 {
 
 	if (refcount_release(&ks->ks_ref)) {
 #ifdef MAC
 		mac_posixsem_destroy(ks);
 #endif
 		cv_destroy(&ks->ks_cv);
 		free(ks, M_KSEM);
 		mtx_lock(&ksem_count_lock);
 		nsems--;
 		mtx_unlock(&ksem_count_lock);
 	}
 }
 
 /*
  * Determine if the credentials have sufficient permissions for read
  * and write access.
  */
 static int
 ksem_access(struct ksem *ks, struct ucred *ucred)
 {
 	int error;
 
 	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid,
 	    VREAD | VWRITE, ucred);
 	if (error)
 		error = priv_check_cred(ucred, PRIV_SEM_WRITE);
 	return (error);
 }
 
 /*
  * Dictionary management.  We maintain an in-kernel dictionary to map
  * paths to semaphore objects.  We use the FNV hash on the path to
  * store the mappings in a hash table.
  */
 static struct ksem *
 ksem_lookup(char *path, Fnv32_t fnv)
 {
 	struct ksem_mapping *map;
 
 	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
 		if (map->km_fnv != fnv)
 			continue;
 		if (strcmp(map->km_path, path) == 0)
 			return (map->km_ksem);
 	}
 
 	return (NULL);
 }
 
 static void
 ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks)
 {
 	struct ksem_mapping *map;
 
 	map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK);
 	map->km_path = path;
 	map->km_fnv = fnv;
 	map->km_ksem = ksem_hold(ks);
 	ks->ks_path = path;
 	LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link);
 }
 
 static int
 ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 {
 	struct ksem_mapping *map;
 	int error;
 
 	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
 		if (map->km_fnv != fnv)
 			continue;
 		if (strcmp(map->km_path, path) == 0) {
 #ifdef MAC
 			error = mac_posixsem_check_unlink(ucred, map->km_ksem);
 			if (error)
 				return (error);
 #endif
 			error = ksem_access(map->km_ksem, ucred);
 			if (error)
 				return (error);
 			map->km_ksem->ks_path = NULL;
 			LIST_REMOVE(map, km_link);
 			ksem_drop(map->km_ksem);
 			free(map->km_path, M_KSEM);
 			free(map, M_KSEM);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 static int
 ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd,
     int compat32)
 {
 	semid_t semid;
 #ifdef COMPAT_FREEBSD32
 	int32_t semid32;
 #endif
 	void *ptr;
 	size_t ptrs;
 
 #ifdef COMPAT_FREEBSD32
 	if (compat32) {
 		semid32 = fd;
 		ptr = &semid32;
 		ptrs = sizeof(semid32);
 	} else {
 #endif
 		semid = fd;
 		ptr = &semid;
 		ptrs = sizeof(semid);
 		compat32 = 0; /* silence gcc */
 #ifdef COMPAT_FREEBSD32
 	}
 #endif
 
 	return (copyout(ptr, semidp, ptrs));
 }
 
 /* Other helper routines. */
 static int
 ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode,
     unsigned int value, int flags, int compat32)
 {
 	struct pwddesc *pdp;
 	struct ksem *ks;
 	struct file *fp;
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error, fd;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 	AUDIT_ARG_VALUE(value);
 
 	if (value > SEM_VALUE_MAX)
 		return (EINVAL);
 
 	pdp = td->td_proc->p_pd;
 	mode = (mode & ~pdp->pd_cmask) & ACCESSPERMS;
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error) {
 		if (name == NULL)
 			error = ENOSPC;
 		return (error);
 	}
 
 	/*
 	 * Go ahead and copyout the file descriptor now.  This is a bit
 	 * premature, but it is a lot easier to handle errors as opposed
 	 * to later when we've possibly created a new semaphore, etc.
 	 */
 	error = ksem_create_copyout_semid(td, semidp, fd, compat32);
 	if (error) {
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	if (name == NULL) {
 		/* Create an anonymous semaphore. */
 		ks = ksem_alloc(td->td_ucred, mode, value);
 		if (ks == NULL)
 			error = ENOSPC;
 		else
 			ks->ks_flags |= KS_ANONYMOUS;
 	} else {
 		path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK);
 		pr_path = td->td_ucred->cr_prison->pr_path;
 
 		/* Construct a full pathname for jailed callers. */
 		pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
 		    : strlcpy(path, pr_path, MAXPATHLEN);
 		error = copyinstr(name, path + pr_pathlen,
 		    MAXPATHLEN - pr_pathlen, NULL);
 
 		/* Require paths to start with a '/' character. */
 		if (error == 0 && path[pr_pathlen] != '/')
 			error = EINVAL;
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			free(path, M_KSEM);
 			return (error);
 		}
 
 		AUDIT_ARG_UPATH1_CANON(path);
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&ksem_dict_lock);
 		ks = ksem_lookup(path, fnv);
 		if (ks == NULL) {
 			/* Object does not exist, create it if requested. */
 			if (flags & O_CREAT) {
 				ks = ksem_alloc(td->td_ucred, mode, value);
 				if (ks == NULL)
 					error = ENFILE;
 				else {
 					ksem_insert(path, fnv, ks);
 					path = NULL;
 				}
 			} else
 				error = ENOENT;
 		} else {
 			/*
 			 * Object already exists, obtain a new
 			 * reference if requested and permitted.
 			 */
 			if ((flags & (O_CREAT | O_EXCL)) ==
 			    (O_CREAT | O_EXCL))
 				error = EEXIST;
 			else {
 #ifdef MAC
 				error = mac_posixsem_check_open(td->td_ucred,
 				    ks);
 				if (error == 0)
 #endif
 				error = ksem_access(ks, td->td_ucred);
 			}
 			if (error == 0)
 				ksem_hold(ks);
 #ifdef INVARIANTS
 			else
 				ks = NULL;
 #endif
 		}
 		sx_xunlock(&ksem_dict_lock);
 		if (path)
 			free(path, M_KSEM);
 	}
 
 	if (error) {
 		KASSERT(ks == NULL, ("ksem_create error with a ksem"));
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 	KASSERT(ks != NULL, ("ksem_create w/o a ksem"));
 
 	finit(fp, FREAD | FWRITE, DTYPE_SEM, ks, &ksem_ops);
 
 	fdrop(fp, td);
 
 	return (0);
 }
 
 static int
 ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp,
     struct file **fpp)
 {
 	struct ksem *ks;
 	struct file *fp;
 	int error;
 
 	error = fget(td, id, rightsp, &fp);
 	if (error)
 		return (EINVAL);
 	if (fp->f_type != DTYPE_SEM) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	ks = fp->f_data;
 	if (ks->ks_flags & KS_DEAD) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	*fpp = fp;
 	return (0);
 }
 
 /* System calls. */
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_init_args {
 	unsigned int	value;
 	semid_t		*idp;
 };
 #endif
 int
 sys_ksem_init(struct thread *td, struct ksem_init_args *uap)
 {
 
 	return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
 	    0, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_open_args {
 	char		*name;
 	int		oflag;
 	mode_t		mode;
 	unsigned int	value;
 	semid_t		*idp;	
 };
 #endif
 int
 sys_ksem_open(struct thread *td, struct ksem_open_args *uap)
 {
 
 	DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid));
 
 	if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
 		return (EINVAL);
 	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
 	    uap->oflag, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_unlink_args {
 	char		*name;
 };
 #endif
 int
 sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap)
 {
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error;
 
 	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	pr_path = td->td_ucred->cr_prison->pr_path;
 	pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
 	    : strlcpy(path, pr_path, MAXPATHLEN);
 	error = copyinstr(uap->name, path + pr_pathlen, MAXPATHLEN - pr_pathlen,
 	    NULL);
 	if (error) {
 		free(path, M_TEMP);
 		return (error);
 	}
 
 	AUDIT_ARG_UPATH1_CANON(path);
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&ksem_dict_lock);
 	error = ksem_remove(path, fnv, td->td_ucred);
 	sx_xunlock(&ksem_dict_lock);
 	free(path, M_TEMP);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_close_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_close(struct thread *td, struct ksem_close_args *uap)
 {
 	struct ksem *ks;
 	struct file *fp;
 	int error;
 
 	/* No capability rights required to close a semaphore. */
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id, &cap_no_rights, &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	if (ks->ks_flags & KS_ANONYMOUS) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	error = kern_close(td, uap->id);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_post_args {
 	semid_t	id;
 };
 #endif
 int
 sys_ksem_post(struct thread *td, struct ksem_post_args *uap)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id,
 	    cap_rights_init_one(&rights, CAP_SEM_POST), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks);
 	if (error)
 		goto err;
 #endif
 	if (ks->ks_value == SEM_VALUE_MAX) {
 		error = EOVERFLOW;
 		goto err;
 	}
 	++ks->ks_value;
 	if (ks->ks_waiters > 0)
 		cv_signal(&ks->ks_cv);
 	error = 0;
 	vfs_timestamp(&ks->ks_ctime);
 err:
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_wait_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_wait(struct thread *td, struct ksem_wait_args *uap)
 {
 
 	return (kern_sem_wait(td, uap->id, 0, NULL));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_timedwait_args {
 	semid_t		id;
 	const struct timespec *abstime;
 };
 #endif
 int
 sys_ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap)
 {
 	struct timespec abstime;
 	struct timespec *ts;
 	int error;
 
 	/*
 	 * We allow a null timespec (wait forever).
 	 */
 	if (uap->abstime == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->abstime, &abstime, sizeof(abstime));
 		if (error != 0)
 			return (error);
 		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
 			return (EINVAL);
 		ts = &abstime;
 	}
 	return (kern_sem_wait(td, uap->id, 0, ts));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_trywait_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_trywait(struct thread *td, struct ksem_trywait_args *uap)
 {
 
 	return (kern_sem_wait(td, uap->id, 1, NULL));
 }
 
 static int
 kern_sem_wait(struct thread *td, semid_t id, int tryflag,
     struct timespec *abstime)
 {
 	struct timespec ts1, ts2;
 	struct timeval tv;
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid));
 	AUDIT_ARG_FD(id);
 	error = ksem_get(td, id, cap_rights_init_one(&rights, CAP_SEM_WAIT),
 	    &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	mtx_lock(&sem_lock);
 	DP((">>> kern_sem_wait critical section entered! pid=%d\n",
 	    (int)td->td_proc->p_pid));
 #ifdef MAC
 	error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks);
 	if (error) {
 		DP(("kern_sem_wait mac failed\n"));
 		goto err;
 	}
 #endif
 	DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag));
 	vfs_timestamp(&ks->ks_atime);
 	while (ks->ks_value == 0) {
 		ks->ks_waiters++;
 		if (tryflag != 0)
 			error = EAGAIN;
 		else if (abstime == NULL)
 			error = cv_wait_sig(&ks->ks_cv, &sem_lock);
 		else {
 			for (;;) {
 				ts1 = *abstime;
 				getnanotime(&ts2);
 				timespecsub(&ts1, &ts2, &ts1);
 				TIMESPEC_TO_TIMEVAL(&tv, &ts1);
 				if (tv.tv_sec < 0) {
 					error = ETIMEDOUT;
 					break;
 				}
 				error = cv_timedwait_sig(&ks->ks_cv,
 				    &sem_lock, tvtohz(&tv));
 				if (error != EWOULDBLOCK)
 					break;
 			}
 		}
 		ks->ks_waiters--;
 		if (error)
 			goto err;
 	}
 	ks->ks_value--;
 	DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value));
 	error = 0;
 err:
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n",
 	    (int)td->td_proc->p_pid, error));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_getvalue_args {
 	semid_t		id;
 	int		*val;
 };
 #endif
 int
 sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	struct ksem *ks;
 	int error, val;
 
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id,
 	    cap_rights_init_one(&rights, CAP_SEM_GETVALUE), &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 
 	mtx_lock(&sem_lock);
 #ifdef MAC
 	error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks);
 	if (error) {
 		mtx_unlock(&sem_lock);
 		fdrop(fp, td);
 		return (error);
 	}
 #endif
 	val = ks->ks_value;
 	vfs_timestamp(&ks->ks_atime);
 	mtx_unlock(&sem_lock);
 	fdrop(fp, td);
 	error = copyout(&val, uap->val, sizeof(val));
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_destroy_args {
 	semid_t		id;
 };
 #endif
 int
 sys_ksem_destroy(struct thread *td, struct ksem_destroy_args *uap)
 {
 	struct file *fp;
 	struct ksem *ks;
 	int error;
 
 	/* No capability rights required to close a semaphore. */
 	AUDIT_ARG_FD(uap->id);
 	error = ksem_get(td, uap->id, &cap_no_rights, &fp);
 	if (error)
 		return (error);
 	ks = fp->f_data;
 	if (!(ks->ks_flags & KS_ANONYMOUS)) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 	mtx_lock(&sem_lock);
 	if (ks->ks_waiters != 0) {
 		mtx_unlock(&sem_lock);
 		error = EBUSY;
 		goto err;
 	}
 	ks->ks_flags |= KS_DEAD;
 	mtx_unlock(&sem_lock);
 
 	error = kern_close(td, uap->id);
 err:
 	fdrop(fp, td);
 	return (error);
 }
 
 static struct syscall_helper_data ksem_syscalls[] = {
 	SYSCALL_INIT_HELPER(ksem_init),
 	SYSCALL_INIT_HELPER(ksem_open),
 	SYSCALL_INIT_HELPER(ksem_unlink),
 	SYSCALL_INIT_HELPER(ksem_close),
 	SYSCALL_INIT_HELPER(ksem_post),
 	SYSCALL_INIT_HELPER(ksem_wait),
 	SYSCALL_INIT_HELPER(ksem_timedwait),
 	SYSCALL_INIT_HELPER(ksem_trywait),
 	SYSCALL_INIT_HELPER(ksem_getvalue),
 	SYSCALL_INIT_HELPER(ksem_destroy),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 int
 freebsd32_ksem_init(struct thread *td, struct freebsd32_ksem_init_args *uap)
 {
 
 	return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value,
 	    0, 1));
 }
 
 int
 freebsd32_ksem_open(struct thread *td, struct freebsd32_ksem_open_args *uap)
 {
 
 	if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0)
 		return (EINVAL);
 	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
 	    uap->oflag, 1));
 }
 
 int
 freebsd32_ksem_timedwait(struct thread *td,
     struct freebsd32_ksem_timedwait_args *uap)
 {
 	struct timespec32 abstime32;
 	struct timespec *ts, abstime;
 	int error;
 
 	/*
 	 * We allow a null timespec (wait forever).
 	 */
 	if (uap->abstime == NULL)
 		ts = NULL;
 	else {
 		error = copyin(uap->abstime, &abstime32, sizeof(abstime32));
 		if (error != 0)
 			return (error);
 		CP(abstime32, abstime, tv_sec);
 		CP(abstime32, abstime, tv_nsec);
 		if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0)
 			return (EINVAL);
 		ts = &abstime;
 	}
 	return (kern_sem_wait(td, uap->id, 0, ts));
 }
 
 static struct syscall_helper_data ksem32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_init),
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_open),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_close),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_post),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_wait),
 	SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue),
 	SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 ksem_module_init(void)
 {
 	int error;
 
 	mtx_init(&sem_lock, "sem", NULL, MTX_DEF);
 	mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF);
 	sx_init(&ksem_dict_lock, "ksem dictionary");
 	ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash);
 	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L);
 	p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX);
 	p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX);
 
 	error = syscall_helper_register(ksem_syscalls, SY_THR_STATIC_KLD);
 	if (error)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(ksem32_syscalls, SY_THR_STATIC_KLD);
 	if (error)
 		return (error);
 #endif
 	return (0);
 }
 
 static void
 ksem_module_destroy(void)
 {
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(ksem32_syscalls);
 #endif
 	syscall_helper_unregister(ksem_syscalls);
 
 	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0);
 	hashdestroy(ksem_dictionary, M_KSEM, ksem_hash);
 	sx_destroy(&ksem_dict_lock);
 	mtx_destroy(&ksem_count_lock);
 	mtx_destroy(&sem_lock);
 	p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX);
 	p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX);
 }
 
 static int
 sem_modload(struct module *module, int cmd, void *arg)
 {
         int error = 0;
 
         switch (cmd) {
         case MOD_LOAD:
 		error = ksem_module_init();
 		if (error)
 			ksem_module_destroy();
                 break;
 
         case MOD_UNLOAD:
 		mtx_lock(&ksem_count_lock);
 		if (nsems != 0) {
 			error = EOPNOTSUPP;
 			mtx_unlock(&ksem_count_lock);
 			break;
 		}
 		ksem_dead = 1;
 		mtx_unlock(&ksem_count_lock);
 		ksem_module_destroy();
                 break;
 
         case MOD_SHUTDOWN:
                 break;
         default:
                 error = EINVAL;
                 break;
         }
         return (error);
 }
 
 static moduledata_t sem_mod = {
         "sem",
         &sem_modload,
         NULL
 };
 
 DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
 MODULE_VERSION(sem, 1);
diff --git a/sys/kern/uipc_shm.c b/sys/kern/uipc_shm.c
index 63c4250f640f..14d808bfc166 100644
--- a/sys/kern/uipc_shm.c
+++ b/sys/kern/uipc_shm.c
@@ -1,2100 +1,2099 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson
  * Copyright 2020 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Support for shared swap-backed anonymous memory objects via
  * shm_open(2), shm_rename(2), and shm_unlink(2).
  * While most of the implementation is here, vm_mmap.c contains
  * mapping logic changes.
  *
  * posixshmcontrol(1) allows users to inspect the state of the memory
  * objects.  Per-uid swap resource limit controls total amount of
  * memory that user can consume for anonymous objects, including
  * shared.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/uio.h>
 #include <sys/signal.h>
 #include <sys/jail.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/time.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 struct shm_mapping {
 	char		*sm_path;
 	Fnv32_t		sm_fnv;
 	struct shmfd	*sm_shmfd;
 	LIST_ENTRY(shm_mapping) sm_link;
 };
 
 static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
 static LIST_HEAD(, shm_mapping) *shm_dictionary;
 static struct sx shm_dict_lock;
 static struct mtx shm_timestamp_lock;
 static u_long shm_hash;
 static struct unrhdr64 shm_ino_unr;
 static dev_t shm_dev_ino;
 
 #define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])
 
 static void	shm_init(void *arg);
 static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
 static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
 static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
 static int	shm_dotruncate_cookie(struct shmfd *shmfd, off_t length,
     void *rl_cookie);
 static int	shm_dotruncate_locked(struct shmfd *shmfd, off_t length,
     void *rl_cookie);
 static int	shm_copyin_path(struct thread *td, const char *userpath_in,
     char **path_out);
 static int	shm_deallocate(struct shmfd *shmfd, off_t *offset,
     off_t *length, int flags);
 
 static fo_rdwr_t	shm_read;
 static fo_rdwr_t	shm_write;
 static fo_truncate_t	shm_truncate;
 static fo_ioctl_t	shm_ioctl;
 static fo_stat_t	shm_stat;
 static fo_close_t	shm_close;
 static fo_chmod_t	shm_chmod;
 static fo_chown_t	shm_chown;
 static fo_seek_t	shm_seek;
 static fo_fill_kinfo_t	shm_fill_kinfo;
 static fo_mmap_t	shm_mmap;
 static fo_get_seals_t	shm_get_seals;
 static fo_add_seals_t	shm_add_seals;
 static fo_fallocate_t	shm_fallocate;
 static fo_fspacectl_t	shm_fspacectl;
 
 /* File descriptor operations. */
 struct fileops shm_ops = {
 	.fo_read = shm_read,
 	.fo_write = shm_write,
 	.fo_truncate = shm_truncate,
 	.fo_ioctl = shm_ioctl,
 	.fo_poll = invfo_poll,
 	.fo_kqfilter = invfo_kqfilter,
 	.fo_stat = shm_stat,
 	.fo_close = shm_close,
 	.fo_chmod = shm_chmod,
 	.fo_chown = shm_chown,
 	.fo_sendfile = vn_sendfile,
 	.fo_seek = shm_seek,
 	.fo_fill_kinfo = shm_fill_kinfo,
 	.fo_mmap = shm_mmap,
 	.fo_get_seals = shm_get_seals,
 	.fo_add_seals = shm_add_seals,
 	.fo_fallocate = shm_fallocate,
 	.fo_fspacectl = shm_fspacectl,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE,
 };
 
 FEATURE(posix_shm, "POSIX shared memory");
 
 static SYSCTL_NODE(_vm, OID_AUTO, largepages, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "");
 
 static int largepage_reclaim_tries = 1;
 SYSCTL_INT(_vm_largepages, OID_AUTO, reclaim_tries,
     CTLFLAG_RWTUN, &largepage_reclaim_tries, 0,
     "Number of contig reclaims before giving up for default alloc policy");
 
 static int
 uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
 {
 	vm_page_t m;
 	vm_pindex_t idx;
 	size_t tlen;
 	int error, offset, rv;
 
 	idx = OFF_TO_IDX(uio->uio_offset);
 	offset = uio->uio_offset & PAGE_MASK;
 	tlen = MIN(PAGE_SIZE - offset, len);
 
 	rv = vm_page_grab_valid_unlocked(&m, obj, idx,
 	    VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOCREAT);
 	if (rv == VM_PAGER_OK)
 		goto found;
 
 	/*
 	 * Read I/O without either a corresponding resident page or swap
 	 * page: use zero_region.  This is intended to avoid instantiating
 	 * pages on read from a sparse region.
 	 */
 	VM_OBJECT_WLOCK(obj);
 	m = vm_page_lookup(obj, idx);
 	if (uio->uio_rw == UIO_READ && m == NULL &&
 	    !vm_pager_has_page(obj, idx, NULL, NULL)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (uiomove(__DECONST(void *, zero_region), tlen, uio));
 	}
 
 	/*
 	 * Although the tmpfs vnode lock is held here, it is
 	 * nonetheless safe to sleep waiting for a free page.  The
 	 * pageout daemon does not need to acquire the tmpfs vnode
 	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
 	 * type object.
 	 */
 	rv = vm_page_grab_valid(&m, obj, idx,
 	    VM_ALLOC_NORMAL | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY);
 	if (rv != VM_PAGER_OK) {
 		VM_OBJECT_WUNLOCK(obj);
 		printf("uiomove_object: vm_obj %p idx %jd pager error %d\n",
 		    obj, idx, rv);
 		return (EIO);
 	}
 	VM_OBJECT_WUNLOCK(obj);
 
 found:
 	error = uiomove_fromphys(&m, offset, tlen, uio);
 	if (uio->uio_rw == UIO_WRITE && error == 0)
 		vm_page_set_dirty(m);
 	vm_page_activate(m);
 	vm_page_sunbusy(m);
 
 	return (error);
 }
 
 int
 uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
 {
 	ssize_t resid;
 	size_t len;
 	int error;
 
 	error = 0;
 	while ((resid = uio->uio_resid) > 0) {
 		if (obj_size <= uio->uio_offset)
 			break;
 		len = MIN(obj_size - uio->uio_offset, resid);
 		if (len == 0)
 			break;
 		error = uiomove_object_page(obj, len, uio);
 		if (error != 0 || resid == uio->uio_resid)
 			break;
 	}
 	return (error);
 }
 
 static u_long count_largepages[MAXPAGESIZES];
 
 static int
 shm_largepage_phys_populate(vm_object_t object, vm_pindex_t pidx,
     int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last)
 {
 	vm_page_t m;
 	int psind;
 
 	psind = object->un_pager.phys.data_val;
 	if (psind == 0 || pidx >= object->size)
 		return (VM_PAGER_FAIL);
 	*first = rounddown2(pidx, pagesizes[psind] / PAGE_SIZE);
 
 	/*
 	 * We only busy the first page in the superpage run.  It is
 	 * useless to busy whole run since we only remove full
 	 * superpage, and it takes too long to busy e.g. 512 * 512 ==
 	 * 262144 pages constituing 1G amd64 superage.
 	 */
 	m = vm_page_grab(object, *first, VM_ALLOC_NORMAL | VM_ALLOC_NOCREAT);
 	MPASS(m != NULL);
 
 	*last = *first + atop(pagesizes[psind]) - 1;
 	return (VM_PAGER_OK);
 }
 
 static boolean_t
 shm_largepage_phys_haspage(vm_object_t object, vm_pindex_t pindex,
     int *before, int *after)
 {
 	int psind;
 
 	psind = object->un_pager.phys.data_val;
 	if (psind == 0 || pindex >= object->size)
 		return (FALSE);
 	if (before != NULL) {
 		*before = pindex - rounddown2(pindex, pagesizes[psind] /
 		    PAGE_SIZE);
 	}
 	if (after != NULL) {
 		*after = roundup2(pindex, pagesizes[psind] / PAGE_SIZE) -
 		    pindex;
 	}
 	return (TRUE);
 }
 
 static void
 shm_largepage_phys_ctor(vm_object_t object, vm_prot_t prot,
     vm_ooffset_t foff, struct ucred *cred)
 {
 }
 
 static void
 shm_largepage_phys_dtor(vm_object_t object)
 {
 	int psind;
 
 	psind = object->un_pager.phys.data_val;
 	if (psind != 0) {
 		atomic_subtract_long(&count_largepages[psind],
 		    object->size / (pagesizes[psind] / PAGE_SIZE));
 		vm_wire_sub(object->size);
 	} else {
 		KASSERT(object->size == 0,
 		    ("largepage phys obj %p not initialized bit size %#jx > 0",
 		    object, (uintmax_t)object->size));
 	}
 }
 
 static const struct phys_pager_ops shm_largepage_phys_ops = {
 	.phys_pg_populate =	shm_largepage_phys_populate,
 	.phys_pg_haspage =	shm_largepage_phys_haspage,
 	.phys_pg_ctor =		shm_largepage_phys_ctor,
 	.phys_pg_dtor =		shm_largepage_phys_dtor,
 };
 
 bool
 shm_largepage(struct shmfd *shmfd)
 {
 	return (shmfd->shm_object->type == OBJT_PHYS);
 }
 
 static int
 shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 	struct shmfd *shmfd;
 	off_t foffset;
 	int error;
 
 	shmfd = fp->f_data;
 	foffset = foffset_lock(fp, 0);
 	error = 0;
 	switch (whence) {
 	case L_INCR:
 		if (foffset < 0 ||
 		    (offset > 0 && foffset > OFF_MAX - offset)) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += foffset;
 		break;
 	case L_XTND:
 		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += shmfd->shm_size;
 		break;
 	case L_SET:
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0) {
 		if (offset < 0 || offset > shmfd->shm_size)
 			error = EINVAL;
 		else
 			td->td_uretoff.tdu_off = offset;
 	}
 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
 static int
 shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	foffset_lock_uio(fp, uio, flags);
 	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
 	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
 	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 	off_t size;
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	if (shm_largepage(shmfd) && shmfd->shm_lp_psind == 0)
 		return (EINVAL);
 	foffset_lock_uio(fp, uio, flags);
 	if (uio->uio_resid > OFF_MAX - uio->uio_offset) {
 		/*
 		 * Overflow is only an error if we're supposed to expand on
 		 * write.  Otherwise, we'll just truncate the write to the
 		 * size of the file, which can only grow up to OFF_MAX.
 		 */
 		if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0) {
 			foffset_unlock_uio(fp, uio, flags);
 			return (EFBIG);
 		}
 
 		size = shmfd->shm_size;
 	} else {
 		size = uio->uio_offset + uio->uio_resid;
 	}
 	if ((flags & FOF_OFFSET) == 0) {
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 		    &shmfd->shm_mtx);
 	} else {
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
 		    size, &shmfd->shm_mtx);
 	}
 	if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) {
 		error = EPERM;
 	} else {
 		error = 0;
 		if ((shmfd->shm_flags & SHM_GROW_ON_WRITE) != 0 &&
 		    size > shmfd->shm_size) {
 			error = shm_dotruncate_cookie(shmfd, size, rl_cookie);
 		}
 		if (error == 0)
 			error = uiomove_object(shmfd->shm_object,
 			    shmfd->shm_size, uio);
 	}
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	return (shm_dotruncate(shmfd, length));
 }
 
 int
 shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	struct shm_largepage_conf *conf;
 	void *rl_cookie;
 
 	shmfd = fp->f_data;
 	switch (com) {
 	case FIONBIO:
 	case FIOASYNC:
 		/*
 		 * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work,
 		 * just like it would on an unlinked regular file
 		 */
 		return (0);
 	case FIOSSHMLPGCNF:
 		if (!shm_largepage(shmfd))
 			return (ENOTTY);
 		conf = data;
 		if (shmfd->shm_lp_psind != 0 &&
 		    conf->psind != shmfd->shm_lp_psind)
 			return (EINVAL);
 		if (conf->psind <= 0 || conf->psind >= MAXPAGESIZES ||
 		    pagesizes[conf->psind] == 0)
 			return (EINVAL);
 		if (conf->alloc_policy != SHM_LARGEPAGE_ALLOC_DEFAULT &&
 		    conf->alloc_policy != SHM_LARGEPAGE_ALLOC_NOWAIT &&
 		    conf->alloc_policy != SHM_LARGEPAGE_ALLOC_HARD)
 			return (EINVAL);
 
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 		    &shmfd->shm_mtx);
 		shmfd->shm_lp_psind = conf->psind;
 		shmfd->shm_lp_alloc_policy = conf->alloc_policy;
 		shmfd->shm_object->un_pager.phys.data_val = conf->psind;
 		rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 		return (0);
 	case FIOGSHMLPGCNF:
 		if (!shm_largepage(shmfd))
 			return (ENOTTY);
 		conf = data;
 		rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, OFF_MAX,
 		    &shmfd->shm_mtx);
 		conf->psind = shmfd->shm_lp_psind;
 		conf->alloc_policy = shmfd->shm_lp_alloc_policy;
 		rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 		return (0);
 	default:
 		return (ENOTTY);
 	}
 }
 
 static int
-shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
-    struct thread *td)
+shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 
 #ifdef MAC
 	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 
 	/*
 	 * Attempt to return sanish values for fstat() on a memory file
 	 * descriptor.
 	 */
 	bzero(sb, sizeof(*sb));
 	sb->st_blksize = PAGE_SIZE;
 	sb->st_size = shmfd->shm_size;
 	sb->st_blocks = howmany(sb->st_size, sb->st_blksize);
 	mtx_lock(&shm_timestamp_lock);
 	sb->st_atim = shmfd->shm_atime;
 	sb->st_ctim = shmfd->shm_ctime;
 	sb->st_mtim = shmfd->shm_mtime;
 	sb->st_birthtim = shmfd->shm_birthtime;
 	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
 	sb->st_uid = shmfd->shm_uid;
 	sb->st_gid = shmfd->shm_gid;
 	mtx_unlock(&shm_timestamp_lock);
 	sb->st_dev = shm_dev_ino;
 	sb->st_ino = shmfd->shm_ino;
 	sb->st_nlink = shmfd->shm_object->ref_count;
 	sb->st_blocks = shmfd->shm_object->size /
 	    (pagesizes[shmfd->shm_lp_psind] >> PAGE_SHIFT);
 
 	return (0);
 }
 
 static int
 shm_close(struct file *fp, struct thread *td)
 {
 	struct shmfd *shmfd;
 
 	shmfd = fp->f_data;
 	fp->f_data = NULL;
 	shm_drop(shmfd);
 
 	return (0);
 }
 
 static int
 shm_copyin_path(struct thread *td, const char *userpath_in, char **path_out) {
 	int error;
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 
 	path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
 	pr_path = td->td_ucred->cr_prison->pr_path;
 
 	/* Construct a full pathname for jailed callers. */
 	pr_pathlen = strcmp(pr_path, "/") ==
 	    0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN);
 	error = copyinstr(userpath_in, path + pr_pathlen,
 	    MAXPATHLEN - pr_pathlen, NULL);
 	if (error != 0)
 		goto out;
 
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_NAMEI))
 		ktrnamei(path);
 #endif
 
 	/* Require paths to start with a '/' character. */
 	if (path[pr_pathlen] != '/') {
 		error = EINVAL;
 		goto out;
 	}
 
 	*path_out = path;
 
 out:
 	if (error != 0)
 		free(path, M_SHMFD);
 
 	return (error);
 }
 
 static int
 shm_partial_page_invalidate(vm_object_t object, vm_pindex_t idx, int base,
     int end)
 {
 	vm_page_t m;
 	int rv;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(base >= 0, ("%s: base %d", __func__, base));
 	KASSERT(end - base <= PAGE_SIZE, ("%s: base %d end %d", __func__, base,
 	    end));
 
 retry:
 	m = vm_page_grab(object, idx, VM_ALLOC_NOCREAT);
 	if (m != NULL) {
 		MPASS(vm_page_all_valid(m));
 	} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
 		m = vm_page_alloc(object, idx,
 		    VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL);
 		if (m == NULL)
 			goto retry;
 		vm_object_pip_add(object, 1);
 		VM_OBJECT_WUNLOCK(object);
 		rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
 		VM_OBJECT_WLOCK(object);
 		vm_object_pip_wakeup(object);
 		if (rv == VM_PAGER_OK) {
 			/*
 			 * Since the page was not resident, and therefore not
 			 * recently accessed, immediately enqueue it for
 			 * asynchronous laundering.  The current operation is
 			 * not regarded as an access.
 			 */
 			vm_page_launder(m);
 		} else {
 			vm_page_free(m);
 			VM_OBJECT_WUNLOCK(object);
 			return (EIO);
 		}
 	}
 	if (m != NULL) {
 		pmap_zero_page_area(m, base, end - base);
 		KASSERT(vm_page_all_valid(m), ("%s: page %p is invalid",
 		    __func__, m));
 		vm_page_set_dirty(m);
 		vm_page_xunbusy(m);
 	}
 
 	return (0);
 }
 
 static int
 shm_dotruncate_locked(struct shmfd *shmfd, off_t length, void *rl_cookie)
 {
 	vm_object_t object;
 	vm_pindex_t nobjsize;
 	vm_ooffset_t delta;
 	int base, error;
 
 	KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
 	object = shmfd->shm_object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	rangelock_cookie_assert(rl_cookie, RA_WLOCKED);
 	if (length == shmfd->shm_size)
 		return (0);
 	nobjsize = OFF_TO_IDX(length + PAGE_MASK);
 
 	/* Are we shrinking?  If so, trim the end. */
 	if (length < shmfd->shm_size) {
 		if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
 			return (EPERM);
 
 		/*
 		 * Disallow any requests to shrink the size if this
 		 * object is mapped into the kernel.
 		 */
 		if (shmfd->shm_kmappings > 0)
 			return (EBUSY);
 
 		/*
 		 * Zero the truncated part of the last page.
 		 */
 		base = length & PAGE_MASK;
 		if (base != 0) {
 			error = shm_partial_page_invalidate(object,
 			    OFF_TO_IDX(length), base, PAGE_SIZE);
 			if (error)
 				return (error);
 		}
 		delta = IDX_TO_OFF(object->size - nobjsize);
 
 		if (nobjsize < object->size)
 			vm_object_page_remove(object, nobjsize, object->size,
 			    0);
 
 		/* Free the swap accounted for shm */
 		swap_release_by_cred(delta, object->cred);
 		object->charge -= delta;
 	} else {
 		if ((shmfd->shm_seals & F_SEAL_GROW) != 0)
 			return (EPERM);
 
 		/* Try to reserve additional swap space. */
 		delta = IDX_TO_OFF(nobjsize - object->size);
 		if (!swap_reserve_by_cred(delta, object->cred))
 			return (ENOMEM);
 		object->charge += delta;
 	}
 	shmfd->shm_size = length;
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_ctime);
 	shmfd->shm_mtime = shmfd->shm_ctime;
 	mtx_unlock(&shm_timestamp_lock);
 	object->size = nobjsize;
 	return (0);
 }
 
 static int
 shm_dotruncate_largepage(struct shmfd *shmfd, off_t length, void *rl_cookie)
 {
 	vm_object_t object;
 	vm_page_t m;
 	vm_pindex_t newobjsz, oldobjsz;
 	int aflags, error, i, psind, try;
 
 	KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
 	object = shmfd->shm_object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	rangelock_cookie_assert(rl_cookie, RA_WLOCKED);
 
 	oldobjsz = object->size;
 	newobjsz = OFF_TO_IDX(length);
 	if (length == shmfd->shm_size)
 		return (0);
 	psind = shmfd->shm_lp_psind;
 	if (psind == 0 && length != 0)
 		return (EINVAL);
 	if ((length & (pagesizes[psind] - 1)) != 0)
 		return (EINVAL);
 
 	if (length < shmfd->shm_size) {
 		if ((shmfd->shm_seals & F_SEAL_SHRINK) != 0)
 			return (EPERM);
 		if (shmfd->shm_kmappings > 0)
 			return (EBUSY);
 		return (ENOTSUP);	/* Pages are unmanaged. */
 #if 0
 		vm_object_page_remove(object, newobjsz, oldobjsz, 0);
 		object->size = newobjsz;
 		shmfd->shm_size = length;
 		return (0);
 #endif
 	}
 
 	if ((shmfd->shm_seals & F_SEAL_GROW) != 0)
 		return (EPERM);
 
 	aflags = VM_ALLOC_NORMAL | VM_ALLOC_ZERO;
 	if (shmfd->shm_lp_alloc_policy == SHM_LARGEPAGE_ALLOC_NOWAIT)
 		aflags |= VM_ALLOC_WAITFAIL;
 	try = 0;
 
 	/*
 	 * Extend shmfd and object, keeping all already fully
 	 * allocated large pages intact even on error, because dropped
 	 * object lock might allowed mapping of them.
 	 */
 	while (object->size < newobjsz) {
 		m = vm_page_alloc_contig(object, object->size, aflags,
 		    pagesizes[psind] / PAGE_SIZE, 0, ~0,
 		    pagesizes[psind], 0,
 		    VM_MEMATTR_DEFAULT);
 		if (m == NULL) {
 			VM_OBJECT_WUNLOCK(object);
 			if (shmfd->shm_lp_alloc_policy ==
 			    SHM_LARGEPAGE_ALLOC_NOWAIT ||
 			    (shmfd->shm_lp_alloc_policy ==
 			    SHM_LARGEPAGE_ALLOC_DEFAULT &&
 			    try >= largepage_reclaim_tries)) {
 				VM_OBJECT_WLOCK(object);
 				return (ENOMEM);
 			}
 			error = vm_page_reclaim_contig(aflags,
 			    pagesizes[psind] / PAGE_SIZE, 0, ~0,
 			    pagesizes[psind], 0) ? 0 :
 			    vm_wait_intr(object);
 			if (error != 0) {
 				VM_OBJECT_WLOCK(object);
 				return (error);
 			}
 			try++;
 			VM_OBJECT_WLOCK(object);
 			continue;
 		}
 		try = 0;
 		for (i = 0; i < pagesizes[psind] / PAGE_SIZE; i++) {
 			if ((m[i].flags & PG_ZERO) == 0)
 				pmap_zero_page(&m[i]);
 			vm_page_valid(&m[i]);
 			vm_page_xunbusy(&m[i]);
 		}
 		object->size += OFF_TO_IDX(pagesizes[psind]);
 		shmfd->shm_size += pagesizes[psind];
 		atomic_add_long(&count_largepages[psind], 1);
 		vm_wire_add(atop(pagesizes[psind]));
 	}
 	return (0);
 }
 
 static int
 shm_dotruncate_cookie(struct shmfd *shmfd, off_t length, void *rl_cookie)
 {
 	int error;
 
 	VM_OBJECT_WLOCK(shmfd->shm_object);
 	error = shm_largepage(shmfd) ? shm_dotruncate_largepage(shmfd,
 	    length, rl_cookie) : shm_dotruncate_locked(shmfd, length,
 	    rl_cookie);
 	VM_OBJECT_WUNLOCK(shmfd->shm_object);
 	return (error);
 }
 
 int
 shm_dotruncate(struct shmfd *shmfd, off_t length)
 {
 	void *rl_cookie;
 	int error;
 
 	rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 	    &shmfd->shm_mtx);
 	error = shm_dotruncate_cookie(shmfd, length, rl_cookie);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	return (error);
 }
 
 /*
  * shmfd object management including creation and reference counting
  * routines.
  */
 struct shmfd *
 shm_alloc(struct ucred *ucred, mode_t mode, bool largepage)
 {
 	struct shmfd *shmfd;
 
 	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
 	shmfd->shm_size = 0;
 	shmfd->shm_uid = ucred->cr_uid;
 	shmfd->shm_gid = ucred->cr_gid;
 	shmfd->shm_mode = mode;
 	if (largepage) {
 		shmfd->shm_object = phys_pager_allocate(NULL,
 		    &shm_largepage_phys_ops, NULL, shmfd->shm_size,
 		    VM_PROT_DEFAULT, 0, ucred);
 		shmfd->shm_lp_alloc_policy = SHM_LARGEPAGE_ALLOC_DEFAULT;
 	} else {
 		shmfd->shm_object = vm_pager_allocate(OBJT_SWAP, NULL,
 		    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
 	}
 	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
 	vfs_timestamp(&shmfd->shm_birthtime);
 	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
 	    shmfd->shm_birthtime;
 	shmfd->shm_ino = alloc_unr64(&shm_ino_unr);
 	refcount_init(&shmfd->shm_refs, 1);
 	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
 	rangelock_init(&shmfd->shm_rl);
 #ifdef MAC
 	mac_posixshm_init(shmfd);
 	mac_posixshm_create(ucred, shmfd);
 #endif
 
 	return (shmfd);
 }
 
 struct shmfd *
 shm_hold(struct shmfd *shmfd)
 {
 
 	refcount_acquire(&shmfd->shm_refs);
 	return (shmfd);
 }
 
 void
 shm_drop(struct shmfd *shmfd)
 {
 
 	if (refcount_release(&shmfd->shm_refs)) {
 #ifdef MAC
 		mac_posixshm_destroy(shmfd);
 #endif
 		rangelock_destroy(&shmfd->shm_rl);
 		mtx_destroy(&shmfd->shm_mtx);
 		vm_object_deallocate(shmfd->shm_object);
 		free(shmfd, M_SHMFD);
 	}
 }
 
 /*
  * Determine if the credentials have sufficient permissions for a
  * specified combination of FREAD and FWRITE.
  */
 int
 shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
 {
 	accmode_t accmode;
 	int error;
 
 	accmode = 0;
 	if (flags & FREAD)
 		accmode |= VREAD;
 	if (flags & FWRITE)
 		accmode |= VWRITE;
 	mtx_lock(&shm_timestamp_lock);
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
 	    accmode, ucred);
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 static void
 shm_init(void *arg)
 {
 	char name[32];
 	int i;
 
 	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
 	sx_init(&shm_dict_lock, "shm dictionary");
 	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
 	new_unrhdr64(&shm_ino_unr, 1);
 	shm_dev_ino = devfs_alloc_cdp_inode();
 	KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
 
 	for (i = 1; i < MAXPAGESIZES; i++) {
 		if (pagesizes[i] == 0)
 			break;
 #define	M	(1024 * 1024)
 #define	G	(1024 * M)
 		if (pagesizes[i] >= G)
 			snprintf(name, sizeof(name), "%luG", pagesizes[i] / G);
 		else if (pagesizes[i] >= M)
 			snprintf(name, sizeof(name), "%luM", pagesizes[i] / M);
 		else
 			snprintf(name, sizeof(name), "%lu", pagesizes[i]);
 #undef G
 #undef M
 		SYSCTL_ADD_ULONG(NULL, SYSCTL_STATIC_CHILDREN(_vm_largepages),
 		    OID_AUTO, name, CTLFLAG_RD, &count_largepages[i],
 		    "number of non-transient largepages allocated");
 	}
 }
 SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
 
 /*
  * Dictionary management.  We maintain an in-kernel dictionary to map
  * paths to shmfd objects.  We use the FNV hash on the path to store
  * the mappings in a hash table.
  */
 static struct shmfd *
 shm_lookup(char *path, Fnv32_t fnv)
 {
 	struct shm_mapping *map;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0)
 			return (map->sm_shmfd);
 	}
 
 	return (NULL);
 }
 
 static void
 shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
 {
 	struct shm_mapping *map;
 
 	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
 	map->sm_path = path;
 	map->sm_fnv = fnv;
 	map->sm_shmfd = shm_hold(shmfd);
 	shmfd->shm_path = path;
 	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
 }
 
 static int
 shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 {
 	struct shm_mapping *map;
 	int error;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0) {
 #ifdef MAC
 			error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
 			if (error)
 				return (error);
 #endif
 			error = shm_access(map->sm_shmfd, ucred,
 			    FREAD | FWRITE);
 			if (error)
 				return (error);
 			map->sm_shmfd->shm_path = NULL;
 			LIST_REMOVE(map, sm_link);
 			shm_drop(map->sm_shmfd);
 			free(map->sm_path, M_SHMFD);
 			free(map, M_SHMFD);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 int
 kern_shm_open2(struct thread *td, const char *userpath, int flags, mode_t mode,
     int shmflags, struct filecaps *fcaps, const char *name __unused)
 {
 	struct pwddesc *pdp;
 	struct shmfd *shmfd;
 	struct file *fp;
 	char *path;
 	void *rl_cookie;
 	Fnv32_t fnv;
 	mode_t cmode;
 	int error, fd, initial_seals;
 	bool largepage;
 
 	if ((shmflags & ~(SHM_ALLOW_SEALING | SHM_GROW_ON_WRITE |
 	    SHM_LARGEPAGE)) != 0)
 		return (EINVAL);
 
 	initial_seals = F_SEAL_SEAL;
 	if ((shmflags & SHM_ALLOW_SEALING) != 0)
 		initial_seals &= ~F_SEAL_SEAL;
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * shm_open(2) is only allowed for anonymous objects.
 	 */
 	if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON))
 		return (ECAPMODE);
 #endif
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 
 	if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR)
 		return (EINVAL);
 
 	if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
 		return (EINVAL);
 
 	largepage = (shmflags & SHM_LARGEPAGE) != 0;
 	if (largepage && !PMAP_HAS_LARGEPAGES)
 		return (ENOTTY);
 
 	/*
 	 * Currently only F_SEAL_SEAL may be set when creating or opening shmfd.
 	 * If the decision is made later to allow additional seals, care must be
 	 * taken below to ensure that the seals are properly set if the shmfd
 	 * already existed -- this currently assumes that only F_SEAL_SEAL can
 	 * be set and doesn't take further precautions to ensure the validity of
 	 * the seals being added with respect to current mappings.
 	 */
 	if ((initial_seals & ~F_SEAL_SEAL) != 0)
 		return (EINVAL);
 
 	pdp = td->td_proc->p_pd;
 	cmode = (mode & ~pdp->pd_cmask) & ACCESSPERMS;
 
 	/*
 	 * shm_open(2) created shm should always have O_CLOEXEC set, as mandated
 	 * by POSIX.  We allow it to be unset here so that an in-kernel
 	 * interface may be written as a thin layer around shm, optionally not
 	 * setting CLOEXEC.  For shm_open(2), O_CLOEXEC is set unconditionally
 	 * in sys_shm_open() to keep this implementation compliant.
 	 */
 	error = falloc_caps(td, &fp, &fd, flags & O_CLOEXEC, fcaps);
 	if (error)
 		return (error);
 
 	/* A SHM_ANON path pointer creates an anonymous object. */
 	if (userpath == SHM_ANON) {
 		/* A read-only anonymous object is pointless. */
 		if ((flags & O_ACCMODE) == O_RDONLY) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 		shmfd = shm_alloc(td->td_ucred, cmode, largepage);
 		shmfd->shm_seals = initial_seals;
 		shmfd->shm_flags = shmflags;
 	} else {
 		error = shm_copyin_path(td, userpath, &path);
 		if (error != 0) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (error);
 		}
 
 		AUDIT_ARG_UPATH1_CANON(path);
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&shm_dict_lock);
 		shmfd = shm_lookup(path, fnv);
 		if (shmfd == NULL) {
 			/* Object does not yet exist, create it if requested. */
 			if (flags & O_CREAT) {
 #ifdef MAC
 				error = mac_posixshm_check_create(td->td_ucred,
 				    path);
 				if (error == 0) {
 #endif
 					shmfd = shm_alloc(td->td_ucred, cmode,
 					    largepage);
 					shmfd->shm_seals = initial_seals;
 					shmfd->shm_flags = shmflags;
 					shm_insert(path, fnv, shmfd);
 #ifdef MAC
 				}
 #endif
 			} else {
 				free(path, M_SHMFD);
 				error = ENOENT;
 			}
 		} else {
 			rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 			    &shmfd->shm_mtx);
 
 			/*
 			 * kern_shm_open() likely shouldn't ever error out on
 			 * trying to set a seal that already exists, unlike
 			 * F_ADD_SEALS.  This would break terribly as
 			 * shm_open(2) actually sets F_SEAL_SEAL to maintain
 			 * historical behavior where the underlying file could
 			 * not be sealed.
 			 */
 			initial_seals &= ~shmfd->shm_seals;
 
 			/*
 			 * Object already exists, obtain a new
 			 * reference if requested and permitted.
 			 */
 			free(path, M_SHMFD);
 
 			/*
 			 * initial_seals can't set additional seals if we've
 			 * already been set F_SEAL_SEAL.  If F_SEAL_SEAL is set,
 			 * then we've already removed that one from
 			 * initial_seals.  This is currently redundant as we
 			 * only allow setting F_SEAL_SEAL at creation time, but
 			 * it's cheap to check and decreases the effort required
 			 * to allow additional seals.
 			 */
 			if ((shmfd->shm_seals & F_SEAL_SEAL) != 0 &&
 			    initial_seals != 0)
 				error = EPERM;
 			else if ((flags & (O_CREAT | O_EXCL)) ==
 			    (O_CREAT | O_EXCL))
 				error = EEXIST;
 			else if (shmflags != 0 && shmflags != shmfd->shm_flags)
 				error = EINVAL;
 			else {
 #ifdef MAC
 				error = mac_posixshm_check_open(td->td_ucred,
 				    shmfd, FFLAGS(flags & O_ACCMODE));
 				if (error == 0)
 #endif
 				error = shm_access(shmfd, td->td_ucred,
 				    FFLAGS(flags & O_ACCMODE));
 			}
 
 			/*
 			 * Truncate the file back to zero length if
 			 * O_TRUNC was specified and the object was
 			 * opened with read/write.
 			 */
 			if (error == 0 &&
 			    (flags & (O_ACCMODE | O_TRUNC)) ==
 			    (O_RDWR | O_TRUNC)) {
 				VM_OBJECT_WLOCK(shmfd->shm_object);
 #ifdef MAC
 				error = mac_posixshm_check_truncate(
 					td->td_ucred, fp->f_cred, shmfd);
 				if (error == 0)
 #endif
 					error = shm_dotruncate_locked(shmfd, 0,
 					    rl_cookie);
 				VM_OBJECT_WUNLOCK(shmfd->shm_object);
 			}
 			if (error == 0) {
 				/*
 				 * Currently we only allow F_SEAL_SEAL to be
 				 * set initially.  As noted above, this would
 				 * need to be reworked should that change.
 				 */
 				shmfd->shm_seals |= initial_seals;
 				shm_hold(shmfd);
 			}
 			rangelock_unlock(&shmfd->shm_rl, rl_cookie,
 			    &shmfd->shm_mtx);
 		}
 		sx_xunlock(&shm_dict_lock);
 
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (error);
 		}
 	}
 
 	finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 
 	return (0);
 }
 
 /* System calls. */
 #ifdef COMPAT_FREEBSD12
 int
 freebsd12_shm_open(struct thread *td, struct freebsd12_shm_open_args *uap)
 {
 
 	return (kern_shm_open(td, uap->path, uap->flags | O_CLOEXEC,
 	    uap->mode, NULL));
 }
 #endif
 
 int
 sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
 {
 	char *path;
 	Fnv32_t fnv;
 	int error;
 
 	error = shm_copyin_path(td, uap->path, &path);
 	if (error != 0)
 		return (error);
 
 	AUDIT_ARG_UPATH1_CANON(path);
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&shm_dict_lock);
 	error = shm_remove(path, fnv, td->td_ucred);
 	sx_xunlock(&shm_dict_lock);
 	free(path, M_SHMFD);
 
 	return (error);
 }
 
 int
 sys_shm_rename(struct thread *td, struct shm_rename_args *uap)
 {
 	char *path_from = NULL, *path_to = NULL;
 	Fnv32_t fnv_from, fnv_to;
 	struct shmfd *fd_from;
 	struct shmfd *fd_to;
 	int error;
 	int flags;
 
 	flags = uap->flags;
 	AUDIT_ARG_FFLAGS(flags);
 
 	/*
 	 * Make sure the user passed only valid flags.
 	 * If you add a new flag, please add a new term here.
 	 */
 	if ((flags & ~(
 	    SHM_RENAME_NOREPLACE |
 	    SHM_RENAME_EXCHANGE
 	    )) != 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/*
 	 * EXCHANGE and NOREPLACE don't quite make sense together. Let's
 	 * force the user to choose one or the other.
 	 */
 	if ((flags & SHM_RENAME_NOREPLACE) != 0 &&
 	    (flags & SHM_RENAME_EXCHANGE) != 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/* Renaming to or from anonymous makes no sense */
 	if (uap->path_from == SHM_ANON || uap->path_to == SHM_ANON) {
 		error = EINVAL;
 		goto out;
 	}
 
 	error = shm_copyin_path(td, uap->path_from, &path_from);
 	if (error != 0)
 		goto out;
 
 	error = shm_copyin_path(td, uap->path_to, &path_to);
 	if (error != 0)
 		goto out;
 
 	AUDIT_ARG_UPATH1_CANON(path_from);
 	AUDIT_ARG_UPATH2_CANON(path_to);
 
 	/* Rename with from/to equal is a no-op */
 	if (strcmp(path_from, path_to) == 0)
 		goto out;
 
 	fnv_from = fnv_32_str(path_from, FNV1_32_INIT);
 	fnv_to = fnv_32_str(path_to, FNV1_32_INIT);
 
 	sx_xlock(&shm_dict_lock);
 
 	fd_from = shm_lookup(path_from, fnv_from);
 	if (fd_from == NULL) {
 		error = ENOENT;
 		goto out_locked;
 	}
 
 	fd_to = shm_lookup(path_to, fnv_to);
 	if ((flags & SHM_RENAME_NOREPLACE) != 0 && fd_to != NULL) {
 		error = EEXIST;
 		goto out_locked;
 	}
 
 	/*
 	 * Unconditionally prevents shm_remove from invalidating the 'from'
 	 * shm's state.
 	 */
 	shm_hold(fd_from);
 	error = shm_remove(path_from, fnv_from, td->td_ucred);
 
 	/*
 	 * One of my assumptions failed if ENOENT (e.g. locking didn't
 	 * protect us)
 	 */
 	KASSERT(error != ENOENT, ("Our shm disappeared during shm_rename: %s",
 	    path_from));
 	if (error != 0) {
 		shm_drop(fd_from);
 		goto out_locked;
 	}
 
 	/*
 	 * If we are exchanging, we need to ensure the shm_remove below
 	 * doesn't invalidate the dest shm's state.
 	 */
 	if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL)
 		shm_hold(fd_to);
 
 	/*
 	 * NOTE: if path_to is not already in the hash, c'est la vie;
 	 * it simply means we have nothing already at path_to to unlink.
 	 * That is the ENOENT case.
 	 *
 	 * If we somehow don't have access to unlink this guy, but
 	 * did for the shm at path_from, then relink the shm to path_from
 	 * and abort with EACCES.
 	 *
 	 * All other errors: that is weird; let's relink and abort the
 	 * operation.
 	 */
 	error = shm_remove(path_to, fnv_to, td->td_ucred);
 	if (error != 0 && error != ENOENT) {
 		shm_insert(path_from, fnv_from, fd_from);
 		shm_drop(fd_from);
 		/* Don't free path_from now, since the hash references it */
 		path_from = NULL;
 		goto out_locked;
 	}
 
 	error = 0;
 
 	shm_insert(path_to, fnv_to, fd_from);
 
 	/* Don't free path_to now, since the hash references it */
 	path_to = NULL;
 
 	/* We kept a ref when we removed, and incremented again in insert */
 	shm_drop(fd_from);
 	KASSERT(fd_from->shm_refs > 0, ("Expected >0 refs; got: %d\n",
 	    fd_from->shm_refs));
 
 	if ((flags & SHM_RENAME_EXCHANGE) != 0 && fd_to != NULL) {
 		shm_insert(path_from, fnv_from, fd_to);
 		path_from = NULL;
 		shm_drop(fd_to);
 		KASSERT(fd_to->shm_refs > 0, ("Expected >0 refs; got: %d\n",
 		    fd_to->shm_refs));
 	}
 
 out_locked:
 	sx_xunlock(&shm_dict_lock);
 
 out:
 	free(path_from, M_SHMFD);
 	free(path_to, M_SHMFD);
 	return (error);
 }
 
 static int
 shm_mmap_large(struct shmfd *shmfd, vm_map_t map, vm_offset_t *addr,
     vm_size_t size, vm_prot_t prot, vm_prot_t max_prot, int flags,
     vm_ooffset_t foff, struct thread *td)
 {
 	struct vmspace *vms;
 	vm_map_entry_t next_entry, prev_entry;
 	vm_offset_t align, mask, maxaddr;
 	int docow, error, rv, try;
 	bool curmap;
 
 	if (shmfd->shm_lp_psind == 0)
 		return (EINVAL);
 
 	/* MAP_PRIVATE is disabled */
 	if ((flags & ~(MAP_SHARED | MAP_FIXED | MAP_EXCL |
 	    MAP_NOCORE |
 #ifdef MAP_32BIT
 	    MAP_32BIT |
 #endif
 	    MAP_ALIGNMENT_MASK)) != 0)
 		return (EINVAL);
 
 	vms = td->td_proc->p_vmspace;
 	curmap = map == &vms->vm_map;
 	if (curmap) {
 		error = kern_mmap_racct_check(td, map, size);
 		if (error != 0)
 			return (error);
 	}
 
 	docow = shmfd->shm_lp_psind << MAP_SPLIT_BOUNDARY_SHIFT;
 	docow |= MAP_INHERIT_SHARE;
 	if ((flags & MAP_NOCORE) != 0)
 		docow |= MAP_DISABLE_COREDUMP;
 
 	mask = pagesizes[shmfd->shm_lp_psind] - 1;
 	if ((foff & mask) != 0)
 		return (EINVAL);
 	maxaddr = vm_map_max(map);
 #ifdef MAP_32BIT
 	if ((flags & MAP_32BIT) != 0 && maxaddr > MAP_32BIT_MAX_ADDR)
 		maxaddr = MAP_32BIT_MAX_ADDR;
 #endif
 	if (size == 0 || (size & mask) != 0 ||
 	    (*addr != 0 && ((*addr & mask) != 0 ||
 	    *addr + size < *addr || *addr + size > maxaddr)))
 		return (EINVAL);
 
 	align = flags & MAP_ALIGNMENT_MASK;
 	if (align == 0) {
 		align = pagesizes[shmfd->shm_lp_psind];
 	} else if (align == MAP_ALIGNED_SUPER) {
 		if (shmfd->shm_lp_psind != 1)
 			return (EINVAL);
 		align = pagesizes[1];
 	} else {
 		align >>= MAP_ALIGNMENT_SHIFT;
 		align = 1ULL << align;
 		/* Also handles overflow. */
 		if (align < pagesizes[shmfd->shm_lp_psind])
 			return (EINVAL);
 	}
 
 	vm_map_lock(map);
 	if ((flags & MAP_FIXED) == 0) {
 		try = 1;
 		if (curmap && (*addr == 0 ||
 		    (*addr >= round_page((vm_offset_t)vms->vm_taddr) &&
 		    *addr < round_page((vm_offset_t)vms->vm_daddr +
 		    lim_max(td, RLIMIT_DATA))))) {
 			*addr = roundup2((vm_offset_t)vms->vm_daddr +
 			    lim_max(td, RLIMIT_DATA),
 			    pagesizes[shmfd->shm_lp_psind]);
 		}
 again:
 		rv = vm_map_find_aligned(map, addr, size, maxaddr, align);
 		if (rv != KERN_SUCCESS) {
 			if (try == 1) {
 				try = 2;
 				*addr = vm_map_min(map);
 				if ((*addr & mask) != 0)
 					*addr = (*addr + mask) & mask;
 				goto again;
 			}
 			goto fail1;
 		}
 	} else if ((flags & MAP_EXCL) == 0) {
 		rv = vm_map_delete(map, *addr, *addr + size);
 		if (rv != KERN_SUCCESS)
 			goto fail1;
 	} else {
 		error = ENOSPC;
 		if (vm_map_lookup_entry(map, *addr, &prev_entry))
 			goto fail;
 		next_entry = vm_map_entry_succ(prev_entry);
 		if (next_entry->start < *addr + size)
 			goto fail;
 	}
 
 	rv = vm_map_insert(map, shmfd->shm_object, foff, *addr, *addr + size,
 	    prot, max_prot, docow);
 fail1:
 	error = vm_mmap_to_errno(rv);
 fail:
 	vm_map_unlock(map);
 	return (error);
 }
 
 static int
 shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags,
     vm_ooffset_t foff, struct thread *td)
 {
 	struct shmfd *shmfd;
 	vm_prot_t maxprot;
 	int error;
 	bool writecnt;
 	void *rl_cookie;
 
 	shmfd = fp->f_data;
 	maxprot = VM_PROT_NONE;
 
 	rl_cookie = rangelock_rlock(&shmfd->shm_rl, 0, objsize,
 	    &shmfd->shm_mtx);
 	/* FREAD should always be set. */
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
 
 	/*
 	 * If FWRITE's set, we can allow VM_PROT_WRITE unless it's a shared
 	 * mapping with a write seal applied.  Private mappings are always
 	 * writeable.
 	 */
 	if ((flags & MAP_SHARED) == 0) {
 		cap_maxprot |= VM_PROT_WRITE;
 		maxprot |= VM_PROT_WRITE;
 		writecnt = false;
 	} else {
 		if ((fp->f_flag & FWRITE) != 0 &&
 		    (shmfd->shm_seals & F_SEAL_WRITE) == 0)
 			maxprot |= VM_PROT_WRITE;
 
 		/*
 		 * Any mappings from a writable descriptor may be upgraded to
 		 * VM_PROT_WRITE with mprotect(2), unless a write-seal was
 		 * applied between the open and subsequent mmap(2).  We want to
 		 * reject application of a write seal as long as any such
 		 * mapping exists so that the seal cannot be trivially bypassed.
 		 */
 		writecnt = (maxprot & VM_PROT_WRITE) != 0;
 		if (!writecnt && (prot & VM_PROT_WRITE) != 0) {
 			error = EACCES;
 			goto out;
 		}
 	}
 	maxprot &= cap_maxprot;
 
 	/* See comment in vn_mmap(). */
 	if (
 #ifdef _LP64
 	    objsize > OFF_MAX ||
 #endif
 	    foff > OFF_MAX - objsize) {
 		error = EINVAL;
 		goto out;
 	}
 
 #ifdef MAC
 	error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags);
 	if (error != 0)
 		goto out;
 #endif
 
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_atime);
 	mtx_unlock(&shm_timestamp_lock);
 	vm_object_reference(shmfd->shm_object);
 
 	if (shm_largepage(shmfd)) {
 		writecnt = false;
 		error = shm_mmap_large(shmfd, map, addr, objsize, prot,
 		    maxprot, flags, foff, td);
 	} else {
 		if (writecnt) {
 			vm_pager_update_writecount(shmfd->shm_object, 0,
 			    objsize);
 		}
 		error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
 		    shmfd->shm_object, foff, writecnt, td);
 	}
 	if (error != 0) {
 		if (writecnt)
 			vm_pager_release_writecount(shmfd->shm_object, 0,
 			    objsize);
 		vm_object_deallocate(shmfd->shm_object);
 	}
 out:
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	return (error);
 }
 
 static int
 shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 	/*
 	 * SUSv4 says that x bits of permission need not be affected.
 	 * Be consistent with our shm_open there.
 	 */
 #ifdef MAC
 	error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
 	if (error != 0)
 		goto out;
 #endif
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
 	    VADMIN, active_cred);
 	if (error != 0)
 		goto out;
 	shmfd->shm_mode = mode & ACCESSPERMS;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 static int
 shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 #ifdef MAC
 	error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
 	if (error != 0)
 		goto out;
 #endif
 	if (uid == (uid_t)-1)
 		uid = shmfd->shm_uid;
 	if (gid == (gid_t)-1)
                  gid = shmfd->shm_gid;
 	if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) ||
 	    (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN)))
 		goto out;
 	shmfd->shm_uid = uid;
 	shmfd->shm_gid = gid;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 /*
  * Helper routines to allow the backing object of a shared memory file
  * descriptor to be mapped in the kernel.
  */
 int
 shm_map(struct file *fp, size_t size, off_t offset, void **memp)
 {
 	struct shmfd *shmfd;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	obj = shmfd->shm_object;
 	VM_OBJECT_WLOCK(obj);
 	/*
 	 * XXXRW: This validation is probably insufficient, and subject to
 	 * sign errors.  It should be fixed.
 	 */
 	if (offset >= shmfd->shm_size ||
 	    offset + size > round_page(shmfd->shm_size)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (EINVAL);
 	}
 
 	shmfd->shm_kmappings++;
 	vm_object_reference_locked(obj);
 	VM_OBJECT_WUNLOCK(obj);
 
 	/* Map the object into the kernel_map and wire it. */
 	kva = vm_map_min(kernel_map);
 	ofs = offset & PAGE_MASK;
 	offset = trunc_page(offset);
 	size = round_page(size + ofs);
 	rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0,
 	    VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
 	    VM_PROT_READ | VM_PROT_WRITE, 0);
 	if (rv == KERN_SUCCESS) {
 		rv = vm_map_wire(kernel_map, kva, kva + size,
 		    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 		if (rv == KERN_SUCCESS) {
 			*memp = (void *)(kva + ofs);
 			return (0);
 		}
 		vm_map_remove(kernel_map, kva, kva + size);
 	} else
 		vm_object_deallocate(obj);
 
 	/* On failure, drop our mapping reference. */
 	VM_OBJECT_WLOCK(obj);
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 
 	return (vm_mmap_to_errno(rv));
 }
 
 /*
  * We require the caller to unmap the entire entry.  This allows us to
  * safely decrement shm_kmappings when a mapping is removed.
  */
 int
 shm_unmap(struct file *fp, void *mem, size_t size)
 {
 	struct shmfd *shmfd;
 	vm_map_entry_t entry;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 	vm_map_t map;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	kva = (vm_offset_t)mem;
 	ofs = kva & PAGE_MASK;
 	kva = trunc_page(kva);
 	size = round_page(size + ofs);
 	map = kernel_map;
 	rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry,
 	    &obj, &pindex, &prot, &wired);
 	if (rv != KERN_SUCCESS)
 		return (EINVAL);
 	if (entry->start != kva || entry->end != kva + size) {
 		vm_map_lookup_done(map, entry);
 		return (EINVAL);
 	}
 	vm_map_lookup_done(map, entry);
 	if (obj != shmfd->shm_object)
 		return (EINVAL);
 	vm_map_remove(map, kva, kva + size);
 	VM_OBJECT_WLOCK(obj);
 	KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped"));
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 	return (0);
 }
 
 static int
 shm_fill_kinfo_locked(struct shmfd *shmfd, struct kinfo_file *kif, bool list)
 {
 	const char *path, *pr_path;
 	size_t pr_pathlen;
 	bool visible;
 
 	sx_assert(&shm_dict_lock, SA_LOCKED);
 	kif->kf_type = KF_TYPE_SHM;
 	kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode;
 	kif->kf_un.kf_file.kf_file_size = shmfd->shm_size;
 	if (shmfd->shm_path != NULL) {
 		if (shmfd->shm_path != NULL) {
 			path = shmfd->shm_path;
 			pr_path = curthread->td_ucred->cr_prison->pr_path;
 			if (strcmp(pr_path, "/") != 0) {
 				/* Return the jail-rooted pathname. */
 				pr_pathlen = strlen(pr_path);
 				visible = strncmp(path, pr_path, pr_pathlen)
 				    == 0 && path[pr_pathlen] == '/';
 				if (list && !visible)
 					return (EPERM);
 				if (visible)
 					path += pr_pathlen;
 			}
 			strlcpy(kif->kf_path, path, sizeof(kif->kf_path));
 		}
 	}
 	return (0);
 }
 
 static int
 shm_fill_kinfo(struct file *fp, struct kinfo_file *kif,
     struct filedesc *fdp __unused)
 {
 	int res;
 
 	sx_slock(&shm_dict_lock);
 	res = shm_fill_kinfo_locked(fp->f_data, kif, false);
 	sx_sunlock(&shm_dict_lock);
 	return (res);
 }
 
 static int
 shm_add_seals(struct file *fp, int seals)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	vm_ooffset_t writemappings;
 	int error, nseals;
 
 	error = 0;
 	shmfd = fp->f_data;
 	rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 	    &shmfd->shm_mtx);
 
 	/* Even already-set seals should result in EPERM. */
 	if ((shmfd->shm_seals & F_SEAL_SEAL) != 0) {
 		error = EPERM;
 		goto out;
 	}
 	nseals = seals & ~shmfd->shm_seals;
 	if ((nseals & F_SEAL_WRITE) != 0) {
 		if (shm_largepage(shmfd)) {
 			error = ENOTSUP;
 			goto out;
 		}
 
 		/*
 		 * The rangelock above prevents writable mappings from being
 		 * added after we've started applying seals.  The RLOCK here
 		 * is to avoid torn reads on ILP32 arches as unmapping/reducing
 		 * writemappings will be done without a rangelock.
 		 */
 		VM_OBJECT_RLOCK(shmfd->shm_object);
 		writemappings = shmfd->shm_object->un_pager.swp.writemappings;
 		VM_OBJECT_RUNLOCK(shmfd->shm_object);
 		/* kmappings are also writable */
 		if (writemappings > 0) {
 			error = EBUSY;
 			goto out;
 		}
 	}
 	shmfd->shm_seals |= nseals;
 out:
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	return (error);
 }
 
 static int
 shm_get_seals(struct file *fp, int *seals)
 {
 	struct shmfd *shmfd;
 
 	shmfd = fp->f_data;
 	*seals = shmfd->shm_seals;
 	return (0);
 }
 
 static int
 shm_deallocate(struct shmfd *shmfd, off_t *offset, off_t *length, int flags)
 {
 	vm_object_t object;
 	vm_pindex_t pistart, pi, piend;
 	vm_ooffset_t off, len;
 	int startofs, endofs, end;
 	int error;
 
 	off = *offset;
 	len = *length;
 	KASSERT(off + len <= (vm_ooffset_t)OFF_MAX, ("off + len overflows"));
 	if (off + len > shmfd->shm_size)
 		len = shmfd->shm_size - off;
 	object = shmfd->shm_object;
 	startofs = off & PAGE_MASK;
 	endofs = (off + len) & PAGE_MASK;
 	pistart = OFF_TO_IDX(off);
 	piend = OFF_TO_IDX(off + len);
 	pi = OFF_TO_IDX(off + PAGE_MASK);
 	error = 0;
 
 	/* Handle the case when offset is on or beyond shm size. */
 	if ((off_t)len <= 0) {
 		*length = 0;
 		return (0);
 	}
 
 	VM_OBJECT_WLOCK(object);
 
 	if (startofs != 0) {
 		end = pistart != piend ? PAGE_SIZE : endofs;
 		error = shm_partial_page_invalidate(object, pistart, startofs,
 		    end);
 		if (error)
 			goto out;
 		off += end - startofs;
 		len -= end - startofs;
 	}
 
 	if (pi < piend) {
 		vm_object_page_remove(object, pi, piend, 0);
 		off += IDX_TO_OFF(piend - pi);
 		len -= IDX_TO_OFF(piend - pi);
 	}
 
 	if (endofs != 0 && pistart != piend) {
 		error = shm_partial_page_invalidate(object, piend, 0, endofs);
 		if (error)
 			goto out;
 		off += endofs;
 		len -= endofs;
 	}
 
 out:
 	VM_OBJECT_WUNLOCK(shmfd->shm_object);
 	*offset = off;
 	*length = len;
 	return (error);
 }
 
 static int
 shm_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags,
     struct ucred *active_cred, struct thread *td)
 {
 	void *rl_cookie;
 	struct shmfd *shmfd;
 	off_t off, len;
 	int error;
 
 	/* This assumes that the caller already checked for overflow. */
 	error = EINVAL;
 	shmfd = fp->f_data;
 	off = *offset;
 	len = *length;
 
 	if (cmd != SPACECTL_DEALLOC || off < 0 || len <= 0 ||
 	    len > OFF_MAX - off || flags != 0)
 		return (EINVAL);
 
 	rl_cookie = rangelock_wlock(&shmfd->shm_rl, off, off + len,
 	    &shmfd->shm_mtx);
 	switch (cmd) {
 	case SPACECTL_DEALLOC:
 		if ((shmfd->shm_seals & F_SEAL_WRITE) != 0) {
 			error = EPERM;
 			break;
 		}
 		error = shm_deallocate(shmfd, &off, &len, flags);
 		*offset = off;
 		*length = len;
 		break;
 	default:
 		__assert_unreachable();
 	}
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	return (error);
 }
 
 
 static int
 shm_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
 {
 	void *rl_cookie;
 	struct shmfd *shmfd;
 	size_t size;
 	int error;
 
 	/* This assumes that the caller already checked for overflow. */
 	error = 0;
 	shmfd = fp->f_data;
 	size = offset + len;
 
 	/*
 	 * Just grab the rangelock for the range that we may be attempting to
 	 * grow, rather than blocking read/write for regions we won't be
 	 * touching while this (potential) resize is in progress.  Other
 	 * attempts to resize the shmfd will have to take a write lock from 0 to
 	 * OFF_MAX, so this being potentially beyond the current usable range of
 	 * the shmfd is not necessarily a concern.  If other mechanisms are
 	 * added to grow a shmfd, this may need to be re-evaluated.
 	 */
 	rl_cookie = rangelock_wlock(&shmfd->shm_rl, offset, size,
 	    &shmfd->shm_mtx);
 	if (size > shmfd->shm_size)
 		error = shm_dotruncate_cookie(shmfd, size, rl_cookie);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	/* Translate to posix_fallocate(2) return value as needed. */
 	if (error == ENOMEM)
 		error = ENOSPC;
 	return (error);
 }
 
 static int
 sysctl_posix_shm_list(SYSCTL_HANDLER_ARGS)
 {
 	struct shm_mapping *shmm;
 	struct sbuf sb;
 	struct kinfo_file kif;
 	u_long i;
 	ssize_t curlen;
 	int error, error2;
 
 	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file) * 5, req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	curlen = 0;
 	error = 0;
 	sx_slock(&shm_dict_lock);
 	for (i = 0; i < shm_hash + 1; i++) {
 		LIST_FOREACH(shmm, &shm_dictionary[i], sm_link) {
 			error = shm_fill_kinfo_locked(shmm->sm_shmfd,
 			    &kif, true);
 			if (error == EPERM) {
 				error = 0;
 				continue;
 			}
 			if (error != 0)
 				break;
 			pack_kinfo(&kif);
 			if (req->oldptr != NULL &&
 			    kif.kf_structsize + curlen > req->oldlen)
 				break;
 			error = sbuf_bcat(&sb, &kif, kif.kf_structsize) == 0 ?
 			    0 : ENOMEM;
 			if (error != 0)
 				break;
 			curlen += kif.kf_structsize;
 		}
 	}
 	sx_sunlock(&shm_dict_lock);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 SYSCTL_PROC(_kern_ipc, OID_AUTO, posix_shm_list,
     CTLFLAG_RD | CTLFLAG_MPSAFE | CTLTYPE_OPAQUE,
     NULL, 0, sysctl_posix_shm_list, "",
     "POSIX SHM list");
 
 int
 kern_shm_open(struct thread *td, const char *path, int flags, mode_t mode,
     struct filecaps *caps)
 {
 
 	return (kern_shm_open2(td, path, flags, mode, 0, caps, NULL));
 }
 
 /*
  * This version of the shm_open() interface leaves CLOEXEC behavior up to the
  * caller, and libc will enforce it for the traditional shm_open() call.  This
  * allows other consumers, like memfd_create(), to opt-in for CLOEXEC.  This
  * interface also includes a 'name' argument that is currently unused, but could
  * potentially be exported later via some interface for debugging purposes.
  * From the kernel's perspective, it is optional.  Individual consumers like
  * memfd_create() may require it in order to be compatible with other systems
  * implementing the same function.
  */
 int
 sys_shm_open2(struct thread *td, struct shm_open2_args *uap)
 {
 
 	return (kern_shm_open2(td, uap->path, uap->flags, uap->mode,
 	    uap->shmflags, NULL, uap->name));
 }
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 82a8125ece95..29a6b34d083a 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -1,5012 +1,5012 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/disk.h>
 #include <sys/sysent.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/namei.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/dirent.h>
 #include <sys/jail.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/stdarg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 
 #include <fs/devfs/devfs.h>
 
 MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
 
 static int kern_chflagsat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, u_long flags, int atflag);
 static int setfflags(struct thread *td, struct vnode *, u_long);
 static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
 static int getutimens(const struct timespec *, enum uio_seg,
     struct timespec *, int *);
 static int setutimes(struct thread *td, struct vnode *,
     const struct timespec *, int, int);
 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
     struct thread *td);
 static int kern_fhlinkat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, fhandle_t *fhp);
 static int kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg,
     size_t count, struct thread *td);
 static int kern_linkat_vp(struct thread *td, struct vnode *vp, int fd,
     const char *path, enum uio_seg segflag);
 
 static uint64_t
 at2cnpflags(u_int at_flags, u_int mask)
 {
 	u_int64_t res;
 
 	MPASS((at_flags & (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW)) !=
 	    (AT_SYMLINK_FOLLOW | AT_SYMLINK_NOFOLLOW));
 
 	res = 0;
 	at_flags &= mask;
 	if ((at_flags & AT_RESOLVE_BENEATH) != 0)
 		res |= RBENEATH;
 	if ((at_flags & AT_SYMLINK_FOLLOW) != 0)
 		res |= FOLLOW;
 	/* NOFOLLOW is pseudo flag */
 	if ((mask & AT_SYMLINK_NOFOLLOW) != 0) {
 		res |= (at_flags & AT_SYMLINK_NOFOLLOW) != 0 ? NOFOLLOW :
 		    FOLLOW;
 	}
 	if ((mask & AT_EMPTY_PATH) != 0 && (at_flags & AT_EMPTY_PATH) != 0)
 		res |= EMPTYPATH;
 	return (res);
 }
 
 int
 kern_sync(struct thread *td)
 {
 	struct mount *mp, *nmp;
 	int save;
 
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
 			save = curthread_pflags_set(TDP_SYNCIO);
 			vfs_periodic(mp, MNT_NOWAIT);
 			VFS_SYNC(mp, MNT_NOWAIT);
 			curthread_pflags_restore(save);
 			vn_finished_write(mp);
 		}
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp);
 	}
 	mtx_unlock(&mountlist_mtx);
 	return (0);
 }
 
 /*
  * Sync each mounted filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sync_args {
 	int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_sync(struct thread *td, struct sync_args *uap)
 {
 
 	return (kern_sync(td));
 }
 
 /*
  * Change filesystem quotas.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct quotactl_args {
 	char *path;
 	int cmd;
 	int uid;
 	caddr_t arg;
 };
 #endif
 int
 sys_quotactl(struct thread *td, struct quotactl_args *uap)
 {
 	struct mount *mp;
 	struct nameidata nd;
 	int error;
 	bool mp_busy;
 
 	AUDIT_ARG_CMD(uap->cmd);
 	AUDIT_ARG_UID(uap->uid);
 	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
 		return (EPERM);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	mp = nd.ni_vp->v_mount;
 	vfs_ref(mp);
 	vput(nd.ni_vp);
 	error = vfs_busy(mp, 0);
 	if (error != 0) {
 		vfs_rel(mp);
 		return (error);
 	}
 	mp_busy = true;
 	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, &mp_busy);
 
 	/*
 	 * Since quota on/off operations typically need to open quota
 	 * files, the implementation may need to unbusy the mount point
 	 * before calling into namei.  Otherwise, unmount might be
 	 * started between two vfs_busy() invocations (first is ours,
 	 * second is from mount point cross-walk code in lookup()),
 	 * causing deadlock.
 	 *
 	 * Avoid unbusying mp if the implementation indicates it has
 	 * already done so.
 	 */
 	if (mp_busy)
 		vfs_unbusy(mp);
 	vfs_rel(mp);
 	return (error);
 }
 
 /*
  * Used by statfs conversion routines to scale the block size up if
  * necessary so that all of the block counts are <= 'max_size'.  Note
  * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
  * value of 'n'.
  */
 void
 statfs_scale_blocks(struct statfs *sf, long max_size)
 {
 	uint64_t count;
 	int shift;
 
 	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
 
 	/*
 	 * Attempt to scale the block counts to give a more accurate
 	 * overview to userland of the ratio of free space to used
 	 * space.  To do this, find the largest block count and compute
 	 * a divisor that lets it fit into a signed integer <= max_size.
 	 */
 	if (sf->f_bavail < 0)
 		count = -sf->f_bavail;
 	else
 		count = sf->f_bavail;
 	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
 	if (count <= max_size)
 		return;
 
 	count >>= flsl(max_size);
 	shift = 0;
 	while (count > 0) {
 		shift++;
 		count >>=1;
 	}
 
 	sf->f_bsize <<= shift;
 	sf->f_blocks >>= shift;
 	sf->f_bfree >>= shift;
 	sf->f_bavail >>= shift;
 }
 
 static int
 kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf)
 {
 	int error;
 
 	if (mp == NULL)
 		return (EBADF);
 	error = vfs_busy(mp, 0);
 	vfs_rel(mp);
 	if (error != 0)
 		return (error);
 #ifdef MAC
 	error = mac_mount_check_stat(td->td_ucred, mp);
 	if (error != 0)
 		goto out;
 #endif
 	error = VFS_STATFS(mp, buf);
 	if (error != 0)
 		goto out;
 	if (priv_check_cred_vfs_generation(td->td_ucred)) {
 		buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
 		prison_enforce_statfs(td->td_ucred, mp, buf);
 	}
 out:
 	vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct statfs_args {
 	char *path;
 	struct statfs *buf;
 };
 #endif
 int
 sys_statfs(struct thread *td, struct statfs_args *uap)
 {
 	struct statfs *sfp;
 	int error;
 
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
 	if (error == 0)
 		error = copyout(sfp, uap->buf, sizeof(struct statfs));
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 int
 kern_statfs(struct thread *td, const char *path, enum uio_seg pathseg,
     struct statfs *buf)
 {
 	struct mount *mp;
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
 	mp = vfs_ref_from_vp(nd.ni_vp);
 	NDFREE_NOTHING(&nd);
 	vrele(nd.ni_vp);
 	return (kern_do_statfs(td, mp, buf));
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstatfs_args {
 	int fd;
 	struct statfs *buf;
 };
 #endif
 int
 sys_fstatfs(struct thread *td, struct fstatfs_args *uap)
 {
 	struct statfs *sfp;
 	int error;
 
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fstatfs(td, uap->fd, sfp);
 	if (error == 0)
 		error = copyout(sfp, uap->buf, sizeof(struct statfs));
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 int
 kern_fstatfs(struct thread *td, int fd, struct statfs *buf)
 {
 	struct file *fp;
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = getvnode_path(td, fd, &cap_fstatfs_rights, &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 #ifdef AUDIT
 	if (AUDITING_TD(td)) {
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		AUDIT_ARG_VNODE1(vp);
 		VOP_UNLOCK(vp);
 	}
 #endif
 	mp = vfs_ref_from_vp(vp);
 	fdrop(fp, td);
 	return (kern_do_statfs(td, mp, buf));
 }
 
 /*
  * Get statistics on all filesystems.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getfsstat_args {
 	struct statfs *buf;
 	long bufsize;
 	int mode;
 };
 #endif
 int
 sys_getfsstat(struct thread *td, struct getfsstat_args *uap)
 {
 	size_t count;
 	int error;
 
 	if (uap->bufsize < 0 || uap->bufsize > SIZE_MAX)
 		return (EINVAL);
 	error = kern_getfsstat(td, &uap->buf, uap->bufsize, &count,
 	    UIO_USERSPACE, uap->mode);
 	if (error == 0)
 		td->td_retval[0] = count;
 	return (error);
 }
 
 /*
  * If (bufsize > 0 && bufseg == UIO_SYSSPACE)
  *	The caller is responsible for freeing memory which will be allocated
  *	in '*buf'.
  */
 int
 kern_getfsstat(struct thread *td, struct statfs **buf, size_t bufsize,
     size_t *countp, enum uio_seg bufseg, int mode)
 {
 	struct mount *mp, *nmp;
 	struct statfs *sfsp, *sp, *sptmp, *tofree;
 	size_t count, maxcount;
 	int error;
 
 	switch (mode) {
 	case MNT_WAIT:
 	case MNT_NOWAIT:
 		break;
 	default:
 		if (bufseg == UIO_SYSSPACE)
 			*buf = NULL;
 		return (EINVAL);
 	}
 restart:
 	maxcount = bufsize / sizeof(struct statfs);
 	if (bufsize == 0) {
 		sfsp = NULL;
 		tofree = NULL;
 	} else if (bufseg == UIO_USERSPACE) {
 		sfsp = *buf;
 		tofree = NULL;
 	} else /* if (bufseg == UIO_SYSSPACE) */ {
 		count = 0;
 		mtx_lock(&mountlist_mtx);
 		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 			count++;
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (maxcount > count)
 			maxcount = count;
 		tofree = sfsp = *buf = malloc(maxcount * sizeof(struct statfs),
 		    M_STATFS, M_WAITOK);
 	}
 
 	count = 0;
 
 	/*
 	 * If there is no target buffer they only want the count.
 	 *
 	 * This could be TAILQ_FOREACH but it is open-coded to match the original
 	 * code below.
 	 */
 	if (sfsp == NULL) {
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			if (prison_canseemount(td->td_ucred, mp) != 0) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 #ifdef MAC
 			if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 #endif
 			count++;
 			nmp = TAILQ_NEXT(mp, mnt_list);
 		}
 		mtx_unlock(&mountlist_mtx);
 		*countp = count;
 		return (0);
 	}
 
 	/*
 	 * They want the entire thing.
 	 *
 	 * Short-circuit the corner case of no room for anything, avoids
 	 * relocking below.
 	 */
 	if (maxcount < 1) {
 		goto out;
 	}
 
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (prison_canseemount(td->td_ucred, mp) != 0) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 #ifdef MAC
 		if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 #endif
 		if (mode == MNT_WAIT) {
 			if (vfs_busy(mp, MBF_MNTLSTLOCK) != 0) {
 				/*
 				 * If vfs_busy() failed, and MBF_NOWAIT
 				 * wasn't passed, then the mp is gone.
 				 * Furthermore, because of MBF_MNTLSTLOCK,
 				 * the mountlist_mtx was dropped.  We have
 				 * no other choice than to start over.
 				 */
 				mtx_unlock(&mountlist_mtx);
 				free(tofree, M_STATFS);
 				goto restart;
 			}
 		} else {
 			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 		}
 		sp = &mp->mnt_stat;
 		/*
 		 * If MNT_NOWAIT is specified, do not refresh
 		 * the fsstat cache.
 		 */
 		if (mode != MNT_NOWAIT) {
 			error = VFS_STATFS(mp, sp);
 			if (error != 0) {
 				mtx_lock(&mountlist_mtx);
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				vfs_unbusy(mp);
 				continue;
 			}
 		}
 		if (priv_check_cred_vfs_generation(td->td_ucred)) {
 			sptmp = malloc(sizeof(struct statfs), M_STATFS,
 			    M_WAITOK);
 			*sptmp = *sp;
 			sptmp->f_fsid.val[0] = sptmp->f_fsid.val[1] = 0;
 			prison_enforce_statfs(td->td_ucred, mp, sptmp);
 			sp = sptmp;
 		} else
 			sptmp = NULL;
 		if (bufseg == UIO_SYSSPACE) {
 			bcopy(sp, sfsp, sizeof(*sp));
 			free(sptmp, M_STATFS);
 		} else /* if (bufseg == UIO_USERSPACE) */ {
 			error = copyout(sp, sfsp, sizeof(*sp));
 			free(sptmp, M_STATFS);
 			if (error != 0) {
 				vfs_unbusy(mp);
 				return (error);
 			}
 		}
 		sfsp++;
 		count++;
 
 		if (count == maxcount) {
 			vfs_unbusy(mp);
 			goto out;
 		}
 
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp);
 	}
 	mtx_unlock(&mountlist_mtx);
 out:
 	*countp = count;
 	return (0);
 }
 
 #ifdef COMPAT_FREEBSD4
 /*
  * Get old format filesystem statistics.
  */
 static void freebsd4_cvtstatfs(struct statfs *, struct ostatfs *);
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_statfs_args {
 	char *path;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_statfs(struct thread *td, struct freebsd4_statfs_args *uap)
 {
 	struct ostatfs osb;
 	struct statfs *sfp;
 	int error;
 
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
 	if (error == 0) {
 		freebsd4_cvtstatfs(sfp, &osb);
 		error = copyout(&osb, uap->buf, sizeof(osb));
 	}
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_fstatfs_args {
 	int fd;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_fstatfs(struct thread *td, struct freebsd4_fstatfs_args *uap)
 {
 	struct ostatfs osb;
 	struct statfs *sfp;
 	int error;
 
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fstatfs(td, uap->fd, sfp);
 	if (error == 0) {
 		freebsd4_cvtstatfs(sfp, &osb);
 		error = copyout(&osb, uap->buf, sizeof(osb));
 	}
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 /*
  * Get statistics on all filesystems.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_getfsstat_args {
 	struct ostatfs *buf;
 	long bufsize;
 	int mode;
 };
 #endif
 int
 freebsd4_getfsstat(struct thread *td, struct freebsd4_getfsstat_args *uap)
 {
 	struct statfs *buf, *sp;
 	struct ostatfs osb;
 	size_t count, size;
 	int error;
 
 	if (uap->bufsize < 0)
 		return (EINVAL);
 	count = uap->bufsize / sizeof(struct ostatfs);
 	if (count > SIZE_MAX / sizeof(struct statfs))
 		return (EINVAL);
 	size = count * sizeof(struct statfs);
 	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
 	    uap->mode);
 	if (error == 0)
 		td->td_retval[0] = count;
 	if (size != 0) {
 		sp = buf;
 		while (count != 0 && error == 0) {
 			freebsd4_cvtstatfs(sp, &osb);
 			error = copyout(&osb, uap->buf, sizeof(osb));
 			sp++;
 			uap->buf++;
 			count--;
 		}
 		free(buf, M_STATFS);
 	}
 	return (error);
 }
 
 /*
  * Implement fstatfs() for (NFS) file handles.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_fhstatfs_args {
 	struct fhandle *u_fhp;
 	struct ostatfs *buf;
 };
 #endif
 int
 freebsd4_fhstatfs(struct thread *td, struct freebsd4_fhstatfs_args *uap)
 {
 	struct ostatfs osb;
 	struct statfs *sfp;
 	fhandle_t fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error != 0)
 		return (error);
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fhstatfs(td, fh, sfp);
 	if (error == 0) {
 		freebsd4_cvtstatfs(sfp, &osb);
 		error = copyout(&osb, uap->buf, sizeof(osb));
 	}
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 /*
  * Convert a new format statfs structure to an old format statfs structure.
  */
 static void
 freebsd4_cvtstatfs(struct statfs *nsp, struct ostatfs *osp)
 {
 
 	statfs_scale_blocks(nsp, LONG_MAX);
 	bzero(osp, sizeof(*osp));
 	osp->f_bsize = nsp->f_bsize;
 	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
 	osp->f_blocks = nsp->f_blocks;
 	osp->f_bfree = nsp->f_bfree;
 	osp->f_bavail = nsp->f_bavail;
 	osp->f_files = MIN(nsp->f_files, LONG_MAX);
 	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
 	osp->f_owner = nsp->f_owner;
 	osp->f_type = nsp->f_type;
 	osp->f_flags = nsp->f_flags;
 	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
 	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
 	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
 	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
 	strlcpy(osp->f_fstypename, nsp->f_fstypename,
 	    MIN(MFSNAMELEN, OMFSNAMELEN));
 	strlcpy(osp->f_mntonname, nsp->f_mntonname,
 	    MIN(MNAMELEN, OMNAMELEN));
 	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
 	    MIN(MNAMELEN, OMNAMELEN));
 	osp->f_fsid = nsp->f_fsid;
 }
 #endif /* COMPAT_FREEBSD4 */
 
 #if defined(COMPAT_FREEBSD11)
 /*
  * Get old format filesystem statistics.
  */
 static void freebsd11_cvtstatfs(struct statfs *, struct freebsd11_statfs *);
 
 int
 freebsd11_statfs(struct thread *td, struct freebsd11_statfs_args *uap)
 {
 	struct freebsd11_statfs osb;
 	struct statfs *sfp;
 	int error;
 
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_statfs(td, uap->path, UIO_USERSPACE, sfp);
 	if (error == 0) {
 		freebsd11_cvtstatfs(sfp, &osb);
 		error = copyout(&osb, uap->buf, sizeof(osb));
 	}
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 int
 freebsd11_fstatfs(struct thread *td, struct freebsd11_fstatfs_args *uap)
 {
 	struct freebsd11_statfs osb;
 	struct statfs *sfp;
 	int error;
 
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fstatfs(td, uap->fd, sfp);
 	if (error == 0) {
 		freebsd11_cvtstatfs(sfp, &osb);
 		error = copyout(&osb, uap->buf, sizeof(osb));
 	}
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 /*
  * Get statistics on all filesystems.
  */
 int
 freebsd11_getfsstat(struct thread *td, struct freebsd11_getfsstat_args *uap)
 {
 	struct freebsd11_statfs osb;
 	struct statfs *buf, *sp;
 	size_t count, size;
 	int error;
 
 	count = uap->bufsize / sizeof(struct ostatfs);
 	size = count * sizeof(struct statfs);
 	error = kern_getfsstat(td, &buf, size, &count, UIO_SYSSPACE,
 	    uap->mode);
 	if (error == 0)
 		td->td_retval[0] = count;
 	if (size > 0) {
 		sp = buf;
 		while (count > 0 && error == 0) {
 			freebsd11_cvtstatfs(sp, &osb);
 			error = copyout(&osb, uap->buf, sizeof(osb));
 			sp++;
 			uap->buf++;
 			count--;
 		}
 		free(buf, M_STATFS);
 	}
 	return (error);
 }
 
 /*
  * Implement fstatfs() for (NFS) file handles.
  */
 int
 freebsd11_fhstatfs(struct thread *td, struct freebsd11_fhstatfs_args *uap)
 {
 	struct freebsd11_statfs osb;
 	struct statfs *sfp;
 	fhandle_t fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error)
 		return (error);
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fhstatfs(td, fh, sfp);
 	if (error == 0) {
 		freebsd11_cvtstatfs(sfp, &osb);
 		error = copyout(&osb, uap->buf, sizeof(osb));
 	}
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 /*
  * Convert a new format statfs structure to an old format statfs structure.
  */
 static void
 freebsd11_cvtstatfs(struct statfs *nsp, struct freebsd11_statfs *osp)
 {
 
 	bzero(osp, sizeof(*osp));
 	osp->f_version = FREEBSD11_STATFS_VERSION;
 	osp->f_type = nsp->f_type;
 	osp->f_flags = nsp->f_flags;
 	osp->f_bsize = nsp->f_bsize;
 	osp->f_iosize = nsp->f_iosize;
 	osp->f_blocks = nsp->f_blocks;
 	osp->f_bfree = nsp->f_bfree;
 	osp->f_bavail = nsp->f_bavail;
 	osp->f_files = nsp->f_files;
 	osp->f_ffree = nsp->f_ffree;
 	osp->f_syncwrites = nsp->f_syncwrites;
 	osp->f_asyncwrites = nsp->f_asyncwrites;
 	osp->f_syncreads = nsp->f_syncreads;
 	osp->f_asyncreads = nsp->f_asyncreads;
 	osp->f_namemax = nsp->f_namemax;
 	osp->f_owner = nsp->f_owner;
 	osp->f_fsid = nsp->f_fsid;
 	strlcpy(osp->f_fstypename, nsp->f_fstypename,
 	    MIN(MFSNAMELEN, sizeof(osp->f_fstypename)));
 	strlcpy(osp->f_mntonname, nsp->f_mntonname,
 	    MIN(MNAMELEN, sizeof(osp->f_mntonname)));
 	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
 	    MIN(MNAMELEN, sizeof(osp->f_mntfromname)));
 }
 #endif /* COMPAT_FREEBSD11 */
 
 /*
  * Change current working directory to a given file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchdir_args {
 	int	fd;
 };
 #endif
 int
 sys_fchdir(struct thread *td, struct fchdir_args *uap)
 {
 	struct vnode *vp, *tdp;
 	struct mount *mp;
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	error = getvnode_path(td, uap->fd, &cap_fchdir_rights,
 	    &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 	vref(vp);
 	fdrop(fp, td);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	error = change_dir(vp, td);
 	while (!error && (mp = vp->v_mountedhere) != NULL) {
 		if (vfs_busy(mp, 0))
 			continue;
 		error = VFS_ROOT(mp, LK_SHARED, &tdp);
 		vfs_unbusy(mp);
 		if (error != 0)
 			break;
 		vput(vp);
 		vp = tdp;
 	}
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	VOP_UNLOCK(vp);
 	pwd_chdir(td, vp);
 	return (0);
 }
 
 /*
  * Change current working directory (``.'').
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chdir_args {
 	char	*path;
 };
 #endif
 int
 sys_chdir(struct thread *td, struct chdir_args *uap)
 {
 
 	return (kern_chdir(td, uap->path, UIO_USERSPACE));
 }
 
 int
 kern_chdir(struct thread *td, const char *path, enum uio_seg pathseg)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if ((error = change_dir(nd.ni_vp, td)) != 0) {
 		vput(nd.ni_vp);
 		NDFREE_NOTHING(&nd);
 		return (error);
 	}
 	VOP_UNLOCK(nd.ni_vp);
 	NDFREE_NOTHING(&nd);
 	pwd_chdir(td, nd.ni_vp);
 	return (0);
 }
 
 static int unprivileged_chroot = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_chroot, CTLFLAG_RW,
     &unprivileged_chroot, 0,
     "Unprivileged processes can use chroot(2)");
 /*
  * Change notion of root (``/'') directory.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chroot_args {
 	char	*path;
 };
 #endif
 int
 sys_chroot(struct thread *td, struct chroot_args *uap)
 {
 	struct nameidata nd;
 	struct proc *p;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_CHROOT);
 	if (error != 0) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		if (unprivileged_chroot == 0 ||
 		    (p->p_flag2 & P2_NO_NEW_PRIVS) == 0) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 		PROC_UNLOCK(p);
 	}
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error != 0)
 		goto error;
 	error = change_dir(nd.ni_vp, td);
 	if (error != 0)
 		goto e_vunlock;
 #ifdef MAC
 	error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
 	if (error != 0)
 		goto e_vunlock;
 #endif
 	VOP_UNLOCK(nd.ni_vp);
 	error = pwd_chroot(td, nd.ni_vp);
 	vrele(nd.ni_vp);
 	NDFREE_NOTHING(&nd);
 	return (error);
 e_vunlock:
 	vput(nd.ni_vp);
 error:
 	NDFREE_NOTHING(&nd);
 	return (error);
 }
 
 /*
  * Common routine for chroot and chdir.  Callers must provide a locked vnode
  * instance.
  */
 int
 change_dir(struct vnode *vp, struct thread *td)
 {
 #ifdef MAC
 	int error;
 #endif
 
 	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 #ifdef MAC
 	error = mac_vnode_check_chdir(td->td_ucred, vp);
 	if (error != 0)
 		return (error);
 #endif
 	return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
 }
 
 static __inline void
 flags_to_rights(int flags, cap_rights_t *rightsp)
 {
 	if (flags & O_EXEC) {
 		cap_rights_set_one(rightsp, CAP_FEXECVE);
 		if (flags & O_PATH)
 			return;
 	} else {
 		switch ((flags & O_ACCMODE)) {
 		case O_RDONLY:
 			cap_rights_set_one(rightsp, CAP_READ);
 			break;
 		case O_RDWR:
 			cap_rights_set_one(rightsp, CAP_READ);
 			/* FALLTHROUGH */
 		case O_WRONLY:
 			cap_rights_set_one(rightsp, CAP_WRITE);
 			if (!(flags & (O_APPEND | O_TRUNC)))
 				cap_rights_set_one(rightsp, CAP_SEEK);
 			break;
 		}
 	}
 
 	if (flags & O_CREAT)
 		cap_rights_set_one(rightsp, CAP_CREATE);
 
 	if (flags & O_TRUNC)
 		cap_rights_set_one(rightsp, CAP_FTRUNCATE);
 
 	if (flags & (O_SYNC | O_FSYNC))
 		cap_rights_set_one(rightsp, CAP_FSYNC);
 
 	if (flags & (O_EXLOCK | O_SHLOCK))
 		cap_rights_set_one(rightsp, CAP_FLOCK);
 }
 
 /*
  * Check permissions, allocate an open file structure, and call the device
  * open routine if any.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct open_args {
 	char	*path;
 	int	flags;
 	int	mode;
 };
 #endif
 int
 sys_open(struct thread *td, struct open_args *uap)
 {
 
 	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->flags, uap->mode));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct openat_args {
 	int	fd;
 	char	*path;
 	int	flag;
 	int	mode;
 };
 #endif
 int
 sys_openat(struct thread *td, struct openat_args *uap)
 {
 
 	AUDIT_ARG_FD(uap->fd);
 	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
 	    uap->mode));
 }
 
 int
 kern_openat(struct thread *td, int fd, const char *path, enum uio_seg pathseg,
     int flags, int mode)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp;
 	struct pwddesc *pdp;
 	struct file *fp;
 	struct vnode *vp;
 	struct nameidata nd;
 	cap_rights_t rights;
 	int cmode, error, indx;
 
 	indx = -1;
 	fdp = p->p_fd;
 	pdp = p->p_pd;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 	cap_rights_init_one(&rights, CAP_LOOKUP);
 	flags_to_rights(flags, &rights);
 
 	/*
 	 * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
 	 * may be specified.  On the other hand, for O_PATH any mode
 	 * except O_EXEC is ignored.
 	 */
 	if ((flags & O_PATH) != 0) {
 		flags &= ~(O_CREAT | O_ACCMODE);
 	} else if ((flags & O_EXEC) != 0) {
 		if (flags & O_ACCMODE)
 			return (EINVAL);
 	} else if ((flags & O_ACCMODE) == O_ACCMODE) {
 		return (EINVAL);
 	} else {
 		flags = FFLAGS(flags);
 	}
 
 	/*
 	 * Allocate a file structure. The descriptor to reference it
 	 * is allocated and used by finstall_refed() below.
 	 */
 	error = falloc_noinstall(td, &fp);
 	if (error != 0)
 		return (error);
 	/* Set the flags early so the finit in devfs can pick them up. */
 	fp->f_flag = flags & FMASK;
 	cmode = ((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT;
 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
 	    &rights, td);
 	td->td_dupfd = -1;		/* XXX check for fdopen */
 	error = vn_open(&nd, &flags, cmode, fp);
 	if (error != 0) {
 		/*
 		 * If the vn_open replaced the method vector, something
 		 * wonderous happened deep below and we just pass it up
 		 * pretending we know what we do.
 		 */
 		if (error == ENXIO && fp->f_ops != &badfileops) {
 			MPASS((flags & O_PATH) == 0);
 			goto success;
 		}
 
 		/*
 		 * Handle special fdopen() case. bleh.
 		 *
 		 * Don't do this for relative (capability) lookups; we don't
 		 * understand exactly what would happen, and we don't think
 		 * that it ever should.
 		 */
 		if ((nd.ni_resflags & NIRES_STRICTREL) == 0 &&
 		    (error == ENODEV || error == ENXIO) &&
 		    td->td_dupfd >= 0) {
 			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
 			    &indx);
 			if (error == 0)
 				goto success;
 		}
 
 		goto bad;
 	}
 	td->td_dupfd = 0;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	/*
 	 * Store the vnode, for any f_type. Typically, the vnode use
 	 * count is decremented by direct call to vn_closefile() for
 	 * files that switched type in the cdevsw fdopen() method.
 	 */
 	fp->f_vnode = vp;
 
 	/*
 	 * If the file wasn't claimed by devfs bind it to the normal
 	 * vnode operations here.
 	 */
 	if (fp->f_ops == &badfileops) {
 		KASSERT(vp->v_type != VFIFO || (flags & O_PATH) != 0,
 		    ("Unexpected fifo fp %p vp %p", fp, vp));
 		if ((flags & O_PATH) != 0) {
 			finit(fp, (flags & FMASK) | (fp->f_flag & FKQALLOWED),
 			    DTYPE_VNODE, NULL, &path_fileops);
 			vhold(vp);
 			vunref(vp);
 		} else {
 			finit_vnode(fp, flags, NULL, &vnops);
 		}
 	}
 
 	VOP_UNLOCK(vp);
 	if (flags & O_TRUNC) {
 		error = fo_truncate(fp, 0, td->td_ucred, td);
 		if (error != 0)
 			goto bad;
 	}
 success:
 	/*
 	 * If we haven't already installed the FD (for dupfdopen), do so now.
 	 */
 	if (indx == -1) {
 		struct filecaps *fcaps;
 
 #ifdef CAPABILITIES
 		if ((nd.ni_resflags & NIRES_STRICTREL) != 0)
 			fcaps = &nd.ni_filecaps;
 		else
 #endif
 			fcaps = NULL;
 		error = finstall_refed(td, fp, &indx, flags, fcaps);
 		/* On success finstall_refed() consumes fcaps. */
 		if (error != 0) {
 			filecaps_free(&nd.ni_filecaps);
 			goto bad;
 		}
 	} else {
 		filecaps_free(&nd.ni_filecaps);
 		falloc_abort(td, fp);
 	}
 
 	td->td_retval[0] = indx;
 	return (0);
 bad:
 	KASSERT(indx == -1, ("indx=%d, should be -1", indx));
 	falloc_abort(td, fp);
 	return (error);
 }
 
 #ifdef COMPAT_43
 /*
  * Create a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ocreat_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 ocreat(struct thread *td, struct ocreat_args *uap)
 {
 
 	return (kern_openat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    O_WRONLY | O_CREAT | O_TRUNC, uap->mode));
 }
 #endif /* COMPAT_43 */
 
 /*
  * Create a special file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mknodat_args {
 	int	fd;
 	char	*path;
 	mode_t	mode;
 	dev_t	dev;
 };
 #endif
 int
 sys_mknodat(struct thread *td, struct mknodat_args *uap)
 {
 
 	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
 	    uap->dev));
 }
 
 #if defined(COMPAT_FREEBSD11)
 int
 freebsd11_mknod(struct thread *td,
     struct freebsd11_mknod_args *uap)
 {
 
 	return (kern_mknodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->mode, uap->dev));
 }
 
 int
 freebsd11_mknodat(struct thread *td,
     struct freebsd11_mknodat_args *uap)
 {
 
 	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
 	    uap->dev));
 }
 #endif /* COMPAT_FREEBSD11 */
 
 int
 kern_mknodat(struct thread *td, int fd, const char *path, enum uio_seg pathseg,
     int mode, dev_t dev)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error, whiteout = 0;
 
 	AUDIT_ARG_MODE(mode);
 	AUDIT_ARG_DEV(dev);
 	switch (mode & S_IFMT) {
 	case S_IFCHR:
 	case S_IFBLK:
 		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
 		if (error == 0 && dev == VNOVAL)
 			error = EINVAL;
 		break;
 	case S_IFWHT:
 		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
 		break;
 	case S_IFIFO:
 		if (dev == 0)
 			return (kern_mkfifoat(td, fd, path, pathseg, mode));
 		/* FALLTHROUGH */
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (error != 0)
 		return (error);
 	NDPREINIT(&nd);
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
 	    NOCACHE, pathseg, path, fd, &cap_mknodat_rights,
 	    td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(vp);
 		return (EEXIST);
 	} else {
 		VATTR_NULL(&vattr);
 		vattr.va_mode = (mode & ALLPERMS) &
 		    ~td->td_proc->p_pd->pd_cmask;
 		vattr.va_rdev = dev;
 		whiteout = 0;
 
 		switch (mode & S_IFMT) {
 		case S_IFCHR:
 			vattr.va_type = VCHR;
 			break;
 		case S_IFBLK:
 			vattr.va_type = VBLK;
 			break;
 		case S_IFWHT:
 			whiteout = 1;
 			break;
 		default:
 			panic("kern_mknod: invalid mode");
 		}
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 #ifdef MAC
 	if (error == 0 && !whiteout)
 		error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
 		    &nd.ni_cnd, &vattr);
 #endif
 	if (error == 0) {
 		if (whiteout)
 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
 		else {
 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
 						&nd.ni_cnd, &vattr);
 		}
 	}
 	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 && !whiteout ? &nd.ni_vp : NULL,
 	    true);
 	vn_finished_write(mp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error == ERELOOKUP)
 		goto restart;
 	return (error);
 }
 
 /*
  * Create a named pipe.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mkfifo_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 sys_mkfifo(struct thread *td, struct mkfifo_args *uap)
 {
 
 	return (kern_mkfifoat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->mode));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mkfifoat_args {
 	int	fd;
 	char	*path;
 	mode_t	mode;
 };
 #endif
 int
 sys_mkfifoat(struct thread *td, struct mkfifoat_args *uap)
 {
 
 	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->mode));
 }
 
 int
 kern_mkfifoat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, int mode)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error;
 
 	AUDIT_ARG_MODE(mode);
 	NDPREINIT(&nd);
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
 	    NOCACHE, pathseg, path, fd, &cap_mkfifoat_rights,
 	    td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if (nd.ni_vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(nd.ni_vp);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VFIFO;
 	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_pd->pd_cmask;
 #ifdef MAC
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 #ifdef MAC
 out:
 #endif
 	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
 	vn_finished_write(mp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error == ERELOOKUP)
 		goto restart;
 	return (error);
 }
 
 /*
  * Make a hard file link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct link_args {
 	char	*path;
 	char	*link;
 };
 #endif
 int
 sys_link(struct thread *td, struct link_args *uap)
 {
 
 	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, uap->path, uap->link,
 	    UIO_USERSPACE, AT_SYMLINK_FOLLOW));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct linkat_args {
 	int	fd1;
 	char	*path1;
 	int	fd2;
 	char	*path2;
 	int	flag;
 };
 #endif
 int
 sys_linkat(struct thread *td, struct linkat_args *uap)
 {
 	int flag;
 
 	flag = uap->flag;
 	if ((flag & ~(AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH |
 	    AT_EMPTY_PATH)) != 0)
 		return (EINVAL);
 
 	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
 	    UIO_USERSPACE, flag));
 }
 
 int hardlink_check_uid = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
     &hardlink_check_uid, 0,
     "Unprivileged processes cannot create hard links to files owned by other "
     "users");
 static int hardlink_check_gid = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
     &hardlink_check_gid, 0,
     "Unprivileged processes cannot create hard links to files owned by other "
     "groups");
 
 static int
 can_hardlink(struct vnode *vp, struct ucred *cred)
 {
 	struct vattr va;
 	int error;
 
 	if (!hardlink_check_uid && !hardlink_check_gid)
 		return (0);
 
 	error = VOP_GETATTR(vp, &va, cred);
 	if (error != 0)
 		return (error);
 
 	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
 		error = priv_check_cred(cred, PRIV_VFS_LINK);
 		if (error != 0)
 			return (error);
 	}
 
 	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
 		error = priv_check_cred(cred, PRIV_VFS_LINK);
 		if (error != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 int
 kern_linkat(struct thread *td, int fd1, int fd2, const char *path1,
     const char *path2, enum uio_seg segflag, int flag)
 {
 	struct nameidata nd;
 	int error;
 
 	NDPREINIT(&nd);
 	do {
 		bwillwrite();
 		NDINIT_ATRIGHTS(&nd, LOOKUP, AUDITVNODE1 | at2cnpflags(flag,
 		    AT_SYMLINK_FOLLOW | AT_RESOLVE_BENEATH | AT_EMPTY_PATH),
 		    segflag, path1, fd1, &cap_linkat_source_rights, td);
 		if ((error = namei(&nd)) != 0)
 			return (error);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if ((nd.ni_resflags & NIRES_EMPTYPATH) != 0) {
 			error = priv_check(td, PRIV_VFS_FHOPEN);
 			if (error != 0) {
 				vrele(nd.ni_vp);
 				return (error);
 			}
 		}
 		error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag);
 	} while (error ==  EAGAIN || error == ERELOOKUP);
 	return (error);
 }
 
 static int
 kern_linkat_vp(struct thread *td, struct vnode *vp, int fd, const char *path,
     enum uio_seg segflag)
 {
 	struct nameidata nd;
 	struct mount *mp;
 	int error;
 
 	if (vp->v_type == VDIR) {
 		vrele(vp);
 		return (EPERM);		/* POSIX */
 	}
 	NDINIT_ATRIGHTS(&nd, CREATE,
 	    LOCKPARENT | SAVENAME | AUDITVNODE2 | NOCACHE, segflag, path, fd,
 	    &cap_linkat_target_rights, td);
 	if ((error = namei(&nd)) == 0) {
 		if (nd.ni_vp != NULL) {
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			if (nd.ni_dvp == nd.ni_vp)
 				vrele(nd.ni_dvp);
 			else
 				vput(nd.ni_dvp);
 			vrele(nd.ni_vp);
 			vrele(vp);
 			return (EEXIST);
 		} else if (nd.ni_dvp->v_mount != vp->v_mount) {
 			/*
 			 * Cross-device link.  No need to recheck
 			 * vp->v_type, since it cannot change, except
 			 * to VBAD.
 			 */
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			vput(nd.ni_dvp);
 			vrele(vp);
 			return (EXDEV);
 		} else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
 			error = can_hardlink(vp, td->td_ucred);
 #ifdef MAC
 			if (error == 0)
 				error = mac_vnode_check_link(td->td_ucred,
 				    nd.ni_dvp, vp, &nd.ni_cnd);
 #endif
 			if (error != 0) {
 				vput(vp);
 				vput(nd.ni_dvp);
 				NDFREE(&nd, NDF_ONLY_PNBUF);
 				return (error);
 			}
 			error = vn_start_write(vp, &mp, V_NOWAIT);
 			if (error != 0) {
 				vput(vp);
 				vput(nd.ni_dvp);
 				NDFREE(&nd, NDF_ONLY_PNBUF);
 				error = vn_start_write(NULL, &mp,
 				    V_XSLEEP | PCATCH);
 				if (error != 0)
 					return (error);
 				return (EAGAIN);
 			}
 			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
 			VOP_VPUT_PAIR(nd.ni_dvp, &vp, true);
 			vn_finished_write(mp);
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			vp = NULL;
 		} else {
 			vput(nd.ni_dvp);
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			vrele(vp);
 			return (EAGAIN);
 		}
 	}
 	if (vp != NULL)
 		vrele(vp);
 	return (error);
 }
 
 /*
  * Make a symbolic link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct symlink_args {
 	char	*path;
 	char	*link;
 };
 #endif
 int
 sys_symlink(struct thread *td, struct symlink_args *uap)
 {
 
 	return (kern_symlinkat(td, uap->path, AT_FDCWD, uap->link,
 	    UIO_USERSPACE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct symlinkat_args {
 	char	*path;
 	int	fd;
 	char	*path2;
 };
 #endif
 int
 sys_symlinkat(struct thread *td, struct symlinkat_args *uap)
 {
 
 	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
 	    UIO_USERSPACE));
 }
 
 int
 kern_symlinkat(struct thread *td, const char *path1, int fd, const char *path2,
     enum uio_seg segflg)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	const char *syspath;
 	char *tmppath;
 	struct nameidata nd;
 	int error;
 
 	if (segflg == UIO_SYSSPACE) {
 		syspath = path1;
 	} else {
 		tmppath = uma_zalloc(namei_zone, M_WAITOK);
 		if ((error = copyinstr(path1, tmppath, MAXPATHLEN, NULL)) != 0)
 			goto out;
 		syspath = tmppath;
 	}
 	AUDIT_ARG_TEXT(syspath);
 	NDPREINIT(&nd);
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
 	    NOCACHE, segflg, path2, fd, &cap_symlinkat_rights,
 	    td);
 	if ((error = namei(&nd)) != 0)
 		goto out;
 	if (nd.ni_vp) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vrele(nd.ni_vp);
 		nd.ni_vp = NULL;
 		error = EEXIST;
 		goto out;
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			goto out;
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_pd->pd_cmask;
 #ifdef MAC
 	vattr.va_type = VLNK;
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error != 0)
 		goto out2;
 #endif
 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
 #ifdef MAC
 out2:
 #endif
 	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
 	vn_finished_write(mp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error == ERELOOKUP)
 		goto restart;
 out:
 	if (segflg != UIO_SYSSPACE)
 		uma_zfree(namei_zone, tmppath);
 	return (error);
 }
 
 /*
  * Delete a whiteout from the filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct undelete_args {
 	char *path;
 };
 #endif
 int
 sys_undelete(struct thread *td, struct undelete_args *uap)
 {
 	struct mount *mp;
 	struct nameidata nd;
 	int error;
 
 	NDPREINIT(&nd);
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
 
 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp == nd.ni_dvp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		if (nd.ni_vp)
 			vrele(nd.ni_vp);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	if (error == ERELOOKUP)
 		goto restart;
 	return (error);
 }
 
 /*
  * Delete a name from the filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unlink_args {
 	char	*path;
 };
 #endif
 int
 sys_unlink(struct thread *td, struct unlink_args *uap)
 {
 
 	return (kern_funlinkat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE,
 	    0, 0));
 }
 
 static int
 kern_funlinkat_ex(struct thread *td, int dfd, const char *path, int fd,
     int flag, enum uio_seg pathseg, ino_t oldinum)
 {
 
 	if ((flag & ~(AT_REMOVEDIR | AT_RESOLVE_BENEATH)) != 0)
 		return (EINVAL);
 
 	if ((flag & AT_REMOVEDIR) != 0)
 		return (kern_frmdirat(td, dfd, path, fd, UIO_USERSPACE, 0));
 
 	return (kern_funlinkat(td, dfd, path, fd, UIO_USERSPACE, 0, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct unlinkat_args {
 	int	fd;
 	char	*path;
 	int	flag;
 };
 #endif
 int
 sys_unlinkat(struct thread *td, struct unlinkat_args *uap)
 {
 
 	return (kern_funlinkat_ex(td, uap->fd, uap->path, FD_NONE, uap->flag,
 	    UIO_USERSPACE, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct funlinkat_args {
 	int		dfd;
 	const char	*path;
 	int		fd;
 	int		flag;
 };
 #endif
 int
 sys_funlinkat(struct thread *td, struct funlinkat_args *uap)
 {
 
 	return (kern_funlinkat_ex(td, uap->dfd, uap->path, uap->fd, uap->flag,
 	    UIO_USERSPACE, 0));
 }
 
 int
 kern_funlinkat(struct thread *td, int dfd, const char *path, int fd,
     enum uio_seg pathseg, int flag, ino_t oldinum)
 {
 	struct mount *mp;
 	struct file *fp;
 	struct vnode *vp;
 	struct nameidata nd;
 	struct stat sb;
 	int error;
 
 	fp = NULL;
 	if (fd != FD_NONE) {
 		error = getvnode_path(td, fd, &cap_no_rights, &fp);
 		if (error != 0)
 			return (error);
 	}
 
 	NDPREINIT(&nd);
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 |
 	    at2cnpflags(flag, AT_RESOLVE_BENEATH),
 	    pathseg, path, dfd, &cap_unlinkat_rights, td);
 	if ((error = namei(&nd)) != 0) {
 		if (error == EINVAL)
 			error = EPERM;
 		goto fdout;
 	}
 	vp = nd.ni_vp;
 	if (vp->v_type == VDIR && oldinum == 0) {
 		error = EPERM;		/* POSIX */
 	} else if (oldinum != 0 &&
-	    ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
+	    ((error = VOP_STAT(vp, &sb, td->td_ucred, NOCRED)) == 0) &&
 	    sb.st_ino != oldinum) {
 		error = EIDRM;	/* Identifier removed */
 	} else if (fp != NULL && fp->f_vnode != vp) {
 		if (VN_IS_DOOMED(fp->f_vnode))
 			error = EBADF;
 		else
 			error = EDEADLK;
 	} else {
 		/*
 		 * The root of a mounted filesystem cannot be deleted.
 		 *
 		 * XXX: can this only be a VDIR case?
 		 */
 		if (vp->v_vflag & VV_ROOT)
 			error = EBUSY;
 	}
 	if (error == 0) {
 		if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			vput(nd.ni_dvp);
 			if (vp == nd.ni_dvp)
 				vrele(vp);
 			else
 				vput(vp);
 			if ((error = vn_start_write(NULL, &mp,
 			    V_XSLEEP | PCATCH)) != 0) {
 				goto fdout;
 			}
 			goto restart;
 		}
 #ifdef MAC
 		error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
 		    &nd.ni_cnd);
 		if (error != 0)
 			goto out;
 #endif
 		vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
 		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 #ifdef MAC
 out:
 #endif
 		vn_finished_write(mp);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	if (vp == nd.ni_dvp)
 		vrele(vp);
 	else
 		vput(vp);
 	if (error == ERELOOKUP)
 		goto restart;
 fdout:
 	if (fp != NULL)
 		fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Reposition read/write file offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lseek_args {
 	int	fd;
 	int	pad;
 	off_t	offset;
 	int	whence;
 };
 #endif
 int
 sys_lseek(struct thread *td, struct lseek_args *uap)
 {
 
 	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
 }
 
 int
 kern_lseek(struct thread *td, int fd, off_t offset, int whence)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = fget(td, fd, &cap_seek_rights, &fp);
 	if (error != 0)
 		return (error);
 	error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
 	    fo_seek(fp, offset, whence, td) : ESPIPE;
 	fdrop(fp, td);
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Reposition read/write file offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct olseek_args {
 	int	fd;
 	long	offset;
 	int	whence;
 };
 #endif
 int
 olseek(struct thread *td, struct olseek_args *uap)
 {
 
 	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_FREEBSD6)
 /* Version with the 'pad' argument */
 int
 freebsd6_lseek(struct thread *td, struct freebsd6_lseek_args *uap)
 {
 
 	return (kern_lseek(td, uap->fd, uap->offset, uap->whence));
 }
 #endif
 
 /*
  * Check access permissions using passed credentials.
  */
 static int
 vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
      struct thread *td)
 {
 	accmode_t accmode;
 	int error;
 
 	/* Flags == 0 means only check for existence. */
 	if (user_flags == 0)
 		return (0);
 
 	accmode = 0;
 	if (user_flags & R_OK)
 		accmode |= VREAD;
 	if (user_flags & W_OK)
 		accmode |= VWRITE;
 	if (user_flags & X_OK)
 		accmode |= VEXEC;
 #ifdef MAC
 	error = mac_vnode_check_access(cred, vp, accmode);
 	if (error != 0)
 		return (error);
 #endif
 	if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
 		error = VOP_ACCESS(vp, accmode, cred, td);
 	return (error);
 }
 
 /*
  * Check access permissions using "real" credentials.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct access_args {
 	char	*path;
 	int	amode;
 };
 #endif
 int
 sys_access(struct thread *td, struct access_args *uap)
 {
 
 	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    0, uap->amode));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct faccessat_args {
 	int	dirfd;
 	char	*path;
 	int	amode;
 	int	flag;
 }
 #endif
 int
 sys_faccessat(struct thread *td, struct faccessat_args *uap)
 {
 
 	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
 	    uap->amode));
 }
 
 int
 kern_accessat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, int flag, int amode)
 {
 	struct ucred *cred, *usecred;
 	struct vnode *vp;
 	struct nameidata nd;
 	int error;
 
 	if ((flag & ~(AT_EACCESS | AT_RESOLVE_BENEATH | AT_EMPTY_PATH)) != 0)
 		return (EINVAL);
 	if (amode != F_OK && (amode & ~(R_OK | W_OK | X_OK)) != 0)
 		return (EINVAL);
 
 	/*
 	 * Create and modify a temporary credential instead of one that
 	 * is potentially shared (if we need one).
 	 */
 	cred = td->td_ucred;
 	if ((flag & AT_EACCESS) == 0 &&
 	    ((cred->cr_uid != cred->cr_ruid ||
 	    cred->cr_rgid != cred->cr_groups[0]))) {
 		usecred = crdup(cred);
 		usecred->cr_uid = cred->cr_ruid;
 		usecred->cr_groups[0] = cred->cr_rgid;
 		td->td_ucred = usecred;
 	} else
 		usecred = cred;
 	AUDIT_ARG_VALUE(amode);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
 	    AUDITVNODE1 | at2cnpflags(flag, AT_RESOLVE_BENEATH |
 	    AT_EMPTY_PATH), pathseg, path, fd, &cap_fstat_rights, td);
 	if ((error = namei(&nd)) != 0)
 		goto out;
 	vp = nd.ni_vp;
 
 	error = vn_access(vp, amode, usecred, td);
 	NDFREE_NOTHING(&nd);
 	vput(vp);
 out:
 	if (usecred != cred) {
 		td->td_ucred = cred;
 		crfree(usecred);
 	}
 	return (error);
 }
 
 /*
  * Check access permissions using "effective" credentials.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct eaccess_args {
 	char	*path;
 	int	amode;
 };
 #endif
 int
 sys_eaccess(struct thread *td, struct eaccess_args *uap)
 {
 
 	return (kern_accessat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    AT_EACCESS, uap->amode));
 }
 
 #if defined(COMPAT_43)
 /*
  * Get file status; this version follows links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ostat_args {
 	char	*path;
 	struct ostat *ub;
 };
 #endif
 int
 ostat(struct thread *td, struct ostat_args *uap)
 {
 	struct stat sb;
 	struct ostat osb;
 	int error;
 
 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    &sb, NULL);
 	if (error != 0)
 		return (error);
 	cvtstat(&sb, &osb);
 	return (copyout(&osb, uap->ub, sizeof (osb)));
 }
 
 /*
  * Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct olstat_args {
 	char	*path;
 	struct ostat *ub;
 };
 #endif
 int
 olstat(struct thread *td, struct olstat_args *uap)
 {
 	struct stat sb;
 	struct ostat osb;
 	int error;
 
 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 	    UIO_USERSPACE, &sb, NULL);
 	if (error != 0)
 		return (error);
 	cvtstat(&sb, &osb);
 	return (copyout(&osb, uap->ub, sizeof (osb)));
 }
 
 /*
  * Convert from an old to a new stat structure.
  * XXX: many values are blindly truncated.
  */
 void
 cvtstat(struct stat *st, struct ostat *ost)
 {
 
 	bzero(ost, sizeof(*ost));
 	ost->st_dev = st->st_dev;
 	ost->st_ino = st->st_ino;
 	ost->st_mode = st->st_mode;
 	ost->st_nlink = st->st_nlink;
 	ost->st_uid = st->st_uid;
 	ost->st_gid = st->st_gid;
 	ost->st_rdev = st->st_rdev;
 	ost->st_size = MIN(st->st_size, INT32_MAX);
 	ost->st_atim = st->st_atim;
 	ost->st_mtim = st->st_mtim;
 	ost->st_ctim = st->st_ctim;
 	ost->st_blksize = st->st_blksize;
 	ost->st_blocks = st->st_blocks;
 	ost->st_flags = st->st_flags;
 	ost->st_gen = st->st_gen;
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11)
 int ino64_trunc_error;
 SYSCTL_INT(_vfs, OID_AUTO, ino64_trunc_error, CTLFLAG_RW,
     &ino64_trunc_error, 0,
     "Error on truncation of device, file or inode number, or link count");
 
 int
 freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost)
 {
 
 	ost->st_dev = st->st_dev;
 	if (ost->st_dev != st->st_dev) {
 		switch (ino64_trunc_error) {
 		default:
 			/*
 			 * Since dev_t is almost raw, don't clamp to the
 			 * maximum for case 2, but ignore the error.
 			 */
 			break;
 		case 1:
 			return (EOVERFLOW);
 		}
 	}
 	ost->st_ino = st->st_ino;
 	if (ost->st_ino != st->st_ino) {
 		switch (ino64_trunc_error) {
 		default:
 		case 0:
 			break;
 		case 1:
 			return (EOVERFLOW);
 		case 2:
 			ost->st_ino = UINT32_MAX;
 			break;
 		}
 	}
 	ost->st_mode = st->st_mode;
 	ost->st_nlink = st->st_nlink;
 	if (ost->st_nlink != st->st_nlink) {
 		switch (ino64_trunc_error) {
 		default:
 		case 0:
 			break;
 		case 1:
 			return (EOVERFLOW);
 		case 2:
 			ost->st_nlink = UINT16_MAX;
 			break;
 		}
 	}
 	ost->st_uid = st->st_uid;
 	ost->st_gid = st->st_gid;
 	ost->st_rdev = st->st_rdev;
 	if (ost->st_rdev != st->st_rdev) {
 		switch (ino64_trunc_error) {
 		default:
 			break;
 		case 1:
 			return (EOVERFLOW);
 		}
 	}
 	ost->st_atim = st->st_atim;
 	ost->st_mtim = st->st_mtim;
 	ost->st_ctim = st->st_ctim;
 	ost->st_size = st->st_size;
 	ost->st_blocks = st->st_blocks;
 	ost->st_blksize = st->st_blksize;
 	ost->st_flags = st->st_flags;
 	ost->st_gen = st->st_gen;
 	ost->st_lspare = 0;
 	ost->st_birthtim = st->st_birthtim;
 	bzero((char *)&ost->st_birthtim + sizeof(ost->st_birthtim),
 	    sizeof(*ost) - offsetof(struct freebsd11_stat,
 	    st_birthtim) - sizeof(ost->st_birthtim));
 	return (0);
 }
 
 int
 freebsd11_stat(struct thread *td, struct freebsd11_stat_args* uap)
 {
 	struct stat sb;
 	struct freebsd11_stat osb;
 	int error;
 
 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    &sb, NULL);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat(&sb, &osb);
 	if (error == 0)
 		error = copyout(&osb, uap->ub, sizeof(osb));
 	return (error);
 }
 
 int
 freebsd11_lstat(struct thread *td, struct freebsd11_lstat_args* uap)
 {
 	struct stat sb;
 	struct freebsd11_stat osb;
 	int error;
 
 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 	    UIO_USERSPACE, &sb, NULL);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat(&sb, &osb);
 	if (error == 0)
 		error = copyout(&osb, uap->ub, sizeof(osb));
 	return (error);
 }
 
 int
 freebsd11_fhstat(struct thread *td, struct freebsd11_fhstat_args* uap)
 {
 	struct fhandle fh;
 	struct stat sb;
 	struct freebsd11_stat osb;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error != 0)
 		return (error);
 	error = kern_fhstat(td, fh, &sb);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat(&sb, &osb);
 	if (error == 0)
 		error = copyout(&osb, uap->sb, sizeof(osb));
 	return (error);
 }
 
 int
 freebsd11_fstatat(struct thread *td, struct freebsd11_fstatat_args* uap)
 {
 	struct stat sb;
 	struct freebsd11_stat osb;
 	int error;
 
 	error = kern_statat(td, uap->flag, uap->fd, uap->path,
 	    UIO_USERSPACE, &sb, NULL);
 	if (error != 0)
 		return (error);
 	error = freebsd11_cvtstat(&sb, &osb);
 	if (error == 0)
 		error = copyout(&osb, uap->buf, sizeof(osb));
 	return (error);
 }
 #endif	/* COMPAT_FREEBSD11 */
 
 /*
  * Get file status
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstatat_args {
 	int	fd;
 	char	*path;
 	struct stat	*buf;
 	int	flag;
 }
 #endif
 int
 sys_fstatat(struct thread *td, struct fstatat_args *uap)
 {
 	struct stat sb;
 	int error;
 
 	error = kern_statat(td, uap->flag, uap->fd, uap->path,
 	    UIO_USERSPACE, &sb, NULL);
 	if (error == 0)
 		error = copyout(&sb, uap->buf, sizeof (sb));
 	return (error);
 }
 
 int
 kern_statat(struct thread *td, int flag, int fd, const char *path,
     enum uio_seg pathseg, struct stat *sbp,
     void (*hook)(struct vnode *vp, struct stat *sbp))
 {
 	struct nameidata nd;
 	int error;
 
 	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
 	    AT_EMPTY_PATH)) != 0)
 		return (EINVAL);
 
 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_RESOLVE_BENEATH |
 	    AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH) | LOCKSHARED | LOCKLEAF |
 	    AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights, td);
 
 	if ((error = namei(&nd)) != 0) {
 		if (error == ENOTDIR &&
 		    (nd.ni_resflags & NIRES_EMPTYPATH) != 0)
 			error = kern_fstat(td, fd, sbp);
 		return (error);
 	}
-	error = VOP_STAT(nd.ni_vp, sbp, td->td_ucred, NOCRED, td);
+	error = VOP_STAT(nd.ni_vp, sbp, td->td_ucred, NOCRED);
 	if (error == 0) {
 		if (__predict_false(hook != NULL))
 			hook(nd.ni_vp, sbp);
 	}
 	NDFREE_NOTHING(&nd);
 	vput(nd.ni_vp);
 #ifdef __STAT_TIME_T_EXT
 	sbp->st_atim_ext = 0;
 	sbp->st_mtim_ext = 0;
 	sbp->st_ctim_ext = 0;
 	sbp->st_btim_ext = 0;
 #endif
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrstat_error(sbp, error);
 #endif
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD11)
 /*
  * Implementation of the NetBSD [l]stat() functions.
  */
 void
 freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb)
 {
 
 	bzero(nsb, sizeof(*nsb));
 	nsb->st_dev = sb->st_dev;
 	nsb->st_ino = sb->st_ino;
 	nsb->st_mode = sb->st_mode;
 	nsb->st_nlink = sb->st_nlink;
 	nsb->st_uid = sb->st_uid;
 	nsb->st_gid = sb->st_gid;
 	nsb->st_rdev = sb->st_rdev;
 	nsb->st_atim = sb->st_atim;
 	nsb->st_mtim = sb->st_mtim;
 	nsb->st_ctim = sb->st_ctim;
 	nsb->st_size = sb->st_size;
 	nsb->st_blocks = sb->st_blocks;
 	nsb->st_blksize = sb->st_blksize;
 	nsb->st_flags = sb->st_flags;
 	nsb->st_gen = sb->st_gen;
 	nsb->st_birthtim = sb->st_birthtim;
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd11_nstat_args {
 	char	*path;
 	struct nstat *ub;
 };
 #endif
 int
 freebsd11_nstat(struct thread *td, struct freebsd11_nstat_args *uap)
 {
 	struct stat sb;
 	struct nstat nsb;
 	int error;
 
 	error = kern_statat(td, 0, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    &sb, NULL);
 	if (error != 0)
 		return (error);
 	freebsd11_cvtnstat(&sb, &nsb);
 	return (copyout(&nsb, uap->ub, sizeof (nsb)));
 }
 
 /*
  * NetBSD lstat.  Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd11_nlstat_args {
 	char	*path;
 	struct nstat *ub;
 };
 #endif
 int
 freebsd11_nlstat(struct thread *td, struct freebsd11_nlstat_args *uap)
 {
 	struct stat sb;
 	struct nstat nsb;
 	int error;
 
 	error = kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->path,
 	    UIO_USERSPACE, &sb, NULL);
 	if (error != 0)
 		return (error);
 	freebsd11_cvtnstat(&sb, &nsb);
 	return (copyout(&nsb, uap->ub, sizeof (nsb)));
 }
 #endif /* COMPAT_FREEBSD11 */
 
 /*
  * Get configurable pathname variables.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pathconf_args {
 	char	*path;
 	int	name;
 };
 #endif
 int
 sys_pathconf(struct thread *td, struct pathconf_args *uap)
 {
 	long value;
 	int error;
 
 	error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW,
 	    &value);
 	if (error == 0)
 		td->td_retval[0] = value;
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct lpathconf_args {
 	char	*path;
 	int	name;
 };
 #endif
 int
 sys_lpathconf(struct thread *td, struct lpathconf_args *uap)
 {
 	long value;
 	int error;
 
 	error = kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
 	    NOFOLLOW, &value);
 	if (error == 0)
 		td->td_retval[0] = value;
 	return (error);
 }
 
 int
 kern_pathconf(struct thread *td, const char *path, enum uio_seg pathseg,
     int name, u_long flags, long *valuep)
 {
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 
 	error = VOP_PATHCONF(nd.ni_vp, name, valuep);
 	vput(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct readlink_args {
 	char	*path;
 	char	*buf;
 	size_t	count;
 };
 #endif
 int
 sys_readlink(struct thread *td, struct readlink_args *uap)
 {
 
 	return (kern_readlinkat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->buf, UIO_USERSPACE, uap->count));
 }
 #ifndef _SYS_SYSPROTO_H_
 struct readlinkat_args {
 	int	fd;
 	char	*path;
 	char	*buf;
 	size_t	bufsize;
 };
 #endif
 int
 sys_readlinkat(struct thread *td, struct readlinkat_args *uap)
 {
 
 	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->buf, UIO_USERSPACE, uap->bufsize));
 }
 
 int
 kern_readlinkat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, char *buf, enum uio_seg bufseg, size_t count)
 {
 	struct vnode *vp;
 	struct nameidata nd;
 	int error;
 
 	if (count > IOSIZE_MAX)
 		return (EINVAL);
 
 	NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 |
 	    EMPTYPATH, pathseg, path, fd, td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 	vp = nd.ni_vp;
 
 	error = kern_readlink_vp(vp, buf, bufseg, count, td);
 	vput(vp);
 
 	return (error);
 }
 
 /*
  * Helper function to readlink from a vnode
  */
 static int
 kern_readlink_vp(struct vnode *vp, char *buf, enum uio_seg bufseg, size_t count,
     struct thread *td)
 {
 	struct iovec aiov;
 	struct uio auio;
 	int error;
 
 	ASSERT_VOP_LOCKED(vp, "kern_readlink_vp(): vp not locked");
 #ifdef MAC
 	error = mac_vnode_check_readlink(td->td_ucred, vp);
 	if (error != 0)
 		return (error);
 #endif
 	if (vp->v_type != VLNK && (vp->v_vflag & VV_READLINK) == 0)
 		return (EINVAL);
 
 	aiov.iov_base = buf;
 	aiov.iov_len = count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = 0;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = bufseg;
 	auio.uio_td = td;
 	auio.uio_resid = count;
 	error = VOP_READLINK(vp, &auio, td->td_ucred);
 	td->td_retval[0] = count - auio.uio_resid;
 	return (error);
 }
 
 /*
  * Common implementation code for chflags() and fchflags().
  */
 static int
 setfflags(struct thread *td, struct vnode *vp, u_long flags)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 
 	/* We can't support the value matching VNOVAL. */
 	if (flags == VNOVAL)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Prevent non-root users from setting flags on devices.  When
 	 * a device is reused, users can retain ownership of the device
 	 * if they are allowed to set flags and programs assume that
 	 * chown can't fail when done as root.
 	 */
 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
 		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
 		if (error != 0)
 			return (error);
 	}
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VATTR_NULL(&vattr);
 	vattr.va_flags = flags;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Change flags of a file given a path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chflags_args {
 	const char *path;
 	u_long	flags;
 };
 #endif
 int
 sys_chflags(struct thread *td, struct chflags_args *uap)
 {
 
 	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->flags, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct chflagsat_args {
 	int	fd;
 	const char *path;
 	u_long	flags;
 	int	atflag;
 }
 #endif
 int
 sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
 {
 
 	if ((uap->atflag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
 	    AT_EMPTY_PATH)) != 0)
 		return (EINVAL);
 
 	return (kern_chflagsat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->flags, uap->atflag));
 }
 
 /*
  * Same as chflags() but doesn't follow symlinks.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchflags_args {
 	const char *path;
 	u_long flags;
 };
 #endif
 int
 sys_lchflags(struct thread *td, struct lchflags_args *uap)
 {
 
 	return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->flags, AT_SYMLINK_NOFOLLOW));
 }
 
 static int
 kern_chflagsat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, u_long flags, int atflag)
 {
 	struct nameidata nd;
 	int error;
 
 	AUDIT_ARG_FFLAGS(flags);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(atflag, AT_SYMLINK_NOFOLLOW |
 	    AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path,
 	    fd, &cap_fchflags_rights, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 	error = setfflags(td, nd.ni_vp, flags);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Change flags of a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchflags_args {
 	int	fd;
 	u_long	flags;
 };
 #endif
 int
 sys_fchflags(struct thread *td, struct fchflags_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_FFLAGS(uap->flags);
 	error = getvnode(td, uap->fd, &cap_fchflags_rights,
 	    &fp);
 	if (error != 0)
 		return (error);
 #ifdef AUDIT
 	if (AUDITING_TD(td)) {
 		vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
 		AUDIT_ARG_VNODE1(fp->f_vnode);
 		VOP_UNLOCK(fp->f_vnode);
 	}
 #endif
 	error = setfflags(td, fp->f_vnode, uap->flags);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation code for chmod(), lchmod() and fchmod().
  */
 int
 setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VATTR_NULL(&vattr);
 	vattr.va_mode = mode & ALLPERMS;
 #ifdef MAC
 	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, cred);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Change mode of a file given path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chmod_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 sys_chmod(struct thread *td, struct chmod_args *uap)
 {
 
 	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->mode, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fchmodat_args {
 	int	dirfd;
 	char	*path;
 	mode_t	mode;
 	int	flag;
 }
 #endif
 int
 sys_fchmodat(struct thread *td, struct fchmodat_args *uap)
 {
 
 	if ((uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
 	    AT_EMPTY_PATH)) != 0)
 		return (EINVAL);
 
 	return (kern_fchmodat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->mode, uap->flag));
 }
 
 /*
  * Change mode of a file given path name (don't follow links.)
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchmod_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 sys_lchmod(struct thread *td, struct lchmod_args *uap)
 {
 
 	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->mode, AT_SYMLINK_NOFOLLOW));
 }
 
 int
 kern_fchmodat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, mode_t mode, int flag)
 {
 	struct nameidata nd;
 	int error;
 
 	AUDIT_ARG_MODE(mode);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
 	    AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path,
 	    fd, &cap_fchmod_rights, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Change mode of a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchmod_args {
 	int	fd;
 	int	mode;
 };
 #endif
 int
 sys_fchmod(struct thread *td, struct fchmod_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_MODE(uap->mode);
 
 	error = fget(td, uap->fd, &cap_fchmod_rights, &fp);
 	if (error != 0)
 		return (error);
 	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation for chown(), lchown(), and fchown()
  */
 int
 setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
     gid_t gid)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VATTR_NULL(&vattr);
 	vattr.va_uid = uid;
 	vattr.va_gid = gid;
 #ifdef MAC
 	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
 	    vattr.va_gid);
 	if (error == 0)
 #endif
 		error = VOP_SETATTR(vp, &vattr, cred);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Set ownership given a path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chown_args {
 	char	*path;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 sys_chown(struct thread *td, struct chown_args *uap)
 {
 
 	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE, uap->uid,
 	    uap->gid, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fchownat_args {
 	int fd;
 	const char * path;
 	uid_t uid;
 	gid_t gid;
 	int flag;
 };
 #endif
 int
 sys_fchownat(struct thread *td, struct fchownat_args *uap)
 {
 
 	if ((uap->flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
 	    AT_EMPTY_PATH)) != 0)
 		return (EINVAL);
 
 	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
 	    uap->gid, uap->flag));
 }
 
 int
 kern_fchownat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, int uid, int gid, int flag)
 {
 	struct nameidata nd;
 	int error;
 
 	AUDIT_ARG_OWNER(uid, gid);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
 	    AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1, pathseg, path,
 	    fd, &cap_fchown_rights, td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set ownership given a path name, do not cross symlinks.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchown_args {
 	char	*path;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 sys_lchown(struct thread *td, struct lchown_args *uap)
 {
 
 	return (kern_fchownat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->uid, uap->gid, AT_SYMLINK_NOFOLLOW));
 }
 
 /*
  * Set ownership given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchown_args {
 	int	fd;
 	int	uid;
 	int	gid;
 };
 #endif
 int
 sys_fchown(struct thread *td, struct fchown_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(uap->fd);
 	AUDIT_ARG_OWNER(uap->uid, uap->gid);
 	error = fget(td, uap->fd, &cap_fchown_rights, &fp);
 	if (error != 0)
 		return (error);
 	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Common implementation code for utimes(), lutimes(), and futimes().
  */
 static int
 getutimes(const struct timeval *usrtvp, enum uio_seg tvpseg,
     struct timespec *tsp)
 {
 	struct timeval tv[2];
 	const struct timeval *tvp;
 	int error;
 
 	if (usrtvp == NULL) {
 		vfs_timestamp(&tsp[0]);
 		tsp[1] = tsp[0];
 	} else {
 		if (tvpseg == UIO_SYSSPACE) {
 			tvp = usrtvp;
 		} else {
 			if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
 				return (error);
 			tvp = tv;
 		}
 
 		if (tvp[0].tv_usec < 0 || tvp[0].tv_usec >= 1000000 ||
 		    tvp[1].tv_usec < 0 || tvp[1].tv_usec >= 1000000)
 			return (EINVAL);
 		TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
 		TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
 	}
 	return (0);
 }
 
 /*
  * Common implementation code for futimens(), utimensat().
  */
 #define	UTIMENS_NULL	0x1
 #define	UTIMENS_EXIT	0x2
 static int
 getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
     struct timespec *tsp, int *retflags)
 {
 	struct timespec tsnow;
 	int error;
 
 	vfs_timestamp(&tsnow);
 	*retflags = 0;
 	if (usrtsp == NULL) {
 		tsp[0] = tsnow;
 		tsp[1] = tsnow;
 		*retflags |= UTIMENS_NULL;
 		return (0);
 	}
 	if (tspseg == UIO_SYSSPACE) {
 		tsp[0] = usrtsp[0];
 		tsp[1] = usrtsp[1];
 	} else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
 		return (error);
 	if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
 		*retflags |= UTIMENS_EXIT;
 	if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
 		*retflags |= UTIMENS_NULL;
 	if (tsp[0].tv_nsec == UTIME_OMIT)
 		tsp[0].tv_sec = VNOVAL;
 	else if (tsp[0].tv_nsec == UTIME_NOW)
 		tsp[0] = tsnow;
 	else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
 		return (EINVAL);
 	if (tsp[1].tv_nsec == UTIME_OMIT)
 		tsp[1].tv_sec = VNOVAL;
 	else if (tsp[1].tv_nsec == UTIME_NOW)
 		tsp[1] = tsnow;
 	else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
 		return (EINVAL);
 
 	return (0);
 }
 
 /*
  * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
  * and utimensat().
  */
 static int
 setutimes(struct thread *td, struct vnode *vp, const struct timespec *ts,
     int numtimes, int nullflag)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 	bool setbirthtime;
 
 	setbirthtime = false;
 	vattr.va_birthtime.tv_sec = VNOVAL;
 	vattr.va_birthtime.tv_nsec = 0;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (numtimes < 3 && VOP_GETATTR(vp, &vattr, td->td_ucred) == 0 &&
 	    timespeccmp(&ts[1], &vattr.va_birthtime, < ))
 		setbirthtime = true;
 	VATTR_NULL(&vattr);
 	vattr.va_atime = ts[0];
 	vattr.va_mtime = ts[1];
 	if (setbirthtime)
 		vattr.va_birthtime = ts[1];
 	if (numtimes > 2)
 		vattr.va_birthtime = ts[2];
 	if (nullflag)
 		vattr.va_vaflags |= VA_UTIMES_NULL;
 #ifdef MAC
 	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
 	    vattr.va_mtime);
 #endif
 	if (error == 0)
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct utimes_args {
 	char	*path;
 	struct	timeval *tptr;
 };
 #endif
 int
 sys_utimes(struct thread *td, struct utimes_args *uap)
 {
 
 	return (kern_utimesat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->tptr, UIO_USERSPACE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct futimesat_args {
 	int fd;
 	const char * path;
 	const struct timeval * times;
 };
 #endif
 int
 sys_futimesat(struct thread *td, struct futimesat_args *uap)
 {
 
 	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->times, UIO_USERSPACE));
 }
 
 int
 kern_utimesat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg)
 {
 	struct nameidata nd;
 	struct timespec ts[2];
 	int error;
 
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
 	    &cap_futimes_rights, td);
 
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lutimes_args {
 	char	*path;
 	struct	timeval *tptr;
 };
 #endif
 int
 sys_lutimes(struct thread *td, struct lutimes_args *uap)
 {
 
 	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
 	    UIO_USERSPACE));
 }
 
 int
 kern_lutimes(struct thread *td, const char *path, enum uio_seg pathseg,
     struct timeval *tptr, enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
 	struct nameidata nd;
 	int error;
 
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct futimes_args {
 	int	fd;
 	struct	timeval *tptr;
 };
 #endif
 int
 sys_futimes(struct thread *td, struct futimes_args *uap)
 {
 
 	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
 }
 
 int
 kern_futimes(struct thread *td, int fd, struct timeval *tptr,
     enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = getutimes(tptr, tptrseg, ts);
 	if (error != 0)
 		return (error);
 	error = getvnode(td, fd, &cap_futimes_rights, &fp);
 	if (error != 0)
 		return (error);
 #ifdef AUDIT
 	if (AUDITING_TD(td)) {
 		vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
 		AUDIT_ARG_VNODE1(fp->f_vnode);
 		VOP_UNLOCK(fp->f_vnode);
 	}
 #endif
 	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_futimens(struct thread *td, struct futimens_args *uap)
 {
 
 	return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
 }
 
 int
 kern_futimens(struct thread *td, int fd, struct timespec *tptr,
     enum uio_seg tptrseg)
 {
 	struct timespec ts[2];
 	struct file *fp;
 	int error, flags;
 
 	AUDIT_ARG_FD(fd);
 	error = getutimens(tptr, tptrseg, ts, &flags);
 	if (error != 0)
 		return (error);
 	if (flags & UTIMENS_EXIT)
 		return (0);
 	error = getvnode(td, fd, &cap_futimes_rights, &fp);
 	if (error != 0)
 		return (error);
 #ifdef AUDIT
 	if (AUDITING_TD(td)) {
 		vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
 		AUDIT_ARG_VNODE1(fp->f_vnode);
 		VOP_UNLOCK(fp->f_vnode);
 	}
 #endif
 	error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_utimensat(struct thread *td, struct utimensat_args *uap)
 {
 
 	return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->times, UIO_USERSPACE, uap->flag));
 }
 
 int
 kern_utimensat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg,
     int flag)
 {
 	struct nameidata nd;
 	struct timespec ts[2];
 	int error, flags;
 
 	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH |
 	    AT_EMPTY_PATH)) != 0)
 		return (EINVAL);
 
 	if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
 		return (error);
 	NDINIT_ATRIGHTS(&nd, LOOKUP, at2cnpflags(flag, AT_SYMLINK_NOFOLLOW |
 	    AT_RESOLVE_BENEATH | AT_EMPTY_PATH) | AUDITVNODE1,
 	    pathseg, path, fd, &cap_futimes_rights, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	/*
 	 * We are allowed to call namei() regardless of 2xUTIME_OMIT.
 	 * POSIX states:
 	 * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
 	 * "Search permission is denied by a component of the path prefix."
 	 */
 	NDFREE_NOTHING(&nd);
 	if ((flags & UTIMENS_EXIT) == 0)
 		error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Truncate a file given its path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct truncate_args {
 	char	*path;
 	int	pad;
 	off_t	length;
 };
 #endif
 int
 sys_truncate(struct thread *td, struct truncate_args *uap)
 {
 
 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
 }
 
 int
 kern_truncate(struct thread *td, const char *path, enum uio_seg pathseg,
     off_t length)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	void *rl_cookie;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error;
 
 	if (length < 0)
 		return (EINVAL);
 	NDPREINIT(&nd);
 retry:
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 		vn_rangelock_unlock(vp, rl_cookie);
 		vrele(vp);
 		return (error);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_type == VDIR)
 		error = EISDIR;
 #ifdef MAC
 	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
 	}
 #endif
 	else if ((error = vn_writechk(vp)) == 0 &&
 	    (error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = length;
 		error = VOP_SETATTR(vp, &vattr, td->td_ucred);
 	}
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	vn_rangelock_unlock(vp, rl_cookie);
 	vrele(vp);
 	if (error == ERELOOKUP)
 		goto retry;
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Truncate a file given its path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct otruncate_args {
 	char	*path;
 	long	length;
 };
 #endif
 int
 otruncate(struct thread *td, struct otruncate_args *uap)
 {
 
 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_FREEBSD6)
 /* Versions with the pad argument */
 int
 freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
 {
 
 	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
 }
 
 int
 freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
 {
 
 	return (kern_ftruncate(td, uap->fd, uap->length));
 }
 #endif
 
 int
 kern_fsync(struct thread *td, int fd, bool fullsync)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = getvnode(td, fd, &cap_fsync_rights, &fp);
 	if (error != 0)
 		return (error);
 	vp = fp->f_vnode;
 #if 0
 	if (!fullsync)
 		/* XXXKIB: compete outstanding aio writes */;
 #endif
 retry:
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error != 0)
 		goto drop;
 	vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	if (vp->v_object != NULL) {
 		VM_OBJECT_WLOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
 		VM_OBJECT_WUNLOCK(vp->v_object);
 	}
 	error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td);
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 	if (error == ERELOOKUP)
 		goto retry;
 drop:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Sync an open file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fsync_args {
 	int	fd;
 };
 #endif
 int
 sys_fsync(struct thread *td, struct fsync_args *uap)
 {
 
 	return (kern_fsync(td, uap->fd, true));
 }
 
 int
 sys_fdatasync(struct thread *td, struct fdatasync_args *uap)
 {
 
 	return (kern_fsync(td, uap->fd, false));
 }
 
 /*
  * Rename files.  Source and destination must either both be directories, or
  * both not be directories.  If target is a directory, it must be empty.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rename_args {
 	char	*from;
 	char	*to;
 };
 #endif
 int
 sys_rename(struct thread *td, struct rename_args *uap)
 {
 
 	return (kern_renameat(td, AT_FDCWD, uap->from, AT_FDCWD,
 	    uap->to, UIO_USERSPACE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct renameat_args {
 	int	oldfd;
 	char	*old;
 	int	newfd;
 	char	*new;
 };
 #endif
 int
 sys_renameat(struct thread *td, struct renameat_args *uap)
 {
 
 	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
 	    UIO_USERSPACE));
 }
 
 #ifdef MAC
 static int
 kern_renameat_mac(struct thread *td, int oldfd, const char *old, int newfd,
     const char *new, enum uio_seg pathseg, struct nameidata *fromnd)
 {
 	int error;
 
 	NDINIT_ATRIGHTS(fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
 	    AUDITVNODE1, pathseg, old, oldfd, &cap_renameat_source_rights, td);
 	if ((error = namei(fromnd)) != 0)
 		return (error);
 	error = mac_vnode_check_rename_from(td->td_ucred, fromnd->ni_dvp,
 	    fromnd->ni_vp, &fromnd->ni_cnd);
 	VOP_UNLOCK(fromnd->ni_dvp);
 	if (fromnd->ni_dvp != fromnd->ni_vp)
 		VOP_UNLOCK(fromnd->ni_vp);
 	if (error != 0) {
 		NDFREE(fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd->ni_dvp);
 		vrele(fromnd->ni_vp);
 		if (fromnd->ni_startdir)
 			vrele(fromnd->ni_startdir);
 	}
 	return (error);
 }
 #endif
 
 int
 kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
     const char *new, enum uio_seg pathseg)
 {
 	struct mount *mp = NULL;
 	struct vnode *tvp, *fvp, *tdvp;
 	struct nameidata fromnd, tond;
 	u_int64_t tondflags;
 	int error;
 
 again:
 	bwillwrite();
 #ifdef MAC
 	if (mac_vnode_check_rename_from_enabled()) {
 		error = kern_renameat_mac(td, oldfd, old, newfd, new, pathseg,
 		    &fromnd);
 		if (error != 0)
 			return (error);
 	} else {
 #endif
 	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
 	    pathseg, old, oldfd, &cap_renameat_source_rights, td);
 	if ((error = namei(&fromnd)) != 0)
 		return (error);
 #ifdef MAC
 	}
 #endif
 	fvp = fromnd.ni_vp;
 	tondflags = LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | AUDITVNODE2;
 	if (fromnd.ni_vp->v_type == VDIR)
 		tondflags |= WILLBEDIR;
 	NDINIT_ATRIGHTS(&tond, RENAME, tondflags, pathseg, new, newfd,
 	    &cap_renameat_target_rights, td);
 	if ((error = namei(&tond)) != 0) {
 		/* Translate error code for rename("dir1", "dir2/."). */
 		if (error == EISDIR && fvp->v_type == VDIR)
 			error = EINVAL;
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 		goto out1;
 	}
 	tdvp = tond.ni_dvp;
 	tvp = tond.ni_vp;
 	error = vn_start_write(fvp, &mp, V_NOWAIT);
 	if (error != 0) {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 		if (tvp != NULL)
 			vput(tvp);
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 		vrele(tond.ni_startdir);
 		if (fromnd.ni_startdir != NULL)
 			vrele(fromnd.ni_startdir);
 		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
 		if (error != 0)
 			return (error);
 		goto again;
 	}
 	if (tvp != NULL) {
 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto out;
 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
 			error = EISDIR;
 			goto out;
 		}
 #ifdef CAPABILITIES
 		if (newfd != AT_FDCWD && (tond.ni_resflags & NIRES_ABS) == 0) {
 			/*
 			 * If the target already exists we require CAP_UNLINKAT
 			 * from 'newfd', when newfd was used for the lookup.
 			 */
 			error = cap_check(&tond.ni_filecaps.fc_rights,
 			    &cap_unlinkat_rights);
 			if (error != 0)
 				goto out;
 		}
 #endif
 	}
 	if (fvp == tdvp) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * If the source is the same as the destination (that is, if they
 	 * are links to the same vnode), then there is nothing to do.
 	 */
 	if (fvp == tvp)
 		error = ERESTART;
 #ifdef MAC
 	else
 		error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
 		    tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
 #endif
 out:
 	if (error == 0) {
 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
 		    tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 	} else {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 		if (tvp != NULL)
 			vput(tvp);
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 	}
 	vrele(tond.ni_startdir);
 	vn_finished_write(mp);
 out1:
 	if (fromnd.ni_startdir)
 		vrele(fromnd.ni_startdir);
 	if (error == ERESTART)
 		return (0);
 	if (error == ERELOOKUP)
 		goto again;
 	return (error);
 }
 
 /*
  * Make a directory file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mkdir_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 sys_mkdir(struct thread *td, struct mkdir_args *uap)
 {
 
 	return (kern_mkdirat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
 	    uap->mode));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mkdirat_args {
 	int	fd;
 	char	*path;
 	mode_t	mode;
 };
 #endif
 int
 sys_mkdirat(struct thread *td, struct mkdirat_args *uap)
 {
 
 	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
 }
 
 int
 kern_mkdirat(struct thread *td, int fd, const char *path, enum uio_seg segflg,
     int mode)
 {
 	struct mount *mp;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error;
 
 	AUDIT_ARG_MODE(mode);
 	NDPREINIT(&nd);
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
 	    NC_NOMAKEENTRY | NC_KEEPPOSENTRY | FAILIFEXISTS | WILLBEDIR,
 	    segflg, path, fd, &cap_mkdirat_rights, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VDIR;
 	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_pd->pd_cmask;
 #ifdef MAC
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 #ifdef MAC
 out:
 #endif
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	VOP_VPUT_PAIR(nd.ni_dvp, error == 0 ? &nd.ni_vp : NULL, true);
 	vn_finished_write(mp);
 	if (error == ERELOOKUP)
 		goto restart;
 	return (error);
 }
 
 /*
  * Remove a directory file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rmdir_args {
 	char	*path;
 };
 #endif
 int
 sys_rmdir(struct thread *td, struct rmdir_args *uap)
 {
 
 	return (kern_frmdirat(td, AT_FDCWD, uap->path, FD_NONE, UIO_USERSPACE,
 	    0));
 }
 
 int
 kern_frmdirat(struct thread *td, int dfd, const char *path, int fd,
     enum uio_seg pathseg, int flag)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct file *fp;
 	struct nameidata nd;
 	cap_rights_t rights;
 	int error;
 
 	fp = NULL;
 	if (fd != FD_NONE) {
 		error = getvnode(td, fd, cap_rights_init_one(&rights,
 		    CAP_LOOKUP), &fp);
 		if (error != 0)
 			return (error);
 	}
 
 	NDPREINIT(&nd);
 restart:
 	bwillwrite();
 	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1 |
 	    at2cnpflags(flag, AT_RESOLVE_BENEATH),
 	    pathseg, path, dfd, &cap_unlinkat_rights, td);
 	if ((error = namei(&nd)) != 0)
 		goto fdout;
 	vp = nd.ni_vp;
 	if (vp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto out;
 	}
 	/*
 	 * No rmdir "." please.
 	 */
 	if (nd.ni_dvp == vp) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * The root of a mounted filesystem cannot be deleted.
 	 */
 	if (vp->v_vflag & VV_ROOT) {
 		error = EBUSY;
 		goto out;
 	}
 
 	if (fp != NULL && fp->f_vnode != vp) {
 		if (VN_IS_DOOMED(fp->f_vnode))
 			error = EBADF;
 		else
 			error = EDEADLK;
 		goto out;
 	}
 
 #ifdef MAC
 	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
 	    &nd.ni_cnd);
 	if (error != 0)
 		goto out;
 #endif
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(vp);
 		if (nd.ni_dvp == vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			goto fdout;
 		goto restart;
 	}
 	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
 	vn_finished_write(mp);
 out:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	if (nd.ni_dvp == vp)
 		vrele(nd.ni_dvp);
 	else
 		vput(nd.ni_dvp);
 	if (error == ERELOOKUP)
 		goto restart;
 fdout:
 	if (fp != NULL)
 		fdrop(fp, td);
 	return (error);
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_FREEBSD11)
 int
 freebsd11_kern_getdirentries(struct thread *td, int fd, char *ubuf, u_int count,
     long *basep, void (*func)(struct freebsd11_dirent *))
 {
 	struct freebsd11_dirent dstdp;
 	struct dirent *dp, *edp;
 	char *dirbuf;
 	off_t base;
 	ssize_t resid, ucount;
 	int error;
 
 	/* XXX arbitrary sanity limit on `count'. */
 	count = min(count, 64 * 1024);
 
 	dirbuf = malloc(count, M_TEMP, M_WAITOK);
 
 	error = kern_getdirentries(td, fd, dirbuf, count, &base, &resid,
 	    UIO_SYSSPACE);
 	if (error != 0)
 		goto done;
 	if (basep != NULL)
 		*basep = base;
 
 	ucount = 0;
 	for (dp = (struct dirent *)dirbuf,
 	    edp = (struct dirent *)&dirbuf[count - resid];
 	    ucount < count && dp < edp; ) {
 		if (dp->d_reclen == 0)
 			break;
 		MPASS(dp->d_reclen >= _GENERIC_DIRLEN(0));
 		if (dp->d_namlen >= sizeof(dstdp.d_name))
 			continue;
 		dstdp.d_type = dp->d_type;
 		dstdp.d_namlen = dp->d_namlen;
 		dstdp.d_fileno = dp->d_fileno;		/* truncate */
 		if (dstdp.d_fileno != dp->d_fileno) {
 			switch (ino64_trunc_error) {
 			default:
 			case 0:
 				break;
 			case 1:
 				error = EOVERFLOW;
 				goto done;
 			case 2:
 				dstdp.d_fileno = UINT32_MAX;
 				break;
 			}
 		}
 		dstdp.d_reclen = sizeof(dstdp) - sizeof(dstdp.d_name) +
 		    ((dp->d_namlen + 1 + 3) &~ 3);
 		bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen);
 		bzero(dstdp.d_name + dstdp.d_namlen,
 		    dstdp.d_reclen - offsetof(struct freebsd11_dirent, d_name) -
 		    dstdp.d_namlen);
 		MPASS(dstdp.d_reclen <= dp->d_reclen);
 		MPASS(ucount + dstdp.d_reclen <= count);
 		if (func != NULL)
 			func(&dstdp);
 		error = copyout(&dstdp, ubuf + ucount, dstdp.d_reclen);
 		if (error != 0)
 			break;
 		dp = (struct dirent *)((char *)dp + dp->d_reclen);
 		ucount += dstdp.d_reclen;
 	}
 
 done:
 	free(dirbuf, M_TEMP);
 	if (error == 0)
 		td->td_retval[0] = ucount;
 	return (error);
 }
 #endif /* COMPAT */
 
 #ifdef COMPAT_43
 static void
 ogetdirentries_cvt(struct freebsd11_dirent *dp)
 {
 #if (BYTE_ORDER == LITTLE_ENDIAN)
 	/*
 	 * The expected low byte of dp->d_namlen is our dp->d_type.
 	 * The high MBZ byte of dp->d_namlen is our dp->d_namlen.
 	 */
 	dp->d_type = dp->d_namlen;
 	dp->d_namlen = 0;
 #else
 	/*
 	 * The dp->d_type is the high byte of the expected dp->d_namlen,
 	 * so must be zero'ed.
 	 */
 	dp->d_type = 0;
 #endif
 }
 
 /*
  * Read a block of directory entries in a filesystem independent format.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ogetdirentries_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
 	long	*basep;
 };
 #endif
 int
 ogetdirentries(struct thread *td, struct ogetdirentries_args *uap)
 {
 	long loff;
 	int error;
 
 	error = kern_ogetdirentries(td, uap, &loff);
 	if (error == 0)
 		error = copyout(&loff, uap->basep, sizeof(long));
 	return (error);
 }
 
 int
 kern_ogetdirentries(struct thread *td, struct ogetdirentries_args *uap,
     long *ploff)
 {
 	long base;
 	int error;
 
 	/* XXX arbitrary sanity limit on `count'. */
 	if (uap->count > 64 * 1024)
 		return (EINVAL);
 
 	error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
 	    &base, ogetdirentries_cvt);
 
 	if (error == 0 && uap->basep != NULL)
 		error = copyout(&base, uap->basep, sizeof(long));
 
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_FREEBSD11)
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd11_getdirentries_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
 	long	*basep;
 };
 #endif
 int
 freebsd11_getdirentries(struct thread *td,
     struct freebsd11_getdirentries_args *uap)
 {
 	long base;
 	int error;
 
 	error = freebsd11_kern_getdirentries(td, uap->fd, uap->buf, uap->count,
 	    &base, NULL);
 
 	if (error == 0 && uap->basep != NULL)
 		error = copyout(&base, uap->basep, sizeof(long));
 	return (error);
 }
 
 int
 freebsd11_getdents(struct thread *td, struct freebsd11_getdents_args *uap)
 {
 	struct freebsd11_getdirentries_args ap;
 
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.count = uap->count;
 	ap.basep = NULL;
 	return (freebsd11_getdirentries(td, &ap));
 }
 #endif /* COMPAT_FREEBSD11 */
 
 /*
  * Read a block of directory entries in a filesystem independent format.
  */
 int
 sys_getdirentries(struct thread *td, struct getdirentries_args *uap)
 {
 	off_t base;
 	int error;
 
 	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
 	    NULL, UIO_USERSPACE);
 	if (error != 0)
 		return (error);
 	if (uap->basep != NULL)
 		error = copyout(&base, uap->basep, sizeof(off_t));
 	return (error);
 }
 
 int
 kern_getdirentries(struct thread *td, int fd, char *buf, size_t count,
     off_t *basep, ssize_t *residp, enum uio_seg bufseg)
 {
 	struct vnode *vp;
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	off_t loff;
 	int error, eofflag;
 	off_t foffset;
 
 	AUDIT_ARG_FD(fd);
 	if (count > IOSIZE_MAX)
 		return (EINVAL);
 	auio.uio_resid = count;
 	error = getvnode(td, fd, &cap_read_rights, &fp);
 	if (error != 0)
 		return (error);
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	vp = fp->f_vnode;
 	foffset = foffset_lock(fp, 0);
 unionread:
 	if (vp->v_type != VDIR) {
 		error = EINVAL;
 		goto fail;
 	}
 	aiov.iov_base = buf;
 	aiov.iov_len = count;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = bufseg;
 	auio.uio_td = td;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	loff = auio.uio_offset = foffset;
 #ifdef MAC
 	error = mac_vnode_check_readdir(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
 		    NULL);
 	foffset = auio.uio_offset;
 	if (error != 0) {
 		VOP_UNLOCK(vp);
 		goto fail;
 	}
 	if (count == auio.uio_resid &&
 	    (vp->v_vflag & VV_ROOT) &&
 	    (vp->v_mount->mnt_flag & MNT_UNION)) {
 		struct vnode *tvp = vp;
 
 		vp = vp->v_mount->mnt_vnodecovered;
 		VREF(vp);
 		fp->f_vnode = vp;
 		foffset = 0;
 		vput(tvp);
 		goto unionread;
 	}
 	VOP_UNLOCK(vp);
 	*basep = loff;
 	if (residp != NULL)
 		*residp = auio.uio_resid;
 	td->td_retval[0] = count - auio.uio_resid;
 fail:
 	foffset_unlock(fp, foffset, 0);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Set the mode mask for creation of filesystem nodes.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct umask_args {
 	int	newmask;
 };
 #endif
 int
 sys_umask(struct thread *td, struct umask_args *uap)
 {
 	struct pwddesc *pdp;
 
 	pdp = td->td_proc->p_pd;
 	PWDDESC_XLOCK(pdp);
 	td->td_retval[0] = pdp->pd_cmask;
 	pdp->pd_cmask = uap->newmask & ALLPERMS;
 	PWDDESC_XUNLOCK(pdp);
 	return (0);
 }
 
 /*
  * Void all references to file by ripping underlying filesystem away from
  * vnode.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct revoke_args {
 	char	*path;
 };
 #endif
 int
 sys_revoke(struct thread *td, struct revoke_args *uap)
 {
 	struct vnode *vp;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
 	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	NDFREE_NOTHING(&nd);
 	if (vp->v_type != VCHR || vp->v_rdev == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_vnode_check_revoke(td->td_ucred, vp);
 	if (error != 0)
 		goto out;
 #endif
 	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 	if (error != 0)
 		goto out;
 	if (td->td_ucred->cr_uid != vattr.va_uid) {
 		error = priv_check(td, PRIV_VFS_ADMIN);
 		if (error != 0)
 			goto out;
 	}
 	if (devfs_usecount(vp) > 0)
 		VOP_REVOKE(vp, REVOKEALL);
 out:
 	vput(vp);
 	return (error);
 }
 
 /*
  * This variant of getvnode() allows O_PATH files.  Caller should
  * ensure that returned file and vnode are only used for compatible
  * semantics.
  */
 int
 getvnode_path(struct thread *td, int fd, cap_rights_t *rightsp,
     struct file **fpp)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_unlocked(td->td_proc->p_fd, fd, rightsp, &fp);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * The file could be not of the vnode type, or it may be not
 	 * yet fully initialized, in which case the f_vnode pointer
 	 * may be set, but f_ops is still badfileops.  E.g.,
 	 * devfs_open() transiently create such situation to
 	 * facilitate csw d_fdopen().
 	 *
 	 * Dupfdopen() handling in kern_openat() installs the
 	 * half-baked file into the process descriptor table, allowing
 	 * other thread to dereference it. Guard against the race by
 	 * checking f_ops.
 	 */
 	if (__predict_false(fp->f_vnode == NULL || fp->f_ops == &badfileops)) {
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 
 	*fpp = fp;
 	return (0);
 }
 
 /*
  * Convert a user file descriptor to a kernel file entry and check
  * that, if it is a capability, the correct rights are present.
  * A reference on the file entry is held upon returning.
  */
 int
 getvnode(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 	int error;
 
 	error = getvnode_path(td, fd, rightsp, fpp);
 	if (__predict_false(error != 0))
 		return (error);
 
 	/*
 	 * Filter out O_PATH file descriptors, most getvnode() callers
 	 * do not call fo_ methods.
 	 */
 	if (__predict_false((*fpp)->f_ops == &path_fileops)) {
 		fdrop(*fpp, td);
 		error = EBADF;
 	}
 
 	return (error);
 }
 
 /*
  * Get an (NFS) file handle.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lgetfh_args {
 	char *fname;
 	fhandle_t *fhp;
 };
 #endif
 int
 sys_lgetfh(struct thread *td, struct lgetfh_args *uap)
 {
 
 	return (kern_getfhat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, uap->fname,
 	    UIO_USERSPACE, uap->fhp, UIO_USERSPACE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getfh_args {
 	char *fname;
 	fhandle_t *fhp;
 };
 #endif
 int
 sys_getfh(struct thread *td, struct getfh_args *uap)
 {
 
 	return (kern_getfhat(td, 0, AT_FDCWD, uap->fname, UIO_USERSPACE,
 	    uap->fhp, UIO_USERSPACE));
 }
 
 /*
  * syscall for the rpc.lockd to use to translate an open descriptor into
  * a NFS file handle.
  *
  * warning: do not remove the priv_check() call or this becomes one giant
  * security hole.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getfhat_args {
 	int fd;
 	char *path;
 	fhandle_t *fhp;
 	int flags;
 };
 #endif
 int
 sys_getfhat(struct thread *td, struct getfhat_args *uap)
 {
 
 	if ((uap->flags & ~(AT_SYMLINK_NOFOLLOW | AT_RESOLVE_BENEATH)) != 0)
 		return (EINVAL);
 	return (kern_getfhat(td, uap->flags, uap->fd, uap->path, UIO_USERSPACE,
 	    uap->fhp, UIO_USERSPACE));
 }
 
 int
 kern_getfhat(struct thread *td, int flags, int fd, const char *path,
     enum uio_seg pathseg, fhandle_t *fhp, enum uio_seg fhseg)
 {
 	struct nameidata nd;
 	fhandle_t fh;
 	struct vnode *vp;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_GETFH);
 	if (error != 0)
 		return (error);
 	NDINIT_AT(&nd, LOOKUP, at2cnpflags(flags, AT_SYMLINK_NOFOLLOW |
 	    AT_RESOLVE_BENEATH) | LOCKLEAF | AUDITVNODE1, pathseg, path,
 	    fd, td);
 	error = namei(&nd);
 	if (error != 0)
 		return (error);
 	NDFREE_NOTHING(&nd);
 	vp = nd.ni_vp;
 	bzero(&fh, sizeof(fh));
 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 	error = VOP_VPTOFH(vp, &fh.fh_fid);
 	vput(vp);
 	if (error == 0) {
 		if (fhseg == UIO_USERSPACE)
 			error = copyout(&fh, fhp, sizeof (fh));
 		else
 			memcpy(fhp, &fh, sizeof(fh));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fhlink_args {
 	fhandle_t *fhp;
 	const char *to;
 };
 #endif
 int
 sys_fhlink(struct thread *td, struct fhlink_args *uap)
 {
 
 	return (kern_fhlinkat(td, AT_FDCWD, uap->to, UIO_USERSPACE, uap->fhp));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fhlinkat_args {
 	fhandle_t *fhp;
 	int tofd;
 	const char *to;
 };
 #endif
 int
 sys_fhlinkat(struct thread *td, struct fhlinkat_args *uap)
 {
 
 	return (kern_fhlinkat(td, uap->tofd, uap->to, UIO_USERSPACE, uap->fhp));
 }
 
 static int
 kern_fhlinkat(struct thread *td, int fd, const char *path,
     enum uio_seg pathseg, fhandle_t *fhp)
 {
 	fhandle_t fh;
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_GETFH);
 	if (error != 0)
 		return (error);
 	error = copyin(fhp, &fh, sizeof(fh));
 	if (error != 0)
 		return (error);
 	do {
 		bwillwrite();
 		if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 			return (ESTALE);
 		error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp);
 		vfs_unbusy(mp);
 		if (error != 0)
 			return (error);
 		VOP_UNLOCK(vp);
 		error = kern_linkat_vp(td, vp, fd, path, pathseg);
 	} while (error == EAGAIN || error == ERELOOKUP);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct fhreadlink_args {
 	fhandle_t *fhp;
 	char *buf;
 	size_t bufsize;
 };
 #endif
 int
 sys_fhreadlink(struct thread *td, struct fhreadlink_args *uap)
 {
 	fhandle_t fh;
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_GETFH);
 	if (error != 0)
 		return (error);
 	if (uap->bufsize > IOSIZE_MAX)
 		return (EINVAL);
 	error = copyin(uap->fhp, &fh, sizeof(fh));
 	if (error != 0)
 		return (error);
 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_SHARED, &vp);
 	vfs_unbusy(mp);
 	if (error != 0)
 		return (error);
 	error = kern_readlink_vp(vp, uap->buf, UIO_USERSPACE, uap->bufsize, td);
 	vput(vp);
 	return (error);
 }
 
 /*
  * syscall for the rpc.lockd to use to translate a NFS file handle into an
  * open descriptor.
  *
  * warning: do not remove the priv_check() call or this becomes one giant
  * security hole.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhopen_args {
 	const struct fhandle *u_fhp;
 	int flags;
 };
 #endif
 int
 sys_fhopen(struct thread *td, struct fhopen_args *uap)
 {
 	return (kern_fhopen(td, uap->u_fhp, uap->flags));
 }
 
 int
 kern_fhopen(struct thread *td, const struct fhandle *u_fhp, int flags)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct fhandle fhp;
 	struct file *fp;
 	int fmode, error;
 	int indx;
 
 	error = priv_check(td, PRIV_VFS_FHOPEN);
 	if (error != 0)
 		return (error);
 	indx = -1;
 	fmode = FFLAGS(flags);
 	/* why not allow a non-read/write open for our lockd? */
 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
 		return (EINVAL);
 	error = copyin(u_fhp, &fhp, sizeof(fhp));
 	if (error != 0)
 		return(error);
 	/* find the mount point */
 	mp = vfs_busyfs(&fhp.fh_fsid);
 	if (mp == NULL)
 		return (ESTALE);
 	/* now give me my vnode, it gets returned to me locked */
 	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
 	vfs_unbusy(mp);
 	if (error != 0)
 		return (error);
 
 	error = falloc_noinstall(td, &fp);
 	if (error != 0) {
 		vput(vp);
 		return (error);
 	}
 	/*
 	 * An extra reference on `fp' has been held for us by
 	 * falloc_noinstall().
 	 */
 
 #ifdef INVARIANTS
 	td->td_dupfd = -1;
 #endif
 	error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
 	if (error != 0) {
 		KASSERT(fp->f_ops == &badfileops,
 		    ("VOP_OPEN in fhopen() set f_ops"));
 		KASSERT(td->td_dupfd < 0,
 		    ("fhopen() encountered fdopen()"));
 
 		vput(vp);
 		goto bad;
 	}
 #ifdef INVARIANTS
 	td->td_dupfd = 0;
 #endif
 	fp->f_vnode = vp;
 	finit_vnode(fp, fmode, NULL, &vnops);
 	VOP_UNLOCK(vp);
 	if ((fmode & O_TRUNC) != 0) {
 		error = fo_truncate(fp, 0, td->td_ucred, td);
 		if (error != 0)
 			goto bad;
 	}
 
 	error = finstall(td, fp, &indx, fmode, NULL);
 bad:
 	fdrop(fp, td);
 	td->td_retval[0] = indx;
 	return (error);
 }
 
 /*
  * Stat an (NFS) file handle.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstat_args {
 	struct fhandle *u_fhp;
 	struct stat *sb;
 };
 #endif
 int
 sys_fhstat(struct thread *td, struct fhstat_args *uap)
 {
 	struct stat sb;
 	struct fhandle fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fh));
 	if (error != 0)
 		return (error);
 	error = kern_fhstat(td, fh, &sb);
 	if (error == 0)
 		error = copyout(&sb, uap->sb, sizeof(sb));
 	return (error);
 }
 
 int
 kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_FHSTAT);
 	if (error != 0)
 		return (error);
 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
 	vfs_unbusy(mp);
 	if (error != 0)
 		return (error);
-	error = VOP_STAT(vp, sb, td->td_ucred, NOCRED, td);
+	error = VOP_STAT(vp, sb, td->td_ucred, NOCRED);
 	vput(vp);
 	return (error);
 }
 
 /*
  * Implement fstatfs() for (NFS) file handles.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstatfs_args {
 	struct fhandle *u_fhp;
 	struct statfs *buf;
 };
 #endif
 int
 sys_fhstatfs(struct thread *td, struct fhstatfs_args *uap)
 {
 	struct statfs *sfp;
 	fhandle_t fh;
 	int error;
 
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error != 0)
 		return (error);
 	sfp = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK);
 	error = kern_fhstatfs(td, fh, sfp);
 	if (error == 0)
 		error = copyout(sfp, uap->buf, sizeof(*sfp));
 	free(sfp, M_STATFS);
 	return (error);
 }
 
 int
 kern_fhstatfs(struct thread *td, fhandle_t fh, struct statfs *buf)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	error = priv_check(td, PRIV_VFS_FHSTATFS);
 	if (error != 0)
 		return (error);
 	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
 	if (error != 0) {
 		vfs_unbusy(mp);
 		return (error);
 	}
 	vput(vp);
 	error = prison_canseemount(td->td_ucred, mp);
 	if (error != 0)
 		goto out;
 #ifdef MAC
 	error = mac_mount_check_stat(td->td_ucred, mp);
 	if (error != 0)
 		goto out;
 #endif
 	error = VFS_STATFS(mp, buf);
 out:
 	vfs_unbusy(mp);
 	return (error);
 }
 
 /*
  * Unlike madvise(2), we do not make a best effort to remember every
  * possible caching hint.  Instead, we remember the last setting with
  * the exception that we will allow POSIX_FADV_NORMAL to adjust the
  * region of any current setting.
  */
 int
 kern_posix_fadvise(struct thread *td, int fd, off_t offset, off_t len,
     int advice)
 {
 	struct fadvise_info *fa, *new;
 	struct file *fp;
 	struct vnode *vp;
 	off_t end;
 	int error;
 
 	if (offset < 0 || len < 0 || offset > OFF_MAX - len)
 		return (EINVAL);
 	AUDIT_ARG_VALUE(advice);
 	switch (advice) {
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_RANDOM:
 	case POSIX_FADV_NOREUSE:
 		new = malloc(sizeof(*fa), M_FADVISE, M_WAITOK);
 		break;
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_WILLNEED:
 	case POSIX_FADV_DONTNEED:
 		new = NULL;
 		break;
 	default:
 		return (EINVAL);
 	}
 	/* XXX: CAP_POSIX_FADVISE? */
 	AUDIT_ARG_FD(fd);
 	error = fget(td, fd, &cap_no_rights, &fp);
 	if (error != 0)
 		goto out;
 	AUDIT_ARG_FILE(td->td_proc, fp);
 	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
 		error = ESPIPE;
 		goto out;
 	}
 	if (fp->f_type != DTYPE_VNODE) {
 		error = ENODEV;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp->v_type != VREG) {
 		error = ENODEV;
 		goto out;
 	}
 	if (len == 0)
 		end = OFF_MAX;
 	else
 		end = offset + len - 1;
 	switch (advice) {
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_RANDOM:
 	case POSIX_FADV_NOREUSE:
 		/*
 		 * Try to merge any existing non-standard region with
 		 * this new region if possible, otherwise create a new
 		 * non-standard region for this request.
 		 */
 		mtx_pool_lock(mtxpool_sleep, fp);
 		fa = fp->f_advice;
 		if (fa != NULL && fa->fa_advice == advice &&
 		    ((fa->fa_start <= end && fa->fa_end >= offset) ||
 		    (end != OFF_MAX && fa->fa_start == end + 1) ||
 		    (fa->fa_end != OFF_MAX && fa->fa_end + 1 == offset))) {
 			if (offset < fa->fa_start)
 				fa->fa_start = offset;
 			if (end > fa->fa_end)
 				fa->fa_end = end;
 		} else {
 			new->fa_advice = advice;
 			new->fa_start = offset;
 			new->fa_end = end;
 			fp->f_advice = new;
 			new = fa;
 		}
 		mtx_pool_unlock(mtxpool_sleep, fp);
 		break;
 	case POSIX_FADV_NORMAL:
 		/*
 		 * If a the "normal" region overlaps with an existing
 		 * non-standard region, trim or remove the
 		 * non-standard region.
 		 */
 		mtx_pool_lock(mtxpool_sleep, fp);
 		fa = fp->f_advice;
 		if (fa != NULL) {
 			if (offset <= fa->fa_start && end >= fa->fa_end) {
 				new = fa;
 				fp->f_advice = NULL;
 			} else if (offset <= fa->fa_start &&
 			    end >= fa->fa_start)
 				fa->fa_start = end + 1;
 			else if (offset <= fa->fa_end && end >= fa->fa_end)
 				fa->fa_end = offset - 1;
 			else if (offset >= fa->fa_start && end <= fa->fa_end) {
 				/*
 				 * If the "normal" region is a middle
 				 * portion of the existing
 				 * non-standard region, just remove
 				 * the whole thing rather than picking
 				 * one side or the other to
 				 * preserve.
 				 */
 				new = fa;
 				fp->f_advice = NULL;
 			}
 		}
 		mtx_pool_unlock(mtxpool_sleep, fp);
 		break;
 	case POSIX_FADV_WILLNEED:
 	case POSIX_FADV_DONTNEED:
 		error = VOP_ADVISE(vp, offset, end, advice);
 		break;
 	}
 out:
 	if (fp != NULL)
 		fdrop(fp, td);
 	free(new, M_FADVISE);
 	return (error);
 }
 
 int
 sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
 {
 	int error;
 
 	error = kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
 	    uap->advice);
 	return (kern_posix_error(td, error));
 }
 
 int
 kern_copy_file_range(struct thread *td, int infd, off_t *inoffp, int outfd,
     off_t *outoffp, size_t len, unsigned int flags)
 {
 	struct file *infp, *outfp;
 	struct vnode *invp, *outvp;
 	int error;
 	size_t retlen;
 	void *rl_rcookie, *rl_wcookie;
 	off_t savinoff, savoutoff;
 
 	infp = outfp = NULL;
 	rl_rcookie = rl_wcookie = NULL;
 	savinoff = -1;
 	error = 0;
 	retlen = 0;
 
 	if (flags != 0) {
 		error = EINVAL;
 		goto out;
 	}
 	if (len > SSIZE_MAX)
 		/*
 		 * Although the len argument is size_t, the return argument
 		 * is ssize_t (which is signed).  Therefore a size that won't
 		 * fit in ssize_t can't be returned.
 		 */
 		len = SSIZE_MAX;
 
 	/* Get the file structures for the file descriptors. */
 	error = fget_read(td, infd, &cap_read_rights, &infp);
 	if (error != 0)
 		goto out;
 	if (infp->f_ops == &badfileops) {
 		error = EBADF;
 		goto out;
 	}
 	if (infp->f_vnode == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 	error = fget_write(td, outfd, &cap_write_rights, &outfp);
 	if (error != 0)
 		goto out;
 	if (outfp->f_ops == &badfileops) {
 		error = EBADF;
 		goto out;
 	}
 	if (outfp->f_vnode == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/* Set the offset pointers to the correct place. */
 	if (inoffp == NULL)
 		inoffp = &infp->f_offset;
 	if (outoffp == NULL)
 		outoffp = &outfp->f_offset;
 	savinoff = *inoffp;
 	savoutoff = *outoffp;
 
 	invp = infp->f_vnode;
 	outvp = outfp->f_vnode;
 	/* Sanity check the f_flag bits. */
 	if ((outfp->f_flag & (FWRITE | FAPPEND)) != FWRITE ||
 	    (infp->f_flag & FREAD) == 0) {
 		error = EBADF;
 		goto out;
 	}
 
 	/* If len == 0, just return 0. */
 	if (len == 0)
 		goto out;
 
 	/*
 	 * If infp and outfp refer to the same file, the byte ranges cannot
 	 * overlap.
 	 */
 	if (invp == outvp && ((savinoff <= savoutoff && savinoff + len >
 	    savoutoff) || (savinoff > savoutoff && savoutoff + len >
 	    savinoff))) {
 		error = EINVAL;
 		goto out;
 	}
 
 	/* Range lock the byte ranges for both invp and outvp. */
 	for (;;) {
 		rl_wcookie = vn_rangelock_wlock(outvp, *outoffp, *outoffp +
 		    len);
 		rl_rcookie = vn_rangelock_tryrlock(invp, *inoffp, *inoffp +
 		    len);
 		if (rl_rcookie != NULL)
 			break;
 		vn_rangelock_unlock(outvp, rl_wcookie);
 		rl_rcookie = vn_rangelock_rlock(invp, *inoffp, *inoffp + len);
 		vn_rangelock_unlock(invp, rl_rcookie);
 	}
 
 	retlen = len;
 	error = vn_copy_file_range(invp, inoffp, outvp, outoffp, &retlen,
 	    flags, infp->f_cred, outfp->f_cred, td);
 out:
 	if (rl_rcookie != NULL)
 		vn_rangelock_unlock(invp, rl_rcookie);
 	if (rl_wcookie != NULL)
 		vn_rangelock_unlock(outvp, rl_wcookie);
 	if (savinoff != -1 && (error == EINTR || error == ERESTART)) {
 		*inoffp = savinoff;
 		*outoffp = savoutoff;
 	}
 	if (outfp != NULL)
 		fdrop(outfp, td);
 	if (infp != NULL)
 		fdrop(infp, td);
 	td->td_retval[0] = retlen;
 	return (error);
 }
 
 int
 sys_copy_file_range(struct thread *td, struct copy_file_range_args *uap)
 {
 	off_t inoff, outoff, *inoffp, *outoffp;
 	int error;
 
 	inoffp = outoffp = NULL;
 	if (uap->inoffp != NULL) {
 		error = copyin(uap->inoffp, &inoff, sizeof(off_t));
 		if (error != 0)
 			return (error);
 		inoffp = &inoff;
 	}
 	if (uap->outoffp != NULL) {
 		error = copyin(uap->outoffp, &outoff, sizeof(off_t));
 		if (error != 0)
 			return (error);
 		outoffp = &outoff;
 	}
 	error = kern_copy_file_range(td, uap->infd, inoffp, uap->outfd,
 	    outoffp, uap->len, uap->flags);
 	if (error == 0 && uap->inoffp != NULL)
 		error = copyout(inoffp, uap->inoffp, sizeof(off_t));
 	if (error == 0 && uap->outoffp != NULL)
 		error = copyout(outoffp, uap->outoffp, sizeof(off_t));
 	return (error);
 }
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 2c25a25da6c6..d6c472995489 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -1,3708 +1,3707 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Copyright (c) 2012 Konstantin Belousov <kib@FreeBSD.org>
  * Copyright (c) 2013, 2014 The FreeBSD Foundation
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/disk.h>
 #include <sys/fail.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/stat.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/filio.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/prng.h>
 #include <sys/sx.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/ttycom.h>
 #include <sys/conf.h>
 #include <sys/syslog.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/ktrace.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 static fo_rdwr_t	vn_read;
 static fo_rdwr_t	vn_write;
 static fo_rdwr_t	vn_io_fault;
 static fo_truncate_t	vn_truncate;
 static fo_ioctl_t	vn_ioctl;
 static fo_poll_t	vn_poll;
 static fo_kqfilter_t	vn_kqfilter;
 static fo_close_t	vn_closefile;
 static fo_mmap_t	vn_mmap;
 static fo_fallocate_t	vn_fallocate;
 static fo_fspacectl_t	vn_fspacectl;
 
 struct 	fileops vnops = {
 	.fo_read = vn_io_fault,
 	.fo_write = vn_io_fault,
 	.fo_truncate = vn_truncate,
 	.fo_ioctl = vn_ioctl,
 	.fo_poll = vn_poll,
 	.fo_kqfilter = vn_kqfilter,
 	.fo_stat = vn_statfile,
 	.fo_close = vn_closefile,
 	.fo_chmod = vn_chmod,
 	.fo_chown = vn_chown,
 	.fo_sendfile = vn_sendfile,
 	.fo_seek = vn_seek,
 	.fo_fill_kinfo = vn_fill_kinfo,
 	.fo_mmap = vn_mmap,
 	.fo_fallocate = vn_fallocate,
 	.fo_fspacectl = vn_fspacectl,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 const u_int io_hold_cnt = 16;
 static int vn_io_fault_enable = 1;
 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_enable, CTLFLAG_RWTUN,
     &vn_io_fault_enable, 0, "Enable vn_io_fault lock avoidance");
 static int vn_io_fault_prefault = 0;
 SYSCTL_INT(_debug, OID_AUTO, vn_io_fault_prefault, CTLFLAG_RWTUN,
     &vn_io_fault_prefault, 0, "Enable vn_io_fault prefaulting");
 static int vn_io_pgcache_read_enable = 1;
 SYSCTL_INT(_debug, OID_AUTO, vn_io_pgcache_read_enable, CTLFLAG_RWTUN,
     &vn_io_pgcache_read_enable, 0,
     "Enable copying from page cache for reads, avoiding fs");
 static u_long vn_io_faults_cnt;
 SYSCTL_ULONG(_debug, OID_AUTO, vn_io_faults, CTLFLAG_RD,
     &vn_io_faults_cnt, 0, "Count of vn_io_fault lock avoidance triggers");
 
 static int vfs_allow_read_dir = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, allow_read_dir, CTLFLAG_RW,
     &vfs_allow_read_dir, 0,
     "Enable read(2) of directory by root for filesystems that support it");
 
 /*
  * Returns true if vn_io_fault mode of handling the i/o request should
  * be used.
  */
 static bool
 do_vn_io_fault(struct vnode *vp, struct uio *uio)
 {
 	struct mount *mp;
 
 	return (uio->uio_segflg == UIO_USERSPACE && vp->v_type == VREG &&
 	    (mp = vp->v_mount) != NULL &&
 	    (mp->mnt_kern_flag & MNTK_NO_IOPF) != 0 && vn_io_fault_enable);
 }
 
 /*
  * Structure used to pass arguments to vn_io_fault1(), to do either
  * file- or vnode-based I/O calls.
  */
 struct vn_io_fault_args {
 	enum {
 		VN_IO_FAULT_FOP,
 		VN_IO_FAULT_VOP
 	} kind;
 	struct ucred *cred;
 	int flags;
 	union {
 		struct fop_args_tag {
 			struct file *fp;
 			fo_rdwr_t *doio;
 		} fop_args;
 		struct vop_args_tag {
 			struct vnode *vp;
 		} vop_args;
 	} args;
 };
 
 static int vn_io_fault1(struct vnode *vp, struct uio *uio,
     struct vn_io_fault_args *args, struct thread *td);
 
 int
 vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp)
 {
 	struct thread *td = curthread;
 
 	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
 }
 
 static uint64_t
 open2nameif(int fmode, u_int vn_open_flags)
 {
 	uint64_t res;
 
 	res = ISOPEN | LOCKLEAF;
 	if ((fmode & O_RESOLVE_BENEATH) != 0)
 		res |= RBENEATH;
 	if ((fmode & O_EMPTY_PATH) != 0)
 		res |= EMPTYPATH;
 	if ((fmode & FREAD) != 0)
 		res |= OPENREAD;
 	if ((fmode & FWRITE) != 0)
 		res |= OPENWRITE;
 	if ((vn_open_flags & VN_OPEN_NOAUDIT) == 0)
 		res |= AUDITVNODE1;
 	if ((vn_open_flags & VN_OPEN_NOCAPCHECK) != 0)
 		res |= NOCAPCHECK;
 	return (res);
 }
 
 /*
  * Common code for vnode open operations via a name lookup.
  * Lookup the vnode and invoke VOP_CREATE if needed.
  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  *
  * Note that this does NOT free nameidata for the successful case,
  * due to the NDINIT being done elsewhere.
  */
 int
 vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags,
     struct ucred *cred, struct file *fp)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	int fmode, error;
 	bool first_open;
 
 restart:
 	first_open = false;
 	fmode = *flagp;
 	if ((fmode & (O_CREAT | O_EXCL | O_DIRECTORY)) == (O_CREAT |
 	    O_EXCL | O_DIRECTORY) ||
 	    (fmode & (O_CREAT | O_EMPTY_PATH)) == (O_CREAT | O_EMPTY_PATH))
 		return (EINVAL);
 	else if ((fmode & (O_CREAT | O_DIRECTORY)) == O_CREAT) {
 		ndp->ni_cnd.cn_nameiop = CREATE;
 		ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 		/*
 		 * Set NOCACHE to avoid flushing the cache when
 		 * rolling in many files at once.
 		 *
 		 * Set NC_KEEPPOSENTRY to keep positive entries if they already
 		 * exist despite NOCACHE.
 		 */
 		ndp->ni_cnd.cn_flags |= LOCKPARENT | NOCACHE | NC_KEEPPOSENTRY;
 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 			ndp->ni_cnd.cn_flags |= FOLLOW;
 		if ((vn_open_flags & VN_OPEN_INVFS) == 0)
 			bwillwrite();
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		if (ndp->ni_vp == NULL) {
 			VATTR_NULL(vap);
 			vap->va_type = VREG;
 			vap->va_mode = cmode;
 			if (fmode & O_EXCL)
 				vap->va_vaflags |= VA_EXCLUSIVE;
 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 				NDFREE(ndp, NDF_ONLY_PNBUF);
 				vput(ndp->ni_dvp);
 				if ((error = vn_start_write(NULL, &mp,
 				    V_XSLEEP | PCATCH)) != 0)
 					return (error);
 				NDREINIT(ndp);
 				goto restart;
 			}
 			if ((vn_open_flags & VN_OPEN_NAMECACHE) != 0)
 				ndp->ni_cnd.cn_flags |= MAKEENTRY;
 #ifdef MAC
 			error = mac_vnode_check_create(cred, ndp->ni_dvp,
 			    &ndp->ni_cnd, vap);
 			if (error == 0)
 #endif
 				error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 				    &ndp->ni_cnd, vap);
 			vp = ndp->ni_vp;
 			if (error == 0 && (fmode & O_EXCL) != 0 &&
 			    (fmode & (O_EXLOCK | O_SHLOCK)) != 0) {
 				VI_LOCK(vp);
 				vp->v_iflag |= VI_FOPENING;
 				VI_UNLOCK(vp);
 				first_open = true;
 			}
 			VOP_VPUT_PAIR(ndp->ni_dvp, error == 0 ? &vp : NULL,
 			    false);
 			vn_finished_write(mp);
 			if (error) {
 				NDFREE(ndp, NDF_ONLY_PNBUF);
 				if (error == ERELOOKUP) {
 					NDREINIT(ndp);
 					goto restart;
 				}
 				return (error);
 			}
 			fmode &= ~O_TRUNC;
 		} else {
 			if (ndp->ni_dvp == ndp->ni_vp)
 				vrele(ndp->ni_dvp);
 			else
 				vput(ndp->ni_dvp);
 			ndp->ni_dvp = NULL;
 			vp = ndp->ni_vp;
 			if (fmode & O_EXCL) {
 				error = EEXIST;
 				goto bad;
 			}
 			if (vp->v_type == VDIR) {
 				error = EISDIR;
 				goto bad;
 			}
 			fmode &= ~O_CREAT;
 		}
 	} else {
 		ndp->ni_cnd.cn_nameiop = LOOKUP;
 		ndp->ni_cnd.cn_flags = open2nameif(fmode, vn_open_flags);
 		ndp->ni_cnd.cn_flags |= (fmode & O_NOFOLLOW) != 0 ? NOFOLLOW :
 		    FOLLOW;
 		if ((fmode & FWRITE) == 0)
 			ndp->ni_cnd.cn_flags |= LOCKSHARED;
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		vp = ndp->ni_vp;
 	}
 	error = vn_open_vnode(vp, fmode, cred, curthread, fp);
 	if (first_open) {
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_FOPENING;
 		wakeup(vp);
 		VI_UNLOCK(vp);
 	}
 	if (error)
 		goto bad;
 	*flagp = fmode;
 	return (0);
 bad:
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	vput(vp);
 	*flagp = fmode;
 	ndp->ni_vp = NULL;
 	return (error);
 }
 
 static int
 vn_open_vnode_advlock(struct vnode *vp, int fmode, struct file *fp)
 {
 	struct flock lf;
 	int error, lock_flags, type;
 
 	ASSERT_VOP_LOCKED(vp, "vn_open_vnode_advlock");
 	if ((fmode & (O_EXLOCK | O_SHLOCK)) == 0)
 		return (0);
 	KASSERT(fp != NULL, ("open with flock requires fp"));
 	if (fp->f_type != DTYPE_NONE && fp->f_type != DTYPE_VNODE)
 		return (EOPNOTSUPP);
 
 	lock_flags = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp);
 
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	lf.l_type = (fmode & O_EXLOCK) != 0 ? F_WRLCK : F_RDLCK;
 	type = F_FLOCK;
 	if ((fmode & FNONBLOCK) == 0)
 		type |= F_WAIT;
 	if ((fmode & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
 		type |= F_FIRSTOPEN;
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type);
 	if (error == 0)
 		fp->f_flag |= FHASLOCK;
 
 	vn_lock(vp, lock_flags | LK_RETRY);
 	return (error);
 }
 
 /*
  * Common code for vnode open operations once a vnode is located.
  * Check permissions, and call the VOP_OPEN routine.
  */
 int
 vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
     struct thread *td, struct file *fp)
 {
 	accmode_t accmode;
 	int error;
 
 	if (vp->v_type == VLNK) {
 		if ((fmode & O_PATH) == 0 || (fmode & FEXEC) != 0)
 			return (EMLINK);
 	}
 	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
 		return (ENOTDIR);
 
 	accmode = 0;
 	if ((fmode & O_PATH) == 0) {
 		if (vp->v_type == VSOCK)
 			return (EOPNOTSUPP);
 		if ((fmode & (FWRITE | O_TRUNC)) != 0) {
 			if (vp->v_type == VDIR)
 				return (EISDIR);
 			accmode |= VWRITE;
 		}
 		if ((fmode & FREAD) != 0)
 			accmode |= VREAD;
 		if ((fmode & O_APPEND) && (fmode & FWRITE))
 			accmode |= VAPPEND;
 #ifdef MAC
 		if ((fmode & O_CREAT) != 0)
 			accmode |= VCREAT;
 #endif
 	}
 	if ((fmode & FEXEC) != 0)
 		accmode |= VEXEC;
 #ifdef MAC
 	if ((fmode & O_VERIFY) != 0)
 		accmode |= VVERIFY;
 	error = mac_vnode_check_open(cred, vp, accmode);
 	if (error != 0)
 		return (error);
 
 	accmode &= ~(VCREAT | VVERIFY);
 #endif
 	if ((fmode & O_CREAT) == 0 && accmode != 0) {
 		error = VOP_ACCESS(vp, accmode, cred, td);
 		if (error != 0)
 			return (error);
 	}
 	if ((fmode & O_PATH) != 0) {
 		if (vp->v_type != VFIFO && vp->v_type != VSOCK &&
 		    VOP_ACCESS(vp, VREAD, cred, td) == 0)
 			fp->f_flag |= FKQALLOWED;
 		return (0);
 	}
 
 	if (vp->v_type == VFIFO && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 		vn_lock(vp, LK_UPGRADE | LK_RETRY);
 	error = VOP_OPEN(vp, fmode, cred, td, fp);
 	if (error != 0)
 		return (error);
 
 	error = vn_open_vnode_advlock(vp, fmode, fp);
 	if (error == 0 && (fmode & FWRITE) != 0) {
 		error = VOP_ADD_WRITECOUNT(vp, 1);
 		if (error == 0) {
 			CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 			     __func__, vp, vp->v_writecount);
 		}
 	}
 
 	/*
 	 * Error from advlock or VOP_ADD_WRITECOUNT() still requires
 	 * calling VOP_CLOSE() to pair with earlier VOP_OPEN().
 	 */
 	if (error != 0) {
 		if (fp != NULL) {
 			/*
 			 * Arrange the call by having fdrop() to use
 			 * vn_closefile().  This is to satisfy
 			 * filesystems like devfs or tmpfs, which
 			 * override fo_close().
 			 */
 			fp->f_flag |= FOPENFAILED;
 			fp->f_vnode = vp;
 			if (fp->f_ops == &badfileops) {
 				fp->f_type = DTYPE_VNODE;
 				fp->f_ops = &vnops;
 			}
 			vref(vp);
 		} else {
 			/*
 			 * If there is no fp, due to kernel-mode open,
 			 * we can call VOP_CLOSE() now.
 			 */
 			if (vp->v_type != VFIFO && (fmode & FWRITE) != 0 &&
 			    !MNT_EXTENDED_SHARED(vp->v_mount) &&
 			    VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
 				vn_lock(vp, LK_UPGRADE | LK_RETRY);
 			(void)VOP_CLOSE(vp, fmode & (FREAD | FWRITE | FEXEC),
 			    cred, td);
 		}
 	}
 
 	ASSERT_VOP_LOCKED(vp, "vn_open_vnode");
 	return (error);
 
 }
 
 /*
  * Check for write permissions on the specified vnode.
  * Prototype text segments cannot be written.
  * It is racy.
  */
 int
 vn_writechk(struct vnode *vp)
 {
 
 	ASSERT_VOP_LOCKED(vp, "vn_writechk");
 	/*
 	 * If there's shared text associated with
 	 * the vnode, try to free it up once.  If
 	 * we fail, we can't allow writing.
 	 */
 	if (VOP_IS_TEXT(vp))
 		return (ETXTBSY);
 
 	return (0);
 }
 
 /*
  * Vnode close call
  */
 static int
 vn_close1(struct vnode *vp, int flags, struct ucred *file_cred,
     struct thread *td, bool keep_ref)
 {
 	struct mount *mp;
 	int error, lock_flags;
 
 	if (vp->v_type != VFIFO && (flags & FWRITE) == 0 &&
 	    MNT_EXTENDED_SHARED(vp->v_mount))
 		lock_flags = LK_SHARED;
 	else
 		lock_flags = LK_EXCLUSIVE;
 
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, lock_flags | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	if ((flags & (FWRITE | FOPENFAILED)) == FWRITE) {
 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
 	error = VOP_CLOSE(vp, flags, file_cred, td);
 	if (keep_ref)
 		VOP_UNLOCK(vp);
 	else
 		vput(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 int
 vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
     struct thread *td)
 {
 
 	return (vn_close1(vp, flags, file_cred, td, false));
 }
 
 /*
  * Heuristic to detect sequential operation.
  */
 static int
 sequential_heuristic(struct uio *uio, struct file *fp)
 {
 	enum uio_rw rw;
 
 	ASSERT_VOP_LOCKED(fp->f_vnode, __func__);
 
 	rw = uio->uio_rw;
 	if (fp->f_flag & FRDAHEAD)
 		return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 
 	/*
 	 * Offset 0 is handled specially.  open() sets f_seqcount to 1 so
 	 * that the first I/O is normally considered to be slightly
 	 * sequential.  Seeking to offset 0 doesn't change sequentiality
 	 * unless previous seeks have reduced f_seqcount to 0, in which
 	 * case offset 0 is not special.
 	 */
 	if ((uio->uio_offset == 0 && fp->f_seqcount[rw] > 0) ||
 	    uio->uio_offset == fp->f_nextoff[rw]) {
 		/*
 		 * f_seqcount is in units of fixed-size blocks so that it
 		 * depends mainly on the amount of sequential I/O and not
 		 * much on the number of sequential I/O's.  The fixed size
 		 * of 16384 is hard-coded here since it is (not quite) just
 		 * a magic size that works well here.  This size is more
 		 * closely related to the best I/O size for real disks than
 		 * to any block size used by software.
 		 */
 		if (uio->uio_resid >= IO_SEQMAX * 16384)
 			fp->f_seqcount[rw] = IO_SEQMAX;
 		else {
 			fp->f_seqcount[rw] += howmany(uio->uio_resid, 16384);
 			if (fp->f_seqcount[rw] > IO_SEQMAX)
 				fp->f_seqcount[rw] = IO_SEQMAX;
 		}
 		return (fp->f_seqcount[rw] << IO_SEQSHIFT);
 	}
 
 	/* Not sequential.  Quickly draw-down sequentiality. */
 	if (fp->f_seqcount[rw] > 1)
 		fp->f_seqcount[rw] = 1;
 	else
 		fp->f_seqcount[rw] = 0;
 	return (0);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
 vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
     enum uio_seg segflg, int ioflg, struct ucred *active_cred,
     struct ucred *file_cred, ssize_t *aresid, struct thread *td)
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	struct ucred *cred;
 	void *rl_cookie;
 	struct vn_io_fault_args args;
 	int error, lock_flags;
 
 	if (offset < 0 && vp->v_type != VCHR)
 		return (EINVAL);
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = base;
 	aiov.iov_len = len;
 	auio.uio_resid = len;
 	auio.uio_offset = offset;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = rw;
 	auio.uio_td = td;
 	error = 0;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((ioflg & IO_RANGELOCKED) == 0) {
 			if (rw == UIO_READ) {
 				rl_cookie = vn_rangelock_rlock(vp, offset,
 				    offset + len);
 			} else if ((ioflg & IO_APPEND) != 0) {
 				rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 			} else {
 				rl_cookie = vn_rangelock_wlock(vp, offset,
 				    offset + len);
 			}
 		} else
 			rl_cookie = NULL;
 		mp = NULL;
 		if (rw == UIO_WRITE) { 
 			if (vp->v_type != VCHR &&
 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 			    != 0)
 				goto out;
 			lock_flags = vn_lktype_write(mp, vp);
 		} else
 			lock_flags = LK_SHARED;
 		vn_lock(vp, lock_flags | LK_RETRY);
 	} else
 		rl_cookie = NULL;
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) == 0) {
 		if (rw == UIO_READ)
 			error = mac_vnode_check_read(active_cred, file_cred,
 			    vp);
 		else
 			error = mac_vnode_check_write(active_cred, file_cred,
 			    vp);
 	}
 #endif
 	if (error == 0) {
 		if (file_cred != NULL)
 			cred = file_cred;
 		else
 			cred = active_cred;
 		if (do_vn_io_fault(vp, &auio)) {
 			args.kind = VN_IO_FAULT_VOP;
 			args.cred = cred;
 			args.flags = ioflg;
 			args.args.vop_args.vp = vp;
 			error = vn_io_fault1(vp, &auio, &args, td);
 		} else if (rw == UIO_READ) {
 			error = VOP_READ(vp, &auio, ioflg, cred);
 		} else /* if (rw == UIO_WRITE) */ {
 			error = VOP_WRITE(vp, &auio, ioflg, cred);
 		}
 	}
 	if (aresid)
 		*aresid = auio.uio_resid;
 	else
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		VOP_UNLOCK(vp);
 		if (mp != NULL)
 			vn_finished_write(mp);
 	}
  out:
 	if (rl_cookie != NULL)
 		vn_rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.  The I/O
  * request is split up into smaller chunks and we try to avoid saturating
  * the buffer cache while potentially holding a vnode locked, so we 
  * check bwillwrite() before calling vn_rdwr().  We also call kern_yield()
  * to give other processes a chance to lock the vnode (either other processes
  * core'ing the same binary, or unrelated processes scanning the directory).
  */
 int
 vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len,
     off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred,
     struct ucred *file_cred, size_t *aresid, struct thread *td)
 {
 	int error = 0;
 	ssize_t iaresid;
 
 	do {
 		int chunk;
 
 		/*
 		 * Force `offset' to a multiple of MAXBSIZE except possibly
 		 * for the first chunk, so that filesystems only need to
 		 * write full blocks except possibly for the first and last
 		 * chunks.
 		 */
 		chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
 
 		if (chunk > len)
 			chunk = len;
 		if (rw != UIO_READ && vp->v_type == VREG)
 			bwillwrite();
 		iaresid = 0;
 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 		    ioflg, active_cred, file_cred, &iaresid, td);
 		len -= chunk;	/* aresid calc already includes length */
 		if (error)
 			break;
 		offset += chunk;
 		base = (char *)base + chunk;
 		kern_yield(PRI_USER);
 	} while (len);
 	if (aresid)
 		*aresid = len + iaresid;
 	return (error);
 }
 
 #if OFF_MAX <= LONG_MAX
 off_t
 foffset_lock(struct file *fp, int flags)
 {
 	volatile short *flagsp;
 	off_t res;
 	short state;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	if ((flags & FOF_NOLOCK) != 0)
 		return (atomic_load_long(&fp->f_offset));
 
 	/*
 	 * According to McKusick the vn lock was protecting f_offset here.
 	 * It is now protected by the FOFFSET_LOCKED flag.
 	 */
 	flagsp = &fp->f_vnread_flags;
 	if (atomic_cmpset_acq_16(flagsp, 0, FOFFSET_LOCKED))
 		return (atomic_load_long(&fp->f_offset));
 
 	sleepq_lock(&fp->f_vnread_flags);
 	state = atomic_load_16(flagsp);
 	for (;;) {
 		if ((state & FOFFSET_LOCKED) == 0) {
 			if (!atomic_fcmpset_acq_16(flagsp, &state,
 			    FOFFSET_LOCKED))
 				continue;
 			break;
 		}
 		if ((state & FOFFSET_LOCK_WAITING) == 0) {
 			if (!atomic_fcmpset_acq_16(flagsp, &state,
 			    state | FOFFSET_LOCK_WAITING))
 				continue;
 		}
 		DROP_GIANT();
 		sleepq_add(&fp->f_vnread_flags, NULL, "vofflock", 0, 0);
 		sleepq_wait(&fp->f_vnread_flags, PUSER -1);
 		PICKUP_GIANT();
 		sleepq_lock(&fp->f_vnread_flags);
 		state = atomic_load_16(flagsp);
 	}
 	res = atomic_load_long(&fp->f_offset);
 	sleepq_release(&fp->f_vnread_flags);
 	return (res);
 }
 
 void
 foffset_unlock(struct file *fp, off_t val, int flags)
 {
 	volatile short *flagsp;
 	short state;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	if ((flags & FOF_NOUPDATE) == 0)
 		atomic_store_long(&fp->f_offset, val);
 	if ((flags & FOF_NEXTOFF_R) != 0)
 		fp->f_nextoff[UIO_READ] = val;
 	if ((flags & FOF_NEXTOFF_W) != 0)
 		fp->f_nextoff[UIO_WRITE] = val;
 
 	if ((flags & FOF_NOLOCK) != 0)
 		return;
 
 	flagsp = &fp->f_vnread_flags;
 	state = atomic_load_16(flagsp);
 	if ((state & FOFFSET_LOCK_WAITING) == 0 &&
 	    atomic_cmpset_rel_16(flagsp, state, 0))
 		return;
 
 	sleepq_lock(&fp->f_vnread_flags);
 	MPASS((fp->f_vnread_flags & FOFFSET_LOCKED) != 0);
 	MPASS((fp->f_vnread_flags & FOFFSET_LOCK_WAITING) != 0);
 	fp->f_vnread_flags = 0;
 	sleepq_broadcast(&fp->f_vnread_flags, SLEEPQ_SLEEP, 0, 0);
 	sleepq_release(&fp->f_vnread_flags);
 }
 #else
 off_t
 foffset_lock(struct file *fp, int flags)
 {
 	struct mtx *mtxp;
 	off_t res;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if ((flags & FOF_NOLOCK) == 0) {
 		while (fp->f_vnread_flags & FOFFSET_LOCKED) {
 			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
 			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
 			    "vofflock", 0);
 		}
 		fp->f_vnread_flags |= FOFFSET_LOCKED;
 	}
 	res = fp->f_offset;
 	mtx_unlock(mtxp);
 	return (res);
 }
 
 void
 foffset_unlock(struct file *fp, off_t val, int flags)
 {
 	struct mtx *mtxp;
 
 	KASSERT((flags & FOF_OFFSET) == 0, ("FOF_OFFSET passed"));
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if ((flags & FOF_NOUPDATE) == 0)
 		fp->f_offset = val;
 	if ((flags & FOF_NEXTOFF_R) != 0)
 		fp->f_nextoff[UIO_READ] = val;
 	if ((flags & FOF_NEXTOFF_W) != 0)
 		fp->f_nextoff[UIO_WRITE] = val;
 	if ((flags & FOF_NOLOCK) == 0) {
 		KASSERT((fp->f_vnread_flags & FOFFSET_LOCKED) != 0,
 		    ("Lost FOFFSET_LOCKED"));
 		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
 			wakeup(&fp->f_vnread_flags);
 		fp->f_vnread_flags = 0;
 	}
 	mtx_unlock(mtxp);
 }
 #endif
 
 void
 foffset_lock_uio(struct file *fp, struct uio *uio, int flags)
 {
 
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = foffset_lock(fp, flags);
 }
 
 void
 foffset_unlock_uio(struct file *fp, struct uio *uio, int flags)
 {
 
 	if ((flags & FOF_OFFSET) == 0)
 		foffset_unlock(fp, uio->uio_offset, flags);
 }
 
 static int
 get_advice(struct file *fp, struct uio *uio)
 {
 	struct mtx *mtxp;
 	int ret;
 
 	ret = POSIX_FADV_NORMAL;
 	if (fp->f_advice == NULL || fp->f_vnode->v_type != VREG)
 		return (ret);
 
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 	mtx_lock(mtxp);
 	if (fp->f_advice != NULL &&
 	    uio->uio_offset >= fp->f_advice->fa_start &&
 	    uio->uio_offset + uio->uio_resid <= fp->f_advice->fa_end)
 		ret = fp->f_advice->fa_advice;
 	mtx_unlock(mtxp);
 	return (ret);
 }
 
 static int
 get_write_ioflag(struct file *fp)
 {
 	int ioflag;
 	struct mount *mp;
 	struct vnode *vp;
 
 	ioflag = 0;
 	vp = fp->f_vnode;
 	mp = atomic_load_ptr(&vp->v_mount);
 
 	if ((fp->f_flag & O_DIRECT) != 0)
 		ioflag |= IO_DIRECT;
 
 	if ((fp->f_flag & O_FSYNC) != 0 ||
 	    (mp != NULL && (mp->mnt_flag & MNT_SYNCHRONOUS) != 0))
 		ioflag |= IO_SYNC;
 
 	/*
 	 * For O_DSYNC we set both IO_SYNC and IO_DATASYNC, so that VOP_WRITE()
 	 * or VOP_DEALLOCATE() implementations that don't understand IO_DATASYNC
 	 * fall back to full O_SYNC behavior.
 	 */
 	if ((fp->f_flag & O_DSYNC) != 0)
 		ioflag |= IO_SYNC | IO_DATASYNC;
 
 	return (ioflag);
 }
 
 int
 vn_read_from_obj(struct vnode *vp, struct uio *uio)
 {
 	vm_object_t obj;
 	vm_page_t ma[io_hold_cnt + 2];
 	off_t off, vsz;
 	ssize_t resid;
 	int error, i, j;
 
 	MPASS(uio->uio_resid <= ptoa(io_hold_cnt + 2));
 	obj = atomic_load_ptr(&vp->v_object);
 	if (obj == NULL)
 		return (EJUSTRETURN);
 
 	/*
 	 * Depends on type stability of vm_objects.
 	 */
 	vm_object_pip_add(obj, 1);
 	if ((obj->flags & OBJ_DEAD) != 0) {
 		/*
 		 * Note that object might be already reused from the
 		 * vnode, and the OBJ_DEAD flag cleared.  This is fine,
 		 * we recheck for DOOMED vnode state after all pages
 		 * are busied, and retract then.
 		 *
 		 * But we check for OBJ_DEAD to ensure that we do not
 		 * busy pages while vm_object_terminate_pages()
 		 * processes the queue.
 		 */
 		error = EJUSTRETURN;
 		goto out_pip;
 	}
 
 	resid = uio->uio_resid;
 	off = uio->uio_offset;
 	for (i = 0; resid > 0; i++) {
 		MPASS(i < io_hold_cnt + 2);
 		ma[i] = vm_page_grab_unlocked(obj, atop(off),
 		    VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY |
 		    VM_ALLOC_NOWAIT);
 		if (ma[i] == NULL)
 			break;
 
 		/*
 		 * Skip invalid pages.  Valid mask can be partial only
 		 * at EOF, and we clip later.
 		 */
 		if (vm_page_none_valid(ma[i])) {
 			vm_page_sunbusy(ma[i]);
 			break;
 		}
 
 		resid -= PAGE_SIZE;
 		off += PAGE_SIZE;
 	}
 	if (i == 0) {
 		error = EJUSTRETURN;
 		goto out_pip;
 	}
 
 	/*
 	 * Check VIRF_DOOMED after we busied our pages.  Since
 	 * vgonel() terminates the vnode' vm_object, it cannot
 	 * process past pages busied by us.
 	 */
 	if (VN_IS_DOOMED(vp)) {
 		error = EJUSTRETURN;
 		goto out;
 	}
 
 	resid = PAGE_SIZE - (uio->uio_offset & PAGE_MASK) + ptoa(i - 1);
 	if (resid > uio->uio_resid)
 		resid = uio->uio_resid;
 
 	/*
 	 * Unlocked read of vnp_size is safe because truncation cannot
 	 * pass busied page.  But we load vnp_size into a local
 	 * variable so that possible concurrent extension does not
 	 * break calculation.
 	 */
 #if defined(__powerpc__) && !defined(__powerpc64__)
 	vsz = obj->un_pager.vnp.vnp_size;
 #else
 	vsz = atomic_load_64(&obj->un_pager.vnp.vnp_size);
 #endif
 	if (uio->uio_offset >= vsz) {
 		error = EJUSTRETURN;
 		goto out;
 	}
 	if (uio->uio_offset + resid > vsz)
 		resid = vsz - uio->uio_offset;
 
 	error = vn_io_fault_pgmove(ma, uio->uio_offset & PAGE_MASK, resid, uio);
 
 out:
 	for (j = 0; j < i; j++) {
 		if (error == 0)
 			vm_page_reference(ma[j]);
 		vm_page_sunbusy(ma[j]);
 	}
 out_pip:
 	vm_object_pip_wakeup(obj);
 	if (error != 0)
 		return (error);
 	return (uio->uio_resid == 0 ? 0 : EJUSTRETURN);
 }
 
 /*
  * File table vnode read routine.
  */
 static int
 vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
     struct thread *td)
 {
 	struct vnode *vp;
 	off_t orig_offset;
 	int error, ioflag;
 	int advice;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 
 	/*
 	 * Try to read from page cache.  VIRF_DOOMED check is racy but
 	 * allows us to avoid unneeded work outright.
 	 */
 	if (vn_io_pgcache_read_enable && !mac_vnode_check_read_enabled() &&
 	    (vn_irflag_read(vp) & (VIRF_DOOMED | VIRF_PGREAD)) == VIRF_PGREAD) {
 		error = VOP_READ_PGCACHE(vp, uio, ioflag, fp->f_cred);
 		if (error == 0) {
 			fp->f_nextoff[UIO_READ] = uio->uio_offset;
 			return (0);
 		}
 		if (error != EJUSTRETURN)
 			return (error);
 	}
 
 	advice = get_advice(fp, uio);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* Disable read-ahead for random I/O. */
 		break;
 	}
 	orig_offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
 	fp->f_nextoff[UIO_READ] = uio->uio_offset;
 	VOP_UNLOCK(vp);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
 	    orig_offset != uio->uio_offset)
 		/*
 		 * Use POSIX_FADV_DONTNEED to flush pages and buffers
 		 * for the backing file after a POSIX_FADV_NOREUSE
 		 * read(2).
 		 */
 		error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
 		    POSIX_FADV_DONTNEED);
 	return (error);
 }
 
 /*
  * File table vnode write routine.
  */
 static int
 vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
     struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	off_t orig_offset;
 	int error, ioflag;
 	int advice;
 	bool need_finished_write;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	KASSERT(flags & FOF_OFFSET, ("No FOF_OFFSET"));
 	vp = fp->f_vnode;
 	if (vp->v_type == VREG)
 		bwillwrite();
 	ioflag = IO_UNIT;
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND) != 0)
 		ioflag |= IO_APPEND;
 	if ((fp->f_flag & FNONBLOCK) != 0)
 		ioflag |= IO_NDELAY;
 	ioflag |= get_write_ioflag(fp);
 
 	mp = NULL;
 	need_finished_write = false;
 	if (vp->v_type != VCHR) {
 		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 		if (error != 0)
 			goto unlock;
 		need_finished_write = true;
 	}
 
 	advice = get_advice(fp, uio);
 
 	vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_NOREUSE:
 		ioflag |= sequential_heuristic(uio, fp);
 		break;
 	case POSIX_FADV_RANDOM:
 		/* XXX: Is this correct? */
 		break;
 	}
 	orig_offset = uio->uio_offset;
 
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error == 0)
 #endif
 		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
 	fp->f_nextoff[UIO_WRITE] = uio->uio_offset;
 	VOP_UNLOCK(vp);
 	if (need_finished_write)
 		vn_finished_write(mp);
 	if (error == 0 && advice == POSIX_FADV_NOREUSE &&
 	    orig_offset != uio->uio_offset)
 		/*
 		 * Use POSIX_FADV_DONTNEED to flush pages and buffers
 		 * for the backing file after a POSIX_FADV_NOREUSE
 		 * write(2).
 		 */
 		error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1,
 		    POSIX_FADV_DONTNEED);
 unlock:
 	return (error);
 }
 
 /*
  * The vn_io_fault() is a wrapper around vn_read() and vn_write() to
  * prevent the following deadlock:
  *
  * Assume that the thread A reads from the vnode vp1 into userspace
  * buffer buf1 backed by the pages of vnode vp2.  If a page in buf1 is
  * currently not resident, then system ends up with the call chain
  *   vn_read() -> VOP_READ(vp1) -> uiomove() -> [Page Fault] ->
  *     vm_fault(buf1) -> vnode_pager_getpages(vp2) -> VOP_GETPAGES(vp2)
  * which establishes lock order vp1->vn_lock, then vp2->vn_lock.
  * If, at the same time, thread B reads from vnode vp2 into buffer buf2
  * backed by the pages of vnode vp1, and some page in buf2 is not
  * resident, we get a reversed order vp2->vn_lock, then vp1->vn_lock.
  *
  * To prevent the lock order reversal and deadlock, vn_io_fault() does
  * not allow page faults to happen during VOP_READ() or VOP_WRITE().
  * Instead, it first tries to do the whole range i/o with pagefaults
  * disabled. If all pages in the i/o buffer are resident and mapped,
  * VOP will succeed (ignoring the genuine filesystem errors).
  * Otherwise, we get back EFAULT, and vn_io_fault() falls back to do
  * i/o in chunks, with all pages in the chunk prefaulted and held
  * using vm_fault_quick_hold_pages().
  *
  * Filesystems using this deadlock avoidance scheme should use the
  * array of the held pages from uio, saved in the curthread->td_ma,
  * instead of doing uiomove().  A helper function
  * vn_io_fault_uiomove() converts uiomove request into
  * uiomove_fromphys() over td_ma array.
  *
  * Since vnode locks do not cover the whole i/o anymore, rangelocks
  * make the current i/o request atomic with respect to other i/os and
  * truncations.
  */
 
 /*
  * Decode vn_io_fault_args and perform the corresponding i/o.
  */
 static int
 vn_io_fault_doio(struct vn_io_fault_args *args, struct uio *uio,
     struct thread *td)
 {
 	int error, save;
 
 	error = 0;
 	save = vm_fault_disable_pagefaults();
 	switch (args->kind) {
 	case VN_IO_FAULT_FOP:
 		error = (args->args.fop_args.doio)(args->args.fop_args.fp,
 		    uio, args->cred, args->flags, td);
 		break;
 	case VN_IO_FAULT_VOP:
 		if (uio->uio_rw == UIO_READ) {
 			error = VOP_READ(args->args.vop_args.vp, uio,
 			    args->flags, args->cred);
 		} else if (uio->uio_rw == UIO_WRITE) {
 			error = VOP_WRITE(args->args.vop_args.vp, uio,
 			    args->flags, args->cred);
 		}
 		break;
 	default:
 		panic("vn_io_fault_doio: unknown kind of io %d %d",
 		    args->kind, uio->uio_rw);
 	}
 	vm_fault_enable_pagefaults(save);
 	return (error);
 }
 
 static int
 vn_io_fault_touch(char *base, const struct uio *uio)
 {
 	int r;
 
 	r = fubyte(base);
 	if (r == -1 || (uio->uio_rw == UIO_READ && subyte(base, r) == -1))
 		return (EFAULT);
 	return (0);
 }
 
 static int
 vn_io_fault_prefault_user(const struct uio *uio)
 {
 	char *base;
 	const struct iovec *iov;
 	size_t len;
 	ssize_t resid;
 	int error, i;
 
 	KASSERT(uio->uio_segflg == UIO_USERSPACE,
 	    ("vn_io_fault_prefault userspace"));
 
 	error = i = 0;
 	iov = uio->uio_iov;
 	resid = uio->uio_resid;
 	base = iov->iov_base;
 	len = iov->iov_len;
 	while (resid > 0) {
 		error = vn_io_fault_touch(base, uio);
 		if (error != 0)
 			break;
 		if (len < PAGE_SIZE) {
 			if (len != 0) {
 				error = vn_io_fault_touch(base + len - 1, uio);
 				if (error != 0)
 					break;
 				resid -= len;
 			}
 			if (++i >= uio->uio_iovcnt)
 				break;
 			iov = uio->uio_iov + i;
 			base = iov->iov_base;
 			len = iov->iov_len;
 		} else {
 			len -= PAGE_SIZE;
 			base += PAGE_SIZE;
 			resid -= PAGE_SIZE;
 		}
 	}
 	return (error);
 }
 
 /*
  * Common code for vn_io_fault(), agnostic to the kind of i/o request.
  * Uses vn_io_fault_doio() to make the call to an actual i/o function.
  * Used from vn_rdwr() and vn_io_fault(), which encode the i/o request
  * into args and call vn_io_fault1() to handle faults during the user
  * mode buffer accesses.
  */
 static int
 vn_io_fault1(struct vnode *vp, struct uio *uio, struct vn_io_fault_args *args,
     struct thread *td)
 {
 	vm_page_t ma[io_hold_cnt + 2];
 	struct uio *uio_clone, short_uio;
 	struct iovec short_iovec[1];
 	vm_page_t *prev_td_ma;
 	vm_prot_t prot;
 	vm_offset_t addr, end;
 	size_t len, resid;
 	ssize_t adv;
 	int error, cnt, saveheld, prev_td_ma_cnt;
 
 	if (vn_io_fault_prefault) {
 		error = vn_io_fault_prefault_user(uio);
 		if (error != 0)
 			return (error); /* Or ignore ? */
 	}
 
 	prot = uio->uio_rw == UIO_READ ? VM_PROT_WRITE : VM_PROT_READ;
 
 	/*
 	 * The UFS follows IO_UNIT directive and replays back both
 	 * uio_offset and uio_resid if an error is encountered during the
 	 * operation.  But, since the iovec may be already advanced,
 	 * uio is still in an inconsistent state.
 	 *
 	 * Cache a copy of the original uio, which is advanced to the redo
 	 * point using UIO_NOCOPY below.
 	 */
 	uio_clone = cloneuio(uio);
 	resid = uio->uio_resid;
 
 	short_uio.uio_segflg = UIO_USERSPACE;
 	short_uio.uio_rw = uio->uio_rw;
 	short_uio.uio_td = uio->uio_td;
 
 	error = vn_io_fault_doio(args, uio, td);
 	if (error != EFAULT)
 		goto out;
 
 	atomic_add_long(&vn_io_faults_cnt, 1);
 	uio_clone->uio_segflg = UIO_NOCOPY;
 	uiomove(NULL, resid - uio->uio_resid, uio_clone);
 	uio_clone->uio_segflg = uio->uio_segflg;
 
 	saveheld = curthread_pflags_set(TDP_UIOHELD);
 	prev_td_ma = td->td_ma;
 	prev_td_ma_cnt = td->td_ma_cnt;
 
 	while (uio_clone->uio_resid != 0) {
 		len = uio_clone->uio_iov->iov_len;
 		if (len == 0) {
 			KASSERT(uio_clone->uio_iovcnt >= 1,
 			    ("iovcnt underflow"));
 			uio_clone->uio_iov++;
 			uio_clone->uio_iovcnt--;
 			continue;
 		}
 		if (len > ptoa(io_hold_cnt))
 			len = ptoa(io_hold_cnt);
 		addr = (uintptr_t)uio_clone->uio_iov->iov_base;
 		end = round_page(addr + len);
 		if (end < addr) {
 			error = EFAULT;
 			break;
 		}
 		cnt = atop(end - trunc_page(addr));
 		/*
 		 * A perfectly misaligned address and length could cause
 		 * both the start and the end of the chunk to use partial
 		 * page.  +2 accounts for such a situation.
 		 */
 		cnt = vm_fault_quick_hold_pages(&td->td_proc->p_vmspace->vm_map,
 		    addr, len, prot, ma, io_hold_cnt + 2);
 		if (cnt == -1) {
 			error = EFAULT;
 			break;
 		}
 		short_uio.uio_iov = &short_iovec[0];
 		short_iovec[0].iov_base = (void *)addr;
 		short_uio.uio_iovcnt = 1;
 		short_uio.uio_resid = short_iovec[0].iov_len = len;
 		short_uio.uio_offset = uio_clone->uio_offset;
 		td->td_ma = ma;
 		td->td_ma_cnt = cnt;
 
 		error = vn_io_fault_doio(args, &short_uio, td);
 		vm_page_unhold_pages(ma, cnt);
 		adv = len - short_uio.uio_resid;
 
 		uio_clone->uio_iov->iov_base =
 		    (char *)uio_clone->uio_iov->iov_base + adv;
 		uio_clone->uio_iov->iov_len -= adv;
 		uio_clone->uio_resid -= adv;
 		uio_clone->uio_offset += adv;
 
 		uio->uio_resid -= adv;
 		uio->uio_offset += adv;
 
 		if (error != 0 || adv == 0)
 			break;
 	}
 	td->td_ma = prev_td_ma;
 	td->td_ma_cnt = prev_td_ma_cnt;
 	curthread_pflags_restore(saveheld);
 out:
 	free(uio_clone, M_IOV);
 	return (error);
 }
 
 static int
 vn_io_fault(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	fo_rdwr_t *doio;
 	struct vnode *vp;
 	void *rl_cookie;
 	struct vn_io_fault_args args;
 	int error;
 
 	doio = uio->uio_rw == UIO_READ ? vn_read : vn_write;
 	vp = fp->f_vnode;
 
 	/*
 	 * The ability to read(2) on a directory has historically been
 	 * allowed for all users, but this can and has been the source of
 	 * at least one security issue in the past.  As such, it is now hidden
 	 * away behind a sysctl for those that actually need it to use it, and
 	 * restricted to root when it's turned on to make it relatively safe to
 	 * leave on for longer sessions of need.
 	 */
 	if (vp->v_type == VDIR) {
 		KASSERT(uio->uio_rw == UIO_READ,
 		    ("illegal write attempted on a directory"));
 		if (!vfs_allow_read_dir)
 			return (EISDIR);
 		if ((error = priv_check(td, PRIV_VFS_READ_DIR)) != 0)
 			return (EISDIR);
 	}
 
 	foffset_lock_uio(fp, uio, flags);
 	if (do_vn_io_fault(vp, uio)) {
 		args.kind = VN_IO_FAULT_FOP;
 		args.args.fop_args.fp = fp;
 		args.args.fop_args.doio = doio;
 		args.cred = active_cred;
 		args.flags = flags | FOF_OFFSET;
 		if (uio->uio_rw == UIO_READ) {
 			rl_cookie = vn_rangelock_rlock(vp, uio->uio_offset,
 			    uio->uio_offset + uio->uio_resid);
 		} else if ((fp->f_flag & O_APPEND) != 0 ||
 		    (flags & FOF_OFFSET) == 0) {
 			/* For appenders, punt and lock the whole range. */
 			rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 		} else {
 			rl_cookie = vn_rangelock_wlock(vp, uio->uio_offset,
 			    uio->uio_offset + uio->uio_resid);
 		}
 		error = vn_io_fault1(vp, uio, &args, td);
 		vn_rangelock_unlock(vp, rl_cookie);
 	} else {
 		error = doio(fp, uio, active_cred, flags | FOF_OFFSET, td);
 	}
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 /*
  * Helper function to perform the requested uiomove operation using
  * the held pages for io->uio_iov[0].iov_base buffer instead of
  * copyin/copyout.  Access to the pages with uiomove_fromphys()
  * instead of iov_base prevents page faults that could occur due to
  * pmap_collect() invalidating the mapping created by
  * vm_fault_quick_hold_pages(), or pageout daemon, page laundry or
  * object cleanup revoking the write access from page mappings.
  *
  * Filesystems specified MNTK_NO_IOPF shall use vn_io_fault_uiomove()
  * instead of plain uiomove().
  */
 int
 vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio)
 {
 	struct uio transp_uio;
 	struct iovec transp_iov[1];
 	struct thread *td;
 	size_t adv;
 	int error, pgadv;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 	    uio->uio_segflg != UIO_USERSPACE)
 		return (uiomove(data, xfersize, uio));
 
 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 	transp_iov[0].iov_base = data;
 	transp_uio.uio_iov = &transp_iov[0];
 	transp_uio.uio_iovcnt = 1;
 	if (xfersize > uio->uio_resid)
 		xfersize = uio->uio_resid;
 	transp_uio.uio_resid = transp_iov[0].iov_len = xfersize;
 	transp_uio.uio_offset = 0;
 	transp_uio.uio_segflg = UIO_SYSSPACE;
 	/*
 	 * Since transp_iov points to data, and td_ma page array
 	 * corresponds to original uio->uio_iov, we need to invert the
 	 * direction of the i/o operation as passed to
 	 * uiomove_fromphys().
 	 */
 	switch (uio->uio_rw) {
 	case UIO_WRITE:
 		transp_uio.uio_rw = UIO_READ;
 		break;
 	case UIO_READ:
 		transp_uio.uio_rw = UIO_WRITE;
 		break;
 	}
 	transp_uio.uio_td = uio->uio_td;
 	error = uiomove_fromphys(td->td_ma,
 	    ((vm_offset_t)uio->uio_iov->iov_base) & PAGE_MASK,
 	    xfersize, &transp_uio);
 	adv = xfersize - transp_uio.uio_resid;
 	pgadv =
 	    (((vm_offset_t)uio->uio_iov->iov_base + adv) >> PAGE_SHIFT) -
 	    (((vm_offset_t)uio->uio_iov->iov_base) >> PAGE_SHIFT);
 	td->td_ma += pgadv;
 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 	    pgadv));
 	td->td_ma_cnt -= pgadv;
 	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + adv;
 	uio->uio_iov->iov_len -= adv;
 	uio->uio_resid -= adv;
 	uio->uio_offset += adv;
 	return (error);
 }
 
 int
 vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
     struct uio *uio)
 {
 	struct thread *td;
 	vm_offset_t iov_base;
 	int cnt, pgadv;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_UIOHELD) == 0 ||
 	    uio->uio_segflg != UIO_USERSPACE)
 		return (uiomove_fromphys(ma, offset, xfersize, uio));
 
 	KASSERT(uio->uio_iovcnt == 1, ("uio_iovcnt %d", uio->uio_iovcnt));
 	cnt = xfersize > uio->uio_resid ? uio->uio_resid : xfersize;
 	iov_base = (vm_offset_t)uio->uio_iov->iov_base;
 	switch (uio->uio_rw) {
 	case UIO_WRITE:
 		pmap_copy_pages(td->td_ma, iov_base & PAGE_MASK, ma,
 		    offset, cnt);
 		break;
 	case UIO_READ:
 		pmap_copy_pages(ma, offset, td->td_ma, iov_base & PAGE_MASK,
 		    cnt);
 		break;
 	}
 	pgadv = ((iov_base + cnt) >> PAGE_SHIFT) - (iov_base >> PAGE_SHIFT);
 	td->td_ma += pgadv;
 	KASSERT(td->td_ma_cnt >= pgadv, ("consumed pages %d %d", td->td_ma_cnt,
 	    pgadv));
 	td->td_ma_cnt -= pgadv;
 	uio->uio_iov->iov_base = (char *)(iov_base + cnt);
 	uio->uio_iov->iov_len -= cnt;
 	uio->uio_resid -= cnt;
 	uio->uio_offset += cnt;
 	return (0);
 }
 
 /*
  * File table truncate routine.
  */
 static int
 vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	void *rl_cookie;
 	int error;
 
 	vp = fp->f_vnode;
 
 retry:
 	/*
 	 * Lock the whole range for truncation.  Otherwise split i/o
 	 * might happen partly before and partly after the truncation.
 	 */
 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 	if (error)
 		goto out1;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	if (vp->v_type == VDIR) {
 		error = EISDIR;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
 	if (error)
 		goto out;
 #endif
 	error = vn_truncate_locked(vp, length, (fp->f_flag & O_FSYNC) != 0,
 	    fp->f_cred);
 out:
 	VOP_UNLOCK(vp);
 	vn_finished_write(mp);
 out1:
 	vn_rangelock_unlock(vp, rl_cookie);
 	if (error == ERELOOKUP)
 		goto retry;
 	return (error);
 }
 
 /*
  * Truncate a file that is already locked.
  */
 int
 vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
     struct ucred *cred)
 {
 	struct vattr vattr;
 	int error;
 
 	error = VOP_ADD_WRITECOUNT(vp, 1);
 	if (error == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = length;
 		if (sync)
 			vattr.va_vaflags |= VA_SYNC;
 		error = VOP_SETATTR(vp, &vattr, cred);
 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 	}
 	return (error);
 }
 
 /*
  * File table vnode stat routine.
  */
 int
-vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred,
-    struct thread *td)
+vn_statfile(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 	struct vnode *vp = fp->f_vnode;
 	int error;
 
 	vn_lock(vp, LK_SHARED | LK_RETRY);
-	error = VOP_STAT(vp, sb, active_cred, fp->f_cred, td);
+	error = VOP_STAT(vp, sb, active_cred, fp->f_cred);
 	VOP_UNLOCK(vp);
 
 	return (error);
 }
 
 /*
  * File table vnode ioctl routine.
  */
 static int
 vn_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vattr vattr;
 	struct vnode *vp;
 	struct fiobmap2_arg *bmarg;
 	int error;
 
 	vp = fp->f_vnode;
 	switch (vp->v_type) {
 	case VDIR:
 	case VREG:
 		switch (com) {
 		case FIONREAD:
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 			error = VOP_GETATTR(vp, &vattr, active_cred);
 			VOP_UNLOCK(vp);
 			if (error == 0)
 				*(int *)data = vattr.va_size - fp->f_offset;
 			return (error);
 		case FIOBMAP2:
 			bmarg = (struct fiobmap2_arg *)data;
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 #ifdef MAC
 			error = mac_vnode_check_read(active_cred, fp->f_cred,
 			    vp);
 			if (error == 0)
 #endif
 				error = VOP_BMAP(vp, bmarg->bn, NULL,
 				    &bmarg->bn, &bmarg->runp, &bmarg->runb);
 			VOP_UNLOCK(vp);
 			return (error);
 		case FIONBIO:
 		case FIOASYNC:
 			return (0);
 		default:
 			return (VOP_IOCTL(vp, com, data, fp->f_flag,
 			    active_cred, td));
 		}
 		break;
 	case VCHR:
 		return (VOP_IOCTL(vp, com, data, fp->f_flag,
 		    active_cred, td));
 	default:
 		return (ENOTTY);
 	}
 }
 
 /*
  * File table vnode poll routine.
  */
 static int
 vn_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 	int error;
 
 	vp = fp->f_vnode;
 #if defined(MAC) || defined(AUDIT)
 	if (AUDITING_TD(td) || mac_vnode_check_poll_enabled()) {
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		AUDIT_ARG_VNODE1(vp);
 		error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
 		VOP_UNLOCK(vp);
 		if (error != 0)
 			return (error);
 	}
 #endif
 	error = VOP_POLL(vp, events, fp->f_cred, td);
 	return (error);
 }
 
 /*
  * Acquire the requested lock and then check for validity.  LK_RETRY
  * permits vn_lock to return doomed vnodes.
  */
 static int __noinline
 _vn_lock_fallback(struct vnode *vp, int flags, const char *file, int line,
     int error)
 {
 
 	KASSERT((flags & LK_RETRY) == 0 || error == 0,
 	    ("vn_lock: error %d incompatible with flags %#x", error, flags));
 
 	if (error == 0)
 		VNASSERT(VN_IS_DOOMED(vp), vp, ("vnode not doomed"));
 
 	if ((flags & LK_RETRY) == 0) {
 		if (error == 0) {
 			VOP_UNLOCK(vp);
 			error = ENOENT;
 		}
 		return (error);
 	}
 
 	/*
 	 * LK_RETRY case.
 	 *
 	 * Nothing to do if we got the lock.
 	 */
 	if (error == 0)
 		return (0);
 
 	/*
 	 * Interlock was dropped by the call in _vn_lock.
 	 */
 	flags &= ~LK_INTERLOCK;
 	do {
 		error = VOP_LOCK1(vp, flags, file, line);
 	} while (error != 0);
 	return (0);
 }
 
 int
 _vn_lock(struct vnode *vp, int flags, const char *file, int line)
 {
 	int error;
 
 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 	    ("vn_lock: no locktype (%d passed)", flags));
 	VNPASS(vp->v_holdcnt > 0, vp);
 	error = VOP_LOCK1(vp, flags, file, line);
 	if (__predict_false(error != 0 || VN_IS_DOOMED(vp)))
 		return (_vn_lock_fallback(vp, flags, file, line, error));
 	return (0);
 }
 
 /*
  * File table vnode close routine.
  */
 static int
 vn_closefile(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	int error;
 	bool ref;
 
 	vp = fp->f_vnode;
 	fp->f_ops = &badfileops;
 	ref = (fp->f_flag & FHASLOCK) != 0;
 
 	error = vn_close1(vp, fp->f_flag, fp->f_cred, td, ref);
 
 	if (__predict_false(ref)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		lf.l_type = F_UNLCK;
 		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
 		vrele(vp);
 	}
 	return (error);
 }
 
 /*
  * Preparing to start a filesystem write operation. If the operation is
  * permitted, then we bump the count of operations in progress and
  * proceed. If a suspend request is in progress, we wait until the
  * suspension is over, and then proceed.
  */
 static int
 vn_start_write_refed(struct mount *mp, int flags, bool mplocked)
 {
 	struct mount_pcpu *mpcpu;
 	int error, mflags;
 
 	if (__predict_true(!mplocked) && (flags & V_XSLEEP) == 0 &&
 	    vfs_op_thread_enter(mp, mpcpu)) {
 		MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
 		vfs_mp_count_add_pcpu(mpcpu, writeopcount, 1);
 		vfs_op_thread_exit(mp, mpcpu);
 		return (0);
 	}
 
 	if (mplocked)
 		mtx_assert(MNT_MTX(mp), MA_OWNED);
 	else
 		MNT_ILOCK(mp);
 
 	error = 0;
 
 	/*
 	 * Check on status of suspension.
 	 */
 	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 ||
 	    mp->mnt_susp_owner != curthread) {
 		mflags = ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ?
 		    (flags & PCATCH) : 0) | (PUSER - 1);
 		while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 			if (flags & V_NOWAIT) {
 				error = EWOULDBLOCK;
 				goto unlock;
 			}
 			error = msleep(&mp->mnt_flag, MNT_MTX(mp), mflags,
 			    "suspfs", 0);
 			if (error)
 				goto unlock;
 		}
 	}
 	if (flags & V_XSLEEP)
 		goto unlock;
 	mp->mnt_writeopcount++;
 unlock:
 	if (error != 0 || (flags & V_XSLEEP) != 0)
 		MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	return (error);
 }
 
 int
 vn_start_write(struct vnode *vp, struct mount **mpp, int flags)
 {
 	struct mount *mp;
 	int error;
 
 	KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
 	    ("V_MNTREF requires mp"));
 
 	error = 0;
 	/*
 	 * If a vnode is provided, get and return the mount point that
 	 * to which it will write.
 	 */
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	if ((mp = *mpp) == NULL)
 		return (0);
 
 	/*
 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 	 * a vfs_ref().
 	 * As long as a vnode is not provided we need to acquire a
 	 * refcount for the provided mountpoint too, in order to
 	 * emulate a vfs_ref().
 	 */
 	if (vp == NULL && (flags & V_MNTREF) == 0)
 		vfs_ref(mp);
 
 	return (vn_start_write_refed(mp, flags, false));
 }
 
 /*
  * Secondary suspension. Used by operations such as vop_inactive
  * routines that are needed by the higher level functions. These
  * are allowed to proceed until all the higher level functions have
  * completed (indicated by mnt_writeopcount dropping to zero). At that
  * time, these operations are halted until the suspension is over.
  */
 int
 vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags)
 {
 	struct mount *mp;
 	int error;
 
 	KASSERT((flags & V_MNTREF) == 0 || (*mpp != NULL && vp == NULL),
 	    ("V_MNTREF requires mp"));
 
  retry:
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	/*
 	 * If we are not suspended or have not yet reached suspended
 	 * mode, then let the operation proceed.
 	 */
 	if ((mp = *mpp) == NULL)
 		return (0);
 
 	/*
 	 * VOP_GETWRITEMOUNT() returns with the mp refcount held through
 	 * a vfs_ref().
 	 * As long as a vnode is not provided we need to acquire a
 	 * refcount for the provided mountpoint too, in order to
 	 * emulate a vfs_ref().
 	 */
 	MNT_ILOCK(mp);
 	if (vp == NULL && (flags & V_MNTREF) == 0)
 		MNT_REF(mp);
 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
 		mp->mnt_secondary_writes++;
 		mp->mnt_secondary_accwrites++;
 		MNT_IUNLOCK(mp);
 		return (0);
 	}
 	if (flags & V_NOWAIT) {
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Wait for the suspension to finish.
 	 */
 	error = msleep(&mp->mnt_flag, MNT_MTX(mp), (PUSER - 1) | PDROP |
 	    ((mp->mnt_vfc->vfc_flags & VFCF_SBDRY) != 0 ? (flags & PCATCH) : 0),
 	    "suspfs", 0);
 	vfs_rel(mp);
 	if (error == 0)
 		goto retry;
 	return (error);
 }
 
 /*
  * Filesystem write operation has completed. If we are suspending and this
  * operation is the last one, notify the suspender that the suspension is
  * now in effect.
  */
 void
 vn_finished_write(struct mount *mp)
 {
 	struct mount_pcpu *mpcpu;
 	int c;
 
 	if (mp == NULL)
 		return;
 
 	if (vfs_op_thread_enter(mp, mpcpu)) {
 		vfs_mp_count_sub_pcpu(mpcpu, writeopcount, 1);
 		vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
 		vfs_op_thread_exit(mp, mpcpu);
 		return;
 	}
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	MNT_REL(mp);
 	c = --mp->mnt_writeopcount;
 	if (mp->mnt_vfs_ops == 0) {
 		MPASS((mp->mnt_kern_flag & MNTK_SUSPEND) == 0);
 		MNT_IUNLOCK(mp);
 		return;
 	}
 	if (c < 0)
 		vfs_dump_mount_counters(mp);
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 && c == 0)
 		wakeup(&mp->mnt_writeopcount);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Filesystem secondary write operation has completed. If we are
  * suspending and this operation is the last one, notify the suspender
  * that the suspension is now in effect.
  */
 void
 vn_finished_secondary_write(struct mount *mp)
 {
 	if (mp == NULL)
 		return;
 	MNT_ILOCK(mp);
 	MNT_REL(mp);
 	mp->mnt_secondary_writes--;
 	if (mp->mnt_secondary_writes < 0)
 		panic("vn_finished_secondary_write: neg cnt");
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 	    mp->mnt_secondary_writes <= 0)
 		wakeup(&mp->mnt_secondary_writes);
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Request a filesystem to suspend write operations.
  */
 int
 vfs_write_suspend(struct mount *mp, int flags)
 {
 	int error;
 
 	vfs_op_enter(mp);
 
 	MNT_ILOCK(mp);
 	vfs_assert_mount_counters(mp);
 	if (mp->mnt_susp_owner == curthread) {
 		vfs_op_exit_locked(mp);
 		MNT_IUNLOCK(mp);
 		return (EALREADY);
 	}
 	while (mp->mnt_kern_flag & MNTK_SUSPEND)
 		msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
 
 	/*
 	 * Unmount holds a write reference on the mount point.  If we
 	 * own busy reference and drain for writers, we deadlock with
 	 * the reference draining in the unmount path.  Callers of
 	 * vfs_write_suspend() must specify VS_SKIP_UNMOUNT if
 	 * vfs_busy() reference is owned and caller is not in the
 	 * unmount context.
 	 */
 	if ((flags & VS_SKIP_UNMOUNT) != 0 &&
 	    (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
 		vfs_op_exit_locked(mp);
 		MNT_IUNLOCK(mp);
 		return (EBUSY);
 	}
 
 	mp->mnt_kern_flag |= MNTK_SUSPEND;
 	mp->mnt_susp_owner = curthread;
 	if (mp->mnt_writeopcount > 0)
 		(void) msleep(&mp->mnt_writeopcount, 
 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
 	else
 		MNT_IUNLOCK(mp);
 	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0) {
 		vfs_write_resume(mp, 0);
 		/* vfs_write_resume does vfs_op_exit() for us */
 	}
 	return (error);
 }
 
 /*
  * Request a filesystem to resume write operations.
  */
 void
 vfs_write_resume(struct mount *mp, int flags)
 {
 
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 		KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
 		mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPEND2 |
 				       MNTK_SUSPENDED);
 		mp->mnt_susp_owner = NULL;
 		wakeup(&mp->mnt_writeopcount);
 		wakeup(&mp->mnt_flag);
 		curthread->td_pflags &= ~TDP_IGNSUSP;
 		if ((flags & VR_START_WRITE) != 0) {
 			MNT_REF(mp);
 			mp->mnt_writeopcount++;
 		}
 		MNT_IUNLOCK(mp);
 		if ((flags & VR_NO_SUSPCLR) == 0)
 			VFS_SUSP_CLEAN(mp);
 		vfs_op_exit(mp);
 	} else if ((flags & VR_START_WRITE) != 0) {
 		MNT_REF(mp);
 		vn_start_write_refed(mp, 0, true);
 	} else {
 		MNT_IUNLOCK(mp);
 	}
 }
 
 /*
  * Helper loop around vfs_write_suspend() for filesystem unmount VFS
  * methods.
  */
 int
 vfs_write_suspend_umnt(struct mount *mp)
 {
 	int error;
 
 	KASSERT((curthread->td_pflags & TDP_IGNSUSP) == 0,
 	    ("vfs_write_suspend_umnt: recursed"));
 
 	/* dounmount() already called vn_start_write(). */
 	for (;;) {
 		vn_finished_write(mp);
 		error = vfs_write_suspend(mp, 0);
 		if (error != 0) {
 			vn_start_write(NULL, &mp, V_WAIT);
 			return (error);
 		}
 		MNT_ILOCK(mp);
 		if ((mp->mnt_kern_flag & MNTK_SUSPENDED) != 0)
 			break;
 		MNT_IUNLOCK(mp);
 		vn_start_write(NULL, &mp, V_WAIT);
 	}
 	mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2);
 	wakeup(&mp->mnt_flag);
 	MNT_IUNLOCK(mp);
 	curthread->td_pflags |= TDP_IGNSUSP;
 	return (0);
 }
 
 /*
  * Implement kqueues for files by translating it to vnode operation.
  */
 static int
 vn_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (VOP_KQFILTER(fp->f_vnode, kn));
 }
 
 int
 vn_kqfilter_opath(struct file *fp, struct knote *kn)
 {
 	if ((fp->f_flag & FKQALLOWED) == 0)
 		return (EBADF);
 	return (vn_kqfilter(fp, kn));
 }
 
 /*
  * Simplified in-kernel wrapper calls for extended attribute access.
  * Both calls pass in a NULL credential, authorizing as "kernel" access.
  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
  */
 int
 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int *buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	int	error;
 
 	iov.iov_len = *buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = *buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute retrieval as kernel */
 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
 	    td);
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		VOP_UNLOCK(vp);
 
 	if (error == 0) {
 		*buflen = *buflen - auio.uio_resid;
 	}
 
 	return (error);
 }
 
 /*
  * XXX failure mode if partially written?
  */
 int
 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	struct mount	*mp;
 	int	error;
 
 	iov.iov_len = buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute setting as kernel */
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp);
 	}
 
 	return (error);
 }
 
 int
 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, struct thread *td)
 {
 	struct mount	*mp;
 	int	error;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
 
 	/* authorize attribute removal as kernel */
 	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
 	if (error == EOPNOTSUPP)
 		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 		    NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp);
 	}
 
 	return (error);
 }
 
 static int
 vn_get_ino_alloc_vget(struct mount *mp, void *arg, int lkflags,
     struct vnode **rvp)
 {
 
 	return (VFS_VGET(mp, *(ino_t *)arg, lkflags, rvp));
 }
 
 int
 vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp)
 {
 
 	return (vn_vget_ino_gen(vp, vn_get_ino_alloc_vget, &ino,
 	    lkflags, rvp));
 }
 
 int
 vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg,
     int lkflags, struct vnode **rvp)
 {
 	struct mount *mp;
 	int ltype, error;
 
 	ASSERT_VOP_LOCKED(vp, "vn_vget_ino_get");
 	mp = vp->v_mount;
 	ltype = VOP_ISLOCKED(vp);
 	KASSERT(ltype == LK_EXCLUSIVE || ltype == LK_SHARED,
 	    ("vn_vget_ino: vp not locked"));
 	error = vfs_busy(mp, MBF_NOWAIT);
 	if (error != 0) {
 		vfs_ref(mp);
 		VOP_UNLOCK(vp);
 		error = vfs_busy(mp, 0);
 		vn_lock(vp, ltype | LK_RETRY);
 		vfs_rel(mp);
 		if (error != 0)
 			return (ENOENT);
 		if (VN_IS_DOOMED(vp)) {
 			vfs_unbusy(mp);
 			return (ENOENT);
 		}
 	}
 	VOP_UNLOCK(vp);
 	error = alloc(mp, alloc_arg, lkflags, rvp);
 	vfs_unbusy(mp);
 	if (error != 0 || *rvp != vp)
 		vn_lock(vp, ltype | LK_RETRY);
 	if (VN_IS_DOOMED(vp)) {
 		if (error == 0) {
 			if (*rvp == vp)
 				vunref(vp);
 			else
 				vput(*rvp);
 		}
 		error = ENOENT;
 	}
 	return (error);
 }
 
 int
 vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
     struct thread *td)
 {
 	off_t lim;
 	bool ktr_write;
 
 	if (td == NULL)
 		return (0);
 
 	/*
 	 * There are conditions where the limit is to be ignored.
 	 * However, since it is almost never reached, check it first.
 	 */
 	ktr_write = (td->td_pflags & TDP_INKTRACE) != 0;
 	lim = lim_cur(td, RLIMIT_FSIZE);
 	if (__predict_false(ktr_write))
 		lim = td->td_ktr_io_lim;
 	if (__predict_true((uoff_t)uio->uio_offset + uio->uio_resid <= lim))
 		return (0);
 
 	/*
 	 * The limit is reached.
 	 */
 	if (vp->v_type != VREG ||
 	    (td->td_pflags2 & TDP2_ACCT) != 0)
 		return (0);
 
 	if (!ktr_write || ktr_filesize_limit_signal) {
 		PROC_LOCK(td->td_proc);
 		kern_psignal(td->td_proc, SIGXFSZ);
 		PROC_UNLOCK(td->td_proc);
 	}
 	return (EFBIG);
 }
 
 int
 vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 
 	vp = fp->f_vnode;
 #ifdef AUDIT
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	VOP_UNLOCK(vp);
 #endif
 	return (setfmode(td, active_cred, vp, mode));
 }
 
 int
 vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct vnode *vp;
 
 	vp = fp->f_vnode;
 #ifdef AUDIT
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	AUDIT_ARG_VNODE1(vp);
 	VOP_UNLOCK(vp);
 #endif
 	return (setfown(td, active_cred, vp, uid, gid));
 }
 
 void
 vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_object_t object;
 
 	if ((object = vp->v_object) == NULL)
 		return;
 	VM_OBJECT_WLOCK(object);
 	vm_object_page_remove(object, start, end, 0);
 	VM_OBJECT_WUNLOCK(object);
 }
 
 int
 vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off,
     struct ucred *cred)
 {
 	struct vattr va;
 	daddr_t bn, bnp;
 	uint64_t bsize;
 	off_t noff;
 	int error;
 
 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 	    ("%s: Wrong command %lu", __func__, cmd));
 	ASSERT_VOP_LOCKED(vp, "vn_bmap_seekhole_locked");
 
 	if (vp->v_type != VREG) {
 		error = ENOTTY;
 		goto out;
 	}
 	error = VOP_GETATTR(vp, &va, cred);
 	if (error != 0)
 		goto out;
 	noff = *off;
 	if (noff >= va.va_size) {
 		error = ENXIO;
 		goto out;
 	}
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	for (bn = noff / bsize; noff < va.va_size; bn++, noff += bsize -
 	    noff % bsize) {
 		error = VOP_BMAP(vp, bn, NULL, &bnp, NULL, NULL);
 		if (error == EOPNOTSUPP) {
 			error = ENOTTY;
 			goto out;
 		}
 		if ((bnp == -1 && cmd == FIOSEEKHOLE) ||
 		    (bnp != -1 && cmd == FIOSEEKDATA)) {
 			noff = bn * bsize;
 			if (noff < *off)
 				noff = *off;
 			goto out;
 		}
 	}
 	if (noff > va.va_size)
 		noff = va.va_size;
 	/* noff == va.va_size. There is an implicit hole at the end of file. */
 	if (cmd == FIOSEEKDATA)
 		error = ENXIO;
 out:
 	if (error == 0)
 		*off = noff;
 	return (error);
 }
 
 int
 vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred)
 {
 	int error;
 
 	KASSERT(cmd == FIOSEEKHOLE || cmd == FIOSEEKDATA,
 	    ("%s: Wrong command %lu", __func__, cmd));
 
 	if (vn_lock(vp, LK_SHARED) != 0)
 		return (EBADF);
 	error = vn_bmap_seekhole_locked(vp, cmd, off, cred);
 	VOP_UNLOCK(vp);
 	return (error);
 }
 
 int
 vn_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 	struct ucred *cred;
 	struct vnode *vp;
 	struct vattr vattr;
 	off_t foffset, size;
 	int error, noneg;
 
 	cred = td->td_ucred;
 	vp = fp->f_vnode;
 	foffset = foffset_lock(fp, 0);
 	noneg = (vp->v_type != VCHR);
 	error = 0;
 	switch (whence) {
 	case L_INCR:
 		if (noneg &&
 		    (foffset < 0 ||
 		    (offset > 0 && foffset > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += foffset;
 		break;
 	case L_XTND:
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_GETATTR(vp, &vattr, cred);
 		VOP_UNLOCK(vp);
 		if (error)
 			break;
 
 		/*
 		 * If the file references a disk device, then fetch
 		 * the media size and use that to determine the ending
 		 * offset.
 		 */
 		if (vattr.va_size == 0 && vp->v_type == VCHR &&
 		    fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
 			vattr.va_size = size;
 		if (noneg &&
 		    (vattr.va_size > OFF_MAX ||
 		    (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += vattr.va_size;
 		break;
 	case L_SET:
 		break;
 	case SEEK_DATA:
 		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
 		if (error == ENOTTY)
 			error = EINVAL;
 		break;
 	case SEEK_HOLE:
 		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
 		if (error == ENOTTY)
 			error = EINVAL;
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0 && noneg && offset < 0)
 		error = EINVAL;
 	if (error != 0)
 		goto drop;
 	VFS_KNOTE_UNLOCKED(vp, 0);
 	td->td_uretoff.tdu_off = offset;
 drop:
 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
 int
 vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred,
     struct thread *td)
 {
 	int error;
 
 	/*
 	 * Grant permission if the caller is the owner of the file, or
 	 * the super-user, or has ACL_WRITE_ATTRIBUTES permission on
 	 * on the file.  If the time pointer is null, then write
 	 * permission on the file is also sufficient.
 	 *
 	 * From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
 	 * A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
 	 * will be allowed to set the times [..] to the current
 	 * server time.
 	 */
 	error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred, td);
 	if (error != 0 && (vap->va_vaflags & VA_UTIMES_NULL) != 0)
 		error = VOP_ACCESS(vp, VWRITE, cred, td);
 	return (error);
 }
 
 int
 vn_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct vnode *vp;
 	int error;
 
 	if (fp->f_type == DTYPE_FIFO)
 		kif->kf_type = KF_TYPE_FIFO;
 	else
 		kif->kf_type = KF_TYPE_VNODE;
 	vp = fp->f_vnode;
 	vref(vp);
 	FILEDESC_SUNLOCK(fdp);
 	error = vn_fill_kinfo_vnode(vp, kif);
 	vrele(vp);
 	FILEDESC_SLOCK(fdp);
 	return (error);
 }
 
 static inline void
 vn_fill_junk(struct kinfo_file *kif)
 {
 	size_t len, olen;
 
 	/*
 	 * Simulate vn_fullpath returning changing values for a given
 	 * vp during e.g. coredump.
 	 */
 	len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
 	olen = strlen(kif->kf_path);
 	if (len < olen)
 		strcpy(&kif->kf_path[len - 1], "$");
 	else
 		for (; olen < len; olen++)
 			strcpy(&kif->kf_path[olen], "A");
 }
 
 int
 vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif)
 {
 	struct vattr va;
 	char *fullpath, *freepath;
 	int error;
 
 	kif->kf_un.kf_file.kf_file_type = vntype_to_kinfo(vp->v_type);
 	freepath = NULL;
 	fullpath = "-";
 	error = vn_fullpath(vp, &fullpath, &freepath);
 	if (error == 0) {
 		strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
 	}
 	if (freepath != NULL)
 		free(freepath, M_TEMP);
 
 	KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
 		vn_fill_junk(kif);
 	);
 
 	/*
 	 * Retrieve vnode attributes.
 	 */
 	va.va_fsid = VNOVAL;
 	va.va_rdev = NODEV;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
 	VOP_UNLOCK(vp);
 	if (error != 0)
 		return (error);
 	if (va.va_fsid != VNOVAL)
 		kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
 	else
 		kif->kf_un.kf_file.kf_file_fsid =
 		    vp->v_mount->mnt_stat.f_fsid.val[0];
 	kif->kf_un.kf_file.kf_file_fsid_freebsd11 =
 	    kif->kf_un.kf_file.kf_file_fsid; /* truncate */
 	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
 	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
 	kif->kf_un.kf_file.kf_file_size = va.va_size;
 	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
 	kif->kf_un.kf_file.kf_file_rdev_freebsd11 =
 	    kif->kf_un.kf_file.kf_file_rdev; /* truncate */
 	return (0);
 }
 
 int
 vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_in pkm;
 #endif
 	struct mount *mp;
 	struct vnode *vp;
 	vm_object_t object;
 	vm_prot_t maxprot;
 	boolean_t writecounted;
 	int error;
 
 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
 	/*
 	 * POSIX shared-memory objects are defined to have
 	 * kernel persistence, and are not defined to support
 	 * read(2)/write(2) -- or even open(2).  Thus, we can
 	 * use MAP_ASYNC to trade on-disk coherence for speed.
 	 * The shm_open(3) library routine turns on the FPOSIXSHM
 	 * flag to request this behavior.
 	 */
 	if ((fp->f_flag & FPOSIXSHM) != 0)
 		flags |= MAP_NOSYNC;
 #endif
 	vp = fp->f_vnode;
 
 	/*
 	 * Ensure that file and memory protections are
 	 * compatible.  Note that we only worry about
 	 * writability if mapping is shared; in this case,
 	 * current and max prot are dictated by the open file.
 	 * XXX use the vnode instead?  Problem is: what
 	 * credentials do we use for determination? What if
 	 * proc does a setuid?
 	 */
 	mp = vp->v_mount;
 	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 		maxprot = VM_PROT_NONE;
 		if ((prot & VM_PROT_EXECUTE) != 0)
 			return (EACCES);
 	} else
 		maxprot = VM_PROT_EXECUTE;
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_READ;
 	else if ((prot & VM_PROT_READ) != 0)
 		return (EACCES);
 
 	/*
 	 * If we are sharing potential changes via MAP_SHARED and we
 	 * are trying to get write permission although we opened it
 	 * without asking for it, bail out.
 	 */
 	if ((flags & MAP_SHARED) != 0) {
 		if ((fp->f_flag & FWRITE) != 0)
 			maxprot |= VM_PROT_WRITE;
 		else if ((prot & VM_PROT_WRITE) != 0)
 			return (EACCES);
 	} else {
 		maxprot |= VM_PROT_WRITE;
 		cap_maxprot |= VM_PROT_WRITE;
 	}
 	maxprot &= cap_maxprot;
 
 	/*
 	 * For regular files and shared memory, POSIX requires that
 	 * the value of foff be a legitimate offset within the data
 	 * object.  In particular, negative offsets are invalid.
 	 * Blocking negative offsets and overflows here avoids
 	 * possible wraparound or user-level access into reserved
 	 * ranges of the data object later.  In contrast, POSIX does
 	 * not dictate how offsets are used by device drivers, so in
 	 * the case of a device mapping a negative offset is passed
 	 * on.
 	 */
 	if (
 #ifdef _LP64
 	    size > OFF_MAX ||
 #endif
 	    foff > OFF_MAX - size)
 		return (EINVAL);
 
 	writecounted = FALSE;
 	error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
 	    &foff, &object, &writecounted);
 	if (error != 0)
 		return (error);
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, writecounted, td);
 	if (error != 0) {
 		/*
 		 * If this mapping was accounted for in the vnode's
 		 * writecount, then undo that now.
 		 */
 		if (writecounted)
 			vm_pager_release_writecount(object, 0, size);
 		vm_object_deallocate(object);
 	}
 #ifdef HWPMC_HOOKS
 	/* Inform hwpmc(4) if an executable is being mapped. */
 	if (PMC_HOOK_INSTALLED(PMC_FN_MMAP)) {
 		if ((prot & VM_PROT_EXECUTE) != 0 && error == 0) {
 			pkm.pm_file = vp;
 			pkm.pm_address = (uintptr_t) *addr;
 			PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_MMAP, (void *) &pkm);
 		}
 	}
 #endif
 	return (error);
 }
 
 void
 vn_fsid(struct vnode *vp, struct vattr *va)
 {
 	fsid_t *f;
 
 	f = &vp->v_mount->mnt_stat.f_fsid;
 	va->va_fsid = (uint32_t)f->val[1];
 	va->va_fsid <<= sizeof(f->val[1]) * NBBY;
 	va->va_fsid += (uint32_t)f->val[0];
 }
 
 int
 vn_fsync_buf(struct vnode *vp, int waitfor)
 {
 	struct buf *bp, *nbp;
 	struct bufobj *bo;
 	struct mount *mp;
 	int error, maxretry;
 
 	error = 0;
 	maxretry = 10000;     /* large, arbitrarily chosen */
 	mp = NULL;
 	if (vp->v_type == VCHR) {
 		VI_LOCK(vp);
 		mp = vp->v_rdev->si_mountpt;
 		VI_UNLOCK(vp);
 	}
 	bo = &vp->v_bufobj;
 	BO_LOCK(bo);
 loop1:
 	/*
 	 * MARK/SCAN initialization to avoid infinite loops.
 	 */
         TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
 		bp->b_vflags &= ~BV_SCANNED;
 		bp->b_error = 0;
 	}
 
 	/*
 	 * Flush all dirty buffers associated with a vnode.
 	 */
 loop2:
 	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 		if ((bp->b_vflags & BV_SCANNED) != 0)
 			continue;
 		bp->b_vflags |= BV_SCANNED;
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
 			if (waitfor != MNT_WAIT)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
 			    BO_LOCKPTR(bo)) != 0) {
 				BO_LOCK(bo);
 				goto loop1;
 			}
 			BO_LOCK(bo);
 		}
 		BO_UNLOCK(bo);
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		if ((bp->b_flags & B_DELWRI) == 0)
 			panic("fsync: not dirty");
 		if ((vp->v_object != NULL) && (bp->b_flags & B_CLUSTEROK)) {
 			vfs_bio_awrite(bp);
 		} else {
 			bremfree(bp);
 			bawrite(bp);
 		}
 		if (maxretry < 1000)
 			pause("dirty", hz < 1000 ? 1 : hz / 1000);
 		BO_LOCK(bo);
 		goto loop2;
 	}
 
 	/*
 	 * If synchronous the caller expects us to completely resolve all
 	 * dirty buffers in the system.  Wait for in-progress I/O to
 	 * complete (which could include background bitmap writes), then
 	 * retry if dirty blocks still exist.
 	 */
 	if (waitfor == MNT_WAIT) {
 		bufobj_wwait(bo, 0, 0);
 		if (bo->bo_dirty.bv_cnt > 0) {
 			/*
 			 * If we are unable to write any of these buffers
 			 * then we fail now rather than trying endlessly
 			 * to write them out.
 			 */
 			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
 				if ((error = bp->b_error) != 0)
 					break;
 			if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
 			    (error == 0 && --maxretry >= 0))
 				goto loop1;
 			if (error == 0)
 				error = EAGAIN;
 		}
 	}
 	BO_UNLOCK(bo);
 	if (error != 0)
 		vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
 
 	return (error);
 }
 
 /*
  * Copies a byte range from invp to outvp.  Calls VOP_COPY_FILE_RANGE()
  * or vn_generic_copy_file_range() after rangelocking the byte ranges,
  * to do the actual copy.
  * vn_generic_copy_file_range() is factored out, so it can be called
  * from a VOP_COPY_FILE_RANGE() call as well, but handles vnodes from
  * different file systems.
  */
 int
 vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp,
     off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred,
     struct ucred *outcred, struct thread *fsize_td)
 {
 	int error;
 	size_t len;
 	uint64_t uval;
 
 	len = *lenp;
 	*lenp = 0;		/* For error returns. */
 	error = 0;
 
 	/* Do some sanity checks on the arguments. */
 	if (invp->v_type == VDIR || outvp->v_type == VDIR)
 		error = EISDIR;
 	else if (*inoffp < 0 || *outoffp < 0 ||
 	    invp->v_type != VREG || outvp->v_type != VREG)
 		error = EINVAL;
 	if (error != 0)
 		goto out;
 
 	/* Ensure offset + len does not wrap around. */
 	uval = *inoffp;
 	uval += len;
 	if (uval > INT64_MAX)
 		len = INT64_MAX - *inoffp;
 	uval = *outoffp;
 	uval += len;
 	if (uval > INT64_MAX)
 		len = INT64_MAX - *outoffp;
 	if (len == 0)
 		goto out;
 
 	/*
 	 * If the two vnode are for the same file system, call
 	 * VOP_COPY_FILE_RANGE(), otherwise call vn_generic_copy_file_range()
 	 * which can handle copies across multiple file systems.
 	 */
 	*lenp = len;
 	if (invp->v_mount == outvp->v_mount)
 		error = VOP_COPY_FILE_RANGE(invp, inoffp, outvp, outoffp,
 		    lenp, flags, incred, outcred, fsize_td);
 	else
 		error = vn_generic_copy_file_range(invp, inoffp, outvp,
 		    outoffp, lenp, flags, incred, outcred, fsize_td);
 out:
 	return (error);
 }
 
 /*
  * Test len bytes of data starting at dat for all bytes == 0.
  * Return true if all bytes are zero, false otherwise.
  * Expects dat to be well aligned.
  */
 static bool
 mem_iszero(void *dat, int len)
 {
 	int i;
 	const u_int *p;
 	const char *cp;
 
 	for (p = dat; len > 0; len -= sizeof(*p), p++) {
 		if (len >= sizeof(*p)) {
 			if (*p != 0)
 				return (false);
 		} else {
 			cp = (const char *)p;
 			for (i = 0; i < len; i++, cp++)
 				if (*cp != '\0')
 					return (false);
 		}
 	}
 	return (true);
 }
 
 /*
  * Look for a hole in the output file and, if found, adjust *outoffp
  * and *xferp to skip past the hole.
  * *xferp is the entire hole length to be written and xfer2 is how many bytes
  * to be written as 0's upon return.
  */
 static off_t
 vn_skip_hole(struct vnode *outvp, off_t xfer2, off_t *outoffp, off_t *xferp,
     off_t *dataoffp, off_t *holeoffp, struct ucred *cred)
 {
 	int error;
 	off_t delta;
 
 	if (*holeoffp == 0 || *holeoffp <= *outoffp) {
 		*dataoffp = *outoffp;
 		error = VOP_IOCTL(outvp, FIOSEEKDATA, dataoffp, 0, cred,
 		    curthread);
 		if (error == 0) {
 			*holeoffp = *dataoffp;
 			error = VOP_IOCTL(outvp, FIOSEEKHOLE, holeoffp, 0, cred,
 			    curthread);
 		}
 		if (error != 0 || *holeoffp == *dataoffp) {
 			/*
 			 * Since outvp is unlocked, it may be possible for
 			 * another thread to do a truncate(), lseek(), write()
 			 * creating a hole at startoff between the above
 			 * VOP_IOCTL() calls, if the other thread does not do
 			 * rangelocking.
 			 * If that happens, *holeoffp == *dataoffp and finding
 			 * the hole has failed, so disable vn_skip_hole().
 			 */
 			*holeoffp = -1;	/* Disable use of vn_skip_hole(). */
 			return (xfer2);
 		}
 		KASSERT(*dataoffp >= *outoffp,
 		    ("vn_skip_hole: dataoff=%jd < outoff=%jd",
 		    (intmax_t)*dataoffp, (intmax_t)*outoffp));
 		KASSERT(*holeoffp > *dataoffp,
 		    ("vn_skip_hole: holeoff=%jd <= dataoff=%jd",
 		    (intmax_t)*holeoffp, (intmax_t)*dataoffp));
 	}
 
 	/*
 	 * If there is a hole before the data starts, advance *outoffp and
 	 * *xferp past the hole.
 	 */
 	if (*dataoffp > *outoffp) {
 		delta = *dataoffp - *outoffp;
 		if (delta >= *xferp) {
 			/* Entire *xferp is a hole. */
 			*outoffp += *xferp;
 			*xferp = 0;
 			return (0);
 		}
 		*xferp -= delta;
 		*outoffp += delta;
 		xfer2 = MIN(xfer2, *xferp);
 	}
 
 	/*
 	 * If a hole starts before the end of this xfer2, reduce this xfer2 so
 	 * that the write ends at the start of the hole.
 	 * *holeoffp should always be greater than *outoffp, but for the
 	 * non-INVARIANTS case, check this to make sure xfer2 remains a sane
 	 * value.
 	 */
 	if (*holeoffp > *outoffp && *holeoffp < *outoffp + xfer2)
 		xfer2 = *holeoffp - *outoffp;
 	return (xfer2);
 }
 
 /*
  * Write an xfer sized chunk to outvp in blksize blocks from dat.
  * dat is a maximum of blksize in length and can be written repeatedly in
  * the chunk.
  * If growfile == true, just grow the file via vn_truncate_locked() instead
  * of doing actual writes.
  * If checkhole == true, a hole is being punched, so skip over any hole
  * already in the output file.
  */
 static int
 vn_write_outvp(struct vnode *outvp, char *dat, off_t outoff, off_t xfer,
     u_long blksize, bool growfile, bool checkhole, struct ucred *cred)
 {
 	struct mount *mp;
 	off_t dataoff, holeoff, xfer2;
 	int error;
 
 	/*
 	 * Loop around doing writes of blksize until write has been completed.
 	 * Lock/unlock on each loop iteration so that a bwillwrite() can be
 	 * done for each iteration, since the xfer argument can be very
 	 * large if there is a large hole to punch in the output file.
 	 */
 	error = 0;
 	holeoff = 0;
 	do {
 		xfer2 = MIN(xfer, blksize);
 		if (checkhole) {
 			/*
 			 * Punching a hole.  Skip writing if there is
 			 * already a hole in the output file.
 			 */
 			xfer2 = vn_skip_hole(outvp, xfer2, &outoff, &xfer,
 			    &dataoff, &holeoff, cred);
 			if (xfer == 0)
 				break;
 			if (holeoff < 0)
 				checkhole = false;
 			KASSERT(xfer2 > 0, ("vn_write_outvp: xfer2=%jd",
 			    (intmax_t)xfer2));
 		}
 		bwillwrite();
 		mp = NULL;
 		error = vn_start_write(outvp, &mp, V_WAIT);
 		if (error != 0)
 			break;
 		if (growfile) {
 			error = vn_lock(outvp, LK_EXCLUSIVE);
 			if (error == 0) {
 				error = vn_truncate_locked(outvp, outoff + xfer,
 				    false, cred);
 				VOP_UNLOCK(outvp);
 			}
 		} else {
 			error = vn_lock(outvp, vn_lktype_write(mp, outvp));
 			if (error == 0) {
 				error = vn_rdwr(UIO_WRITE, outvp, dat, xfer2,
 				    outoff, UIO_SYSSPACE, IO_NODELOCKED,
 				    curthread->td_ucred, cred, NULL, curthread);
 				outoff += xfer2;
 				xfer -= xfer2;
 				VOP_UNLOCK(outvp);
 			}
 		}
 		if (mp != NULL)
 			vn_finished_write(mp);
 	} while (!growfile && xfer > 0 && error == 0);
 	return (error);
 }
 
 /*
  * Copy a byte range of one file to another.  This function can handle the
  * case where invp and outvp are on different file systems.
  * It can also be called by a VOP_COPY_FILE_RANGE() to do the work, if there
  * is no better file system specific way to do it.
  */
 int
 vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
     struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags,
     struct ucred *incred, struct ucred *outcred, struct thread *fsize_td)
 {
 	struct vattr va, inva;
 	struct mount *mp;
 	struct uio io;
 	off_t startoff, endoff, xfer, xfer2;
 	u_long blksize;
 	int error, interrupted;
 	bool cantseek, readzeros, eof, lastblock, holetoeof;
 	ssize_t aresid;
 	size_t copylen, len, rem, savlen;
 	char *dat;
 	long holein, holeout;
 	struct timespec curts, endts;
 
 	holein = holeout = 0;
 	savlen = len = *lenp;
 	error = 0;
 	interrupted = 0;
 	dat = NULL;
 
 	error = vn_lock(invp, LK_SHARED);
 	if (error != 0)
 		goto out;
 	if (VOP_PATHCONF(invp, _PC_MIN_HOLE_SIZE, &holein) != 0)
 		holein = 0;
 	if (holein > 0)
 		error = VOP_GETATTR(invp, &inva, incred);
 	VOP_UNLOCK(invp);
 	if (error != 0)
 		goto out;
 
 	mp = NULL;
 	error = vn_start_write(outvp, &mp, V_WAIT);
 	if (error == 0)
 		error = vn_lock(outvp, LK_EXCLUSIVE);
 	if (error == 0) {
 		/*
 		 * If fsize_td != NULL, do a vn_rlimit_fsize() call,
 		 * now that outvp is locked.
 		 */
 		if (fsize_td != NULL) {
 			io.uio_offset = *outoffp;
 			io.uio_resid = len;
 			error = vn_rlimit_fsize(outvp, &io, fsize_td);
 			if (error != 0)
 				error = EFBIG;
 		}
 		if (VOP_PATHCONF(outvp, _PC_MIN_HOLE_SIZE, &holeout) != 0)
 			holeout = 0;
 		/*
 		 * Holes that are past EOF do not need to be written as a block
 		 * of zero bytes.  So, truncate the output file as far as
 		 * possible and then use va.va_size to decide if writing 0
 		 * bytes is necessary in the loop below.
 		 */
 		if (error == 0)
 			error = VOP_GETATTR(outvp, &va, outcred);
 		if (error == 0 && va.va_size > *outoffp && va.va_size <=
 		    *outoffp + len) {
 #ifdef MAC
 			error = mac_vnode_check_write(curthread->td_ucred,
 			    outcred, outvp);
 			if (error == 0)
 #endif
 				error = vn_truncate_locked(outvp, *outoffp,
 				    false, outcred);
 			if (error == 0)
 				va.va_size = *outoffp;
 		}
 		VOP_UNLOCK(outvp);
 	}
 	if (mp != NULL)
 		vn_finished_write(mp);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Set the blksize to the larger of the hole sizes for invp and outvp.
 	 * If hole sizes aren't available, set the blksize to the larger 
 	 * f_iosize of invp and outvp.
 	 * This code expects the hole sizes and f_iosizes to be powers of 2.
 	 * This value is clipped at 4Kbytes and 1Mbyte.
 	 */
 	blksize = MAX(holein, holeout);
 
 	/* Clip len to end at an exact multiple of hole size. */
 	if (blksize > 1) {
 		rem = *inoffp % blksize;
 		if (rem > 0)
 			rem = blksize - rem;
 		if (len > rem && len - rem > blksize)
 			len = savlen = rounddown(len - rem, blksize) + rem;
 	}
 
 	if (blksize <= 1)
 		blksize = MAX(invp->v_mount->mnt_stat.f_iosize,
 		    outvp->v_mount->mnt_stat.f_iosize);
 	if (blksize < 4096)
 		blksize = 4096;
 	else if (blksize > 1024 * 1024)
 		blksize = 1024 * 1024;
 	dat = malloc(blksize, M_TEMP, M_WAITOK);
 
 	/*
 	 * If VOP_IOCTL(FIOSEEKHOLE) works for invp, use it and FIOSEEKDATA
 	 * to find holes.  Otherwise, just scan the read block for all 0s
 	 * in the inner loop where the data copying is done.
 	 * Note that some file systems such as NFSv3, NFSv4.0 and NFSv4.1 may
 	 * support holes on the server, but do not support FIOSEEKHOLE.
 	 * The kernel flag COPY_FILE_RANGE_TIMEO1SEC is used to indicate
 	 * that this function should return after 1second with a partial
 	 * completion.
 	 */
 	if ((flags & COPY_FILE_RANGE_TIMEO1SEC) != 0) {
 		getnanouptime(&endts);
 		endts.tv_sec++;
 	} else
 		timespecclear(&endts);
 	holetoeof = eof = false;
 	while (len > 0 && error == 0 && !eof && interrupted == 0) {
 		endoff = 0;			/* To shut up compilers. */
 		cantseek = true;
 		startoff = *inoffp;
 		copylen = len;
 
 		/*
 		 * Find the next data area.  If there is just a hole to EOF,
 		 * FIOSEEKDATA should fail with ENXIO.
 		 * (I do not know if any file system will report a hole to
 		 *  EOF via FIOSEEKHOLE, but I am pretty sure FIOSEEKDATA
 		 *  will fail for those file systems.)
 		 *
 		 * For input files that don't support FIOSEEKDATA/FIOSEEKHOLE,
 		 * the code just falls through to the inner copy loop.
 		 */
 		error = EINVAL;
 		if (holein > 0) {
 			error = VOP_IOCTL(invp, FIOSEEKDATA, &startoff, 0,
 			    incred, curthread);
 			if (error == ENXIO) {
 				startoff = endoff = inva.va_size;
 				eof = holetoeof = true;
 				error = 0;
 			}
 		}
 		if (error == 0 && !holetoeof) {
 			endoff = startoff;
 			error = VOP_IOCTL(invp, FIOSEEKHOLE, &endoff, 0,
 			    incred, curthread);
 			/*
 			 * Since invp is unlocked, it may be possible for
 			 * another thread to do a truncate(), lseek(), write()
 			 * creating a hole at startoff between the above
 			 * VOP_IOCTL() calls, if the other thread does not do
 			 * rangelocking.
 			 * If that happens, startoff == endoff and finding
 			 * the hole has failed, so set an error.
 			 */
 			if (error == 0 && startoff == endoff)
 				error = EINVAL; /* Any error. Reset to 0. */
 		}
 		if (error == 0) {
 			if (startoff > *inoffp) {
 				/* Found hole before data block. */
 				xfer = MIN(startoff - *inoffp, len);
 				if (*outoffp < va.va_size) {
 					/* Must write 0s to punch hole. */
 					xfer2 = MIN(va.va_size - *outoffp,
 					    xfer);
 					memset(dat, 0, MIN(xfer2, blksize));
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer2, blksize, false,
 					    holeout > 0, outcred);
 				}
 
 				if (error == 0 && *outoffp + xfer >
 				    va.va_size && (xfer == len || holetoeof)) {
 					/* Grow output file (hole at end). */
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer, blksize, true,
 					    false, outcred);
 				}
 				if (error == 0) {
 					*inoffp += xfer;
 					*outoffp += xfer;
 					len -= xfer;
 					if (len < savlen) {
 						interrupted = sig_intr();
 						if (timespecisset(&endts) &&
 						    interrupted == 0) {
 							getnanouptime(&curts);
 							if (timespeccmp(&curts,
 							    &endts, >=))
 								interrupted =
 								    EINTR;
 						}
 					}
 				}
 			}
 			copylen = MIN(len, endoff - startoff);
 			cantseek = false;
 		} else {
 			cantseek = true;
 			startoff = *inoffp;
 			copylen = len;
 			error = 0;
 		}
 
 		xfer = blksize;
 		if (cantseek) {
 			/*
 			 * Set first xfer to end at a block boundary, so that
 			 * holes are more likely detected in the loop below via
 			 * the for all bytes 0 method.
 			 */
 			xfer -= (*inoffp % blksize);
 		}
 		/* Loop copying the data block. */
 		while (copylen > 0 && error == 0 && !eof && interrupted == 0) {
 			if (copylen < xfer)
 				xfer = copylen;
 			error = vn_lock(invp, LK_SHARED);
 			if (error != 0)
 				goto out;
 			error = vn_rdwr(UIO_READ, invp, dat, xfer,
 			    startoff, UIO_SYSSPACE, IO_NODELOCKED,
 			    curthread->td_ucred, incred, &aresid,
 			    curthread);
 			VOP_UNLOCK(invp);
 			lastblock = false;
 			if (error == 0 && aresid > 0) {
 				/* Stop the copy at EOF on the input file. */
 				xfer -= aresid;
 				eof = true;
 				lastblock = true;
 			}
 			if (error == 0) {
 				/*
 				 * Skip the write for holes past the initial EOF
 				 * of the output file, unless this is the last
 				 * write of the output file at EOF.
 				 */
 				readzeros = cantseek ? mem_iszero(dat, xfer) :
 				    false;
 				if (xfer == len)
 					lastblock = true;
 				if (!cantseek || *outoffp < va.va_size ||
 				    lastblock || !readzeros)
 					error = vn_write_outvp(outvp, dat,
 					    *outoffp, xfer, blksize,
 					    readzeros && lastblock &&
 					    *outoffp >= va.va_size, false,
 					    outcred);
 				if (error == 0) {
 					*inoffp += xfer;
 					startoff += xfer;
 					*outoffp += xfer;
 					copylen -= xfer;
 					len -= xfer;
 					if (len < savlen) {
 						interrupted = sig_intr();
 						if (timespecisset(&endts) &&
 						    interrupted == 0) {
 							getnanouptime(&curts);
 							if (timespeccmp(&curts,
 							    &endts, >=))
 								interrupted =
 								    EINTR;
 						}
 					}
 				}
 			}
 			xfer = blksize;
 		}
 	}
 out:
 	*lenp = savlen - len;
 	free(dat, M_TEMP);
 	return (error);
 }
 
 static int
 vn_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
 {
 	struct mount *mp;
 	struct vnode *vp;
 	off_t olen, ooffset;
 	int error;
 #ifdef AUDIT
 	int audited_vnode1 = 0;
 #endif
 
 	vp = fp->f_vnode;
 	if (vp->v_type != VREG)
 		return (ENODEV);
 
 	/* Allocating blocks may take a long time, so iterate. */
 	for (;;) {
 		olen = len;
 		ooffset = offset;
 
 		bwillwrite();
 		mp = NULL;
 		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
 		if (error != 0)
 			break;
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0) {
 			vn_finished_write(mp);
 			break;
 		}
 #ifdef AUDIT
 		if (!audited_vnode1) {
 			AUDIT_ARG_VNODE1(vp);
 			audited_vnode1 = 1;
 		}
 #endif
 #ifdef MAC
 		error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
 		if (error == 0)
 #endif
 			error = VOP_ALLOCATE(vp, &offset, &len);
 		VOP_UNLOCK(vp);
 		vn_finished_write(mp);
 
 		if (olen + ooffset != offset + len) {
 			panic("offset + len changed from %jx/%jx to %jx/%jx",
 			    ooffset, olen, offset, len);
 		}
 		if (error != 0 || len == 0)
 			break;
 		KASSERT(olen > len, ("Iteration did not make progress?"));
 		maybe_yield();
 	}
 
 	return (error);
 }
 
 static int
 vn_deallocate_impl(struct vnode *vp, off_t *offset, off_t *length, int flags,
     int ioflag, struct ucred *cred, struct ucred *active_cred,
     struct ucred *file_cred)
 {
 	struct mount *mp;
 	void *rl_cookie;
 	off_t off, len;
 	int error;
 #ifdef AUDIT
 	bool audited_vnode1 = false;
 #endif
 
 	rl_cookie = NULL;
 	error = 0;
 	mp = NULL;
 	off = *offset;
 	len = *length;
 
 	if ((ioflag & (IO_NODELOCKED | IO_RANGELOCKED)) == 0)
 		rl_cookie = vn_rangelock_wlock(vp, off, off + len);
 	while (len > 0 && error == 0) {
 		/*
 		 * Try to deallocate the longest range in one pass.
 		 * In case a pass takes too long to be executed, it returns
 		 * partial result. The residue will be proceeded in the next
 		 * pass.
 		 */
 
 		if ((ioflag & IO_NODELOCKED) == 0) {
 			bwillwrite();
 			if ((error = vn_start_write(vp, &mp,
 			    V_WAIT | PCATCH)) != 0)
 				goto out;
 			vn_lock(vp, vn_lktype_write(mp, vp) | LK_RETRY);
 		}
 #ifdef AUDIT
 		if (!audited_vnode1) {
 			AUDIT_ARG_VNODE1(vp);
 			audited_vnode1 = true;
 		}
 #endif
 
 #ifdef MAC
 		if ((ioflag & IO_NOMACCHECK) == 0)
 			error = mac_vnode_check_write(active_cred, file_cred,
 			    vp);
 #endif
 		if (error == 0)
 			error = VOP_DEALLOCATE(vp, &off, &len, flags, ioflag,
 			    cred);
 
 		if ((ioflag & IO_NODELOCKED) == 0) {
 			VOP_UNLOCK(vp);
 			if (mp != NULL) {
 				vn_finished_write(mp);
 				mp = NULL;
 			}
 		}
 		if (error == 0 && len != 0)
 			maybe_yield();
 	}
 out:
 	if (rl_cookie != NULL)
 		vn_rangelock_unlock(vp, rl_cookie);
 	*offset = off;
 	*length = len;
 	return (error);
 }
 
 /*
  * This function is supposed to be used in the situations where the deallocation
  * is not triggered by a user request.
  */
 int
 vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags,
     int ioflag, struct ucred *active_cred, struct ucred *file_cred)
 {
 	struct ucred *cred;
 
 	if (*offset < 0 || *length <= 0 || *length > OFF_MAX - *offset ||
 	    flags != 0)
 		return (EINVAL);
 	if (vp->v_type != VREG)
 		return (ENODEV);
 
 	cred = file_cred != NOCRED ? file_cred : active_cred;
 	return (vn_deallocate_impl(vp, offset, length, flags, ioflag, cred,
 	    active_cred, file_cred));
 }
 
 static int
 vn_fspacectl(struct file *fp, int cmd, off_t *offset, off_t *length, int flags,
     struct ucred *active_cred, struct thread *td)
 {
 	int error;
 	struct vnode *vp;
 	int ioflag;
 
 	vp = fp->f_vnode;
 
 	if (cmd != SPACECTL_DEALLOC || *offset < 0 || *length <= 0 ||
 	    *length > OFF_MAX - *offset || flags != 0)
 		return (EINVAL);
 	if (vp->v_type != VREG)
 		return (ENODEV);
 
 	ioflag = get_write_ioflag(fp);
 
 	switch (cmd) {
 	case SPACECTL_DEALLOC:
 		error = vn_deallocate_impl(vp, offset, length, flags, ioflag,
 		    active_cred, active_cred, fp->f_cred);
 		break;
 	default:
 		panic("vn_fspacectl: unknown cmd %d", cmd);
 	}
 
 	return (error);
 }
 
 static u_long vn_lock_pair_pause_cnt;
 SYSCTL_ULONG(_debug, OID_AUTO, vn_lock_pair_pause, CTLFLAG_RD,
     &vn_lock_pair_pause_cnt, 0,
     "Count of vn_lock_pair deadlocks");
 
 u_int vn_lock_pair_pause_max;
 SYSCTL_UINT(_debug, OID_AUTO, vn_lock_pair_pause_max, CTLFLAG_RW,
     &vn_lock_pair_pause_max, 0,
     "Max ticks for vn_lock_pair deadlock avoidance sleep");
 
 static void
 vn_lock_pair_pause(const char *wmesg)
 {
 	atomic_add_long(&vn_lock_pair_pause_cnt, 1);
 	pause(wmesg, prng32_bounded(vn_lock_pair_pause_max));
 }
 
 /*
  * Lock pair of vnodes vp1, vp2, avoiding lock order reversal.
  * vp1_locked indicates whether vp1 is exclusively locked; if not, vp1
  * must be unlocked.  Same for vp2 and vp2_locked.  One of the vnodes
  * can be NULL.
  *
  * The function returns with both vnodes exclusively locked, and
  * guarantees that it does not create lock order reversal with other
  * threads during its execution.  Both vnodes could be unlocked
  * temporary (and reclaimed).
  */
 void
 vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2,
     bool vp2_locked)
 {
 	int error;
 
 	if (vp1 == NULL && vp2 == NULL)
 		return;
 	if (vp1 != NULL) {
 		if (vp1_locked)
 			ASSERT_VOP_ELOCKED(vp1, "vp1");
 		else
 			ASSERT_VOP_UNLOCKED(vp1, "vp1");
 	} else {
 		vp1_locked = true;
 	}
 	if (vp2 != NULL) {
 		if (vp2_locked)
 			ASSERT_VOP_ELOCKED(vp2, "vp2");
 		else
 			ASSERT_VOP_UNLOCKED(vp2, "vp2");
 	} else {
 		vp2_locked = true;
 	}
 	if (!vp1_locked && !vp2_locked) {
 		vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
 		vp1_locked = true;
 	}
 
 	for (;;) {
 		if (vp1_locked && vp2_locked)
 			break;
 		if (vp1_locked && vp2 != NULL) {
 			if (vp1 != NULL) {
 				error = VOP_LOCK1(vp2, LK_EXCLUSIVE | LK_NOWAIT,
 				    __FILE__, __LINE__);
 				if (error == 0)
 					break;
 				VOP_UNLOCK(vp1);
 				vp1_locked = false;
 				vn_lock_pair_pause("vlp1");
 			}
 			vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY);
 			vp2_locked = true;
 		}
 		if (vp2_locked && vp1 != NULL) {
 			if (vp2 != NULL) {
 				error = VOP_LOCK1(vp1, LK_EXCLUSIVE | LK_NOWAIT,
 				    __FILE__, __LINE__);
 				if (error == 0)
 					break;
 				VOP_UNLOCK(vp2);
 				vp2_locked = false;
 				vn_lock_pair_pause("vlp2");
 			}
 			vn_lock(vp1, LK_EXCLUSIVE | LK_RETRY);
 			vp1_locked = true;
 		}
 	}
 	if (vp1 != NULL)
 		ASSERT_VOP_ELOCKED(vp1, "vp1 ret");
 	if (vp2 != NULL)
 		ASSERT_VOP_ELOCKED(vp2, "vp2 ret");
 }
 
 int
 vn_lktype_write(struct mount *mp, struct vnode *vp)
 {
 	if (MNT_SHARED_WRITES(mp) ||
 	    (mp == NULL && MNT_SHARED_WRITES(vp->v_mount)))
 		return (LK_SHARED);
 	return (LK_EXCLUSIVE);
 }
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
index ff57d1c9a28e..d10758019b87 100644
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@@ -1,839 +1,838 @@
 #-
 # Copyright (c) 1992, 1993
 #	The Regents of the University of California.  All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 # 3. Neither the name of the University nor the names of its contributors
 #    may be used to endorse or promote products derived from this software
 #    without specific prior written permission.
 #
 # THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #
 #	@(#)vnode_if.src	8.12 (Berkeley) 5/14/95
 # $FreeBSD$
 #
 
 #
 # Above each of the vop descriptors in lines starting with %%
 # is a specification of the locking protocol used by each vop call.
 # The first column is the name of the variable, the remaining three
 # columns are in, out and error respectively.  The "in" column defines
 # the lock state on input, the "out" column defines the state on successful
 # return, and the "error" column defines the locking state on error exit.
 #
 # The locking value can take the following values:
 # L: locked; not converted to type of lock.
 # E: locked with exclusive lock for this process.
 # U: unlocked.
 # -: not applicable.  vnode does not yet (or no longer) exists.
 # =: the same on input and output, may be either L or U.
 #
 # The paramater named "vpp" is assumed to be always used with double
 # indirection (**vpp) and that name is hard-coded in vnode_if.awk !
 #
 # Lines starting with %! specify a pre or post-condition function
 # to call before/after the vop call.
 #
 # If other such parameters are introduced, they have to be added to
 # the AWK script at the head of the definition of "add_debug_code()".
 #
 
 vop_islocked {
 	IN struct vnode *vp;
 };
 
 
 %% lookup	dvp	L L L
 %% lookup	vpp	- L -
 
 # XXX - the lookup locking protocol defies simple description and depends
 #	on the flags and operation fields in the (cnp) structure.  Note
 #	especially that *vpp may equal dvp and both may be locked.
 
 vop_lookup {
 	IN struct vnode *dvp;
 	INOUT struct vnode **vpp;
 	IN struct componentname *cnp;
 };
 
 
 %% cachedlookup	dvp	L L L
 %% cachedlookup	vpp	- L -
 
 # This must be an exact copy of lookup.  See kern/vfs_cache.c for details.
 
 vop_cachedlookup {
 	IN struct vnode *dvp;
 	INOUT struct vnode **vpp;
 	IN struct componentname *cnp;
 };
 
 
 %% create	dvp	E E E
 %% create	vpp	- L -
 %! create	pre	vop_create_pre
 %! create	post	vop_create_post
 
 vop_create {
 	IN struct vnode *dvp;
 	OUT struct vnode **vpp;
 	IN struct componentname *cnp;
 	IN struct vattr *vap;
 };
 
 
 %% whiteout	dvp	E E E
 %! whiteout	pre	vop_whiteout_pre
 %! whiteout	post	vop_whiteout_post
 
 vop_whiteout {
 	IN struct vnode *dvp;
 	IN struct componentname *cnp;
 	IN int flags;
 };
 
 
 %% mknod	dvp	E E E
 %% mknod	vpp	- L -
 %! mknod	pre	vop_mknod_pre
 %! mknod	post	vop_mknod_post
 
 vop_mknod {
 	IN struct vnode *dvp;
 	OUT struct vnode **vpp;
 	IN struct componentname *cnp;
 	IN struct vattr *vap;
 };
 
 
 %% open		vp	L L L
 %! open		post	vop_open_post
 
 vop_open {
 	IN struct vnode *vp;
 	IN int mode;
 	IN struct ucred *cred;
 	IN struct thread *td;
 	IN struct file *fp;
 };
 
 
 %% close	vp	L L L
 %! close	post	vop_close_post
 
 vop_close {
 	IN struct vnode *vp;
 	IN int fflag;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% fplookup_vexec	vp	- - -
 %! fplookup_vexec	debugpre	vop_fplookup_vexec_debugpre
 %! fplookup_vexec	debugpost	vop_fplookup_vexec_debugpost
 
 vop_fplookup_vexec {
 	IN struct vnode *vp;
 	IN struct ucred *cred;
 };
 
 
 %% fplookup_symlink	vp	- - -
 %! fplookup_symlink	debugpre	vop_fplookup_symlink_debugpre
 %! fplookup_symlink	debugpost	vop_fplookup_symlink_debugpost
 
 vop_fplookup_symlink {
 	IN struct vnode *vp;
 	IN struct cache_fpl *fpl;
 };
 
 
 %% access	vp	L L L
 
 vop_access {
 	IN struct vnode *vp;
 	IN accmode_t accmode;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% accessx	vp	L L L
 
 vop_accessx {
 	IN struct vnode *vp;
 	IN accmode_t accmode;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% stat	vp	L L L
 
 vop_stat {
 	IN struct vnode *vp;
 	OUT struct stat *sb;
 	IN struct ucred *active_cred;
 	IN struct ucred *file_cred;
-	IN struct thread *td;
 };
 
 
 %% getattr	vp	L L L
 
 vop_getattr {
 	IN struct vnode *vp;
 	OUT struct vattr *vap;
 	IN struct ucred *cred;
 };
 
 
 %% setattr	vp	E E E
 %! setattr	pre	vop_setattr_pre
 %! setattr	post	vop_setattr_post
 
 vop_setattr {
 	IN struct vnode *vp;
 	IN struct vattr *vap;
 	IN struct ucred *cred;
 };
 
 
 %% mmapped	vp	L L L
 
 vop_mmapped {
 	IN struct vnode *vp;
 };
 
 
 %% read		vp	L L L
 %! read		post	vop_read_post
 
 vop_read {
 	IN struct vnode *vp;
 	INOUT struct uio *uio;
 	IN int ioflag;
 	IN struct ucred *cred;
 };
 
 
 %% read_pgcache	vp	- - -
 %! read_pgcache	post	vop_read_pgcache_post
 
 vop_read_pgcache {
 	IN struct vnode *vp;
 	INOUT struct uio *uio;
 	IN int ioflag;
 	IN struct ucred *cred;
 };
 
 
 %% write	vp	L L L
 %! write	pre	VOP_WRITE_PRE
 %! write	post	VOP_WRITE_POST
 
 vop_write {
 	IN struct vnode *vp;
 	INOUT struct uio *uio;
 	IN int ioflag;
 	IN struct ucred *cred;
 };
 
 
 %% ioctl	vp	U U U
 
 vop_ioctl {
 	IN struct vnode *vp;
 	IN u_long command;
 	IN void *data;
 	IN int fflag;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% poll		vp	U U U
 
 vop_poll {
 	IN struct vnode *vp;
 	IN int events;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% kqfilter	vp	U U U
 
 vop_kqfilter {
 	IN struct vnode *vp;
 	IN struct knote *kn;
 };
 
 
 %% revoke	vp	L L L
 
 vop_revoke {
 	IN struct vnode *vp;
 	IN int flags;
 };
 
 
 %% fsync	vp	L L L
 
 vop_fsync {
 	IN struct vnode *vp;
 	IN int waitfor;
 	IN struct thread *td;
 };
 
 
 %% remove	dvp	E E E
 %% remove	vp	E E E
 %! remove	pre	vop_remove_pre
 %! remove	post	vop_remove_post
 
 vop_remove {
 	IN struct vnode *dvp;
 	IN struct vnode *vp;
 	IN struct componentname *cnp;
 };
 
 
 %% link		tdvp	E E E
 %% link		vp	E E E
 %! link		pre	vop_link_pre
 %! link		post	vop_link_post
 
 vop_link {
 	IN struct vnode *tdvp;
 	IN struct vnode *vp;
 	IN struct componentname *cnp;
 };
 
 
 %! rename	pre	vop_rename_pre
 %! rename	post	vop_rename_post
 
 vop_rename {
 	IN WILLRELE struct vnode *fdvp;
 	IN WILLRELE struct vnode *fvp;
 	IN struct componentname *fcnp;
 	IN WILLRELE struct vnode *tdvp;
 	IN WILLRELE struct vnode *tvp;
 	IN struct componentname *tcnp;
 };
 
 
 %% mkdir	dvp	E E E
 %% mkdir	vpp	- E -
 %! mkdir	pre	vop_mkdir_pre
 %! mkdir	post	vop_mkdir_post
 %! mkdir	debugpost vop_mkdir_debugpost
 
 vop_mkdir {
 	IN struct vnode *dvp;
 	OUT struct vnode **vpp;
 	IN struct componentname *cnp;
 	IN struct vattr *vap;
 };
 
 
 %% rmdir	dvp	E E E
 %% rmdir	vp	E E E
 %! rmdir	pre	vop_rmdir_pre
 %! rmdir	post	vop_rmdir_post
 
 vop_rmdir {
 	IN struct vnode *dvp;
 	IN struct vnode *vp;
 	IN struct componentname *cnp;
 };
 
 
 %% symlink	dvp	E E E
 %% symlink	vpp	- E -
 %! symlink	pre	vop_symlink_pre
 %! symlink	post	vop_symlink_post
 
 vop_symlink {
 	IN struct vnode *dvp;
 	OUT struct vnode **vpp;
 	IN struct componentname *cnp;
 	IN struct vattr *vap;
 	IN const char *target;
 };
 
 
 %% readdir	vp	L L L
 %! readdir	post	vop_readdir_post
 
 vop_readdir {
 	IN struct vnode *vp;
 	INOUT struct uio *uio;
 	IN struct ucred *cred;
 	INOUT int *eofflag;
 	OUT int *ncookies;
 	INOUT u_long **cookies;
 };
 
 
 %% readlink	vp	L L L
 
 vop_readlink {
 	IN struct vnode *vp;
 	INOUT struct uio *uio;
 	IN struct ucred *cred;
 };
 
 
 %% inactive	vp	E E E
 
 vop_inactive {
 	IN struct vnode *vp;
 };
 
 %! need_inactive	debugpre	vop_need_inactive_debugpre
 %! need_inactive	debugpost	vop_need_inactive_debugpost
 
 vop_need_inactive {
         IN struct vnode *vp;
 };
 
 %% reclaim	vp	E E E
 %! reclaim	post	vop_reclaim_post
 
 vop_reclaim {
 	IN struct vnode *vp;
 };
 
 
 %! lock1	debugpre	vop_lock_debugpre
 %! lock1	debugpost	vop_lock_debugpost
 
 vop_lock1 {
 	IN struct vnode *vp;
 	IN int flags;
 	IN const char *file;
 	IN int line;
 };
 
 
 %! unlock	debugpre	vop_unlock_debugpre
 
 vop_unlock {
 	IN struct vnode *vp;
 };
 
 
 %% bmap		vp	L L L
 
 vop_bmap {
 	IN struct vnode *vp;
 	IN daddr_t bn;
 	OUT struct bufobj **bop;
 	IN daddr_t *bnp;
 	OUT int *runp;
 	OUT int *runb;
 };
 
 
 %% strategy	vp	L L L
 %! strategy	debugpre	vop_strategy_debugpre
 
 vop_strategy {
 	IN struct vnode *vp;
 	IN struct buf *bp;
 };
 
 
 %% getwritemount vp	= = =
 
 vop_getwritemount {
 	IN struct vnode *vp;
 	OUT struct mount **mpp;
 };
 
 
 %% print	vp	- - -
 
 vop_print {
 	IN struct vnode *vp;
 };
 
 
 %% pathconf	vp	L L L
 
 vop_pathconf {
 	IN struct vnode *vp;
 	IN int name;
 	OUT long *retval;
 };
 
 
 %% advlock	vp	U U U
 
 vop_advlock {
 	IN struct vnode *vp;
 	IN void *id;
 	IN int op;
 	IN struct flock *fl;
 	IN int flags;
 };
 
 
 %% advlockasync	vp	U U U
 
 vop_advlockasync {
 	IN struct vnode *vp;
 	IN void *id;
 	IN int op;
 	IN struct flock *fl;
 	IN int flags;
 	IN struct task *task;	
 	INOUT void **cookiep;
 };
 
 
 %% advlockpurge	vp	E E E
 
 vop_advlockpurge {
 	IN struct vnode *vp;
 };
 
 
 %% reallocblks	vp	E E E
 
 vop_reallocblks {
 	IN struct vnode *vp;
 	IN struct cluster_save *buflist;
 };
 
 
 %% getpages	vp	L L L
 
 vop_getpages {
 	IN struct vnode *vp;
 	IN vm_page_t *m;
 	IN int count;
 	IN int *rbehind;
 	IN int *rahead;
 };
 
 
 %% getpages_async	vp	L L L
 
 vop_getpages_async {
 	IN struct vnode *vp;
 	IN vm_page_t *m;
 	IN int count;
 	IN int *rbehind;
 	IN int *rahead;
 	IN vop_getpages_iodone_t *iodone;
 	IN void *arg;
 };
 
 
 %% putpages	vp	L L L
 
 vop_putpages {
 	IN struct vnode *vp;
 	IN vm_page_t *m;
 	IN int count;
 	IN int sync;
 	IN int *rtvals;
 };
 
 
 %% getacl	vp	L L L
 
 vop_getacl {
 	IN struct vnode *vp;
 	IN acl_type_t type;
 	OUT struct acl *aclp;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% setacl	vp	E E E
 %! setacl	pre	vop_setacl_pre
 %! setacl	post	vop_setacl_post
 
 vop_setacl {
 	IN struct vnode *vp;
 	IN acl_type_t type;
 	IN struct acl *aclp;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% aclcheck	vp	= = =
 
 vop_aclcheck {
 	IN struct vnode *vp;
 	IN acl_type_t type;
 	IN struct acl *aclp;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% closeextattr	vp	L L L
 
 vop_closeextattr {
 	IN struct vnode *vp;
 	IN int commit;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% getextattr	vp	L L L
 
 vop_getextattr {
 	IN struct vnode *vp;
 	IN int attrnamespace;
 	IN const char *name;
 	INOUT struct uio *uio;
 	OUT size_t *size;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% listextattr	vp	L L L
 
 vop_listextattr {
 	IN struct vnode *vp;
 	IN int attrnamespace;
 	INOUT struct uio *uio;
 	OUT size_t *size;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% openextattr	vp	L L L
 
 vop_openextattr {
 	IN struct vnode *vp;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% deleteextattr	vp	E E E
 %! deleteextattr	pre	vop_deleteextattr_pre
 %! deleteextattr	post	vop_deleteextattr_post
 
 vop_deleteextattr {
 	IN struct vnode *vp;
 	IN int attrnamespace;
 	IN const char *name;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% setextattr	vp	E E E
 %! setextattr	pre	vop_setextattr_pre
 %! setextattr	post	vop_setextattr_post
 
 vop_setextattr {
 	IN struct vnode *vp;
 	IN int attrnamespace;
 	IN const char *name;
 	INOUT struct uio *uio;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% setlabel	vp	E E E
 
 vop_setlabel {
 	IN struct vnode *vp;
 	IN struct label *label;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
 
 %% vptofh	vp	= = =
 
 vop_vptofh {
 	IN struct vnode *vp;
 	IN struct fid *fhp;
 };
 
 
 %% vptocnp		vp	L L L
 %% vptocnp		vpp	- U -
 
 vop_vptocnp {
 	IN struct vnode *vp;
 	OUT struct vnode **vpp;
 	INOUT char *buf;
 	INOUT size_t *buflen;
 };
 
 
 %% allocate	vp	E E E
 
 vop_allocate {
 	IN struct vnode *vp;
 	INOUT off_t *offset;
 	INOUT off_t *len;
 };
 
 
 %% advise	vp	U U U
 
 vop_advise {
 	IN struct vnode *vp;
 	IN off_t start;
 	IN off_t end;
 	IN int advice;
 };
 
 
 %% unp_bind	vp	E E E
 
 vop_unp_bind {
 	IN struct vnode *vp;
 	IN struct unpcb *unpcb;
 };
 
 
 %% unp_connect	vp	L L L
 
 vop_unp_connect {
 	IN struct vnode *vp;
 	OUT struct unpcb **unpcb;
 };
 
 
 %% unp_detach	vp	= = =
 
 vop_unp_detach {
 	IN struct vnode *vp;
 };
 
 
 %% is_text	vp	L L L
 
 vop_is_text {
 	IN struct vnode *vp;
 };
 
 
 %% set_text	vp	= = =
 
 vop_set_text {
 	IN struct vnode *vp;
 };
 
 
 %% vop_unset_text	vp	L L L
 
 vop_unset_text {
 	IN struct vnode *vp;
 };
 
 
 %% add_writecount	vp	L L L
 
 vop_add_writecount {
 	IN struct vnode *vp;
 	IN int inc;
 };
 
 
 %% fdatasync	vp	L L L
 
 vop_fdatasync {
 	IN struct vnode *vp;
 	IN struct thread *td;
 };
 
 
 %% copy_file_range	invp	U U U
 %% copy_file_range	outvp	U U U
 
 vop_copy_file_range {
 	IN struct vnode *invp;
 	INOUT off_t *inoffp;
 	IN struct vnode *outvp;
 	INOUT off_t *outoffp;
 	INOUT size_t *lenp;
 	IN unsigned int flags;
 	IN struct ucred *incred;
 	IN struct ucred *outcred;
 	IN struct thread *fsizetd;
 };
 
 
 %% vput_pair	dvp	E - -
 
 vop_vput_pair {
 	IN struct vnode *dvp;
 	INOUT struct vnode **vpp;
 	IN bool unlock_vp;
 };
 
 
 %% deallocate	vp	L L L
 
 vop_deallocate {
 	IN struct vnode *vp;
 	INOUT off_t *offset;
 	INOUT off_t *len;
 	IN int flags;
 	IN int ioflag;
 	IN struct ucred *cred;
 };
 
 
 # The VOPs below are spares at the end of the table to allow new VOPs to be
 # added in stable branches without breaking the KBI.  New VOPs in HEAD should
 # be added above these spares.  When merging a new VOP to a stable branch,
 # the new VOP should replace one of the spares.
 
 vop_spare1 {
 	IN struct vnode *vp;
 };
 
 vop_spare2 {
 	IN struct vnode *vp;
 };
 
 vop_spare3 {
 	IN struct vnode *vp;
 };
 
 vop_spare4 {
 	IN struct vnode *vp;
 };
 
 vop_spare5 {
 	IN struct vnode *vp;
 };
diff --git a/sys/sys/file.h b/sys/sys/file.h
index 8a790a25fc6b..5a6f0b1d2071 100644
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@@ -1,492 +1,491 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)file.h	8.3 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_FILE_H_
 #define	_SYS_FILE_H_
 
 #ifndef _KERNEL
 #include <sys/types.h> /* XXX */
 #include <sys/fcntl.h>
 #include <sys/unistd.h>
 #else
 #include <sys/queue.h>
 #include <sys/refcount.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <vm/vm.h>
 
 struct filedesc;
 struct stat;
 struct thread;
 struct uio;
 struct knote;
 struct vnode;
 struct nameidata;
 
 #endif /* _KERNEL */
 
 #define	DTYPE_NONE	0	/* not yet initialized */
 #define	DTYPE_VNODE	1	/* file */
 #define	DTYPE_SOCKET	2	/* communications endpoint */
 #define	DTYPE_PIPE	3	/* pipe */
 #define	DTYPE_FIFO	4	/* fifo (named pipe) */
 #define	DTYPE_KQUEUE	5	/* event queue */
 #define	DTYPE_CRYPTO	6	/* crypto */
 #define	DTYPE_MQUEUE	7	/* posix message queue */
 #define	DTYPE_SHM	8	/* swap-backed shared memory */
 #define	DTYPE_SEM	9	/* posix semaphore */
 #define	DTYPE_PTS	10	/* pseudo teletype master device */
 #define	DTYPE_DEV	11	/* Device specific fd type */
 #define	DTYPE_PROCDESC	12	/* process descriptor */
 #define	DTYPE_EVENTFD	13	/* eventfd */
 #define	DTYPE_LINUXTFD	14	/* emulation timerfd type */
 
 #ifdef _KERNEL
 
 struct file;
 struct filecaps;
 struct kaiocb;
 struct kinfo_file;
 struct ucred;
 
 #define	FOF_OFFSET	0x01	/* Use the offset in uio argument */
 #define	FOF_NOLOCK	0x02	/* Do not take FOFFSET_LOCK */
 #define	FOF_NEXTOFF_R	0x04	/* Also update f_nextoff[UIO_READ] */
 #define	FOF_NEXTOFF_W	0x08	/* Also update f_nextoff[UIO_WRITE] */
 #define	FOF_NOUPDATE	0x10	/* Do not update f_offset */
 off_t foffset_lock(struct file *fp, int flags);
 void foffset_lock_uio(struct file *fp, struct uio *uio, int flags);
 void foffset_unlock(struct file *fp, off_t val, int flags);
 void foffset_unlock_uio(struct file *fp, struct uio *uio, int flags);
 
 static inline off_t
 foffset_get(struct file *fp)
 {
 
 	return (foffset_lock(fp, FOF_NOLOCK));
 }
 
 typedef int fo_rdwr_t(struct file *fp, struct uio *uio,
 		    struct ucred *active_cred, int flags,
 		    struct thread *td);
 typedef	int fo_truncate_t(struct file *fp, off_t length,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_ioctl_t(struct file *fp, u_long com, void *data,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_poll_t(struct file *fp, int events,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_kqfilter_t(struct file *fp, struct knote *kn);
 typedef	int fo_stat_t(struct file *fp, struct stat *sb,
-		    struct ucred *active_cred, struct thread *td);
+		    struct ucred *active_cred);
 typedef	int fo_close_t(struct file *fp, struct thread *td);
 typedef	int fo_chmod_t(struct file *fp, mode_t mode,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_chown_t(struct file *fp, uid_t uid, gid_t gid,
 		    struct ucred *active_cred, struct thread *td);
 typedef int fo_sendfile_t(struct file *fp, int sockfd, struct uio *hdr_uio,
 		    struct uio *trl_uio, off_t offset, size_t nbytes,
 		    off_t *sent, int flags, struct thread *td);
 typedef int fo_seek_t(struct file *fp, off_t offset, int whence,
 		    struct thread *td);
 typedef int fo_fill_kinfo_t(struct file *fp, struct kinfo_file *kif,
 		    struct filedesc *fdp);
 typedef int fo_mmap_t(struct file *fp, vm_map_t map, vm_offset_t *addr,
 		    vm_size_t size, vm_prot_t prot, vm_prot_t cap_maxprot,
 		    int flags, vm_ooffset_t foff, struct thread *td);
 typedef int fo_aio_queue_t(struct file *fp, struct kaiocb *job);
 typedef int fo_add_seals_t(struct file *fp, int flags);
 typedef int fo_get_seals_t(struct file *fp, int *flags);
 typedef int fo_fallocate_t(struct file *fp, off_t offset, off_t len,
 		    struct thread *td);
 typedef int fo_fspacectl_t(struct file *fp, int cmd,
 		    off_t *offset, off_t *length, int flags,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_flags_t;
 
 struct fileops {
 	fo_rdwr_t	*fo_read;
 	fo_rdwr_t	*fo_write;
 	fo_truncate_t	*fo_truncate;
 	fo_ioctl_t	*fo_ioctl;
 	fo_poll_t	*fo_poll;
 	fo_kqfilter_t	*fo_kqfilter;
 	fo_stat_t	*fo_stat;
 	fo_close_t	*fo_close;
 	fo_chmod_t	*fo_chmod;
 	fo_chown_t	*fo_chown;
 	fo_sendfile_t	*fo_sendfile;
 	fo_seek_t	*fo_seek;
 	fo_fill_kinfo_t	*fo_fill_kinfo;
 	fo_mmap_t	*fo_mmap;
 	fo_aio_queue_t	*fo_aio_queue;
 	fo_add_seals_t	*fo_add_seals;
 	fo_get_seals_t	*fo_get_seals;
 	fo_fallocate_t	*fo_fallocate;
 	fo_fspacectl_t	*fo_fspacectl;
 	fo_flags_t	fo_flags;	/* DFLAG_* below */
 };
 
 #define DFLAG_PASSABLE	0x01	/* may be passed via unix sockets. */
 #define DFLAG_SEEKABLE	0x02	/* seekable / nonsequential */
 #endif /* _KERNEL */
 
 #if defined(_KERNEL) || defined(_WANT_FILE)
 /*
  * Kernel descriptor table.
  * One entry for each open kernel vnode and socket.
  *
  * Below is the list of locks that protects members in struct file.
  *
  * (a) f_vnode lock required (shared allows both reads and writes)
  * (f) updated with atomics and blocking on sleepq
  * (d) cdevpriv_mtx
  * none	not locked
  */
 
 struct fadvise_info {
 	int		fa_advice;	/* (f) FADV_* type. */
 	off_t		fa_start;	/* (f) Region start. */
 	off_t		fa_end;		/* (f) Region end. */
 };
 
 struct file {
 	volatile u_int	f_flag;		/* see fcntl.h */
 	volatile u_int 	f_count;	/* reference count */
 	void		*f_data;	/* file descriptor specific data */
 	struct fileops	*f_ops;		/* File operations */
 	struct vnode 	*f_vnode;	/* NULL or applicable vnode */
 	struct ucred	*f_cred;	/* associated credentials. */
 	short		f_type;		/* descriptor type */
 	short		f_vnread_flags; /* (f) Sleep lock for f_offset */
 	/*
 	 *  DTYPE_VNODE specific fields.
 	 */
 	union {
 		int16_t	f_seqcount[2];	/* (a) Count of seq. reads and writes. */
 		int	f_pipegen;
 	};
 	off_t		f_nextoff[2];	/* next expected read/write offset. */
 	union {
 		struct cdev_privdata *fvn_cdevpriv;
 					/* (d) Private data for the cdev. */
 		struct fadvise_info *fvn_advice;
 	} f_vnun;
 	/*
 	 *  DFLAG_SEEKABLE specific fields
 	 */
 	off_t		f_offset;
 };
 
 #define	f_cdevpriv	f_vnun.fvn_cdevpriv
 #define	f_advice	f_vnun.fvn_advice
 
 #define	FOFFSET_LOCKED       0x1
 #define	FOFFSET_LOCK_WAITING 0x2
 
 #endif /* _KERNEL || _WANT_FILE */
 
 /*
  * Userland version of struct file, for sysctl
  */
 struct xfile {
 	ksize_t	xf_size;	/* size of struct xfile */
 	pid_t	xf_pid;		/* owning process */
 	uid_t	xf_uid;		/* effective uid of owning process */
 	int	xf_fd;		/* descriptor number */
 	int	_xf_int_pad1;
 	kvaddr_t xf_file;	/* address of struct file */
 	short	xf_type;	/* descriptor type */
 	short	_xf_short_pad1;
 	int	xf_count;	/* reference count */
 	int	xf_msgcount;	/* references from message queue */
 	int	_xf_int_pad2;
 	off_t	xf_offset;	/* file offset */
 	kvaddr_t xf_data;	/* file descriptor specific data */
 	kvaddr_t xf_vnode;	/* vnode pointer */
 	u_int	xf_flag;	/* flags (see fcntl.h) */
 	int	_xf_int_pad3;
 	int64_t	_xf_int64_pad[6];
 };
 
 #ifdef _KERNEL
 
 extern struct fileops vnops;
 extern struct fileops badfileops;
 extern struct fileops path_fileops;
 extern struct fileops socketops;
 extern int maxfiles;		/* kernel limit on number of open files */
 extern int maxfilesperproc;	/* per process limit on number of open files */
 
 int fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp);
 int fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp,
     vm_prot_t *maxprotp, struct file **fpp);
 int fget_read(struct thread *td, int fd, cap_rights_t *rightsp,
     struct file **fpp);
 int fget_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct file **fpp);
 int fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp,
     int needfcntl, struct file **fpp);
 int _fdrop(struct file *fp, struct thread *td);
 
 fo_rdwr_t	invfo_rdwr;
 fo_truncate_t	invfo_truncate;
 fo_ioctl_t	invfo_ioctl;
 fo_poll_t	invfo_poll;
 fo_kqfilter_t	invfo_kqfilter;
 fo_chmod_t	invfo_chmod;
 fo_chown_t	invfo_chown;
 fo_sendfile_t	invfo_sendfile;
 fo_stat_t	vn_statfile;
 fo_sendfile_t	vn_sendfile;
 fo_seek_t	vn_seek;
 fo_fill_kinfo_t	vn_fill_kinfo;
 fo_kqfilter_t	vn_kqfilter_opath;
 int vn_fill_kinfo_vnode(struct vnode *vp, struct kinfo_file *kif);
 
 void finit(struct file *, u_int, short, void *, struct fileops *);
 void finit_vnode(struct file *, u_int, void *, struct fileops *);
 int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp);
 int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp);
 int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct filecaps *havecaps, struct vnode **vpp);
 int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp);
 int fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp);
 int fgetvp_lookup_smr(int fd, struct nameidata *ndp, struct vnode **vpp, bool *fsearch);
 
 static __inline __result_use_check bool
 fhold(struct file *fp)
 {
 	return (refcount_acquire_checked(&fp->f_count));
 }
 
 #define	fdrop(fp, td)		({				\
 	struct file *_fp;					\
 	int _error;						\
 								\
 	_error = 0;						\
 	_fp = (fp);						\
 	if (__predict_false(refcount_release(&_fp->f_count)))	\
 		_error = _fdrop(_fp, td);			\
 	_error;							\
 })
 
 #define	fdrop_close(fp, td)		({			\
 	struct file *_fp;					\
 	int _error;						\
 								\
 	_error = 0;						\
 	_fp = (fp);						\
 	if (__predict_true(refcount_release(&_fp->f_count)))	\
 		_error = _fdrop(_fp, td);			\
 	_error;							\
 })
 
 static __inline fo_rdwr_t	fo_read;
 static __inline fo_rdwr_t	fo_write;
 static __inline fo_truncate_t	fo_truncate;
 static __inline fo_ioctl_t	fo_ioctl;
 static __inline fo_poll_t	fo_poll;
 static __inline fo_kqfilter_t	fo_kqfilter;
 static __inline fo_stat_t	fo_stat;
 static __inline fo_close_t	fo_close;
 static __inline fo_chmod_t	fo_chmod;
 static __inline fo_chown_t	fo_chown;
 static __inline fo_sendfile_t	fo_sendfile;
 
 static __inline int
 fo_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_read)(fp, uio, active_cred, flags, td));
 }
 
 static __inline int
 fo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_write)(fp, uio, active_cred, flags, td));
 }
 
 static __inline int
 fo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_truncate)(fp, length, active_cred, td));
 }
 
 static __inline int
 fo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_ioctl)(fp, com, data, active_cred, td));
 }
 
 static __inline int
 fo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_poll)(fp, events, active_cred, td));
 }
 
 static __inline int
-fo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
-    struct thread *td)
+fo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred)
 {
 
-	return ((*fp->f_ops->fo_stat)(fp, sb, active_cred, td));
+	return ((*fp->f_ops->fo_stat)(fp, sb, active_cred));
 }
 
 static __inline int
 fo_close(struct file *fp, struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_close)(fp, td));
 }
 
 static __inline int
 fo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return ((*fp->f_ops->fo_kqfilter)(fp, kn));
 }
 
 static __inline int
 fo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_chmod)(fp, mode, active_cred, td));
 }
 
 static __inline int
 fo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_chown)(fp, uid, gid, active_cred, td));
 }
 
 static __inline int
 fo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_sendfile)(fp, sockfd, hdr_uio, trl_uio, offset,
 	    nbytes, sent, flags, td));
 }
 
 static __inline int
 fo_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 
 	return ((*fp->f_ops->fo_seek)(fp, offset, whence, td));
 }
 
 static __inline int
 fo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	return ((*fp->f_ops->fo_fill_kinfo)(fp, kif, fdp));
 }
 
 static __inline int
 fo_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 
 	if (fp->f_ops->fo_mmap == NULL)
 		return (ENODEV);
 	return ((*fp->f_ops->fo_mmap)(fp, map, addr, size, prot, cap_maxprot,
 	    flags, foff, td));
 }
 
 static __inline int
 fo_aio_queue(struct file *fp, struct kaiocb *job)
 {
 
 	return ((*fp->f_ops->fo_aio_queue)(fp, job));
 }
 
 static __inline int
 fo_add_seals(struct file *fp, int seals)
 {
 
 	if (fp->f_ops->fo_add_seals == NULL)
 		return (EINVAL);
 	return ((*fp->f_ops->fo_add_seals)(fp, seals));
 }
 
 static __inline int
 fo_get_seals(struct file *fp, int *seals)
 {
 
 	if (fp->f_ops->fo_get_seals == NULL)
 		return (EINVAL);
 	return ((*fp->f_ops->fo_get_seals)(fp, seals));
 }
 
 static __inline int
 fo_fallocate(struct file *fp, off_t offset, off_t len, struct thread *td)
 {
 
 	if (fp->f_ops->fo_fallocate == NULL)
 		return (ENODEV);
 	return ((*fp->f_ops->fo_fallocate)(fp, offset, len, td));
 }
 
 static __inline int fo_fspacectl(struct file *fp, int cmd, off_t *offset,
     off_t *length, int flags, struct ucred *active_cred, struct thread *td)
 {
 
 	if (fp->f_ops->fo_fspacectl == NULL)
 		return (ENODEV);
 	return ((*fp->f_ops->fo_fspacectl)(fp, cmd, offset, length, flags,
 	    active_cred, td));
 }
 
 
 #endif /* _KERNEL */
 
 #endif /* !SYS_FILE_H */
diff --git a/sys/sys/param.h b/sys/sys/param.h
index 9a016e35323e..50d861a61985 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -1,392 +1,392 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)param.h	8.3 (Berkeley) 4/4/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PARAM_H_
 #define _SYS_PARAM_H_
 
 #include <sys/_null.h>
 
 #define	BSD	199506		/* System version (year & month). */
 #define BSD4_3	1
 #define BSD4_4	1
 
 /*
  * __FreeBSD_version numbers are documented in the Porter's Handbook.
  * If you bump the version for any reason, you should update the documentation
  * there.
  * Currently this lives here in the doc/ repository:
  *
  *	documentation/content/en/books/porters-handbook/versions/_index.adoc
  *
  * Encoding:	<major><two digit minor>Rxx
  *		'R' is in the range 0 to 4 if this is a release branch or
  *		X.0-CURRENT before releng/X.0 is created, otherwise 'R' is
  *		in the range 5 to 9.
  * Short hand: MMmmXXX
  *
  * __FreeBSD_version is bumped every time there's a change in the base system
  * that's noteworthy. A noteworthy change is any change which changes the
  * kernel's KBI in -CURRENT, one that changes some detail about the system that
  * external software (or the ports system) would want to know about, one that
  * adds a system call, one that adds or deletes a shipped library, a security
  * fix, or similar change not specifically noted here. Bumps should be limited
  * to one per day / a couple per week except for security fixes.
  *
  * The approved way to obtain this from a shell script is:
  *	awk '/^\#define[[:space:]]*__FreeBSD_version/ {print $3}'
  * Other methods to parse this file may work, but are not guaranteed against
  * future changes. The above script works back to FreeBSD 3.x when this macro
  * was introduced. This number is propagated to other places needing it that
  * cannot include sys/param.h and should only be updated here.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1400036
+#define __FreeBSD_version 1400037
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
  * which by definition is always true on FreeBSD. This macro is also defined
  * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD.
  *
  * It is tempting to use this macro in userland code when we want to enable
  * kernel-specific routines, and in fact it's fine to do this in code that
  * is part of FreeBSD itself.  However, be aware that as presence of this
  * macro is still not widespread (e.g. older FreeBSD versions, 3rd party
  * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in
  * external applications without also checking for __FreeBSD__ as an
  * alternative.
  */
 #undef __FreeBSD_kernel__
 #define __FreeBSD_kernel__
 
 #if defined(_KERNEL) || defined(IN_RTLD)
 #define	P_OSREL_SIGWAIT			700000
 #define	P_OSREL_SIGSEGV			700004
 #define	P_OSREL_MAP_ANON		800104
 #define	P_OSREL_MAP_FSTRICT		1100036
 #define	P_OSREL_SHUTDOWN_ENOTCONN	1100077
 #define	P_OSREL_MAP_GUARD		1200035
 #define	P_OSREL_WRFSBASE		1200041
 #define	P_OSREL_CK_CYLGRP		1200046
 #define	P_OSREL_VMTOTAL64		1200054
 #define	P_OSREL_CK_SUPERBLOCK		1300000
 #define	P_OSREL_CK_INODE		1300005
 #define	P_OSREL_POWERPC_NEW_AUX_ARGS	1300070
 
 #define	P_OSREL_MAJOR(x)		((x) / 100000)
 #endif
 
 #ifndef LOCORE
 #include <sys/types.h>
 #endif
 
 /*
  * Machine-independent constants (some used in following include files).
  * Redefined constants are from POSIX 1003.1 limits file.
  *
  * MAXCOMLEN should be >= sizeof(ac_comm) (see <acct.h>)
  */
 #include <sys/syslimits.h>
 
 #define	MAXCOMLEN	19		/* max command name remembered */
 #define	MAXINTERP	PATH_MAX	/* max interpreter file name length */
 #define	MAXLOGNAME	33		/* max login name length (incl. NUL) */
 #define	MAXUPRC		CHILD_MAX	/* max simultaneous processes */
 #define	NCARGS		ARG_MAX		/* max bytes for an exec function */
 #define	NGROUPS		(NGROUPS_MAX+1)	/* max number groups */
 #define	NOFILE		OPEN_MAX	/* max open files per process */
 #define	NOGROUP		65535		/* marker for empty group set member */
 #define MAXHOSTNAMELEN	256		/* max hostname size */
 #define SPECNAMELEN	255		/* max length of devicename */
 
 /* More types and definitions used throughout the kernel. */
 #ifdef _KERNEL
 #include <sys/cdefs.h>
 #include <sys/errno.h>
 #ifndef LOCORE
 #include <sys/time.h>
 #include <sys/priority.h>
 #endif
 
 #ifndef FALSE
 #define	FALSE	0
 #endif
 #ifndef TRUE
 #define	TRUE	1
 #endif
 #endif
 
 #ifndef _KERNEL
 #ifndef LOCORE
 /* Signals. */
 #include <sys/signal.h>
 #endif
 #endif
 
 /* Machine type dependent parameters. */
 #include <machine/param.h>
 #ifndef _KERNEL
 #include <sys/limits.h>
 #endif
 
 #ifndef DEV_BSHIFT
 #define	DEV_BSHIFT	9		/* log2(DEV_BSIZE) */
 #endif
 #define	DEV_BSIZE	(1<<DEV_BSHIFT)
 
 #ifndef BLKDEV_IOSIZE
 #define BLKDEV_IOSIZE  PAGE_SIZE	/* default block device I/O size */
 #endif
 #ifndef DFLTPHYS
 #define DFLTPHYS	(64 * 1024)	/* default max raw I/O transfer size */
 #endif
 #ifndef MAXPHYS				/* max raw I/O transfer size */
 #ifdef __ILP32__
 #define MAXPHYS		(128 * 1024)
 #else
 #define MAXPHYS		(1024 * 1024)
 #endif
 #endif
 #ifndef MAXDUMPPGS
 #define MAXDUMPPGS	(DFLTPHYS/PAGE_SIZE)
 #endif
 
 /*
  * Constants related to network buffer management.
  * MCLBYTES must be no larger than PAGE_SIZE.
  */
 #ifndef	MSIZE
 #define	MSIZE		256		/* size of an mbuf */
 #endif
 
 #ifndef	MCLSHIFT
 #define MCLSHIFT	11		/* convert bytes to mbuf clusters */
 #endif	/* MCLSHIFT */
 
 #define MCLBYTES	(1 << MCLSHIFT)	/* size of an mbuf cluster */
 
 #if PAGE_SIZE < 2048
 #define	MJUMPAGESIZE	MCLBYTES
 #elif PAGE_SIZE <= 8192
 #define	MJUMPAGESIZE	PAGE_SIZE
 #else
 #define	MJUMPAGESIZE	(8 * 1024)
 #endif
 
 #define	MJUM9BYTES	(9 * 1024)	/* jumbo cluster 9k */
 #define	MJUM16BYTES	(16 * 1024)	/* jumbo cluster 16k */
 
 /*
  * Some macros for units conversion
  */
 
 /* clicks to bytes */
 #ifndef ctob
 #define ctob(x)	((x)<<PAGE_SHIFT)
 #endif
 
 /* bytes to clicks */
 #ifndef btoc
 #define btoc(x)	(((vm_offset_t)(x)+PAGE_MASK)>>PAGE_SHIFT)
 #endif
 
 /*
  * btodb() is messy and perhaps slow because `bytes' may be an off_t.  We
  * want to shift an unsigned type to avoid sign extension and we don't
  * want to widen `bytes' unnecessarily.  Assume that the result fits in
  * a daddr_t.
  */
 #ifndef btodb
 #define btodb(bytes)	 		/* calculates (bytes / DEV_BSIZE) */ \
 	(sizeof (bytes) > sizeof(long) \
 	 ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \
 	 : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT))
 #endif
 
 #ifndef dbtob
 #define dbtob(db)			/* calculates (db * DEV_BSIZE) */ \
 	((off_t)(db) << DEV_BSHIFT)
 #endif
 
 #define	PRIMASK		0x0ff
 #define	PCATCH		0x100	/* OR'd with pri for tsleep to check signals */
 #define	PDROP		0x200	/* OR'd with pri to stop re-entry of interlock mutex */
 #define	PNOLOCK		0x400	/* OR'd with pri to allow sleeping w/o a lock */
 #define	PRILASTFLAG	0x400	/* Last flag defined above */
 
 #define	NZERO	0		/* default "nice" */
 
 #define	NBBY	8		/* number of bits in a byte */
 #define	NBPW	sizeof(int)	/* number of bytes per word (integer) */
 
 #define	CMASK	022		/* default file mask: S_IWGRP|S_IWOTH */
 
 #define	NODEV	(dev_t)(-1)	/* non-existent device */
 
 /*
  * File system parameters and macros.
  *
  * MAXBSIZE -	Filesystems are made out of blocks of at most MAXBSIZE bytes
  *		per block.  MAXBSIZE may be made larger without effecting
  *		any existing filesystems as long as it does not exceed MAXPHYS,
  *		and may be made smaller at the risk of not being able to use
  *		filesystems which require a block size exceeding MAXBSIZE.
  *
  * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache.  This must
  *		be >= MAXBSIZE and can be set differently for different
  *		architectures by defining it in <machine/param.h>.
  *		Making this larger allows NFS to do larger reads/writes.
  *
  * BKVASIZE -	Nominal buffer space per buffer, in bytes.  BKVASIZE is the
  *		minimum KVM memory reservation the kernel is willing to make.
  *		Filesystems can of course request smaller chunks.  Actual
  *		backing memory uses a chunk size of a page (PAGE_SIZE).
  *		The default value here can be overridden on a per-architecture
  *		basis by defining it in <machine/param.h>.
  *
  *		If you make BKVASIZE too small you risk seriously fragmenting
  *		the buffer KVM map which may slow things down a bit.  If you
  *		make it too big the kernel will not be able to optimally use
  *		the KVM memory reserved for the buffer cache and will wind
  *		up with too-few buffers.
  *
  *		The default is 16384, roughly 2x the block size used by a
  *		normal UFS filesystem.
  */
 #define MAXBSIZE	65536	/* must be power of 2 */
 #ifndef	MAXBCACHEBUF
 #define	MAXBCACHEBUF	MAXBSIZE /* must be a power of 2 >= MAXBSIZE */
 #endif
 #ifndef	BKVASIZE
 #define BKVASIZE	16384	/* must be power of 2 */
 #endif
 #define BKVAMASK	(BKVASIZE-1)
 
 /*
  * MAXPATHLEN defines the longest permissible path length after expanding
  * symbolic links. It is used to allocate a temporary buffer from the buffer
  * pool in which to do the name expansion, hence should be a power of two,
  * and must be less than or equal to MAXBSIZE.  MAXSYMLINKS defines the
  * maximum number of symbolic links that may be expanded in a path name.
  * It should be set high enough to allow all legitimate uses, but halt
  * infinite loops reasonably quickly.
  */
 #define	MAXPATHLEN	PATH_MAX
 #define MAXSYMLINKS	32
 
 /* Bit map related macros. */
 #define	setbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY))
 #define	clrbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY)))
 #define	isset(a,i)							\
 	(((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY)))
 #define	isclr(a,i)							\
 	((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0)
 
 /* Macros for counting and rounding. */
 #ifndef howmany
 #define	howmany(x, y)	(((x)+((y)-1))/(y))
 #endif
 #define	nitems(x)	(sizeof((x)) / sizeof((x)[0]))
 #define	rounddown(x, y)	(((x)/(y))*(y))
 #define	rounddown2(x, y) __align_down(x, y) /* if y is power of two */
 #define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))  /* to any y */
 #define	roundup2(x, y)	__align_up(x, y) /* if y is powers of two */
 #define powerof2(x)	((((x)-1)&(x))==0)
 
 /* Macros for min/max. */
 #define	MIN(a,b) (((a)<(b))?(a):(b))
 #define	MAX(a,b) (((a)>(b))?(a):(b))
 
 #ifdef _KERNEL
 /*
  * Basic byte order function prototypes for non-inline functions.
  */
 #ifndef LOCORE
 #ifndef _BYTEORDER_PROTOTYPED
 #define	_BYTEORDER_PROTOTYPED
 __BEGIN_DECLS
 __uint32_t	 htonl(__uint32_t);
 __uint16_t	 htons(__uint16_t);
 __uint32_t	 ntohl(__uint32_t);
 __uint16_t	 ntohs(__uint16_t);
 __END_DECLS
 #endif
 #endif
 
 #ifndef _BYTEORDER_FUNC_DEFINED
 #define	_BYTEORDER_FUNC_DEFINED
 #define	htonl(x)	__htonl(x)
 #define	htons(x)	__htons(x)
 #define	ntohl(x)	__ntohl(x)
 #define	ntohs(x)	__ntohs(x)
 #endif /* !_BYTEORDER_FUNC_DEFINED */
 #endif /* _KERNEL */
 
 /*
  * Scale factor for scaled integers used to count %cpu time and load avgs.
  *
  * The number of CPU `tick's that map to a unique `%age' can be expressed
  * by the formula (1 / (2 ^ (FSHIFT - 11))).  The maximum load average that
  * can be calculated (assuming 32 bits) can be closely approximated using
  * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15).
  *
  * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age',
  * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024.
  */
 #define	FSHIFT	11		/* bits to right of fixed binary point */
 #define FSCALE	(1<<FSHIFT)
 
 #define dbtoc(db)			/* calculates devblks to pages */ \
 	((db + (ctodb(1) - 1)) >> (PAGE_SHIFT - DEV_BSHIFT))
 
 #define ctodb(db)			/* calculates pages to devblks */ \
 	((db) << (PAGE_SHIFT - DEV_BSHIFT))
 
 /*
  * Old spelling of __containerof().
  */
 #define	member2struct(s, m, x)						\
 	((struct s *)(void *)((char *)(x) - offsetof(struct s, m)))
 
 /*
  * Access a variable length array that has been declared as a fixed
  * length array.
  */
 #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset])
 
 #endif	/* _SYS_PARAM_H_ */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index 81f3f3d5489c..a9b57dd4c25d 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -1,1143 +1,1143 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_VNODE_H_
 #define	_SYS_VNODE_H_
 
 #include <sys/bufobj.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <sys/mutex.h>
 #include <sys/rangelock.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
 #include <sys/ktr.h>
 #include <sys/_seqc.h>
 
 /*
  * The vnode is the focus of all file activity in UNIX.  There is a
  * unique vnode allocated for each active file, each current directory,
  * each mounted-on file, text file, and the root.
  */
 
 /*
  * Vnode types.  VNON means no type.
  */
 enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD,
 		  VMARKER };
 
 enum vgetstate	{ VGET_NONE, VGET_HOLDCNT, VGET_USECOUNT };
 /*
  * Each underlying filesystem allocates its own private area and hangs
  * it from v_data.  If non-null, this area is freed in getnewvnode().
  */
 
 struct namecache;
 struct cache_fpl;
 
 struct vpollinfo {
 	struct	mtx vpi_lock;		/* lock to protect below */
 	struct	selinfo vpi_selinfo;	/* identity of poller(s) */
 	short	vpi_events;		/* what they are looking for */
 	short	vpi_revents;		/* what has happened */
 };
 
 /*
  * Reading or writing any of these items requires holding the appropriate lock.
  *
  * Lock reference:
  *	c - namecache mutex
  *	i - interlock
  *	l - mp mnt_listmtx or freelist mutex
  *	I - updated with atomics, 0->1 and 1->0 transitions with interlock held
  *	m - mount point interlock
  *	p - pollinfo lock
  *	u - Only a reference to the vnode is needed to read.
  *	v - vnode lock
  *
  * Vnodes may be found on many lists.  The general way to deal with operating
  * on a vnode that is on a list is:
  *	1) Lock the list and find the vnode.
  *	2) Lock interlock so that the vnode does not go away.
  *	3) Unlock the list to avoid lock order reversals.
  *	4) vget with LK_INTERLOCK and check for ENOENT, or
  *	5) Check for DOOMED if the vnode lock is not required.
  *	6) Perform your operation, then vput().
  */
 
 #if defined(_KERNEL) || defined(_KVM_VNODE)
 
 struct vnode {
 	/*
 	 * Fields which define the identity of the vnode.  These fields are
 	 * owned by the filesystem (XXX: and vgone() ?)
 	 */
 	enum	vtype v_type:8;			/* u vnode type */
 	short	v_irflag;			/* i frequently read flags */
 	seqc_t	v_seqc;				/* i modification count */
 	uint32_t v_nchash;			/* u namecache hash */
 	u_int	v_hash;
 	struct	vop_vector *v_op;		/* u vnode operations vector */
 	void	*v_data;			/* u private data for fs */
 
 	/*
 	 * Filesystem instance stuff
 	 */
 	struct	mount *v_mount;			/* u ptr to vfs we are in */
 	TAILQ_ENTRY(vnode) v_nmntvnodes;	/* m vnodes for mount point */
 
 	/*
 	 * Type specific fields, only one applies to any given vnode.
 	 */
 	union {
 		struct mount	*v_mountedhere;	/* v ptr to mountpoint (VDIR) */
 		struct unpcb	*v_unpcb;	/* v unix domain net (VSOCK) */
 		struct cdev	*v_rdev; 	/* v device (VCHR, VBLK) */
 		struct fifoinfo	*v_fifoinfo;	/* v fifo (VFIFO) */
 	};
 
 	/*
 	 * vfs_hash: (mount + inode) -> vnode hash.  The hash value
 	 * itself is grouped with other int fields, to avoid padding.
 	 */
 	LIST_ENTRY(vnode)	v_hashlist;
 
 	/*
 	 * VFS_namecache stuff
 	 */
 	LIST_HEAD(, namecache) v_cache_src;	/* c Cache entries from us */
 	TAILQ_HEAD(, namecache) v_cache_dst;	/* c Cache entries to us */
 	struct namecache *v_cache_dd;		/* c Cache entry for .. vnode */
 
 	/*
 	 * Locking
 	 */
 	struct	lock v_lock;			/* u (if fs don't have one) */
 	struct	mtx v_interlock;		/* lock for "i" things */
 	struct	lock *v_vnlock;			/* u pointer to vnode lock */
 
 	/*
 	 * The machinery of being a vnode
 	 */
 	TAILQ_ENTRY(vnode) v_vnodelist;		/* l vnode lists */
 	TAILQ_ENTRY(vnode) v_lazylist;		/* l vnode lazy list */
 	struct bufobj	v_bufobj;		/* * Buffer cache object */
 
 	/*
 	 * Hooks for various subsystems and features.
 	 */
 	struct vpollinfo *v_pollinfo;		/* i Poll events, p for *v_pi */
 	struct label *v_label;			/* MAC label for vnode */
 	struct lockf *v_lockf;		/* Byte-level advisory lock list */
 	struct rangelock v_rl;			/* Byte-range lock */
 
 	u_int	v_holdcnt;			/* I prevents recycling. */
 	u_int	v_usecount;			/* I ref count of users */
 	u_short	v_iflag;			/* i vnode flags (see below) */
 	u_short	v_vflag;			/* v vnode flags */
 	u_short	v_mflag;			/* l mnt-specific vnode flags */
 	short	v_dbatchcpu;			/* i LRU requeue deferral batch */
 	int	v_writecount;			/* I ref count of writers or
 						   (negative) text users */
 	int	v_seqc_users;			/* i modifications pending */
 };
 
 #ifndef DEBUG_LOCKS
 #ifdef _LP64
 /*
  * Not crossing 448 bytes fits 9 vnodes per page. If you have to add fields
  * to the structure and there is nothing which can be done to prevent growth
  * then so be it. But don't grow it without a good reason.
  */
 _Static_assert(sizeof(struct vnode) <= 448, "vnode size crosses 448 bytes");
 #endif
 #endif
 
 #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */
 
 #define	bo2vnode(bo)	__containerof((bo), struct vnode, v_bufobj)
 
 /* XXX: These are temporary to avoid a source sweep at this time */
 #define v_object	v_bufobj.bo_object
 
 /*
  * Userland version of struct vnode, for sysctl.
  */
 struct xvnode {
 	size_t	xv_size;			/* sizeof(struct xvnode) */
 	void	*xv_vnode;			/* address of real vnode */
 	u_long	xv_flag;			/* vnode vflags */
 	int	xv_usecount;			/* reference count of users */
 	int	xv_writecount;			/* reference count of writers */
 	int	xv_holdcnt;			/* page & buffer references */
 	u_long	xv_id;				/* capability identifier */
 	void	*xv_mount;			/* address of parent mount */
 	long	xv_numoutput;			/* num of writes in progress */
 	enum	vtype xv_type;			/* vnode type */
 	union {
 		void	*xvu_socket;		/* unpcb, if VSOCK */
 		void	*xvu_fifo;		/* fifo, if VFIFO */
 		dev_t	xvu_rdev;		/* maj/min, if VBLK/VCHR */
 		struct {
 			dev_t	xvu_dev;	/* device, if VDIR/VREG/VLNK */
 			ino_t	xvu_ino;	/* id, if VDIR/VREG/VLNK */
 		} xv_uns;
 	} xv_un;
 };
 #define xv_socket	xv_un.xvu_socket
 #define xv_fifo		xv_un.xvu_fifo
 #define xv_rdev		xv_un.xvu_rdev
 #define xv_dev		xv_un.xv_uns.xvu_dev
 #define xv_ino		xv_un.xv_uns.xvu_ino
 
 /* We don't need to lock the knlist */
 #define	VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL ||	\
 	    KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note))
 
 #define VN_KNOTE(vp, b, a)					\
 	do {							\
 		if (!VN_KNLIST_EMPTY(vp))			\
 			KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \
 			    (a) | KNF_NOKQLOCK);		\
 	} while (0)
 #define	VN_KNOTE_LOCKED(vp, b)		VN_KNOTE(vp, b, KNF_LISTLOCKED)
 #define	VN_KNOTE_UNLOCKED(vp, b)	VN_KNOTE(vp, b, 0)
 
 /*
  * Vnode flags.
  *	VI flags are protected by interlock and live in v_iflag
  *	VV flags are protected by the vnode lock and live in v_vflag
  *
  *	VIRF_DOOMED is doubly protected by the interlock and vnode lock.  Both
  *	are required for writing but the status may be checked with either.
  */
 #define	VHOLD_NO_SMR	(1<<29)	/* Disable vhold_smr */
 #define VHOLD_ALL_FLAGS (VHOLD_NO_SMR)
 
 #define	VIRF_DOOMED	0x0001	/* This vnode is being recycled */
 #define	VIRF_PGREAD	0x0002	/* Direct reads from the page cache are permitted,
 				   never cleared once set */
 #define	VIRF_MOUNTPOINT	0x0004	/* This vnode is mounted on */
 
 #define	VI_TEXT_REF	0x0001	/* Text ref grabbed use ref */
 #define	VI_MOUNT	0x0002	/* Mount in progress */
 #define	VI_DOINGINACT	0x0004	/* VOP_INACTIVE is in progress */
 #define	VI_OWEINACT	0x0008	/* Need to call inactive */
 #define	VI_DEFINACT	0x0010	/* deferred inactive */
 #define	VI_FOPENING	0x0020	/* In open, with opening process having the
 				   first right to advlock file */
 
 #define	VV_ROOT		0x0001	/* root of its filesystem */
 #define	VV_ISTTY	0x0002	/* vnode represents a tty */
 #define	VV_NOSYNC	0x0004	/* unlinked, stop syncing */
 #define	VV_ETERNALDEV	0x0008	/* device that is never destroyed */
 #define	VV_CACHEDLABEL	0x0010	/* Vnode has valid cached MAC label */
 #define	VV_VMSIZEVNLOCK	0x0020	/* object size check requires vnode lock */
 #define	VV_COPYONWRITE	0x0040	/* vnode is doing copy-on-write */
 #define	VV_SYSTEM	0x0080	/* vnode being used by kernel */
 #define	VV_PROCDEP	0x0100	/* vnode is process dependent */
 #define	VV_NOKNOTE	0x0200	/* don't activate knotes on this vnode */
 #define	VV_DELETED	0x0400	/* should be removed */
 #define	VV_MD		0x0800	/* vnode backs the md device */
 #define	VV_FORCEINSMQ	0x1000	/* force the insmntque to succeed */
 #define	VV_READLINK	0x2000	/* fdescfs linux vnode */
 #define	VV_UNREF	0x4000	/* vunref, do not drop lock in inactive() */
 
 #define	VMP_LAZYLIST	0x0001	/* Vnode is on mnt's lazy list */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
  * is unavailable (getattr) or which is not to be changed (setattr).
  */
 struct vattr {
 	enum vtype	va_type;	/* vnode type (for create) */
 	u_short		va_mode;	/* files access mode and type */
 	u_short		va_padding0;
 	uid_t		va_uid;		/* owner user id */
 	gid_t		va_gid;		/* owner group id */
 	nlink_t		va_nlink;	/* number of references to file */
 	dev_t		va_fsid;	/* filesystem id */
 	ino_t		va_fileid;	/* file id */
 	u_quad_t	va_size;	/* file size in bytes */
 	long		va_blocksize;	/* blocksize preferred for i/o */
 	struct timespec	va_atime;	/* time of last access */
 	struct timespec	va_mtime;	/* time of last modification */
 	struct timespec	va_ctime;	/* time file changed */
 	struct timespec	va_birthtime;	/* time file created */
 	u_long		va_gen;		/* generation number of file */
 	u_long		va_flags;	/* flags defined for file */
 	dev_t		va_rdev;	/* device the special file represents */
 	u_quad_t	va_bytes;	/* bytes of disk space held by file */
 	u_quad_t	va_filerev;	/* file modification number */
 	u_int		va_vaflags;	/* operations flags, see below */
 	long		va_spare;	/* remain quad aligned */
 };
 
 /*
  * Flags for va_vaflags.
  */
 #define	VA_UTIMES_NULL	0x01		/* utimes argument was NULL */
 #define	VA_EXCLUSIVE	0x02		/* exclusive create request */
 #define	VA_SYNC		0x04		/* O_SYNC truncation */
 
 /*
  * Flags for ioflag. (high 16 bits used to ask for read-ahead and
  * help with write clustering)
  * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h
  */
 #define	IO_UNIT		0x0001		/* do I/O as atomic unit */
 #define	IO_APPEND	0x0002		/* append write to end */
 #define	IO_NDELAY	0x0004		/* FNDELAY flag set in file table */
 #define	IO_NODELOCKED	0x0008		/* underlying node already locked */
 #define	IO_ASYNC	0x0010		/* bawrite rather then bdwrite */
 #define	IO_VMIO		0x0020		/* data already in VMIO space */
 #define	IO_INVAL	0x0040		/* invalidate after I/O */
 #define	IO_SYNC		0x0080		/* do I/O synchronously */
 #define	IO_DIRECT	0x0100		/* attempt to bypass buffer cache */
 #define	IO_NOREUSE	0x0200		/* VMIO data won't be reused */
 #define	IO_EXT		0x0400		/* operate on external attributes */
 #define	IO_NORMAL	0x0800		/* operate on regular data */
 #define	IO_NOMACCHECK	0x1000		/* MAC checks unnecessary */
 #define	IO_BUFLOCKED	0x2000		/* ffs flag; indir buf is locked */
 #define	IO_RANGELOCKED	0x4000		/* range locked */
 #define	IO_DATASYNC	0x8000		/* do only data I/O synchronously */
 
 #define IO_SEQMAX	0x7F		/* seq heuristic max value */
 #define IO_SEQSHIFT	16		/* seq heuristic in upper 16 bits */
 
 /*
  * Flags for accmode_t.
  */
 #define	VEXEC			000000000100 /* execute/search permission */
 #define	VWRITE			000000000200 /* write permission */
 #define	VREAD			000000000400 /* read permission */
 #define	VADMIN			000000010000 /* being the file owner */
 #define	VAPPEND			000000040000 /* permission to write/append */
 /*
  * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only
  * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL,
  * and 0 otherwise.  This never happens with ordinary unix access rights
  * or POSIX.1e ACLs.  Obviously, VEXPLICIT_DENY must be OR-ed with
  * some other V* constant.
  */
 #define	VEXPLICIT_DENY		000000100000
 #define	VREAD_NAMED_ATTRS 	000000200000 /* not used */
 #define	VWRITE_NAMED_ATTRS 	000000400000 /* not used */
 #define	VDELETE_CHILD	 	000001000000
 #define	VREAD_ATTRIBUTES 	000002000000 /* permission to stat(2) */
 #define	VWRITE_ATTRIBUTES 	000004000000 /* change {m,c,a}time */
 #define	VDELETE		 	000010000000
 #define	VREAD_ACL	 	000020000000 /* read ACL and file mode */
 #define	VWRITE_ACL	 	000040000000 /* change ACL and/or file mode */
 #define	VWRITE_OWNER	 	000100000000 /* change file owner */
 #define	VSYNCHRONIZE	 	000200000000 /* not used */
 #define	VCREAT			000400000000 /* creating new file */
 #define	VVERIFY			001000000000 /* verification required */
 
 /*
  * Permissions that were traditionally granted only to the file owner.
  */
 #define VADMIN_PERMS	(VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \
     VWRITE_OWNER)
 
 /*
  * Permissions that were traditionally granted to everyone.
  */
 #define VSTAT_PERMS	(VREAD_ATTRIBUTES | VREAD_ACL)
 
 /*
  * Permissions that allow to change the state of the file in any way.
  */
 #define VMODIFY_PERMS	(VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \
     VDELETE)
 
 /*
  * Token indicating no attribute value yet assigned.
  */
 #define	VNOVAL	(-1)
 
 /*
  * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon)
  */
 #define VLKTIMEOUT	(hz / 20 + 1)
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_VNODE);
 #endif
 
 extern u_int ncsizefactor;
 extern const u_int io_hold_cnt;
 
 /*
  * Convert between vnode types and inode formats (since POSIX.1
  * defines mode word of stat structure in terms of inode formats).
  */
 extern enum vtype	iftovt_tab[];
 extern int		vttoif_tab[];
 #define	IFTOVT(mode)	(iftovt_tab[((mode) & S_IFMT) >> 12])
 #define	VTTOIF(indx)	(vttoif_tab[(int)(indx)])
 #define	MAKEIMODE(indx, mode)	(int)(VTTOIF(indx) | (mode))
 
 /*
  * Flags to various vnode functions.
  */
 #define	SKIPSYSTEM	0x0001	/* vflush: skip vnodes marked VSYSTEM */
 #define	FORCECLOSE	0x0002	/* vflush: force file closure */
 #define	WRITECLOSE	0x0004	/* vflush: only close writable files */
 #define	EARLYFLUSH	0x0008	/* vflush: early call for ffs_flushfiles */
 #define	V_SAVE		0x0001	/* vinvalbuf: sync file first */
 #define	V_ALT		0x0002	/* vinvalbuf: invalidate only alternate bufs */
 #define	V_NORMAL	0x0004	/* vinvalbuf: invalidate only regular bufs */
 #define	V_CLEANONLY	0x0008	/* vinvalbuf: invalidate only clean bufs */
 #define	V_VMIO		0x0010	/* vinvalbuf: called during pageout */
 #define	V_ALLOWCLEAN	0x0020	/* vinvalbuf: allow clean buffers after flush */
 #define	REVOKEALL	0x0001	/* vop_revoke: revoke all aliases */
 #define	V_WAIT		0x0001	/* vn_start_write: sleep for suspend */
 #define	V_NOWAIT	0x0002	/* vn_start_write: don't sleep for suspend */
 #define	V_XSLEEP	0x0004	/* vn_start_write: just return after sleep */
 #define	V_MNTREF	0x0010	/* vn_start_write: mp is already ref-ed */
 
 #define	VR_START_WRITE	0x0001	/* vfs_write_resume: start write atomically */
 #define	VR_NO_SUSPCLR	0x0002	/* vfs_write_resume: do not clear suspension */
 
 #define	VS_SKIP_UNMOUNT	0x0001	/* vfs_write_suspend: fail if the
 				   filesystem is being unmounted */
 
 #define	VREF(vp)	vref(vp)
 
 #ifdef DIAGNOSTIC
 #define	VATTR_NULL(vap)	vattr_null(vap)
 #else
 #define	VATTR_NULL(vap)	(*(vap) = va_null)	/* initialize a vattr */
 #endif /* DIAGNOSTIC */
 
 #define	NULLVP	((struct vnode *)NULL)
 
 /*
  * Global vnode data.
  */
 extern	struct vnode *rootvnode;	/* root (i.e. "/") vnode */
 extern	struct mount *rootdevmp;	/* "/dev" mount */
 extern	u_long desiredvnodes;		/* number of vnodes desired */
 extern	struct uma_zone *namei_zone;
 extern	struct vattr va_null;		/* predefined null vattr structure */
 
 extern	u_int vn_lock_pair_pause_max;
 
 #define	VI_LOCK(vp)	mtx_lock(&(vp)->v_interlock)
 #define	VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags))
 #define	VI_TRYLOCK(vp)	mtx_trylock(&(vp)->v_interlock)
 #define	VI_UNLOCK(vp)	mtx_unlock(&(vp)->v_interlock)
 #define	VI_MTX(vp)	(&(vp)->v_interlock)
 
 #define	VN_LOCK_AREC(vp)	lockallowrecurse((vp)->v_vnlock)
 #define	VN_LOCK_ASHARE(vp)	lockallowshare((vp)->v_vnlock)
 #define	VN_LOCK_DSHARE(vp)	lockdisableshare((vp)->v_vnlock)
 
 #endif /* _KERNEL */
 
 /*
  * Mods for extensibility.
  */
 
 /*
  * Flags for vdesc_flags:
  */
 #define	VDESC_MAX_VPS		16
 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
 #define	VDESC_VP0_WILLRELE	0x0001
 #define	VDESC_VP1_WILLRELE	0x0002
 #define	VDESC_VP2_WILLRELE	0x0004
 #define	VDESC_VP3_WILLRELE	0x0008
 
 /*
  * A generic structure.
  * This can be used by bypass routines to identify generic arguments.
  */
 struct vop_generic_args {
 	struct vnodeop_desc *a_desc;
 	/* other random data follows, presumably */
 };
 
 typedef int vop_bypass_t(struct vop_generic_args *);
 
 /*
  * VDESC_NO_OFFSET is used to identify the end of the offset list
  * and in places where no such field exists.
  */
 #define VDESC_NO_OFFSET -1
 
 /*
  * This structure describes the vnode operation taking place.
  */
 struct vnodeop_desc {
 	char	*vdesc_name;		/* a readable name for debugging */
 	int	 vdesc_flags;		/* VDESC_* flags */
 	int	vdesc_vop_offset;
 	vop_bypass_t	*vdesc_call;	/* Function to call */
 
 	/*
 	 * These ops are used by bypass routines to map and locate arguments.
 	 * Creds and procs are not needed in bypass routines, but sometimes
 	 * they are useful to (for example) transport layers.
 	 * Nameidata is useful because it has a cred in it.
 	 */
 	int	*vdesc_vp_offsets;	/* list ended by VDESC_NO_OFFSET */
 	int	vdesc_vpp_offset;	/* return vpp location */
 	int	vdesc_cred_offset;	/* cred location, if any */
 	int	vdesc_thread_offset;	/* thread location, if any */
 	int	vdesc_componentname_offset; /* if any */
 };
 
 #ifdef _KERNEL
 /*
  * A list of all the operation descs.
  */
 extern struct vnodeop_desc *vnodeop_descs[];
 
 #define	VOPARG_OFFSETOF(s_type, field)	__offsetof(s_type, field)
 #define	VOPARG_OFFSETTO(s_type, s_offset, struct_p) \
     ((s_type)(((char*)(struct_p)) + (s_offset)))
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * Support code to aid in debugging VFS locking problems.  Not totally
  * reliable since if the thread sleeps between changing the lock
  * state and checking it with the assert, some other thread could
  * change the state.  They are good enough for debugging a single
  * filesystem using a single-threaded test.  Note that the unreliability is
  * limited to false negatives; efforts were made to ensure that false
  * positives cannot occur.
  */
 void	assert_vi_locked(struct vnode *vp, const char *str);
 void	assert_vi_unlocked(struct vnode *vp, const char *str);
 void	assert_vop_elocked(struct vnode *vp, const char *str);
 void	assert_vop_locked(struct vnode *vp, const char *str);
 void	assert_vop_unlocked(struct vnode *vp, const char *str);
 
 #define	ASSERT_VI_LOCKED(vp, str)	assert_vi_locked((vp), (str))
 #define	ASSERT_VI_UNLOCKED(vp, str)	assert_vi_unlocked((vp), (str))
 #define	ASSERT_VOP_ELOCKED(vp, str)	assert_vop_elocked((vp), (str))
 #define	ASSERT_VOP_LOCKED(vp, str)	assert_vop_locked((vp), (str))
 #define	ASSERT_VOP_UNLOCKED(vp, str)	assert_vop_unlocked((vp), (str))
 
 #define ASSERT_VOP_IN_SEQC(vp)	do {				\
 	struct vnode *_vp = (vp);				\
 								\
 	VNPASS(seqc_in_modify(_vp->v_seqc), _vp);		\
 } while (0)
 
 #define ASSERT_VOP_NOT_IN_SEQC(vp)	do {			\
 	struct vnode *_vp = (vp);				\
 								\
 	VNPASS(!seqc_in_modify(_vp->v_seqc), _vp);		\
 } while (0)
 
 #else /* !DEBUG_VFS_LOCKS */
 
 #define	ASSERT_VI_LOCKED(vp, str)	((void)0)
 #define	ASSERT_VI_UNLOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_ELOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_LOCKED(vp, str)	((void)0)
 #define	ASSERT_VOP_UNLOCKED(vp, str)	((void)0)
 
 #define ASSERT_VOP_IN_SEQC(vp)		((void)0)
 #define ASSERT_VOP_NOT_IN_SEQC(vp)	((void)0)
 
 #endif /* DEBUG_VFS_LOCKS */
 
 /*
  * This call works for vnodes in the kernel.
  */
 #define VCALL(c) ((c)->a_desc->vdesc_call(c))
 
 #define DOINGASYNC(vp)	   					\
 	(((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 &&	\
 	 ((curthread->td_pflags & TDP_SYNCIO) == 0))
 
 /*
  * VMIO support inline
  */
 
 extern int vmiodirenable;
 
 static __inline int
 vn_canvmio(struct vnode *vp)
 {
       if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR)))
 		return(TRUE);
 	return(FALSE);
 }
 
 /*
  * Finally, include the default set of vnode operations.
  */
 typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int);
 #include "vnode_if.h"
 
 /* vn_open_flags */
 #define	VN_OPEN_NOAUDIT		0x00000001
 #define	VN_OPEN_NOCAPCHECK	0x00000002
 #define	VN_OPEN_NAMECACHE	0x00000004
 #define	VN_OPEN_INVFS		0x00000008
 
 /* copy_file_range kernel flags */
 #define	COPY_FILE_RANGE_KFLAGS		0xff000000
 #define	COPY_FILE_RANGE_TIMEO1SEC	0x01000000	/* Return after 1sec. */
 
 /*
  * Public vnode manipulation functions.
  */
 struct componentname;
 struct file;
 struct mount;
 struct nameidata;
 struct ostat;
 struct freebsd11_stat;
 struct thread;
 struct proc;
 struct stat;
 struct nstat;
 struct ucred;
 struct uio;
 struct vattr;
 struct vfsops;
 struct vnode;
 
 typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **);
 
 int	bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn,
 	    daddr_t endn);
 /* cache_* may belong in namei.h. */
 void	cache_changesize(u_long newhashsize);
 
 #define	VFS_CACHE_DROPOLD	0x1
 
 void	cache_enter_time_flags(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp, struct timespec *tsp,
 	    struct timespec *dtsp, int flags);
 #define	cache_enter(dvp, vp, cnp)					\
 	cache_enter_time(dvp, vp, cnp, NULL, NULL)
 void	cache_enter_time(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp, struct timespec *tsp,
 	    struct timespec *dtsp);
 int	cache_lookup(struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp, struct timespec *tsp, int *ticksp);
 void	cache_vnode_init(struct vnode *vp);
 void	cache_purge(struct vnode *vp);
 void	cache_purge_vgone(struct vnode *vp);
 void	cache_purge_negative(struct vnode *vp);
 void	cache_purgevfs(struct mount *mp);
 char	*cache_symlink_alloc(size_t size, int flags);
 void	cache_symlink_free(char *string, size_t size);
 int	cache_symlink_resolve(struct cache_fpl *fpl, const char *string,
 	    size_t len);
 void	cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp);
 void	cache_vop_rmdir(struct vnode *dvp, struct vnode *vp);
 #ifdef INVARIANTS
 void	cache_validate(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp);
 #else
 static inline void
 cache_validate(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 }
 #endif
 void	cache_fast_lookup_enabled_recalc(void);
 int	change_dir(struct vnode *vp, struct thread *td);
 void	cvtstat(struct stat *st, struct ostat *ost);
 void	freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb);
 int	freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost);
 int	getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
 	    struct vnode **vpp);
 void	getnewvnode_reserve(void);
 void	getnewvnode_drop_reserve(void);
 int	insmntque1(struct vnode *vp, struct mount *mp,
 	    void (*dtr)(struct vnode *, void *), void *dtr_arg);
 int	insmntque(struct vnode *vp, struct mount *mp);
 u_quad_t init_va_filerev(void);
 int	speedup_syncer(void);
 int	vn_vptocnp(struct vnode **vp, char *buf, size_t *buflen);
 int	vn_getcwd(char *buf, char **retbuf, size_t *buflen);
 int	vn_fullpath(struct vnode *vp, char **retbuf, char **freebuf);
 int	vn_fullpath_global(struct vnode *vp, char **retbuf, char **freebuf);
 struct vnode *
 	vn_dir_dd_ino(struct vnode *vp);
 int	vn_commname(struct vnode *vn, char *buf, u_int buflen);
 int	vn_path_to_global_path(struct thread *td, struct vnode *vp,
 	    char *path, u_int pathlen);
 int	vaccess(enum vtype type, mode_t file_mode, uid_t file_uid,
 	    gid_t file_gid, accmode_t accmode, struct ucred *cred);
 int	vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid,
 	    struct ucred *cred);
 int	vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid,
 	    struct acl *aclp, accmode_t accmode, struct ucred *cred);
 int	vaccess_acl_posix1e(enum vtype type, uid_t file_uid,
 	    gid_t file_gid, struct acl *acl, accmode_t accmode,
 	    struct ucred *cred);
 void	vattr_null(struct vattr *vap);
 void	vlazy(struct vnode *);
 void	vdrop(struct vnode *);
 void	vdropl(struct vnode *);
 int	vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
 int	vget(struct vnode *vp, int flags);
 enum vgetstate	vget_prep_smr(struct vnode *vp);
 enum vgetstate	vget_prep(struct vnode *vp);
 int	vget_finish(struct vnode *vp, int flags, enum vgetstate vs);
 void	vget_finish_ref(struct vnode *vp, enum vgetstate vs);
 void	vget_abort(struct vnode *vp, enum vgetstate vs);
 void	vgone(struct vnode *vp);
 void	vhold(struct vnode *);
 void	vholdnz(struct vnode *);
 bool	vhold_smr(struct vnode *);
 int	vinactive(struct vnode *vp);
 int	vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
 int	vtruncbuf(struct vnode *vp, off_t length, int blksize);
 void	v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
 	    int blksize);
 void	vunref(struct vnode *);
 void	vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3);
 int	vrecycle(struct vnode *vp);
 int	vrecyclel(struct vnode *vp);
 int	vn_bmap_seekhole_locked(struct vnode *vp, u_long cmd, off_t *off,
 	    struct ucred *cred);
 int	vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off,
 	    struct ucred *cred);
 int	vn_close(struct vnode *vp,
 	    int flags, struct ucred *file_cred, struct thread *td);
 int	vn_copy_file_range(struct vnode *invp, off_t *inoffp,
 	    struct vnode *outvp, off_t *outoffp, size_t *lenp,
 	    unsigned int flags, struct ucred *incred, struct ucred *outcred,
 	    struct thread *fsize_td);
 int	vn_deallocate(struct vnode *vp, off_t *offset, off_t *length, int flags,
 	    int ioflg, struct ucred *active_cred, struct ucred *file_cred);
 void	vn_finished_write(struct mount *mp);
 void	vn_finished_secondary_write(struct mount *mp);
 int	vn_fsync_buf(struct vnode *vp, int waitfor);
 int	vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp,
 	    struct vnode *outvp, off_t *outoffp, size_t *lenp,
 	    unsigned int flags, struct ucred *incred, struct ucred *outcred,
 	    struct thread *fsize_td);
 int	vn_need_pageq_flush(struct vnode *vp);
 bool	vn_isdisk_error(struct vnode *vp, int *errp);
 bool	vn_isdisk(struct vnode *vp);
 int	_vn_lock(struct vnode *vp, int flags, const char *file, int line);
 #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__)
 void	vn_lock_pair(struct vnode *vp1, bool vp1_locked, struct vnode *vp2,
 	    bool vp2_locked);
 int	vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp);
 int	vn_open_cred(struct nameidata *ndp, int *flagp, int cmode,
 	    u_int vn_open_flags, struct ucred *cred, struct file *fp);
 int	vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 	    struct thread *td, struct file *fp);
 void	vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end);
 int	vn_pollrecord(struct vnode *vp, struct thread *p, int events);
 int	vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid,
 	    struct thread *td);
 int	vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base,
 	    size_t len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *active_cred, struct ucred *file_cred, size_t *aresid,
 	    struct thread *td);
 int	vn_read_from_obj(struct vnode *vp, struct uio *uio);
 int	vn_rlimit_fsize(const struct vnode *vp, const struct uio *uio,
 	    struct thread *td);
 int	vn_start_write(struct vnode *vp, struct mount **mpp, int flags);
 int	vn_start_secondary_write(struct vnode *vp, struct mount **mpp,
 	    int flags);
 int	vn_truncate_locked(struct vnode *vp, off_t length, bool sync,
 	    struct ucred *cred);
 int	vn_writechk(struct vnode *vp);
 int	vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int *buflen, char *buf, struct thread *td);
 int	vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, int buflen, char *buf, struct thread *td);
 int	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
 	    const char *attrname, struct thread *td);
 int	vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags,
 	    struct vnode **rvp);
 int	vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc,
 	    void *alloc_arg, int lkflags, struct vnode **rvp);
 int	vn_utimes_perm(struct vnode *vp, struct vattr *vap,
 	    struct ucred *cred, struct thread *td);
 
 int	vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio);
 int	vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize,
 	    struct uio *uio);
 
 void	vn_seqc_write_begin_locked(struct vnode *vp);
 void	vn_seqc_write_begin(struct vnode *vp);
 void	vn_seqc_write_end_locked(struct vnode *vp);
 void	vn_seqc_write_end(struct vnode *vp);
 #define	vn_seqc_read_any(vp)		seqc_read_any(&(vp)->v_seqc)
 #define	vn_seqc_read_notmodify(vp)	seqc_read_notmodify(&(vp)->v_seqc)
 #define	vn_seqc_consistent(vp, seq)	seqc_consistent(&(vp)->v_seqc, seq)
 
 #define	vn_rangelock_unlock(vp, cookie)					\
 	rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp))
 #define	vn_rangelock_unlock_range(vp, cookie, start, end)		\
 	rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), 	\
 	    VI_MTX(vp))
 #define	vn_rangelock_rlock(vp, start, end)				\
 	rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_tryrlock(vp, start, end)				\
 	rangelock_tryrlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_wlock(vp, start, end)				\
 	rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 #define	vn_rangelock_trywlock(vp, start, end)				\
 	rangelock_trywlock(&(vp)->v_rl, (start), (end), VI_MTX(vp))
 
 #define	vn_irflag_read(vp)	atomic_load_short(&(vp)->v_irflag)
 void	vn_irflag_set_locked(struct vnode *vp, short toset);
 void	vn_irflag_set(struct vnode *vp, short toset);
 void	vn_irflag_set_cond_locked(struct vnode *vp, short toset);
 void	vn_irflag_set_cond(struct vnode *vp, short toset);
 void	vn_irflag_unset_locked(struct vnode *vp, short tounset);
 void	vn_irflag_unset(struct vnode *vp, short tounset);
 
 int	vfs_cache_lookup(struct vop_lookup_args *ap);
 int	vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp);
 void	vfs_timestamp(struct timespec *);
 void	vfs_write_resume(struct mount *mp, int flags);
 int	vfs_write_suspend(struct mount *mp, int flags);
 int	vfs_write_suspend_umnt(struct mount *mp);
 struct vnode *vnlru_alloc_marker(void);
 void	vnlru_free_marker(struct vnode *);
 void	vnlru_free_vfsops(int, struct vfsops *, struct vnode *);
 int	vop_stdbmap(struct vop_bmap_args *);
 int	vop_stdfdatasync_buf(struct vop_fdatasync_args *);
 int	vop_stdfsync(struct vop_fsync_args *);
 int	vop_stdgetwritemount(struct vop_getwritemount_args *);
 int	vop_stdgetpages(struct vop_getpages_args *);
 int	vop_stdinactive(struct vop_inactive_args *);
 int	vop_stdioctl(struct vop_ioctl_args *);
 int	vop_stdneed_inactive(struct vop_need_inactive_args *);
 int	vop_stdkqfilter(struct vop_kqfilter_args *);
 int	vop_stdlock(struct vop_lock1_args *);
 int	vop_stdunlock(struct vop_unlock_args *);
 int	vop_stdislocked(struct vop_islocked_args *);
 int	vop_lock(struct vop_lock1_args *);
 int	vop_unlock(struct vop_unlock_args *);
 int	vop_islocked(struct vop_islocked_args *);
 int	vop_stdputpages(struct vop_putpages_args *);
 int	vop_nopoll(struct vop_poll_args *);
 int	vop_stdaccess(struct vop_access_args *ap);
 int	vop_stdaccessx(struct vop_accessx_args *ap);
 int	vop_stdadvise(struct vop_advise_args *ap);
 int	vop_stdadvlock(struct vop_advlock_args *ap);
 int	vop_stdadvlockasync(struct vop_advlockasync_args *ap);
 int	vop_stdadvlockpurge(struct vop_advlockpurge_args *ap);
 int	vop_stdallocate(struct vop_allocate_args *ap);
 int	vop_stddeallocate(struct vop_deallocate_args *ap);
 int	vop_stdset_text(struct vop_set_text_args *ap);
 int	vop_stdpathconf(struct vop_pathconf_args *);
 int	vop_stdpoll(struct vop_poll_args *);
 int	vop_stdvptocnp(struct vop_vptocnp_args *ap);
 int	vop_stdvptofh(struct vop_vptofh_args *ap);
 int	vop_stdunp_bind(struct vop_unp_bind_args *ap);
 int	vop_stdunp_connect(struct vop_unp_connect_args *ap);
 int	vop_stdunp_detach(struct vop_unp_detach_args *ap);
 int	vop_eopnotsupp(struct vop_generic_args *ap);
 int	vop_ebadf(struct vop_generic_args *ap);
 int	vop_einval(struct vop_generic_args *ap);
 int	vop_enoent(struct vop_generic_args *ap);
 int	vop_enotty(struct vop_generic_args *ap);
 int	vop_eagain(struct vop_generic_args *ap);
 int	vop_null(struct vop_generic_args *ap);
 int	vop_panic(struct vop_generic_args *ap);
 int	dead_poll(struct vop_poll_args *ap);
 int	dead_read(struct vop_read_args *ap);
 int	dead_write(struct vop_write_args *ap);
 
 /* These are called from within the actual VOPS. */
 void	vop_close_post(void *a, int rc);
 void	vop_create_pre(void *a);
 void	vop_create_post(void *a, int rc);
 void	vop_whiteout_pre(void *a);
 void	vop_whiteout_post(void *a, int rc);
 void	vop_deleteextattr_pre(void *a);
 void	vop_deleteextattr_post(void *a, int rc);
 void	vop_link_pre(void *a);
 void	vop_link_post(void *a, int rc);
 void	vop_lookup_post(void *a, int rc);
 void	vop_lookup_pre(void *a);
 void	vop_mkdir_pre(void *a);
 void	vop_mkdir_post(void *a, int rc);
 void	vop_mknod_pre(void *a);
 void	vop_mknod_post(void *a, int rc);
 void	vop_open_post(void *a, int rc);
 void	vop_read_post(void *a, int rc);
 void	vop_read_pgcache_post(void *ap, int rc);
 void	vop_readdir_post(void *a, int rc);
 void	vop_reclaim_post(void *a, int rc);
 void	vop_remove_pre(void *a);
 void	vop_remove_post(void *a, int rc);
 void	vop_rename_post(void *a, int rc);
 void	vop_rename_pre(void *a);
 void	vop_rmdir_pre(void *a);
 void	vop_rmdir_post(void *a, int rc);
 void	vop_setattr_pre(void *a);
 void	vop_setattr_post(void *a, int rc);
 void	vop_setacl_pre(void *a);
 void	vop_setacl_post(void *a, int rc);
 void	vop_setextattr_pre(void *a);
 void	vop_setextattr_post(void *a, int rc);
 void	vop_symlink_pre(void *a);
 void	vop_symlink_post(void *a, int rc);
 int	vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a);
 
 #ifdef DEBUG_VFS_LOCKS
 void	vop_fplookup_vexec_debugpre(void *a);
 void	vop_fplookup_vexec_debugpost(void *a, int rc);
 void	vop_fplookup_symlink_debugpre(void *a);
 void	vop_fplookup_symlink_debugpost(void *a, int rc);
 void	vop_strategy_debugpre(void *a);
 void	vop_lock_debugpre(void *a);
 void	vop_lock_debugpost(void *a, int rc);
 void	vop_unlock_debugpre(void *a);
 void	vop_need_inactive_debugpre(void *a);
 void	vop_need_inactive_debugpost(void *a, int rc);
 void	vop_mkdir_debugpost(void *a, int rc);
 #else
 #define	vop_fplookup_vexec_debugpre(x)		do { } while (0)
 #define	vop_fplookup_vexec_debugpost(x, y)	do { } while (0)
 #define	vop_fplookup_symlink_debugpre(x)	do { } while (0)
 #define	vop_fplookup_symlink_debugpost(x, y)	do { } while (0)
 #define	vop_strategy_debugpre(x)		do { } while (0)
 #define	vop_lock_debugpre(x)			do { } while (0)
 #define	vop_lock_debugpost(x, y)		do { } while (0)
 #define	vop_unlock_debugpre(x)			do { } while (0)
 #define	vop_need_inactive_debugpre(x)		do { } while (0)
 #define	vop_need_inactive_debugpost(x, y)	do { } while (0)
 #define	vop_mkdir_debugpost(x, y)		do { } while (0)
 #endif
 
 void	vop_rename_fail(struct vop_rename_args *ap);
 
 #define	vop_stat_helper_pre(ap)	({						\
 	int _error;								\
 	AUDIT_ARG_VNODE1(ap->a_vp);						\
 	_error = mac_vnode_check_stat(ap->a_active_cred, ap->a_file_cred, ap->a_vp);\
 	if (__predict_true(_error == 0))					\
 		bzero(ap->a_sb, sizeof(*ap->a_sb));				\
 	_error;									\
 })
 
 #define	vop_stat_helper_post(ap, error)	({					\
 	int _error = (error);							\
-	if (priv_check_cred_vfs_generation(ap->a_td->td_ucred))			\
+	if (priv_check_cred_vfs_generation(ap->a_active_cred))			\
 		ap->a_sb->st_gen = 0;						\
 	_error;									\
 })
 
 #define	VOP_WRITE_PRE(ap)						\
 	struct vattr va;						\
 	int error;							\
 	off_t osize, ooffset, noffset;					\
 									\
 	osize = ooffset = noffset = 0;					\
 	if (!VN_KNLIST_EMPTY((ap)->a_vp)) {				\
 		error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred);	\
 		if (error)						\
 			return (error);					\
 		ooffset = (ap)->a_uio->uio_offset;			\
 		osize = (off_t)va.va_size;				\
 	}
 
 #define VOP_WRITE_POST(ap, ret)						\
 	noffset = (ap)->a_uio->uio_offset;				\
 	if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) {	\
 		VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE			\
 		    | (noffset > osize ? NOTE_EXTEND : 0));		\
 	}
 
 #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__)
 
 #ifdef INVARIANTS
 #define	VOP_ADD_WRITECOUNT_CHECKED(vp, cnt)				\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_ADD_WRITECOUNT((vp), (cnt));			\
 	VNASSERT(error_ == 0, (vp), ("VOP_ADD_WRITECOUNT returned %d",	\
 	    error_));							\
 } while (0)
 #define	VOP_SET_TEXT_CHECKED(vp)					\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_SET_TEXT((vp));					\
 	VNASSERT(error_ == 0, (vp), ("VOP_SET_TEXT returned %d",	\
 	    error_));							\
 } while (0)
 #define	VOP_UNSET_TEXT_CHECKED(vp)					\
 do {									\
 	int error_;							\
 									\
 	error_ = VOP_UNSET_TEXT((vp));					\
 	VNASSERT(error_ == 0, (vp), ("VOP_UNSET_TEXT returned %d",	\
 	    error_));							\
 } while (0)
 #else
 #define	VOP_ADD_WRITECOUNT_CHECKED(vp, cnt)	VOP_ADD_WRITECOUNT((vp), (cnt))
 #define	VOP_SET_TEXT_CHECKED(vp)		VOP_SET_TEXT((vp))
 #define	VOP_UNSET_TEXT_CHECKED(vp)		VOP_UNSET_TEXT((vp))
 #endif
 
 #define	VN_IS_DOOMED(vp)	__predict_false((vn_irflag_read(vp) & VIRF_DOOMED) != 0)
 
 void	vput(struct vnode *vp);
 void	vrele(struct vnode *vp);
 void	vref(struct vnode *vp);
 void	vrefact(struct vnode *vp);
 void 	v_addpollinfo(struct vnode *vp);
 static __inline int
 vrefcnt(struct vnode *vp)
 {
 
 	return (vp->v_usecount);
 }
 
 #define	vholdl(vp)	do {						\
 	ASSERT_VI_LOCKED(vp, __func__);					\
 	vhold(vp);							\
 } while (0)
 
 #define	vrefl(vp)	do {						\
 	ASSERT_VI_LOCKED(vp, __func__);					\
 	vref(vp);							\
 } while (0)
 
 int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td);
 void vnode_destroy_vobject(struct vnode *vp);
 
 extern struct vop_vector fifo_specops;
 extern struct vop_vector dead_vnodeops;
 extern struct vop_vector default_vnodeops;
 
 #define VOP_PANIC	((void*)(uintptr_t)vop_panic)
 #define VOP_NULL	((void*)(uintptr_t)vop_null)
 #define VOP_EBADF	((void*)(uintptr_t)vop_ebadf)
 #define VOP_ENOTTY	((void*)(uintptr_t)vop_enotty)
 #define VOP_EINVAL	((void*)(uintptr_t)vop_einval)
 #define VOP_ENOENT	((void*)(uintptr_t)vop_enoent)
 #define VOP_EOPNOTSUPP	((void*)(uintptr_t)vop_eopnotsupp)
 #define VOP_EAGAIN	((void*)(uintptr_t)vop_eagain)
 
 /* fifo_vnops.c */
 int	fifo_printinfo(struct vnode *);
 
 /* vfs_hash.c */
 typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg);
 
 void vfs_hash_changesize(u_long newhashsize);
 int vfs_hash_get(const struct mount *mp, u_int hash, int flags,
     struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 u_int vfs_hash_index(struct vnode *vp);
 int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td,
     struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg);
 void vfs_hash_rehash(struct vnode *vp, u_int hash);
 void vfs_hash_remove(struct vnode *vp);
 
 int vfs_kqfilter(struct vop_kqfilter_args *);
 struct dirent;
 int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off);
 int vfs_emptydir(struct vnode *vp);
 
 int vfs_unixify_accmode(accmode_t *accmode);
 
 void vfs_unp_reclaim(struct vnode *vp);
 
 int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode);
 int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid,
     gid_t gid);
 int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td);
 int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td);
 
 void vn_fsid(struct vnode *vp, struct vattr *va);
 
 int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp);
 int vn_lktype_write(struct mount *mp, struct vnode *vp);
 
 #define VOP_UNLOCK_FLAGS(vp, flags)	({				\
 	struct vnode *_vp = (vp);					\
 	int _flags = (flags);						\
 	int _error;							\
 									\
         if ((_flags & ~(LK_INTERLOCK | LK_RELEASE)) != 0)		\
                 panic("%s: unsupported flags %x\n", __func__, flags);	\
         _error = VOP_UNLOCK(_vp);					\
         if (_flags & LK_INTERLOCK)					\
                 VI_UNLOCK(_vp);						\
         _error;								\
 })
 
 #include <sys/kernel.h>
 
 #define VFS_VOP_VECTOR_REGISTER(vnodeops) \
 	SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \
 	    vfs_vector_op_register, &vnodeops)
 
 #define VFS_SMR_DECLARE				\
 	extern smr_t vfs_smr
 
 #define VFS_SMR()	vfs_smr
 #define vfs_smr_enter()	smr_enter(VFS_SMR())
 #define vfs_smr_exit()	smr_exit(VFS_SMR())
 #define vfs_smr_synchronize()	smr_synchronize(VFS_SMR())
 #define vfs_smr_entered_load(ptr)	smr_entered_load((ptr), VFS_SMR())
 #define VFS_SMR_ASSERT_ENTERED()	SMR_ASSERT_ENTERED(VFS_SMR())
 #define VFS_SMR_ASSERT_NOT_ENTERED()	SMR_ASSERT_NOT_ENTERED(VFS_SMR())
 #define VFS_SMR_ZONE_SET(zone)	uma_zone_set_smr((zone), VFS_SMR())
 
 #define vn_load_v_data_smr(vp)	({		\
 	struct vnode *_vp = (vp);		\
 						\
 	VFS_SMR_ASSERT_ENTERED();		\
 	atomic_load_consume_ptr(&(_vp)->v_data);\
 })
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_VNODE_H_ */