diff --git a/lib/libc/sys/jail.2 b/lib/libc/sys/jail.2
index 2e13a6c3a381..82c2e97d4a7b 100644
--- a/lib/libc/sys/jail.2
+++ b/lib/libc/sys/jail.2
@@ -1,411 +1,414 @@
 .\" Copyright (c) 1999 Poul-Henning Kamp.
 .\" Copyright (c) 2009 James Gritton.
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd February 8, 2012
+.Dd February 19, 2021
 .Dt JAIL 2
 .Os
 .Sh NAME
 .Nm jail ,
 .Nm jail_get ,
 .Nm jail_set ,
 .Nm jail_remove ,
 .Nm jail_attach
 .Nd create and manage system jails
 .Sh LIBRARY
 .Lb libc
 .Sh SYNOPSIS
 .In sys/param.h
 .In sys/jail.h
 .Ft int
 .Fn jail "struct jail *jail"
 .Ft int
 .Fn jail_attach "int jid"
 .Ft int
 .Fn jail_remove "int jid"
 .In sys/uio.h
 .Ft int
 .Fn jail_get "struct iovec *iov" "u_int niov" "int flags"
 .Ft int
 .Fn jail_set "struct iovec *iov" "u_int niov" "int flags"
 .Sh DESCRIPTION
 The
 .Fn jail
 system call sets up a jail and locks the current process in it.
 .Pp
 The argument is a pointer to a structure describing the prison:
 .Bd -literal -offset indent
 struct jail {
 	uint32_t	version;
 	char		*path;
 	char		*hostname;
 	char		*jailname;
 	unsigned int	ip4s;
 	unsigned int	ip6s;
 	struct in_addr	*ip4;
 	struct in6_addr	*ip6;
 };
 .Ed
 .Pp
 .Dq Li version
 defines the version of the API in use.
 .Dv JAIL_API_VERSION
 is defined for the current version.
 .Pp
 The
 .Dq Li path
 pointer should be set to the directory which is to be the root of the
 prison.
 .Pp
 The
 .Dq Li hostname
 pointer can be set to the hostname of the prison.
 This can be changed
 from the inside of the prison.
 .Pp
 The
 .Dq Li jailname
 pointer is an optional name that can be assigned to the jail
 for example for management purposes.
 .Pp
 The
 .Dq Li ip4s
 and
 .Dq Li ip6s
 give the numbers of IPv4 and IPv6 addresses that will be passed
 via their respective pointers.
 .Pp
 The
 .Dq Li ip4
 and
 .Dq Li ip6
 pointers can be set to an arrays of IPv4 and IPv6 addresses to be assigned to
 the prison, or NULL if none.
 IPv4 addresses must be in network byte order.
 .Pp
 This is equivalent to, and deprecated in favor of, the
 .Fn jail_set
 system call (see below), with the parameters
 .Va path ,
 .Va host.hostname ,
 .Va name ,
 .Va ip4.addr ,
 and
 .Va ip6.addr ,
 and with the
 .Dv JAIL_ATTACH
 flag.
 .Pp
 The
 .Fn jail_set
 system call creates a new jail, or modifies an existing one, and optionally
 locks the current process in it.
 Jail parameters are passed as an array of name-value pairs in the array
 .Fa iov ,
 containing
 .Fa niov
 elements.
 Parameter names are a null-terminated string, and values may be strings,
 integers, or other arbitrary data.
 Some parameters are boolean, and do not have a value (their length is zero)
 but are set by the name alone with or without a
 .Dq no
 prefix, e.g.
 .Va persist
 or
 .Va nopersist .
 Any parameters not set will be given default values, generally based on
 the current environment.
 .Pp
 Jails have a set of core parameters, and modules can add their own jail
 parameters.
 The current set of available parameters, and their formats, can be
 retrieved via the
 .Va security.jail.param
 sysctl MIB entry.
 Notable parameters include those mentioned in the
 .Fn jail
 description above, as well as
 .Va jid
 and
 .Va name ,
 which identify the jail being created or modified.
 See
 .Xr jail 8
 for more information on the core jail parameters.
 .Pp
 The
 .Fa flags
 arguments consists of one or more of the following flags:
 .Bl -tag -width indent
 .It Dv JAIL_CREATE
 Create a new jail.
 If a
 .Va jid
 or
 .Va name
 parameters exists, they must not refer to an existing jail.
 .It Dv JAIL_UPDATE
 Modify an existing jail.
 One of the
 .Va jid
 or
 .Va name
 parameters must exist, and must refer to an existing jail.
 If both
 .Dv JAIL_CREATE
 and
 .Dv JAIL_UPDATE
 are set, a jail will be created if it does not yet exist, and modified if it
 does exist.
 .It Dv JAIL_ATTACH
 In addition to creating or modifying the jail, attach the current process to
 it, as with the
 .Fn jail_attach
 system call.
 .It Dv JAIL_DYING
 Allow setting a jail that is in the process of being removed.
 .El
 .Pp
 The
 .Fn jail_get
 system call retrieves jail parameters, using the same name-value list as
 .Fn jail_set
 in the
 .Fa iov
 and
 .Fa niov
 arguments.
 The jail to read can be specified by either
 .Va jid
 or
 .Va name
 by including those parameters in the list.
 If they are included but are not intended to be the search key, they
 should be cleared (zero and the empty string respectively).
 .Pp
 The special parameter
 .Va lastjid
 can be used to retrieve a list of all jails.
 It will fetch the jail with the jid above and closest to the passed value.
 The first jail (usually but not always jid 1) can be found by passing a
 .Va lastjid
 of zero.
 .Pp
 The
 .Fa flags
 arguments consists of one or more following flags:
 .Bl -tag -width indent
 .It Dv JAIL_DYING
 Allow getting a jail that is in the process of being removed.
 .El
 .Pp
 The
 .Fn jail_attach
 system call attaches the current process to an existing jail,
 identified by
 .Fa jid .
+It changes the process's root and current directories to the jail's
+.Va path
+directory.
 .Pp
 The
 .Fn jail_remove
 system call removes the jail identified by
 .Fa jid .
 It will kill all processes belonging to the jail, and remove any children
 of that jail.
 .Sh RETURN VALUES
 If successful,
 .Fn jail ,
 .Fn jail_set ,
 and
 .Fn jail_get
 return a non-negative integer, termed the jail identifier (JID).
 They return \-1 on failure, and set
 .Va errno
 to indicate the error.
 .Pp
 .Rv -std jail_attach jail_remove
 .Sh ERRORS
 The
 .Fn jail
 system call
 will fail if:
 .Bl -tag -width Er
 .It Bq Er EPERM
 This process is not allowed to create a jail, either because it is not
 the super-user, or because it would exceed the jail's
 .Va children.max
 limit.
 .It Bq Er EFAULT
 .Fa jail
 points to an address outside the allocated address space of the process.
 .It Bq Er EINVAL
 The version number of the argument is not correct.
 .It Bq Er EAGAIN
 No free JID could be found.
 .El
 .Pp
 The
 .Fn jail_set
 system call
 will fail if:
 .Bl -tag -width Er
 .It Bq Er EPERM
 This process is not allowed to create a jail, either because it is not
 the super-user, or because it would exceed the jail's
 .Va children.max
 limit.
 .It Bq Er EPERM
 A jail parameter was set to a less restrictive value then the current
 environment.
 .It Bq Er EFAULT
 .Fa Iov ,
 or one of the addresses contained within it,
 points to an address outside the allocated address space of the process.
 .It Bq Er ENOENT
 The jail referred to by a
 .Va jid
 or
 .Va name
 parameter does not exist, and the
 .Dv JAIL_CREATE
 flag is not set.
 .It Bq Er ENOENT
 The jail referred to by a
 .Va jid
 is not accessible by the process, because the process is in a different
 jail.
 .It Bq Er EEXIST
 The jail referred to by a
 .Va jid
 or
 .Va name
 parameter exists, and the
 .Dv JAIL_UPDATE
 flag is not set.
 .It Bq Er EINVAL
 A supplied parameter is the wrong size.
 .It Bq Er EINVAL
 A supplied parameter is out of range.
 .It Bq Er EINVAL
 A supplied string parameter is not null-terminated.
 .It Bq Er EINVAL
 A supplied parameter name does not match any known parameters.
 .It Bq Er EINVAL
 One of the
 .Dv JAIL_CREATE
 or
 .Dv JAIL_UPDATE
 flags is not set.
 .It Bq Er ENAMETOOLONG
 A supplied string parameter is longer than allowed.
 .It Bq Er EAGAIN
 There are no jail IDs left.
 .El
 .Pp
 The
 .Fn jail_get
 system call
 will fail if:
 .Bl -tag -width Er
 .It Bq Er EFAULT
 .Fa Iov ,
 or one of the addresses contained within it,
 points to an address outside the allocated address space of the process.
 .It Bq Er ENOENT
 The jail referred to by a
 .Va jid
 or
 .Va name
 parameter does not exist.
 .It Bq Er ENOENT
 The jail referred to by a
 .Va jid
 is not accessible by the process, because the process is in a different
 jail.
 .It Bq Er ENOENT
 The
 .Va lastjid
 parameter is greater than the highest current jail ID.
 .It Bq Er EINVAL
 A supplied parameter is the wrong size.
 .It Bq Er EINVAL
 A supplied parameter name does not match any known parameters.
 .El
 .Pp
 The
 .Fn jail_attach
 and
 .Fn jail_remove
 system calls
 will fail if:
 .Bl -tag -width Er
 .It Bq Er EPERM
 A user other than the super-user attempted to attach to or remove a jail.
 .It Bq Er EINVAL
 The jail specified by
 .Fa jid
 does not exist.
 .El
 .Pp
 Further
 .Fn jail ,
 .Fn jail_set ,
 and
 .Fn jail_attach
 call
 .Xr chroot 2
 internally, so they can fail for all the same reasons.
 Please consult the
 .Xr chroot 2
 manual page for details.
 .Sh SEE ALSO
 .Xr chdir 2 ,
 .Xr chroot 2 ,
 .Xr jail 8
 .Sh HISTORY
 The
 .Fn jail
 system call appeared in
 .Fx 4.0 .
 The
 .Fn jail_attach
 system call appeared in
 .Fx 5.1 .
 The
 .Fn jail_set ,
 .Fn jail_get ,
 and
 .Fn jail_remove
 system calls appeared in
 .Fx 8.0 .
 .Sh AUTHORS
 The jail feature was written by
 .An Poul-Henning Kamp
 for R&D Associates
 who contributed it to
 .Fx .
 .An James Gritton
 added the extensible jail parameters and hierarchical jails.
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 26b059c0a4b6..17f773d2b7de 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -1,4144 +1,4176 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/selinfo.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/kdb.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <net/vnet.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <ddb/ddb.h>
 
 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
     "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
 
 MALLOC_DECLARE(M_FADVISE);
 
 static __read_mostly uma_zone_t file_zone;
 static __read_mostly uma_zone_t filedesc0_zone;
 
 static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
 		    struct thread *td, int holdleaders);
 static int	fd_first_free(struct filedesc *fdp, int low, int size);
 static int	fd_last_used(struct filedesc *fdp, int size);
 static void	fdgrowtable(struct filedesc *fdp, int nfd);
 static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
 static int	getmaxfd(struct thread *td);
 
 /*
  * Each process has:
  *
  * - An array of open file descriptors (fd_ofiles)
  * - An array of file flags (fd_ofileflags)
  * - A bitmap recording which descriptors are in use (fd_map)
  *
  * A process starts out with NDFILE descriptors.  The value of NDFILE has
  * been selected based the historical limit of 20 open files, and an
  * assumption that the majority of processes, especially short-lived
  * processes like shells, will never need more.
  *
  * If this initial allocation is exhausted, a larger descriptor table and
  * map are allocated dynamically, and the pointers in the process's struct
  * filedesc are updated to point to those.  This is repeated every time
  * the process runs out of file descriptors (provided it hasn't hit its
  * resource limit).
  *
  * Since threads may hold references to individual descriptor table
  * entries, the tables are never freed.  Instead, they are placed on a
  * linked list and freed only when the struct filedesc is released.
  */
 #define NDFILE		20
 #define NDSLOTSIZE	sizeof(NDSLOTTYPE)
 #define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
 #define NDSLOT(x)	((x) / NDENTRIES)
 #define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
 
 /*
  * SLIST entry used to keep track of ofiles which must be reclaimed when
  * the process exits.
  */
 struct freetable {
 	struct fdescenttbl *ft_table;
 	SLIST_ENTRY(freetable) ft_next;
 };
 
 /*
  * Initial allocation: a filedesc structure + the head of SLIST used to
  * keep track of old ofiles + enough space for NDFILE descriptors.
  */
 
 struct fdescenttbl0 {
 	int	fdt_nfiles;
 	struct	filedescent fdt_ofiles[NDFILE];
 };
 
 struct filedesc0 {
 	struct filedesc fd_fd;
 	SLIST_HEAD(, freetable) fd_free;
 	struct	fdescenttbl0 fd_dfiles;
 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
 };
 
 /*
  * Descriptor management.
  */
 volatile int __exclusive_cache_line openfiles; /* actual number of open files */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
 void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /*
  * If low >= size, just return low. Otherwise find the first zero bit in the
  * given bitmap, starting at low and not exceeding size - 1. Return size if
  * not found.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, maxoff;
 
 	if (low >= size)
 		return (low);
 
 	off = NDSLOT(low);
 	if (low % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
 		if ((mask &= ~map[off]) != 0UL)
 			return (off * NDENTRIES + ffsl(mask) - 1);
 		++off;
 	}
 	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
 		if (map[off] != ~0UL)
 			return (off * NDENTRIES + ffsl(~map[off]) - 1);
 	return (size);
 }
 
 /*
  * Find the highest non-zero bit in the given bitmap, starting at 0 and
  * not exceeding size - 1. Return -1 if not found.
  */
 static int
 fd_last_used(struct filedesc *fdp, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, minoff;
 
 	off = NDSLOT(size);
 	if (size % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
 		if ((mask &= map[off]) != 0)
 			return (off * NDENTRIES + flsl(mask) - 1);
 		--off;
 	}
 	for (minoff = NDSLOT(0); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
 	return (-1);
 }
 
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
 
 	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
 	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
 
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 
 /*
  * Mark a file descriptor as used.
  */
 static void
 fdused_init(struct filedesc *fdp, int fd)
 {
 
 	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
 
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 }
 
 static void
 fdused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	fdused_init(fdp, fd);
 	if (fd > fdp->fd_lastfile)
 		fdp->fd_lastfile = fd;
 	if (fd == fdp->fd_freefile)
 		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
 }
 
 /*
  * Mark a file descriptor as unused.
  */
 static void
 fdunused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("fd=%d is still in use", fd));
 
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 	if (fd == fdp->fd_lastfile)
 		fdp->fd_lastfile = fd_last_used(fdp, fd);
 }
 
 /*
  * Free a file descriptor.
  *
  * Avoid some work if fdp is about to be destroyed.
  */
 static inline void
 fdefree_last(struct filedescent *fde)
 {
 
 	filecaps_free(&fde->fde_caps);
 }
 
 static inline void
 fdfree(struct filedesc *fdp, int fd)
 {
 	struct filedescent *fde;
 
 	fde = &fdp->fd_ofiles[fd];
 #ifdef CAPABILITIES
 	seq_write_begin(&fde->fde_seq);
 #endif
 	fdefree_last(fde);
 	fde->fde_file = NULL;
 	fdunused(fdp, fd);
 #ifdef CAPABILITIES
 	seq_write_end(&fde->fde_seq);
 #endif
 }
 
 void
 pwd_ensure_dirs(void)
 {
 	struct filedesc *fdp;
 
 	fdp = curproc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_cdir == NULL) {
 		fdp->fd_cdir = rootvnode;
 		vrefact(rootvnode);
 	}
 	if (fdp->fd_rdir == NULL) {
 		fdp->fd_rdir = rootvnode;
 		vrefact(rootvnode);
 	}
 	FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * System calls on descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdtablesize_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
 {
 #ifdef	RACCT
 	uint64_t lim;
 #endif
 
 	td->td_retval[0] =
 	    min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc);
 #ifdef	RACCT
 	PROC_LOCK(td->td_proc);
 	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
 	PROC_UNLOCK(td->td_proc);
 	if (lim < td->td_retval[0])
 		td->td_retval[0] = lim;
 #endif
 	return (0);
 }
 
 /*
  * Duplicate a file descriptor to a particular value.
  *
  * Note: keep in mind that a potential race condition exists when closing
  * descriptors from a shared descriptor table (via rfork).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup2_args {
 	u_int	from;
 	u_int	to;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup2(struct thread *td, struct dup2_args *uap)
 {
 
 	return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to));
 }
 
 /*
  * Duplicate a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup_args {
 	u_int	fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup(struct thread *td, struct dup_args *uap)
 {
 
 	return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0));
 }
 
 /*
  * The file control system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fcntl_args {
 	int	fd;
 	int	cmd;
 	long	arg;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fcntl(struct thread *td, struct fcntl_args *uap)
 {
 
 	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
 }
 
 int
 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
 {
 	struct flock fl;
 	struct __oflock ofl;
 	intptr_t arg1;
 	int error, newcmd;
 
 	error = 0;
 	newcmd = cmd;
 	switch (cmd) {
 	case F_OGETLK:
 	case F_OSETLK:
 	case F_OSETLKW:
 		/*
 		 * Convert old flock structure to new.
 		 */
 		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
 		fl.l_start = ofl.l_start;
 		fl.l_len = ofl.l_len;
 		fl.l_pid = ofl.l_pid;
 		fl.l_type = ofl.l_type;
 		fl.l_whence = ofl.l_whence;
 		fl.l_sysid = 0;
 
 		switch (cmd) {
 		case F_OGETLK:
 			newcmd = F_GETLK;
 			break;
 		case F_OSETLK:
 			newcmd = F_SETLK;
 			break;
 		case F_OSETLKW:
 			newcmd = F_SETLKW;
 			break;
 		}
 		arg1 = (intptr_t)&fl;
 		break;
 	case F_GETLK:
 	case F_SETLK:
 	case F_SETLKW:
 	case F_SETLK_REMOTE:
 		error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
 		arg1 = (intptr_t)&fl;
 		break;
 	default:
 		arg1 = arg;
 		break;
 	}
 	if (error)
 		return (error);
 	error = kern_fcntl(td, fd, newcmd, arg1);
 	if (error)
 		return (error);
 	if (cmd == F_OGETLK) {
 		ofl.l_start = fl.l_start;
 		ofl.l_len = fl.l_len;
 		ofl.l_pid = fl.l_pid;
 		ofl.l_type = fl.l_type;
 		ofl.l_whence = fl.l_whence;
 		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
 	} else if (cmd == F_GETLK) {
 		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
 	}
 	return (error);
 }
 
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
 	struct filedesc *fdp;
 	struct flock *flp;
 	struct file *fp, *fp2;
 	struct filedescent *fde;
 	struct proc *p;
 	struct vnode *vp;
 	cap_rights_t rights;
 	int error, flg, tmp;
 	uint64_t bsize;
 	off_t foffset;
 
 	error = 0;
 	flg = F_POSIX;
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	switch (cmd) {
 	case F_DUPFD:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp);
 		break;
 
 	case F_DUPFD_CLOEXEC:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
 		break;
 
 	case F_DUP2FD:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
 		break;
 
 	case F_DUP2FD_CLOEXEC:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp);
 		break;
 
 	case F_GETFD:
 		error = EBADF;
 		FILEDESC_SLOCK(fdp);
 		fde = fdeget_locked(fdp, fd);
 		if (fde != NULL) {
 			td->td_retval[0] =
 			    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
 			error = 0;
 		}
 		FILEDESC_SUNLOCK(fdp);
 		break;
 
 	case F_SETFD:
 		error = EBADF;
 		FILEDESC_XLOCK(fdp);
 		fde = fdeget_locked(fdp, fd);
 		if (fde != NULL) {
 			fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
 			    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
 			error = 0;
 		}
 		FILEDESC_XUNLOCK(fdp);
 		break;
 
 	case F_GETFL:
 		error = fget_fcntl(td, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp);
 		if (error != 0)
 			break;
 		td->td_retval[0] = OFLAGS(fp->f_flag);
 		fdrop(fp, td);
 		break;
 
 	case F_SETFL:
 		error = fget_fcntl(td, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp);
 		if (error != 0)
 			break;
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
 			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
 		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		tmp = fp->f_flag & FASYNC;
 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
 		if (error == 0) {
 			fdrop(fp, td);
 			break;
 		}
 		atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		tmp = 0;
 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_GETOWN:
 		error = fget_fcntl(td, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp);
 		if (error != 0)
 			break;
 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
 		if (error == 0)
 			td->td_retval[0] = tmp;
 		fdrop(fp, td);
 		break;
 
 	case F_SETOWN:
 		error = fget_fcntl(td, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp);
 		if (error != 0)
 			break;
 		tmp = arg;
 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETLK_REMOTE:
 		error = priv_check(td, PRIV_NFS_LOCKD);
 		if (error)
 			return (error);
 		flg = F_REMOTE;
 		goto do_setlk;
 
 	case F_SETLKW:
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
 	do_setlk:
 		cap_rights_init(&rights, CAP_FLOCK);
 		error = fget_unlocked(fdp, fd, &rights, &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 
 		flp = (struct flock *)arg;
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if (foffset < 0 ||
 			    (flp->l_start > 0 &&
 			     foffset > OFF_MAX - flp->l_start)) {
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 
 		vp = fp->f_vnode;
 		switch (flp->l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_WRLCK:
 			if ((fp->f_flag & FWRITE) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_UNLCK:
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
 			    flp, flg);
 			break;
 		case F_UNLCKSYS:
 			/*
 			 * Temporary api for testing remote lock
 			 * infrastructure.
 			 */
 			if (flg != F_REMOTE) {
 				error = EINVAL;
 				break;
 			}
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCKSYS, flp, flg);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error != 0 || flp->l_type == F_UNLCK ||
 		    flp->l_type == F_UNLCKSYS) {
 			fdrop(fp, td);
 			break;
 		}
 
 		/*
 		 * Check for a race with close.
 		 *
 		 * The vnode is now advisory locked (or unlocked, but this case
 		 * is not really important) as the caller requested.
 		 * We had to drop the filedesc lock, so we need to recheck if
 		 * the descriptor is still valid, because if it was closed
 		 * in the meantime we need to remove advisory lock from the
 		 * vnode - close on any descriptor leading to an advisory
 		 * locked vnode, removes that lock.
 		 * We will return 0 on purpose in that case, as the result of
 		 * successful advisory lock might have been externally visible
 		 * already. This is fine - effectively we pretend to the caller
 		 * that the closing thread was a bit slower and that the
 		 * advisory lock succeeded before the close.
 		 */
 		error = fget_unlocked(fdp, fd, &rights, &fp2, NULL);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (fp != fp2) {
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
 			flp->l_len = 0;
 			flp->l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCK, flp, F_POSIX);
 		}
 		fdrop(fp, td);
 		fdrop(fp2, td);
 		break;
 
 	case F_GETLK:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights, CAP_FLOCK), &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
 		    flp->l_type != F_UNLCK) {
 			error = EINVAL;
 			fdrop(fp, td);
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if ((flp->l_start > 0 &&
 			    foffset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
 			    foffset < OFF_MIN - flp->l_start)) {
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 		vp = fp->f_vnode;
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
 		    F_POSIX);
 		fdrop(fp, td);
 		break;
 
 	case F_RDAHEAD:
 		arg = arg ? 128 * 1024: 0;
 		/* FALLTHROUGH */
 	case F_READAHEAD:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights), &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		vp = fp->f_vnode;
 		if (vp->v_type != VREG) {
 			fdrop(fp, td);
 			error = ENOTTY;
 			break;
 		}
 
 		/*
 		 * Exclusive lock synchronizes against f_seqcount reads and
 		 * writes in sequential_heuristic().
 		 */
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (arg >= 0) {
 			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
 			fp->f_seqcount = (arg + bsize - 1) / bsize;
 			atomic_set_int(&fp->f_flag, FRDAHEAD);
 		} else {
 			atomic_clear_int(&fp->f_flag, FRDAHEAD);
 		}
 		VOP_UNLOCK(vp, 0);
 		fdrop(fp, td);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static int
 getmaxfd(struct thread *td)
 {
 
 	return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc));
 }
 
 /*
  * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
  */
 int
 kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
 {
 	struct filedesc *fdp;
 	struct filedescent *oldfde, *newfde;
 	struct proc *p;
 	struct file *delfp;
 	int error, maxfd;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
 	MPASS(mode < FDDUP_LASTMODE);
 
 	AUDIT_ARG_FD(old);
 	/* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */
 
 	/*
 	 * Verify we have a valid descriptor to dup from and possibly to
 	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
 	 * return EINVAL when the new descriptor is out of bounds.
 	 */
 	if (old < 0)
 		return (EBADF);
 	if (new < 0)
 		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
 	maxfd = getmaxfd(td);
 	if (new >= maxfd)
 		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
 
 	error = EBADF;
 	FILEDESC_XLOCK(fdp);
 	if (fget_locked(fdp, old) == NULL)
 		goto unlock;
 	if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) {
 		td->td_retval[0] = new;
 		if (flags & FDDUP_FLAG_CLOEXEC)
 			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
 		error = 0;
 		goto unlock;
 	}
 
 	/*
 	 * If the caller specified a file descriptor, make sure the file
 	 * table is large enough to hold it, and grab it.  Otherwise, just
 	 * allocate a new descriptor the usual way.
 	 */
 	switch (mode) {
 	case FDDUP_NORMAL:
 	case FDDUP_FCNTL:
 		if ((error = fdalloc(td, new, &new)) != 0)
 			goto unlock;
 		break;
 	case FDDUP_MUSTREPLACE:
 		/* Target file descriptor must exist. */
 		if (fget_locked(fdp, new) == NULL)
 			goto unlock;
 		break;
 	case FDDUP_FIXED:
 		if (new >= fdp->fd_nfiles) {
 			/*
 			 * The resource limits are here instead of e.g.
 			 * fdalloc(), because the file descriptor table may be
 			 * shared between processes, so we can't really use
 			 * racct_add()/racct_sub().  Instead of counting the
 			 * number of actually allocated descriptors, just put
 			 * the limit on the size of the file descriptor table.
 			 */
 #ifdef RACCT
 			if (racct_enable) {
 				PROC_LOCK(p);
 				error = racct_set(p, RACCT_NOFILE, new + 1);
 				PROC_UNLOCK(p);
 				if (error != 0) {
 					error = EMFILE;
 					goto unlock;
 				}
 			}
 #endif
 			fdgrowtable_exp(fdp, new + 1);
 		}
 		if (!fdisused(fdp, new))
 			fdused(fdp, new);
 		break;
 	default:
 		KASSERT(0, ("%s unsupported mode %d", __func__, mode));
 	}
 
 	KASSERT(old != new, ("new fd is same as old"));
 
 	oldfde = &fdp->fd_ofiles[old];
 	fhold(oldfde->fde_file);
 	newfde = &fdp->fd_ofiles[new];
 	delfp = newfde->fde_file;
 
 	/*
 	 * Duplicate the source descriptor.
 	 */
 #ifdef CAPABILITIES
 	seq_write_begin(&newfde->fde_seq);
 #endif
 	filecaps_free(&newfde->fde_caps);
 	memcpy(newfde, oldfde, fde_change_size);
 	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps, true);
 	if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
 		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
 	else
 		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
 #ifdef CAPABILITIES
 	seq_write_end(&newfde->fde_seq);
 #endif
 	td->td_retval[0] = new;
 
 	error = 0;
 
 	if (delfp != NULL) {
 		(void) closefp(fdp, new, delfp, td, 1);
 		FILEDESC_UNLOCK_ASSERT(fdp);
 	} else {
 unlock:
 		FILEDESC_XUNLOCK(fdp);
 	}
 
 	return (error);
 }
 
 /*
  * If sigio is on the list associated with a process or process group,
  * disable signalling from the device, remove sigio from the list and
  * free sigio.
  */
 void
 funsetown(struct sigio **sigiop)
 {
 	struct sigio *sigio;
 
 	if (*sigiop == NULL)
 		return;
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	*(sigio->sio_myref) = NULL;
 	if ((sigio)->sio_pgid < 0) {
 		struct pgrp *pg = (sigio)->sio_pgrp;
 		PGRP_LOCK(pg);
 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
 			    sigio, sio_pgsigio);
 		PGRP_UNLOCK(pg);
 	} else {
 		struct proc *p = (sigio)->sio_proc;
 		PROC_LOCK(p);
 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
 			    sigio, sio_pgsigio);
 		PROC_UNLOCK(p);
 	}
 	SIGIO_UNLOCK();
 	crfree(sigio->sio_ucred);
 	free(sigio, M_SIGIO);
 }
 
 /*
  * Free a list of sigio structures.
  * We only need to lock the SIGIO_LOCK because we have made ourselves
  * inaccessible to callers of fsetown and therefore do not need to lock
  * the proc or pgrp struct for the list manipulation.
  */
 void
 funsetownlst(struct sigiolst *sigiolst)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	struct sigio *sigio;
 
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL)
 		return;
 	p = NULL;
 	pg = NULL;
 
 	/*
 	 * Every entry of the list should belong
 	 * to a single proc or pgrp.
 	 */
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
 	} else /* if (sigio->sio_pgid > 0) */ {
 		p = sigio->sio_proc;
 		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	}
 
 	SIGIO_LOCK();
 	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
 		*(sigio->sio_myref) = NULL;
 		if (pg != NULL) {
 			KASSERT(sigio->sio_pgid < 0,
 			    ("Proc sigio in pgrp sigio list"));
 			KASSERT(sigio->sio_pgrp == pg,
 			    ("Bogus pgrp in sigio list"));
 			PGRP_LOCK(pg);
 			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PGRP_UNLOCK(pg);
 		} else /* if (p != NULL) */ {
 			KASSERT(sigio->sio_pgid > 0,
 			    ("Pgrp sigio in proc sigio list"));
 			KASSERT(sigio->sio_proc == p,
 			    ("Bogus proc in sigio list"));
 			PROC_LOCK(p);
 			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PROC_UNLOCK(p);
 		}
 		SIGIO_UNLOCK();
 		crfree(sigio->sio_ucred);
 		free(sigio, M_SIGIO);
 		SIGIO_LOCK();
 	}
 	SIGIO_UNLOCK();
 }
 
 /*
  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
  *
  * After permission checking, add a sigio structure to the sigio list for
  * the process or process group.
  */
 int
 fsetown(pid_t pgid, struct sigio **sigiop)
 {
 	struct proc *proc;
 	struct pgrp *pgrp;
 	struct sigio *sigio;
 	int ret;
 
 	if (pgid == 0) {
 		funsetown(sigiop);
 		return (0);
 	}
 
 	ret = 0;
 
 	/* Allocate and fill in the new sigio out of locks. */
 	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
 	sigio->sio_pgid = pgid;
 	sigio->sio_ucred = crhold(curthread->td_ucred);
 	sigio->sio_myref = sigiop;
 
 	sx_slock(&proctree_lock);
 	if (pgid > 0) {
 		proc = pfind(pgid);
 		if (proc == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		PROC_UNLOCK(proc);
 		if (proc->p_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		pgrp = NULL;
 	} else /* if (pgid < 0) */ {
 		pgrp = pgfind(-pgid);
 		if (pgrp == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 		PGRP_UNLOCK(pgrp);
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		if (pgrp->pg_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		proc = NULL;
 	}
 	funsetown(sigiop);
 	if (pgid > 0) {
 		PROC_LOCK(proc);
 		/*
 		 * Since funsetownlst() is called without the proctree
 		 * locked, we need to check for P_WEXIT.
 		 * XXX: is ESRCH correct?
 		 */
 		if ((proc->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(proc);
 			ret = ESRCH;
 			goto fail;
 		}
 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_proc = proc;
 		PROC_UNLOCK(proc);
 	} else {
 		PGRP_LOCK(pgrp);
 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_pgrp = pgrp;
 		PGRP_UNLOCK(pgrp);
 	}
 	sx_sunlock(&proctree_lock);
 	SIGIO_LOCK();
 	*sigiop = sigio;
 	SIGIO_UNLOCK();
 	return (0);
 
 fail:
 	sx_sunlock(&proctree_lock);
 	crfree(sigio->sio_ucred);
 	free(sigio, M_SIGIO);
 	return (ret);
 }
 
 /*
  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
  */
 pid_t
 fgetown(sigiop)
 	struct sigio **sigiop;
 {
 	pid_t pgid;
 
 	SIGIO_LOCK();
 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
 	SIGIO_UNLOCK();
 	return (pgid);
 }
 
 /*
  * Function drops the filedesc lock on return.
  */
 static int
 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
     int holdleaders)
 {
 	int error;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (holdleaders) {
 		if (td->td_proc->p_fdtol != NULL) {
 			/*
 			 * Ask fdfree() to sleep to ensure that all relevant
 			 * process leaders can be traversed in closef().
 			 */
 			fdp->fd_holdleaderscount++;
 		} else {
 			holdleaders = 0;
 		}
 	}
 
 	/*
 	 * We now hold the fp reference that used to be owned by the
 	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
 	 * knote_fdclose to prevent a race of the fd getting opened, a knote
 	 * added, and deleteing a knote for the new fd.
 	 */
 	knote_fdclose(td, fd);
 
 	/*
 	 * We need to notify mqueue if the object is of type mqueue.
 	 */
 	if (fp->f_type == DTYPE_MQUEUE)
 		mq_fdclose(td, fd, fp);
 	FILEDESC_XUNLOCK(fdp);
 
 	error = closef(fp, td);
 	if (holdleaders) {
 		FILEDESC_XLOCK(fdp);
 		fdp->fd_holdleaderscount--;
 		if (fdp->fd_holdleaderscount == 0 &&
 		    fdp->fd_holdleaderswakeup != 0) {
 			fdp->fd_holdleaderswakeup = 0;
 			wakeup(&fdp->fd_holdleaderscount);
 		}
 		FILEDESC_XUNLOCK(fdp);
 	}
 	return (error);
 }
 
 /*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct close_args {
 	int     fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_close(struct thread *td, struct close_args *uap)
 {
 
 	return (kern_close(td, uap->fd));
 }
 
 int
 kern_close(struct thread *td, int fd)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 
 	fdp = td->td_proc->p_fd;
 
 	AUDIT_SYSCLOSE(td, fd);
 
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_locked(fdp, fd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	fdfree(fdp, fd);
 
 	/* closefp() drops the FILEDESC lock for us. */
 	return (closefp(fdp, fd, fp, td, 1));
 }
 
 /*
  * Close open file descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct closefrom_args {
 	int	lowfd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_closefrom(struct thread *td, struct closefrom_args *uap)
 {
 	struct filedesc *fdp;
 	int fd;
 
 	fdp = td->td_proc->p_fd;
 	AUDIT_ARG_FD(uap->lowfd);
 
 	/*
 	 * Treat negative starting file descriptor values identical to
 	 * closefrom(0) which closes all files.
 	 */
 	if (uap->lowfd < 0)
 		uap->lowfd = 0;
 	FILEDESC_SLOCK(fdp);
 	for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
 		if (fdp->fd_ofiles[fd].fde_file != NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			(void)kern_close(td, fd);
 			FILEDESC_SLOCK(fdp);
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	return (0);
 }
 
 #if defined(COMPAT_43)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ofstat_args {
 	int	fd;
 	struct	ostat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 ofstat(struct thread *td, struct ofstat_args *uap)
 {
 	struct ostat oub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtstat(&ub, &oub);
 		error = copyout(&oub, uap->sb, sizeof(oub));
 	}
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstat_args {
 	int	fd;
 	struct	stat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fstat(struct thread *td, struct fstat_args *uap)
 {
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0)
 		error = copyout(&ub, uap->sb, sizeof(ub));
 	return (error);
 }
 
 int
 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 
 	error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
 	if (error != 0)
 		return (error);
 
 	AUDIT_ARG_FILE(td->td_proc, fp);
 
 	error = fo_stat(fp, sbp, td->td_ucred, td);
 	fdrop(fp, td);
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
 		ktrstat(sbp);
 #endif
 	return (error);
 }
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nfstat_args {
 	int	fd;
 	struct	nstat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 sys_nfstat(struct thread *td, struct nfstat_args *uap)
 {
 	struct nstat nub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtnstat(&ub, &nub);
 		error = copyout(&nub, uap->sb, sizeof(nub));
 	}
 	return (error);
 }
 
 /*
  * Return pathconf information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fpathconf_args {
 	int	fd;
 	int	name;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
 {
 
 	return (kern_fpathconf(td, uap->fd, uap->name));
 }
 
 int
 kern_fpathconf(struct thread *td, int fd, int name)
 {
 	struct file *fp;
 	struct vnode *vp;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(td, fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
 	if (error != 0)
 		return (error);
 
 	if (name == _PC_ASYNC_IO) {
 		td->td_retval[0] = _POSIX_ASYNCHRONOUS_IO;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp != NULL) {
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_PATHCONF(vp, name, td->td_retval);
 		VOP_UNLOCK(vp, 0);
 	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 		if (name != _PC_PIPE_BUF) {
 			error = EINVAL;
 		} else {
 			td->td_retval[0] = PIPE_BUF;
 			error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
 	}
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Initialize filecaps structure.
  */
 void
 filecaps_init(struct filecaps *fcaps)
 {
 
 	bzero(fcaps, sizeof(*fcaps));
 	fcaps->fc_nioctls = -1;
 }
 
 /*
  * Copy filecaps structure allocating memory for ioctls array if needed.
  *
  * The last parameter indicates whether the fdtable is locked. If it is not and
  * ioctls are encountered, copying fails and the caller must lock the table.
  *
  * Note that if the table was not locked, the caller has to check the relevant
  * sequence counter to determine whether the operation was successful.
  */
 bool
 filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked)
 {
 	size_t size;
 
 	if (src->fc_ioctls != NULL && !locked)
 		return (false);
 	*dst = *src;
 	if (src->fc_ioctls == NULL)
 		return (true);
 
 	KASSERT(src->fc_nioctls > 0,
 	    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
 
 	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 	dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
 	bcopy(src->fc_ioctls, dst->fc_ioctls, size);
 	return (true);
 }
 
 /*
  * Move filecaps structure to the new place and clear the old place.
  */
 void
 filecaps_move(struct filecaps *src, struct filecaps *dst)
 {
 
 	*dst = *src;
 	bzero(src, sizeof(*src));
 }
 
 /*
  * Fill the given filecaps structure with full rights.
  */
 static void
 filecaps_fill(struct filecaps *fcaps)
 {
 
 	CAP_ALL(&fcaps->fc_rights);
 	fcaps->fc_ioctls = NULL;
 	fcaps->fc_nioctls = -1;
 	fcaps->fc_fcntls = CAP_FCNTL_ALL;
 }
 
 /*
  * Free memory allocated within filecaps structure.
  */
 void
 filecaps_free(struct filecaps *fcaps)
 {
 
 	free(fcaps->fc_ioctls, M_FILECAPS);
 	bzero(fcaps, sizeof(*fcaps));
 }
 
 /*
  * Validate the given filecaps structure.
  */
 static void
 filecaps_validate(const struct filecaps *fcaps, const char *func)
 {
 
 	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
 	    ("%s: invalid rights", func));
 	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
 	    ("%s: invalid fcntls", func));
 	KASSERT(fcaps->fc_fcntls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
 	    ("%s: fcntls without CAP_FCNTL", func));
 	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
 	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
 	    ("%s: invalid ioctls", func));
 	KASSERT(fcaps->fc_nioctls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
 	    ("%s: ioctls without CAP_IOCTL", func));
 }
 
 static void
 fdgrowtable_exp(struct filedesc *fdp, int nfd)
 {
 	int nfd1;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	nfd1 = fdp->fd_nfiles * 2;
 	if (nfd1 < nfd)
 		nfd1 = nfd;
 	fdgrowtable(fdp, nfd1);
 }
 
 /*
  * Grow the file table to accommodate (at least) nfd descriptors.
  */
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft;
 	struct fdescenttbl *ntable;
 	struct fdescenttbl *otable;
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap, *omap;
 
 	/*
 	 * If lastfile is -1 this struct filedesc was just allocated and we are
 	 * growing it to accommodate for the one we are going to copy from. There
 	 * is no need to have a lock on this one as it's not visible to anyone.
 	 */
 	if (fdp->fd_lastfile != -1)
 		FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
 
 	/* save old values */
 	onfiles = fdp->fd_nfiles;
 	otable = fdp->fd_files;
 	omap = fdp->fd_map;
 
 	/* compute the size of the new table */
 	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 	if (nnfiles <= onfiles)
 		/* the table is already large enough */
 		return;
 
 	/*
 	 * Allocate a new table.  We need enough space for the number of
 	 * entries, file entries themselves and the struct freetable we will use
 	 * when we decommission the table and place it on the freelist.
 	 * We place the struct freetable in the middle so we don't have
 	 * to worry about padding.
 	 */
 	ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
 	    nnfiles * sizeof(ntable->fdt_ofiles[0]) +
 	    sizeof(struct freetable),
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	/* copy the old data */
 	ntable->fdt_nfiles = nnfiles;
 	memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
 	    onfiles * sizeof(ntable->fdt_ofiles[0]));
 
 	/*
 	 * Allocate a new map only if the old is not large enough.  It will
 	 * grow at a slower rate than the table as it can map more
 	 * entries than the table can hold.
 	 */
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
 		    M_ZERO | M_WAITOK);
 		/* copy over the old data and update the pointer */
 		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
 		fdp->fd_map = nmap;
 	}
 
 	/*
 	 * Make sure that ntable is correctly initialized before we replace
 	 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent
 	 * data.
 	 */
 	atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
 
 	/*
 	 * Do not free the old file table, as some threads may still
 	 * reference entries within it.  Instead, place it on a freelist
 	 * which will be processed when the struct filedesc is released.
 	 *
 	 * Note that if onfiles == NDFILE, we're dealing with the original
 	 * static allocation contained within (struct filedesc0 *)fdp,
 	 * which must not be freed.
 	 */
 	if (onfiles > NDFILE) {
 		ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
 		fdp0 = (struct filedesc0 *)fdp;
 		ft->ft_table = otable;
 		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
 	}
 	/*
 	 * The map does not have the same possibility of threads still
 	 * holding references to it.  So always free it as long as it
 	 * does not reference the original static allocation.
 	 */
 	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
 		free(omap, M_FILEDESC);
 }
 
 /*
  * Allocate a file descriptor for the process.
  */
 int
 fdalloc(struct thread *td, int minfd, int *result)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int fd, maxfd, allocfd;
 #ifdef RACCT
 	int error;
 #endif
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (fdp->fd_freefile > minfd)
 		minfd = fdp->fd_freefile;
 
 	maxfd = getmaxfd(td);
 
 	/*
 	 * Search the bitmap for a free descriptor starting at minfd.
 	 * If none is found, grow the file table.
 	 */
 	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
 	if (fd >= maxfd)
 		return (EMFILE);
 	if (fd >= fdp->fd_nfiles) {
 		allocfd = min(fd * 2, maxfd);
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(p);
 			error = racct_set(p, RACCT_NOFILE, allocfd);
 			PROC_UNLOCK(p);
 			if (error != 0)
 				return (EMFILE);
 		}
 #endif
 		/*
 		 * fd is already equal to first free descriptor >= minfd, so
 		 * we only need to grow the table and we are done.
 		 */
 		fdgrowtable_exp(fdp, allocfd);
 	}
 
 	/*
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
 	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
 	    ("invalid descriptor %d", fd));
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("file descriptor isn't free"));
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
 }
 
 /*
  * Allocate n file descriptors for the process.
  */
 int
 fdallocn(struct thread *td, int minfd, int *fds, int n)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int i;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	for (i = 0; i < n; i++)
 		if (fdalloc(td, 0, &fds[i]) != 0)
 			break;
 
 	if (i < n) {
 		for (i--; i >= 0; i--)
 			fdunused(fdp, fds[i]);
 		return (EMFILE);
 	}
 
 	return (0);
 }
 
 /*
  * Create a new open file structure and allocate a file descriptor for the
  * process that refers to it.  We add one reference to the file for the
  * descriptor table and one reference for resultfp. This is to prevent us
  * being preempted and the entry in the descriptor table closed after we
  * release the FILEDESC lock.
  */
 int
 falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags,
     struct filecaps *fcaps)
 {
 	struct file *fp;
 	int error, fd;
 
 	error = falloc_noinstall(td, &fp);
 	if (error)
 		return (error);		/* no reference held on error */
 
 	error = finstall(td, fp, &fd, flags, fcaps);
 	if (error) {
 		fdrop(fp, td);		/* one reference (fp only) */
 		return (error);
 	}
 
 	if (resultfp != NULL)
 		*resultfp = fp;		/* copy out result */
 	else
 		fdrop(fp, td);		/* release local reference */
 
 	if (resultfd != NULL)
 		*resultfd = fd;
 
 	return (0);
 }
 
 /*
  * Create a new open file structure without allocating a file descriptor.
  */
 int
 falloc_noinstall(struct thread *td, struct file **resultfp)
 {
 	struct file *fp;
 	int maxuserfiles = maxfiles - (maxfiles / 20);
 	int openfiles_new;
 	static struct timeval lastfail;
 	static int curfail;
 
 	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
 
 	openfiles_new = atomic_fetchadd_int(&openfiles, 1) + 1;
 	if ((openfiles_new >= maxuserfiles &&
 	    priv_check(td, PRIV_MAXFILES) != 0) ||
 	    openfiles_new >= maxfiles) {
 		atomic_subtract_int(&openfiles, 1);
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
 			printf("kern.maxfiles limit exceeded by uid %i, (%s) "
 			    "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm);
 		}
 		return (ENFILE);
 	}
 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
 	refcount_init(&fp->f_count, 1);
 	fp->f_cred = crhold(td->td_ucred);
 	fp->f_ops = &badfileops;
 	*resultfp = fp;
 	return (0);
 }
 
 /*
  * Install a file in a file descriptor table.
  */
 void
 _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
     struct filecaps *fcaps)
 {
 	struct filedescent *fde;
 
 	MPASS(fp != NULL);
 	if (fcaps != NULL)
 		filecaps_validate(fcaps, __func__);
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	fde = &fdp->fd_ofiles[fd];
 #ifdef CAPABILITIES
 	seq_write_begin(&fde->fde_seq);
 #endif
 	fde->fde_file = fp;
 	fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0;
 	if (fcaps != NULL)
 		filecaps_move(fcaps, &fde->fde_caps);
 	else
 		filecaps_fill(&fde->fde_caps);
 #ifdef CAPABILITIES
 	seq_write_end(&fde->fde_seq);
 #endif
 }
 
 int
 finstall(struct thread *td, struct file *fp, int *fd, int flags,
     struct filecaps *fcaps)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 
 	MPASS(fd != NULL);
 
 	FILEDESC_XLOCK(fdp);
 	if ((error = fdalloc(td, 0, fd))) {
 		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 	fhold(fp);
 	_finstall(fdp, fp, *fd, flags, fcaps);
 	FILEDESC_XUNLOCK(fdp);
 	return (0);
 }
 
 /*
  * Build a new filedesc structure from another.
  * Copy the current, root, and jail root vnode references.
  *
  * If fdp is not NULL, return with it shared locked.
  */
 struct filedesc *
 fdinit(struct filedesc *fdp, bool prepfiles)
 {
 	struct filedesc0 *newfdp0;
 	struct filedesc *newfdp;
 
 	newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO);
 	newfdp = &newfdp0->fd_fd;
 
 	/* Create the file descriptor table. */
 	FILEDESC_LOCK_INIT(newfdp);
 	refcount_init(&newfdp->fd_refcnt, 1);
 	refcount_init(&newfdp->fd_holdcnt, 1);
 	newfdp->fd_cmask = CMASK;
 	newfdp->fd_map = newfdp0->fd_dmap;
 	newfdp->fd_lastfile = -1;
 	newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
 	newfdp->fd_files->fdt_nfiles = NDFILE;
 
 	if (fdp == NULL)
 		return (newfdp);
 
 	if (prepfiles && fdp->fd_lastfile >= newfdp->fd_nfiles)
 		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 
 	FILEDESC_SLOCK(fdp);
 	newfdp->fd_cdir = fdp->fd_cdir;
 	if (newfdp->fd_cdir)
 		vrefact(newfdp->fd_cdir);
 	newfdp->fd_rdir = fdp->fd_rdir;
 	if (newfdp->fd_rdir)
 		vrefact(newfdp->fd_rdir);
 	newfdp->fd_jdir = fdp->fd_jdir;
 	if (newfdp->fd_jdir)
 		vrefact(newfdp->fd_jdir);
 
 	if (!prepfiles) {
 		FILEDESC_SUNLOCK(fdp);
 	} else {
 		while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
 			FILEDESC_SUNLOCK(fdp);
 			fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 			FILEDESC_SLOCK(fdp);
 		}
 	}
 
 	return (newfdp);
 }
 
 static struct filedesc *
 fdhold(struct proc *p)
 {
 	struct filedesc *fdp;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	fdp = p->p_fd;
 	if (fdp != NULL)
 		refcount_acquire(&fdp->fd_holdcnt);
 	return (fdp);
 }
 
 static void
 fddrop(struct filedesc *fdp)
 {
 
 	if (fdp->fd_holdcnt > 1) {
 		if (refcount_release(&fdp->fd_holdcnt) == 0)
 			return;
 	}
 
 	FILEDESC_LOCK_DESTROY(fdp);
 	uma_zfree(filedesc0_zone, fdp);
 }
 
 /*
  * Share a filedesc structure.
  */
 struct filedesc *
 fdshare(struct filedesc *fdp)
 {
 
 	refcount_acquire(&fdp->fd_refcnt);
 	return (fdp);
 }
 
 /*
  * Unshare a filedesc structure, if necessary by making a copy
  */
 void
 fdunshare(struct thread *td)
 {
 	struct filedesc *tmp;
 	struct proc *p = td->td_proc;
 
 	if (p->p_fd->fd_refcnt == 1)
 		return;
 
 	tmp = fdcopy(p->p_fd);
 	fdescfree(td);
 	p->p_fd = tmp;
 }
 
 void
 fdinstall_remapped(struct thread *td, struct filedesc *fdp)
 {
 
 	fdescfree(td);
 	td->td_proc->p_fd = fdp;
 }
 
 /*
  * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
  * this is to ease callers, not catch errors.
  */
 struct filedesc *
 fdcopy(struct filedesc *fdp)
 {
 	struct filedesc *newfdp;
 	struct filedescent *nfde, *ofde;
 	int i;
 
 	MPASS(fdp != NULL);
 
 	newfdp = fdinit(fdp, true);
 	/* copy all passable descriptors (i.e. not kqueue) */
 	newfdp->fd_freefile = -1;
 	for (i = 0; i <= fdp->fd_lastfile; ++i) {
 		ofde = &fdp->fd_ofiles[i];
 		if (ofde->fde_file == NULL ||
 		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
 			if (newfdp->fd_freefile == -1)
 				newfdp->fd_freefile = i;
 			continue;
 		}
 		nfde = &newfdp->fd_ofiles[i];
 		*nfde = *ofde;
 		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 		fhold(nfde->fde_file);
 		fdused_init(newfdp, i);
 		newfdp->fd_lastfile = i;
 	}
 	if (newfdp->fd_freefile == -1)
 		newfdp->fd_freefile = i;
 	newfdp->fd_cmask = fdp->fd_cmask;
 	FILEDESC_SUNLOCK(fdp);
 	return (newfdp);
 }
 
 /*
  * Copies a filedesc structure, while remapping all file descriptors
  * stored inside using a translation table.
  *
  * File descriptors are copied over to the new file descriptor table,
  * regardless of whether the close-on-exec flag is set.
  */
 int
 fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds,
     struct filedesc **ret)
 {
 	struct filedesc *newfdp;
 	struct filedescent *nfde, *ofde;
 	int error, i;
 
 	MPASS(fdp != NULL);
 
 	newfdp = fdinit(fdp, true);
 	if (nfds > fdp->fd_lastfile + 1) {
 		/* New table cannot be larger than the old one. */
 		error = E2BIG;
 		goto bad;
 	}
 	/* Copy all passable descriptors (i.e. not kqueue). */
 	newfdp->fd_freefile = nfds;
 	for (i = 0; i < nfds; ++i) {
 		if (fds[i] < 0 || fds[i] > fdp->fd_lastfile) {
 			/* File descriptor out of bounds. */
 			error = EBADF;
 			goto bad;
 		}
 		ofde = &fdp->fd_ofiles[fds[i]];
 		if (ofde->fde_file == NULL) {
 			/* Unused file descriptor. */
 			error = EBADF;
 			goto bad;
 		}
 		if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
 			/* File descriptor cannot be passed. */
 			error = EINVAL;
 			goto bad;
 		}
 		nfde = &newfdp->fd_ofiles[i];
 		*nfde = *ofde;
 		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 		fhold(nfde->fde_file);
 		fdused_init(newfdp, i);
 		newfdp->fd_lastfile = i;
 	}
 	newfdp->fd_cmask = fdp->fd_cmask;
 	FILEDESC_SUNLOCK(fdp);
 	*ret = newfdp;
 	return (0);
 bad:
 	FILEDESC_SUNLOCK(fdp);
 	fdescfree_remapped(newfdp);
 	return (error);
 }
 
 /*
  * Clear POSIX style locks. This is only used when fdp looses a reference (i.e.
  * one of processes using it exits) and the table used to be shared.
  */
 static void
 fdclearlocks(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedesc_to_leader *fdtol;
 	struct flock lf;
 	struct file *fp;
 	struct proc *p;
 	struct vnode *vp;
 	int i;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	fdtol = p->p_fdtol;
 	MPASS(fdtol != NULL);
 
 	FILEDESC_XLOCK(fdp);
 	KASSERT(fdtol->fdl_refcount > 0,
 	    ("filedesc_to_refcount botch: fdl_refcount=%d",
 	    fdtol->fdl_refcount));
 	if (fdtol->fdl_refcount == 1 &&
 	    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 		for (i = 0; i <= fdp->fd_lastfile; i++) {
 			fp = fdp->fd_ofiles[i].fde_file;
 			if (fp == NULL || fp->f_type != DTYPE_VNODE)
 				continue;
 			fhold(fp);
 			FILEDESC_XUNLOCK(fdp);
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			vp = fp->f_vnode;
 			(void) VOP_ADVLOCK(vp,
 			    (caddr_t)p->p_leader, F_UNLCK,
 			    &lf, F_POSIX);
 			FILEDESC_XLOCK(fdp);
 			fdrop(fp, td);
 		}
 	}
 retry:
 	if (fdtol->fdl_refcount == 1) {
 		if (fdp->fd_holdleaderscount > 0 &&
 		    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 			/*
 			 * close() or kern_dup() has cleared a reference
 			 * in a shared file descriptor table.
 			 */
 			fdp->fd_holdleaderswakeup = 1;
 			sx_sleep(&fdp->fd_holdleaderscount,
 			    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
 			goto retry;
 		}
 		if (fdtol->fdl_holdcount > 0) {
 			/*
 			 * Ensure that fdtol->fdl_leader remains
 			 * valid in closef().
 			 */
 			fdtol->fdl_wakeup = 1;
 			sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
 			    "fdlhold", 0);
 			goto retry;
 		}
 	}
 	fdtol->fdl_refcount--;
 	if (fdtol->fdl_refcount == 0 &&
 	    fdtol->fdl_holdcount == 0) {
 		fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
 		fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
 	} else
 		fdtol = NULL;
 	p->p_fdtol = NULL;
 	FILEDESC_XUNLOCK(fdp);
 	if (fdtol != NULL)
 		free(fdtol, M_FILEDESC_TO_LEADER);
 }
 
 /*
  * Release a filedesc structure.
  */
 static void
 fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft, *tft;
 	struct filedescent *fde;
 	struct file *fp;
 	int i;
 
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		fde = &fdp->fd_ofiles[i];
 		fp = fde->fde_file;
 		if (fp != NULL) {
 			fdefree_last(fde);
 			if (needclose)
 				(void) closef(fp, td);
 			else
 				fdrop(fp, td);
 		}
 	}
 
 	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
 		free(fdp->fd_map, M_FILEDESC);
 	if (fdp->fd_nfiles > NDFILE)
 		free(fdp->fd_files, M_FILEDESC);
 
 	fdp0 = (struct filedesc0 *)fdp;
 	SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft)
 		free(ft->ft_table, M_FILEDESC);
 
 	fddrop(fdp);
 }
 
 void
 fdescfree(struct thread *td)
 {
 	struct proc *p;
 	struct filedesc *fdp;
 	struct vnode *cdir, *jdir, *rdir;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	MPASS(fdp != NULL);
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		racct_set(p, RACCT_NOFILE, 0);
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	if (p->p_fdtol != NULL)
 		fdclearlocks(td);
 
 	PROC_LOCK(p);
 	p->p_fd = NULL;
 	PROC_UNLOCK(p);
 
 	if (refcount_release(&fdp->fd_refcnt) == 0)
 		return;
 
 	FILEDESC_XLOCK(fdp);
 	cdir = fdp->fd_cdir;
 	fdp->fd_cdir = NULL;
 	rdir = fdp->fd_rdir;
 	fdp->fd_rdir = NULL;
 	jdir = fdp->fd_jdir;
 	fdp->fd_jdir = NULL;
 	FILEDESC_XUNLOCK(fdp);
 
 	if (cdir != NULL)
 		vrele(cdir);
 	if (rdir != NULL)
 		vrele(rdir);
 	if (jdir != NULL)
 		vrele(jdir);
 
 	fdescfree_fds(td, fdp, 1);
 }
 
 void
 fdescfree_remapped(struct filedesc *fdp)
 {
 
 	if (fdp->fd_cdir != NULL)
 		vrele(fdp->fd_cdir);
 	if (fdp->fd_rdir != NULL)
 		vrele(fdp->fd_rdir);
 	if (fdp->fd_jdir != NULL)
 		vrele(fdp->fd_jdir);
 
 	fdescfree_fds(curthread, fdp, 0);
 }
 
 /*
  * For setugid programs, we don't want to people to use that setugidness
  * to generate error messages which write to a file which otherwise would
  * otherwise be off-limits to the process.  We check for filesystems where
  * the vnode can change out from under us after execve (like [lin]procfs).
  *
  * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
  * sufficient.  We also don't check for setugidness since we know we are.
  */
 static bool
 is_unsafe(struct file *fp)
 {
 	struct vnode *vp;
 
 	if (fp->f_type != DTYPE_VNODE)
 		return (false);
 
 	vp = fp->f_vnode;
 	return ((vp->v_vflag & VV_PROCDEP) != 0);
 }
 
 /*
  * Make this setguid thing safe, if at all possible.
  */
 void
 fdsetugidsafety(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	MPASS(fdp->fd_nfiles >= 3);
 	for (i = 0; i <= 2; i++) {
 		fp = fdp->fd_ofiles[i].fde_file;
 		if (fp != NULL && is_unsafe(fp)) {
 			FILEDESC_XLOCK(fdp);
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fdfree(fdp, i);
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
 		}
 	}
 }
 
 /*
  * If a specific file object occupies a specific file descriptor, close the
  * file descriptor entry and drop a reference on the file object.  This is a
  * convenience function to handle a subsequent error in a function that calls
  * falloc() that handles the race that another thread might have closed the
  * file descriptor out from under the thread creating the file object.
  */
 void
 fdclose(struct thread *td, struct file *fp, int idx)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 
 	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_ofiles[idx].fde_file == fp) {
 		fdfree(fdp, idx);
 		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
 	} else
 		FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * Close any files on exec?
  */
 void
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedescent *fde;
 	struct file *fp;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		fde = &fdp->fd_ofiles[i];
 		fp = fde->fde_file;
 		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
 		    (fde->fde_flags & UF_EXCLOSE))) {
 			FILEDESC_XLOCK(fdp);
 			fdfree(fdp, i);
 			(void) closefp(fdp, i, fp, td, 0);
 			FILEDESC_UNLOCK_ASSERT(fdp);
 		}
 	}
 }
 
 /*
  * It is unsafe for set[ug]id processes to be started with file
  * descriptors 0..2 closed, as these descriptors are given implicit
  * significance in the Standard C library.  fdcheckstd() will create a
  * descriptor referencing /dev/null for each of stdin, stdout, and
  * stderr that is not already open.
  */
 int
 fdcheckstd(struct thread *td)
 {
 	struct filedesc *fdp;
 	register_t save;
 	int i, error, devnull;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	MPASS(fdp->fd_nfiles >= 3);
 	devnull = -1;
 	for (i = 0; i <= 2; i++) {
 		if (fdp->fd_ofiles[i].fde_file != NULL)
 			continue;
 
 		save = td->td_retval[0];
 		if (devnull != -1) {
 			error = kern_dup(td, FDDUP_FIXED, 0, devnull, i);
 		} else {
 			error = kern_openat(td, AT_FDCWD, "/dev/null",
 			    UIO_SYSSPACE, O_RDWR, 0);
 			if (error == 0) {
 				devnull = td->td_retval[0];
 				KASSERT(devnull == i, ("we didn't get our fd"));
 			}
 		}
 		td->td_retval[0] = save;
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Internal form of close.  Decrement reference count on file structure.
  * Note: td may be NULL when closing a file that was being passed in a
  * message.
  *
  * XXXRW: Giant is not required for the caller, but often will be held; this
  * makes it moderately likely the Giant will be recursed in the VFS case.
  */
 int
 closef(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	struct filedesc_to_leader *fdtol;
 	struct filedesc *fdp;
 
 	/*
 	 * POSIX record locking dictates that any close releases ALL
 	 * locks owned by this process.  This is handled by setting
 	 * a flag in the unlock to free ONLY locks obeying POSIX
 	 * semantics, and not to free BSD-style file locks.
 	 * If the descriptor was in a message, POSIX-style locks
 	 * aren't passed with the descriptor, and the thread pointer
 	 * will be NULL.  Callers should be careful only to pass a
 	 * NULL thread pointer when there really is no owning
 	 * context that might have locks, or the locks will be
 	 * leaked.
 	 */
 	if (fp->f_type == DTYPE_VNODE && td != NULL) {
 		vp = fp->f_vnode;
 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 			    F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
 			/*
 			 * Handle special case where file descriptor table is
 			 * shared between multiple process leaders.
 			 */
 			fdp = td->td_proc->p_fd;
 			FILEDESC_XLOCK(fdp);
 			for (fdtol = fdtol->fdl_next;
 			    fdtol != td->td_proc->p_fdtol;
 			    fdtol = fdtol->fdl_next) {
 				if ((fdtol->fdl_leader->p_flag &
 				    P_ADVLOCK) == 0)
 					continue;
 				fdtol->fdl_holdcount++;
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				(void) VOP_ADVLOCK(vp,
 				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
 				    F_POSIX);
 				FILEDESC_XLOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
 				    fdtol->fdl_wakeup != 0) {
 					fdtol->fdl_wakeup = 0;
 					wakeup(fdtol);
 				}
 			}
 			FILEDESC_XUNLOCK(fdp);
 		}
 	}
 	return (fdrop(fp, td));
 }
 
 /*
  * Initialize the file pointer with the specified properties.
  *
  * The ops are set with release semantics to be certain that the flags, type,
  * and data are visible when ops is.  This is to prevent ops methods from being
  * called with bad data.
  */
 void
 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
 {
 	fp->f_data = data;
 	fp->f_flag = flag;
 	fp->f_type = type;
 	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
 }
 
 int
 fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp, struct filecaps *havecapsp)
 {
 	struct filedescent *fde;
 	int error;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	fde = fdeget_locked(fdp, fd);
 	if (fde == NULL) {
 		error = EBADF;
 		goto out;
 	}
 
 #ifdef CAPABILITIES
 	error = cap_check(cap_rights_fde(fde), needrightsp);
 	if (error != 0)
 		goto out;
 #endif
 
 	if (havecapsp != NULL)
 		filecaps_copy(&fde->fde_caps, havecapsp, true);
 
 	*fpp = fde->fde_file;
 
 	error = 0;
 out:
 	return (error);
 }
 
 int
 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct file **fpp, struct filecaps *havecapsp)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 #ifndef CAPABILITIES
 	error = fget_unlocked(fdp, fd, needrightsp, fpp, NULL);
 	if (error == 0 && havecapsp != NULL)
 		filecaps_fill(havecapsp);
 #else
 	struct file *fp;
 	seq_t seq;
 
 	for (;;) {
 		error = fget_unlocked(fdp, fd, needrightsp, &fp, &seq);
 		if (error != 0)
 			return (error);
 
 		if (havecapsp != NULL) {
 			if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps,
 			    havecapsp, false)) {
 				fdrop(fp, td);
 				goto get_locked;
 			}
 		}
 
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(fp, td);
 	}
 
 	*fpp = fp;
 	return (0);
 
 get_locked:
 	FILEDESC_SLOCK(fdp);
 	error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp);
 	if (error == 0)
 		fhold(*fpp);
 	FILEDESC_SUNLOCK(fdp);
 #endif
 	return (error);
 }
 
 int
 fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp, seq_t *seqp)
 {
 #ifdef CAPABILITIES
 	struct filedescent *fde;
 #endif
 	struct fdescenttbl *fdt;
 	struct file *fp;
 	u_int count;
 #ifdef CAPABILITIES
 	seq_t seq;
 	cap_rights_t haverights;
 	int error;
 #endif
 
 	fdt = fdp->fd_files;
 	if ((u_int)fd >= fdt->fdt_nfiles)
 		return (EBADF);
 	/*
 	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
 	 * never raising a refcount above 0.  To accomplish this we have
 	 * to use a cmpset loop rather than an atomic_add.  The descriptor
 	 * must be re-verified once we acquire a reference to be certain
 	 * that the identity is still correct and we did not lose a race
 	 * due to preemption.
 	 */
 	for (;;) {
 #ifdef CAPABILITIES
 		seq = seq_read(fd_seq(fdt, fd));
 		fde = &fdt->fdt_ofiles[fd];
 		haverights = *cap_rights_fde(fde);
 		fp = fde->fde_file;
 		if (!seq_consistent(fd_seq(fdt, fd), seq))
 			continue;
 #else
 		fp = fdt->fdt_ofiles[fd].fde_file;
 #endif
 		if (fp == NULL)
 			return (EBADF);
 #ifdef CAPABILITIES
 		error = cap_check(&haverights, needrightsp);
 		if (error != 0)
 			return (error);
 #endif
 		count = fp->f_count;
 	retry:
 		if (count == 0) {
 			/*
 			 * Force a reload. Other thread could reallocate the
 			 * table before this fd was closed, so it possible that
 			 * there is a stale fp pointer in cached version.
 			 */
 			fdt = *(struct fdescenttbl * volatile *)&(fdp->fd_files);
 			continue;
 		}
 		/*
 		 * Use an acquire barrier to force re-reading of fdt so it is
 		 * refreshed for verification.
 		 */
 		if (atomic_fcmpset_acq_int(&fp->f_count, &count, count + 1) == 0)
 			goto retry;
 		fdt = fdp->fd_files;
 #ifdef	CAPABILITIES
 		if (seq_consistent_nomb(fd_seq(fdt, fd), seq))
 #else
 		if (fp == fdt->fdt_ofiles[fd].fde_file)
 #endif
 			break;
 		fdrop(fp, curthread);
 	}
 	*fpp = fp;
 	if (seqp != NULL) {
 #ifdef CAPABILITIES
 		*seqp = seq;
 #endif
 	}
 	return (0);
 }
 
 /*
  * Extract the file pointer associated with the specified descriptor for the
  * current user process.
  *
  * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
  * returned.
  *
  * File's rights will be checked against the capability rights mask.
  *
  * If an error occurred the non-zero error is returned and *fpp is set to
  * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
  * responsible for fdrop().
  */
 static __inline int
 _fget(struct thread *td, int fd, struct file **fpp, int flags,
     cap_rights_t *needrightsp, seq_t *seqp)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int error;
 
 	*fpp = NULL;
 	fdp = td->td_proc->p_fd;
 	error = fget_unlocked(fdp, fd, needrightsp, &fp, seqp);
 	if (error != 0)
 		return (error);
 	if (fp->f_ops == &badfileops) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 */
 	error = 0;
 	switch (flags) {
 	case FREAD:
 	case FWRITE:
 		if ((fp->f_flag & flags) == 0)
 			error = EBADF;
 		break;
 	case FEXEC:
 	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
 		    ((fp->f_flag & FWRITE) != 0))
 			error = EBADF;
 		break;
 	case 0:
 		break;
 	default:
 		KASSERT(0, ("wrong flags"));
 	}
 
 	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
 
 	*fpp = fp;
 	return (0);
 }
 
 int
 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, 0, rightsp, NULL));
 }
 
 int
 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, vm_prot_t *maxprotp,
     struct file **fpp)
 {
 	int error;
 #ifndef CAPABILITIES
 	error = _fget(td, fd, fpp, 0, rightsp, NULL);
 	if (maxprotp != NULL)
 		*maxprotp = VM_PROT_ALL;
 #else
 	cap_rights_t fdrights;
 	struct filedesc *fdp = td->td_proc->p_fd;
 	seq_t seq;
 
 	MPASS(cap_rights_is_set(rightsp, CAP_MMAP));
 	for (;;) {
 		error = _fget(td, fd, fpp, 0, rightsp, &seq);
 		if (error != 0)
 			return (error);
 		if (maxprotp != NULL)
 			fdrights = *cap_rights(fdp, fd);
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(*fpp, td);
 	}
 
 	/*
 	 * If requested, convert capability rights to access flags.
 	 */
 	if (maxprotp != NULL)
 		*maxprotp = cap_rights_to_vmprot(&fdrights);
 #endif
 	return (error);
 }
 
 int
 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, FREAD, rightsp, NULL));
 }
 
 int
 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
 }
 
 int
 fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl,
     struct file **fpp)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 #ifndef CAPABILITIES
 	return (fget_unlocked(fdp, fd, rightsp, fpp, NULL));
 #else
 	int error;
 	seq_t seq;
 
 	MPASS(cap_rights_is_set(rightsp, CAP_FCNTL));
 	for (;;) {
 		error = fget_unlocked(fdp, fd, rightsp, fpp, &seq);
 		if (error != 0)
 			return (error);
 		error = cap_fcntl_check(fdp, fd, needfcntl);
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(*fpp, td);
 	}
 	if (error != 0) {
 		fdrop(*fpp, td);
 		*fpp = NULL;
 	}
 	return (error);
 #endif
 }
 
 /*
  * Like fget() but loads the underlying vnode, or returns an error if the
  * descriptor does not represent a vnode.  Note that pipes use vnodes but
  * never have VM objects.  The returned vnode will be vref()'d.
  *
  * XXX: what about the unused flags ?
  */
 static __inline int
 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
     struct vnode **vpp)
 {
 	struct file *fp;
 	int error;
 
 	*vpp = NULL;
 	error = _fget(td, fd, &fp, flags, needrightsp, NULL);
 	if (error != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 	} else {
 		*vpp = fp->f_vnode;
 		vrefact(*vpp);
 	}
 	fdrop(fp, td);
 
 	return (error);
 }
 
 int
 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, 0, rightsp, vpp));
 }
 
 int
 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct filecaps *havecaps, struct vnode **vpp)
 {
 	struct filedesc *fdp;
 	struct filecaps caps;
 	struct file *fp;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	error = fget_cap_locked(fdp, fd, needrightsp, &fp, &caps);
 	if (error != 0)
 		return (error);
 	if (fp->f_ops == &badfileops) {
 		error = EBADF;
 		goto out;
 	}
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	*havecaps = caps;
 	*vpp = fp->f_vnode;
 	vrefact(*vpp);
 
 	return (0);
 out:
 	filecaps_free(&caps);
 	return (error);
 }
 
 int
 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
 }
 
 int
 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
 }
 
 #ifdef notyet
 int
 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
 }
 #endif
 
 /*
  * Handle the last reference to a file being closed.
  */
 int
 _fdrop(struct file *fp, struct thread *td)
 {
 	int error;
 
 	if (fp->f_count != 0)
 		panic("fdrop: count %d", fp->f_count);
 	error = fo_close(fp, td);
 	atomic_subtract_int(&openfiles, 1);
 	crfree(fp->f_cred);
 	free(fp->f_advice, M_FADVISE);
 	uma_zfree(file_zone, fp);
 
 	return (error);
 }
 
 /*
  * Apply an advisory lock on a file descriptor.
  *
  * Just attempt to get a record lock of the requested type on the entire file
  * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct flock_args {
 	int	fd;
 	int	how;
 };
 #endif
 /* ARGSUSED */
 int
 sys_flock(struct thread *td, struct flock_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_VNODE) {
 		fdrop(fp, td);
 		return (EOPNOTSUPP);
 	}
 
 	vp = fp->f_vnode;
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
 		atomic_clear_int(&fp->f_flag, FHASLOCK);
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 		goto done2;
 	}
 	if (uap->how & LOCK_EX)
 		lf.l_type = F_WRLCK;
 	else if (uap->how & LOCK_SH)
 		lf.l_type = F_RDLCK;
 	else {
 		error = EBADF;
 		goto done2;
 	}
 	atomic_set_int(&fp->f_flag, FHASLOCK);
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done2:
 	fdrop(fp, td);
 	return (error);
 }
 /*
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
     int openerror, int *indxp)
 {
 	struct filedescent *newfde, *oldfde;
 	struct file *fp;
 	int error, indx;
 
 	KASSERT(openerror == ENODEV || openerror == ENXIO,
 	    ("unexpected error %d in %s", openerror, __func__));
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
 	 * of file descriptors, or the fd to be dup'd has already been
 	 * closed, then reject.
 	 */
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_locked(fdp, dfd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 
 	error = fdalloc(td, 0, &indx);
 	if (error != 0) {
 		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 
 	/*
 	 * There are two cases of interest here.
 	 *
 	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and store it in
 	 * (indx).  (dfd) is effectively closed by this operation.
 	 */
 	switch (openerror) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
 		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
 			fdunused(fdp, indx);
 			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
 		fhold(fp);
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 #ifdef CAPABILITIES
 		seq_write_begin(&newfde->fde_seq);
 #endif
 		memcpy(newfde, oldfde, fde_change_size);
 		filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps, true);
 #ifdef CAPABILITIES
 		seq_write_end(&newfde->fde_seq);
 #endif
 		break;
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 #ifdef CAPABILITIES
 		seq_write_begin(&newfde->fde_seq);
 #endif
 		memcpy(newfde, oldfde, fde_change_size);
 		oldfde->fde_file = NULL;
 		fdunused(fdp, dfd);
 #ifdef CAPABILITIES
 		seq_write_end(&newfde->fde_seq);
 #endif
 		break;
 	}
 	FILEDESC_XUNLOCK(fdp);
 	*indxp = indx;
 	return (0);
 }
 
 /*
  * This sysctl determines if we will allow a process to chroot(2) if it
  * has a directory open:
  *	0: disallowed for all processes.
  *	1: allowed for processes that were not already chroot(2)'ed.
  *	2: allowed for all processes.
  */
 
 static int chroot_allow_open_directories = 1;
 
 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
     &chroot_allow_open_directories, 0,
     "Allow a process to chroot(2) if it has a directory open");
 
 /*
  * Helper function for raised chroot(2) security function:  Refuse if
  * any filedescriptors are open directories.
  */
 static int
 chroot_refuse_vdir_fds(struct filedesc *fdp)
 {
 	struct vnode *vp;
 	struct file *fp;
 	int fd;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
 		fp = fget_locked(fdp, fd);
 		if (fp == NULL)
 			continue;
 		if (fp->f_type == DTYPE_VNODE) {
 			vp = fp->f_vnode;
 			if (vp->v_type == VDIR)
 				return (EPERM);
 		}
 	}
 	return (0);
 }
 
 /*
- * Common routine for kern_chroot() and jail_attach().  The caller is
- * responsible for invoking priv_check() and mac_vnode_check_chroot() to
- * authorize this operation.
- */
+* The caller is responsible for invoking priv_check() and
+* mac_vnode_check_chroot() to authorize this operation.
+*/
 int
 pwd_chroot(struct thread *td, struct vnode *vp)
 {
 	struct filedesc *fdp;
 	struct vnode *oldvp;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	if (chroot_allow_open_directories == 0 ||
 	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
 		error = chroot_refuse_vdir_fds(fdp);
 		if (error != 0) {
 			FILEDESC_XUNLOCK(fdp);
 			return (error);
 		}
 	}
 	oldvp = fdp->fd_rdir;
 	vrefact(vp);
 	fdp->fd_rdir = vp;
 	if (fdp->fd_jdir == NULL) {
 		vrefact(vp);
 		fdp->fd_jdir = vp;
 	}
 	FILEDESC_XUNLOCK(fdp);
 	vrele(oldvp);
 	return (0);
 }
 
 void
 pwd_chdir(struct thread *td, struct vnode *vp)
 {
 	struct filedesc *fdp;
 	struct vnode *oldvp;
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("chdir to a vnode with zero usecount"));
 	oldvp = fdp->fd_cdir;
 	fdp->fd_cdir = vp;
 	FILEDESC_XUNLOCK(fdp);
 	vrele(oldvp);
 }
 
+/*
+ * jail_attach(2) changes both root and working directories.
+ */
+int
+pwd_chroot_chdir(struct thread *td, struct vnode *vp)
+{
+	struct filedesc *fdp;
+	struct vnode *oldvrp, *oldvcp;
+	int error;
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_XLOCK(fdp);
+	error = chroot_refuse_vdir_fds(fdp);
+	if (error != 0) {
+		FILEDESC_XUNLOCK(fdp);
+		return (error);
+	}
+	oldvrp = fdp->fd_rdir;
+	vrefact(vp);
+	fdp->fd_rdir = vp;
+	oldvcp = fdp->fd_cdir;
+	vrefact(vp);
+	fdp->fd_cdir = vp;
+	if (fdp->fd_jdir == NULL) {
+		vrefact(vp);
+		fdp->fd_jdir = vp;
+	}
+	FILEDESC_XUNLOCK(fdp);
+	vrele(oldvrp);
+	vrele(oldvcp);
+	return (0);
+}
+
 /*
  * Scan all active processes and prisons to see if any of them have a current
  * or root directory of `olddp'. If so, replace them with the new mount point.
  */
 void
 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 {
 	struct filedesc *fdp;
 	struct prison *pr;
 	struct proc *p;
 	int nrele;
 
 	if (vrefcnt(olddp) == 1)
 		return;
 	nrele = 0;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		fdp = fdhold(p);
 		PROC_UNLOCK(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_XLOCK(fdp);
 		if (fdp->fd_cdir == olddp) {
 			vrefact(newdp);
 			fdp->fd_cdir = newdp;
 			nrele++;
 		}
 		if (fdp->fd_rdir == olddp) {
 			vrefact(newdp);
 			fdp->fd_rdir = newdp;
 			nrele++;
 		}
 		if (fdp->fd_jdir == olddp) {
 			vrefact(newdp);
 			fdp->fd_jdir = newdp;
 			nrele++;
 		}
 		FILEDESC_XUNLOCK(fdp);
 		fddrop(fdp);
 	}
 	sx_sunlock(&allproc_lock);
 	if (rootvnode == olddp) {
 		vrefact(newdp);
 		rootvnode = newdp;
 		nrele++;
 	}
 	mtx_lock(&prison0.pr_mtx);
 	if (prison0.pr_root == olddp) {
 		vrefact(newdp);
 		prison0.pr_root = newdp;
 		nrele++;
 	}
 	mtx_unlock(&prison0.pr_mtx);
 	sx_slock(&allprison_lock);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		mtx_lock(&pr->pr_mtx);
 		if (pr->pr_root == olddp) {
 			vrefact(newdp);
 			pr->pr_root = newdp;
 			nrele++;
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 	sx_sunlock(&allprison_lock);
 	while (nrele--)
 		vrele(olddp);
 }
 
 struct filedesc_to_leader *
 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
 {
 	struct filedesc_to_leader *fdtol;
 
 	fdtol = malloc(sizeof(struct filedesc_to_leader),
 	    M_FILEDESC_TO_LEADER, M_WAITOK);
 	fdtol->fdl_refcount = 1;
 	fdtol->fdl_holdcount = 0;
 	fdtol->fdl_wakeup = 0;
 	fdtol->fdl_leader = leader;
 	if (old != NULL) {
 		FILEDESC_XLOCK(fdp);
 		fdtol->fdl_next = old->fdl_next;
 		fdtol->fdl_prev = old;
 		old->fdl_next = fdtol;
 		fdtol->fdl_next->fdl_prev = fdtol;
 		FILEDESC_XUNLOCK(fdp);
 	} else {
 		fdtol->fdl_next = fdtol;
 		fdtol->fdl_prev = fdtol;
 	}
 	return (fdtol);
 }
 
 static int
 sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS)
 {
 	struct filedesc *fdp;
 	int i, count, slots;
 
 	if (*(int *)arg1 != 0)
 		return (EINVAL);
 
 	fdp = curproc->p_fd;
 	count = 0;
 	FILEDESC_SLOCK(fdp);
 	slots = NDSLOTS(fdp->fd_lastfile + 1);
 	for (i = 0; i < slots; i++)
 		count += bitcountl(fdp->fd_map[i]);
 	FILEDESC_SUNLOCK(fdp);
 
 	return (SYSCTL_OUT(req, &count, sizeof(count)));
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds,
     CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds,
     "Number of open file descriptors");
 
 /*
  * Get file structures globally.
  */
 static int
 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 {
 	struct xfile xf;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int error, n;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	if (req->oldptr == NULL) {
 		n = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			fdp = fdhold(p);
 			PROC_UNLOCK(p);
 			if (fdp == NULL)
 				continue;
 			/* overestimates sparse tables. */
 			if (fdp->fd_lastfile > 0)
 				n += fdp->fd_lastfile;
 			fddrop(fdp);
 		}
 		sx_sunlock(&allproc_lock);
 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 	}
 	error = 0;
 	bzero(&xf, sizeof(xf));
 	xf.xf_size = sizeof(xf);
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if (p_cansee(req->td, p) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		xf.xf_pid = p->p_pid;
 		xf.xf_uid = p->p_ucred->cr_uid;
 		fdp = fdhold(p);
 		PROC_UNLOCK(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_SLOCK(fdp);
 		for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			xf.xf_fd = n;
 			xf.xf_file = fp;
 			xf.xf_data = fp->f_data;
 			xf.xf_vnode = fp->f_vnode;
 			xf.xf_type = fp->f_type;
 			xf.xf_count = fp->f_count;
 			xf.xf_msgcount = 0;
 			xf.xf_offset = foffset_get(fp);
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
 			if (error)
 				break;
 		}
 		FILEDESC_SUNLOCK(fdp);
 		fddrop(fdp);
 		if (error)
 			break;
 	}
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 
 #ifdef KINFO_FILE_SIZE
 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 #endif
 
 static int
 xlate_fflags(int fflags)
 {
 	static const struct {
 		int	fflag;
 		int	kf_fflag;
 	} fflags_table[] = {
 		{ FAPPEND, KF_FLAG_APPEND },
 		{ FASYNC, KF_FLAG_ASYNC },
 		{ FFSYNC, KF_FLAG_FSYNC },
 		{ FHASLOCK, KF_FLAG_HASLOCK },
 		{ FNONBLOCK, KF_FLAG_NONBLOCK },
 		{ FREAD, KF_FLAG_READ },
 		{ FWRITE, KF_FLAG_WRITE },
 		{ O_CREAT, KF_FLAG_CREAT },
 		{ O_DIRECT, KF_FLAG_DIRECT },
 		{ O_EXCL, KF_FLAG_EXCL },
 		{ O_EXEC, KF_FLAG_EXEC },
 		{ O_EXLOCK, KF_FLAG_EXLOCK },
 		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
 		{ O_SHLOCK, KF_FLAG_SHLOCK },
 		{ O_TRUNC, KF_FLAG_TRUNC }
 	};
 	unsigned int i;
 	int kflags;
 
 	kflags = 0;
 	for (i = 0; i < nitems(fflags_table); i++)
 		if (fflags & fflags_table[i].fflag)
 			kflags |=  fflags_table[i].kf_fflag;
 	return (kflags);
 }
 
 /* Trim unused data from kf_path by truncating the structure size. */
 static void
 pack_kinfo(struct kinfo_file *kif)
 {
 
 	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
 	    strlen(kif->kf_path) + 1;
 	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
 }
 
 static void
 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
     struct kinfo_file *kif, struct filedesc *fdp, int flags)
 {
 	int error;
 
 	bzero(kif, sizeof(*kif));
 
 	/* Set a default type to allow for empty fill_kinfo() methods. */
 	kif->kf_type = KF_TYPE_UNKNOWN;
 	kif->kf_flags = xlate_fflags(fp->f_flag);
 	if (rightsp != NULL)
 		kif->kf_cap_rights = *rightsp;
 	else
 		cap_rights_init(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_ref_count = fp->f_count;
 	kif->kf_offset = foffset_get(fp);
 
 	/*
 	 * This may drop the filedesc lock, so the 'fp' cannot be
 	 * accessed after this call.
 	 */
 	error = fo_fill_kinfo(fp, kif, fdp);
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 		pack_kinfo(kif);
 	else
 		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 }
 
 static void
 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
     struct kinfo_file *kif, int flags)
 {
 	int error;
 
 	bzero(kif, sizeof(*kif));
 
 	kif->kf_type = KF_TYPE_VNODE;
 	error = vn_fill_kinfo_vnode(vp, kif);
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 	kif->kf_flags = xlate_fflags(fflags);
 	cap_rights_init(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_ref_count = -1;
 	kif->kf_offset = -1;
 	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 		pack_kinfo(kif);
 	else
 		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 	vrele(vp);
 }
 
 struct export_fd_buf {
 	struct filedesc		*fdp;
 	struct sbuf 		*sb;
 	ssize_t			remainder;
 	struct kinfo_file	kif;
 	int			flags;
 };
 
 static int
 export_kinfo_to_sb(struct export_fd_buf *efbuf)
 {
 	struct kinfo_file *kif;
 
 	kif = &efbuf->kif;
 	if (efbuf->remainder != -1) {
 		if (efbuf->remainder < kif->kf_structsize) {
 			/* Terminate export. */
 			efbuf->remainder = 0;
 			return (0);
 		}
 		efbuf->remainder -= kif->kf_structsize;
 	}
 	return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM);
 }
 
 static int
 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
     struct export_fd_buf *efbuf)
 {
 	int error;
 
 	if (efbuf->remainder == 0)
 		return (0);
 	export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp,
 	    efbuf->flags);
 	FILEDESC_SUNLOCK(efbuf->fdp);
 	error = export_kinfo_to_sb(efbuf);
 	FILEDESC_SLOCK(efbuf->fdp);
 	return (error);
 }
 
 static int
 export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
     struct export_fd_buf *efbuf)
 {
 	int error;
 
 	if (efbuf->remainder == 0)
 		return (0);
 	if (efbuf->fdp != NULL)
 		FILEDESC_SUNLOCK(efbuf->fdp);
 	export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags);
 	error = export_kinfo_to_sb(efbuf);
 	if (efbuf->fdp != NULL)
 		FILEDESC_SLOCK(efbuf->fdp);
 	return (error);
 }
 
 /*
  * Store a process file descriptor information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen,
     int flags)
 {
 	struct file *fp;
 	struct filedesc *fdp;
 	struct export_fd_buf *efbuf;
 	struct vnode *cttyvp, *textvp, *tracevp;
 	int error, i;
 	cap_rights_t rights;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/* ktrace vnode */
 	tracevp = p->p_tracevp;
 	if (tracevp != NULL)
 		vrefact(tracevp);
 	/* text vnode */
 	textvp = p->p_textvp;
 	if (textvp != NULL)
 		vrefact(textvp);
 	/* Controlling tty. */
 	cttyvp = NULL;
 	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
 		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
 		if (cttyvp != NULL)
 			vrefact(cttyvp);
 	}
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->fdp = NULL;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 	efbuf->flags = flags;
 	if (tracevp != NULL)
 		export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE,
 		    efbuf);
 	if (textvp != NULL)
 		export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf);
 	if (cttyvp != NULL)
 		export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE,
 		    efbuf);
 	error = 0;
 	if (fdp == NULL)
 		goto fail;
 	efbuf->fdp = fdp;
 	FILEDESC_SLOCK(fdp);
 	/* working directory */
 	if (fdp->fd_cdir != NULL) {
 		vrefact(fdp->fd_cdir);
 		export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
 	}
 	/* root directory */
 	if (fdp->fd_rdir != NULL) {
 		vrefact(fdp->fd_rdir);
 		export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf);
 	}
 	/* jail directory */
 	if (fdp->fd_jdir != NULL) {
 		vrefact(fdp->fd_jdir);
 		export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf);
 	}
 	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
 		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 #ifdef CAPABILITIES
 		rights = *cap_rights(fdp, i);
 #else /* !CAPABILITIES */
 		cap_rights_init(&rights);
 #endif
 		/*
 		 * Create sysctl entry.  It is OK to drop the filedesc
 		 * lock inside of export_file_to_sb() as we will
 		 * re-validate and re-evaluate its properties when the
 		 * loop continues.
 		 */
 		error = export_file_to_sb(fp, i, &rights, efbuf);
 		if (error != 0 || efbuf->remainder == 0)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 fail:
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 #define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	int error, error2, *name;
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_filedesc_out(p, &sb, maxlen,
 	    KERN_FILEDESC_PACK_KINFO);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 #ifdef KINFO_OFILE_SIZE
 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
 #endif
 
 #ifdef COMPAT_FREEBSD7
 static void
 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
 {
 
 	okif->kf_structsize = sizeof(*okif);
 	okif->kf_type = kif->kf_type;
 	okif->kf_fd = kif->kf_fd;
 	okif->kf_ref_count = kif->kf_ref_count;
 	okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
 	    KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
 	    KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
 	okif->kf_offset = kif->kf_offset;
 	okif->kf_vnode_type = kif->kf_vnode_type;
 	okif->kf_sock_domain = kif->kf_sock_domain;
 	okif->kf_sock_type = kif->kf_sock_type;
 	okif->kf_sock_protocol = kif->kf_sock_protocol;
 	strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
 	okif->kf_sa_local = kif->kf_sa_local;
 	okif->kf_sa_peer = kif->kf_sa_peer;
 }
 
 static int
 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
     struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req)
 {
 	int error;
 
 	vrefact(vp);
 	FILEDESC_SUNLOCK(fdp);
 	export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO);
 	kinfo_to_okinfo(kif, okif);
 	error = SYSCTL_OUT(req, okif, sizeof(*okif));
 	FILEDESC_SLOCK(fdp);
 	return (error);
 }
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
 {
 	struct kinfo_ofile *okif;
 	struct kinfo_file *kif;
 	struct filedesc *fdp;
 	int error, i, *name;
 	struct file *fp;
 	struct proc *p;
 
 	name = (int *)arg1;
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0)
 		return (error);
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	if (fdp == NULL)
 		return (ENOENT);
 	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
 	okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
 	FILEDESC_SLOCK(fdp);
 	if (fdp->fd_cdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
 		    okif, fdp, req);
 	if (fdp->fd_rdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
 		    okif, fdp, req);
 	if (fdp->fd_jdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
 		    okif, fdp, req);
 	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
 		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 		export_file_to_kinfo(fp, i, NULL, kif, fdp,
 		    KERN_FILEDESC_PACK_KINFO);
 		FILEDESC_SUNLOCK(fdp);
 		kinfo_to_okinfo(kif, okif);
 		error = SYSCTL_OUT(req, okif, sizeof(*okif));
 		FILEDESC_SLOCK(fdp);
 		if (error)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 	free(kif, M_TEMP);
 	free(okif, M_TEMP);
 	return (0);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
     "Process ofiledesc entries");
 #endif	/* COMPAT_FREEBSD7 */
 
 int
 vntype_to_kinfo(int vtype)
 {
 	struct {
 		int	vtype;
 		int	kf_vtype;
 	} vtypes_table[] = {
 		{ VBAD, KF_VTYPE_VBAD },
 		{ VBLK, KF_VTYPE_VBLK },
 		{ VCHR, KF_VTYPE_VCHR },
 		{ VDIR, KF_VTYPE_VDIR },
 		{ VFIFO, KF_VTYPE_VFIFO },
 		{ VLNK, KF_VTYPE_VLNK },
 		{ VNON, KF_VTYPE_VNON },
 		{ VREG, KF_VTYPE_VREG },
 		{ VSOCK, KF_VTYPE_VSOCK }
 	};
 	unsigned int i;
 
 	/*
 	 * Perform vtype translation.
 	 */
 	for (i = 0; i < nitems(vtypes_table); i++)
 		if (vtypes_table[i].vtype == vtype)
 			return (vtypes_table[i].kf_vtype);
 
 	return (KF_VTYPE_UNKNOWN);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
     "Process filedesc entries");
 
 /*
  * Store a process current working directory information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_cwd_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
 {
 	struct filedesc *fdp;
 	struct export_fd_buf *efbuf;
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	if (fdp == NULL)
 		return (EINVAL);
 
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->fdp = fdp;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 
 	FILEDESC_SLOCK(fdp);
 	if (fdp->fd_cdir == NULL)
 		error = EINVAL;
 	else {
 		vrefact(fdp->fd_cdir);
 		error = export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD,
 		    FREAD, efbuf);
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * Get per-process current working directory.
  */
 static int
 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	int error, error2, *name;
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_cwd_out(p, &sb, maxlen);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE,
     sysctl_kern_proc_cwd, "Process current working directory");
 
 #ifdef DDB
 /*
  * For the purposes of debugging, generate a human-readable string for the
  * file type.
  */
 static const char *
 file_type_to_name(short type)
 {
 
 	switch (type) {
 	case 0:
 		return ("zero");
 	case DTYPE_VNODE:
 		return ("vnod");
 	case DTYPE_SOCKET:
 		return ("sock");
 	case DTYPE_PIPE:
 		return ("pipe");
 	case DTYPE_FIFO:
 		return ("fifo");
 	case DTYPE_KQUEUE:
 		return ("kque");
 	case DTYPE_CRYPTO:
 		return ("crpt");
 	case DTYPE_MQUEUE:
 		return ("mque");
 	case DTYPE_SHM:
 		return ("shm");
 	case DTYPE_SEM:
 		return ("ksem");
 	default:
 		return ("unkn");
 	}
 }
 
 /*
  * For the purposes of debugging, identify a process (if any, perhaps one of
  * many) that references the passed file in its file descriptor array. Return
  * NULL if none.
  */
 static struct proc *
 file_to_first_proc(struct file *fp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int n;
 
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
 		for (n = 0; n <= fdp->fd_lastfile; n++) {
 			if (fp == fdp->fd_ofiles[n].fde_file)
 				return (p);
 		}
 	}
 	return (NULL);
 }
 
 static void
 db_print_file(struct file *fp, int header)
 {
 	struct proc *p;
 
 	if (header)
 		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
 		    "File", "Type", "Data", "Flag", "GCFl", "Count",
 		    "MCount", "Vnode", "FPID", "FCmd");
 	p = file_to_first_proc(fp);
 	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
 	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
 	    0, fp->f_count, 0, fp->f_vnode,
 	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 }
 
 DB_SHOW_COMMAND(file, db_show_file)
 {
 	struct file *fp;
 
 	if (!have_addr) {
 		db_printf("usage: show file <addr>\n");
 		return;
 	}
 	fp = (struct file *)addr;
 	db_print_file(fp, 1);
 }
 
 DB_SHOW_COMMAND(files, db_show_files)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int header;
 	int n;
 
 	header = 1;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		if ((fdp = p->p_fd) == NULL)
 			continue;
 		for (n = 0; n <= fdp->fd_lastfile; ++n) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			db_print_file(fp, header);
 			header = 0;
 		}
 	}
 }
 #endif
 
 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
     &maxfilesperproc, 0, "Maximum files allowed open per process");
 
 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
     &maxfiles, 0, "Maximum number of files");
 
 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
     __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
 
 /* ARGSUSED*/
 static void
 filelistinit(void *dummy)
 {
 
 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 }
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
 
 /*-------------------------------------------------------------------*/
 
 static int
 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 static int
 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_close(struct file *fp, struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	return (0);
 }
 
 struct fileops badfileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
 	.fo_truncate = badfo_truncate,
 	.fo_ioctl = badfo_ioctl,
 	.fo_poll = badfo_poll,
 	.fo_kqfilter = badfo_kqfilter,
 	.fo_stat = badfo_stat,
 	.fo_close = badfo_close,
 	.fo_chmod = badfo_chmod,
 	.fo_chown = badfo_chown,
 	.fo_sendfile = badfo_sendfile,
 	.fo_fill_kinfo = badfo_fill_kinfo,
 };
 
 int
 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_ioctl(struct file *fp, u_long com, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 
 	return (ENOTTY);
 }
 
 int
 invfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (poll_no_poll(events));
 }
 
 int
 invfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 /*-------------------------------------------------------------------*/
 
 /*
  * File Descriptor pseudo-device driver (/dev/fd/).
  *
  * Opening minor device N dup()s the file (if any) connected to file
  * descriptor N belonging to the calling process.  Note that this driver
  * consists of only the ``open()'' routine, because all subsequent
  * references to this file will be direct to the other driver.
  *
  * XXX: we could give this one a cloning event handler if necessary.
  */
 
 /* ARGSUSED */
 static int
 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
 {
 
 	/*
 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
 	 * the file descriptor being sought for duplication. The error
 	 * return ensures that the vnode for this device will be released
 	 * by vn_open. Open will detect this special error and take the
 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 	 * will simply report the error.
 	 */
 	td->td_dupfd = dev2unit(dev);
 	return (ENODEV);
 }
 
 static struct cdevsw fildesc_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	fdopen,
 	.d_name =	"FD",
 };
 
 static void
 fildesc_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
 	make_dev_alias(dev, "stdin");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
 	make_dev_alias(dev, "stdout");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
 	make_dev_alias(dev, "stderr");
 }
 
 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 0bb9c76a25c9..836bc4e4f8b7 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -1,4164 +1,4164 @@
 /*-
  * Copyright (c) 1999 Poul-Henning Kamp.
  * Copyright (c) 2008 Bjoern A. Zeeb.
  * Copyright (c) 2009 James Gritton.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/osd.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/taskqueue.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/racct.h>
 #include <sys/rctl.h>
 #include <sys/refcount.h>
 #include <sys/sx.h>
 #include <sys/sysent.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif /* DDB */
 
 #include <security/mac/mac_framework.h>
 
 #define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
 
 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
 
 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
 #ifdef INET
 #ifdef INET6
 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
 #else
 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
 #endif
 #else /* !INET */
 #ifdef INET6
 #define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
 #else
 #define	_PR_IP_SADDRSEL	0
 #endif
 #endif
 
 /* prison0 describes what is "real" about the system. */
 struct prison prison0 = {
 	.pr_id		= 0,
 	.pr_name	= "0",
 	.pr_ref		= 1,
 	.pr_uref	= 1,
 	.pr_path	= "/",
 	.pr_securelevel	= -1,
 	.pr_devfs_rsnum = 0,
 	.pr_childmax	= JAIL_MAX,
 	.pr_hostuuid	= DEFAULT_HOSTUUID,
 	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
 #ifdef VIMAGE
 	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
 #else
 	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
 #endif
 	.pr_allow	= PR_ALLOW_ALL,
 };
 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
 
 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
 struct	sx allprison_lock;
 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
 struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
 LIST_HEAD(, prison_racct) allprison_racct;
 int	lastprid = 0;
 
 static int do_jail_attach(struct thread *td, struct prison *pr);
 static void prison_complete(void *context, int pending);
 static void prison_deref(struct prison *pr, int flags);
 static char *prison_path(struct prison *pr1, struct prison *pr2);
 static void prison_remove_one(struct prison *pr);
 #ifdef RACCT
 static void prison_racct_attach(struct prison *pr);
 static void prison_racct_modify(struct prison *pr);
 static void prison_racct_detach(struct prison *pr);
 #endif
 
 /* Flags for prison_deref */
 #define	PD_DEREF	0x01
 #define	PD_DEUREF	0x02
 #define	PD_LOCKED	0x04
 #define	PD_LIST_SLOCKED	0x08
 #define	PD_LIST_XLOCKED	0x10
 
 /*
  * Parameter names corresponding to PR_* flag values.  Size values are for kvm
  * as we cannot figure out the size of a sparse array, or an array without a
  * terminating entry.
  */
 static char *pr_flag_names[] = {
 	[0] = "persist",
 #ifdef INET
 	[7] = "ip4.saddrsel",
 #endif
 #ifdef INET6
 	[8] = "ip6.saddrsel",
 #endif
 };
 const size_t pr_flag_names_size = sizeof(pr_flag_names);
 
 static char *pr_flag_nonames[] = {
 	[0] = "nopersist",
 #ifdef INET
 	[7] = "ip4.nosaddrsel",
 #endif
 #ifdef INET6
 	[8] = "ip6.nosaddrsel",
 #endif
 };
 const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames);
 
 struct jailsys_flags {
 	const char	*name;
 	unsigned	 disable;
 	unsigned	 new;
 } pr_flag_jailsys[] = {
 	{ "host", 0, PR_HOST },
 #ifdef VIMAGE
 	{ "vnet", 0, PR_VNET },
 #endif
 #ifdef INET
 	{ "ip4", PR_IP4_USER, PR_IP4_USER },
 #endif
 #ifdef INET6
 	{ "ip6", PR_IP6_USER, PR_IP6_USER },
 #endif
 };
 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
 
 static char *pr_allow_names[] = {
 	"allow.set_hostname",
 	"allow.sysvipc",
 	"allow.raw_sockets",
 	"allow.chflags",
 	"allow.mount",
 	"allow.quotas",
 	"allow.socket_af",
 	"allow.mount.devfs",
 	"allow.mount.nullfs",
 	"allow.mount.zfs",
 	"allow.mount.procfs",
 	"allow.mount.tmpfs",
 	"allow.mount.fdescfs",
 	"allow.mount.linprocfs",
 	"allow.mount.linsysfs",
 	"allow.read_msgbuf",
 };
 const size_t pr_allow_names_size = sizeof(pr_allow_names);
 
 static char *pr_allow_nonames[] = {
 	"allow.noset_hostname",
 	"allow.nosysvipc",
 	"allow.noraw_sockets",
 	"allow.nochflags",
 	"allow.nomount",
 	"allow.noquotas",
 	"allow.nosocket_af",
 	"allow.mount.nodevfs",
 	"allow.mount.nonullfs",
 	"allow.mount.nozfs",
 	"allow.mount.noprocfs",
 	"allow.mount.notmpfs",
 	"allow.mount.nofdescfs",
 	"allow.mount.nolinprocfs",
 	"allow.mount.nolinsysfs",
 	"allow.noread_msgbuf",
 };
 const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
 
 #define	JAIL_DEFAULT_ALLOW		PR_ALLOW_SET_HOSTNAME
 #define	JAIL_DEFAULT_ENFORCE_STATFS	2
 #define	JAIL_DEFAULT_DEVFS_RSNUM	0
 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
 #if defined(INET) || defined(INET6)
 static unsigned jail_max_af_ips = 255;
 #endif
 
 /*
  * Initialize the parts of prison0 that can't be static-initialized with
  * constants.  This is called from proc0_init() after creating thread0 cpuset.
  */
 void
 prison0_init(void)
 {
 
 	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
 	prison0.pr_osreldate = osreldate;
 	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
 }
 
 /*
  * struct jail_args {
  *	struct jail *jail;
  * };
  */
 int
 sys_jail(struct thread *td, struct jail_args *uap)
 {
 	uint32_t version;
 	int error;
 	struct jail j;
 
 	error = copyin(uap->jail, &version, sizeof(uint32_t));
 	if (error)
 		return (error);
 
 	switch (version) {
 	case 0:
 	{
 		struct jail_v0 j0;
 
 		/* FreeBSD single IPv4 jails. */
 		bzero(&j, sizeof(struct jail));
 		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
 		if (error)
 			return (error);
 		j.version = j0.version;
 		j.path = j0.path;
 		j.hostname = j0.hostname;
 		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
 		break;
 	}
 
 	case 1:
 		/*
 		 * Version 1 was used by multi-IPv4 jail implementations
 		 * that never made it into the official kernel.
 		 */
 		return (EINVAL);
 
 	case 2:	/* JAIL_API_VERSION */
 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
 		error = copyin(uap->jail, &j, sizeof(struct jail));
 		if (error)
 			return (error);
 		break;
 
 	default:
 		/* Sci-Fi jails are not supported, sorry. */
 		return (EINVAL);
 	}
 	return (kern_jail(td, &j));
 }
 
 int
 kern_jail(struct thread *td, struct jail *j)
 {
 	struct iovec optiov[2 * (4 + nitems(pr_allow_names)
 #ifdef INET
 			    + 1
 #endif
 #ifdef INET6
 			    + 1
 #endif
 			    )];
 	struct uio opt;
 	char *u_path, *u_hostname, *u_name;
 #ifdef INET
 	uint32_t ip4s;
 	struct in_addr *u_ip4;
 #endif
 #ifdef INET6
 	struct in6_addr *u_ip6;
 #endif
 	size_t tmplen;
 	int error, enforce_statfs, fi;
 
 	bzero(&optiov, sizeof(optiov));
 	opt.uio_iov = optiov;
 	opt.uio_iovcnt = 0;
 	opt.uio_offset = -1;
 	opt.uio_resid = -1;
 	opt.uio_segflg = UIO_SYSSPACE;
 	opt.uio_rw = UIO_READ;
 	opt.uio_td = td;
 
 	/* Set permissions for top-level jails from sysctls. */
 	if (!jailed(td->td_ucred)) {
 		for (fi = 0; fi < nitems(pr_allow_names); fi++) {
 			optiov[opt.uio_iovcnt].iov_base =
 			    (jail_default_allow & (1 << fi))
 			    ? pr_allow_names[fi] : pr_allow_nonames[fi];
 			optiov[opt.uio_iovcnt].iov_len =
 			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
 			opt.uio_iovcnt += 2;
 		}
 		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
 		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
 		opt.uio_iovcnt++;
 		enforce_statfs = jail_default_enforce_statfs;
 		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
 		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
 		opt.uio_iovcnt++;
 	}
 
 	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
 #ifdef INET
 	ip4s = (j->version == 0) ? 1 : j->ip4s;
 	if (ip4s > jail_max_af_ips)
 		return (EINVAL);
 	tmplen += ip4s * sizeof(struct in_addr);
 #else
 	if (j->ip4s > 0)
 		return (EINVAL);
 #endif
 #ifdef INET6
 	if (j->ip6s > jail_max_af_ips)
 		return (EINVAL);
 	tmplen += j->ip6s * sizeof(struct in6_addr);
 #else
 	if (j->ip6s > 0)
 		return (EINVAL);
 #endif
 	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
 	u_hostname = u_path + MAXPATHLEN;
 	u_name = u_hostname + MAXHOSTNAMELEN;
 #ifdef INET
 	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
 #endif
 #ifdef INET6
 #ifdef INET
 	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
 #else
 	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
 #endif
 #endif
 	optiov[opt.uio_iovcnt].iov_base = "path";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_path;
 	error = copyinstr(j->path, u_path, MAXPATHLEN,
 	    &optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_hostname;
 	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
 	    &optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 	if (j->jailname != NULL) {
 		optiov[opt.uio_iovcnt].iov_base = "name";
 		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
 		opt.uio_iovcnt++;
 		optiov[opt.uio_iovcnt].iov_base = u_name;
 		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
 		    &optiov[opt.uio_iovcnt].iov_len);
 		if (error) {
 			free(u_path, M_TEMP);
 			return (error);
 		}
 		opt.uio_iovcnt++;
 	}
 #ifdef INET
 	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_ip4;
 	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
 	if (j->version == 0)
 		u_ip4->s_addr = j->ip4s;
 	else {
 		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
 		if (error) {
 			free(u_path, M_TEMP);
 			return (error);
 		}
 	}
 	opt.uio_iovcnt++;
 #endif
 #ifdef INET6
 	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_ip6;
 	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
 	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 #endif
 	KASSERT(opt.uio_iovcnt <= nitems(optiov),
 		("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
 	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
 	free(u_path, M_TEMP);
 	return (error);
 }
 
 
 /*
  * struct jail_set_args {
  *	struct iovec *iovp;
  *	unsigned int iovcnt;
  *	int flags;
  * };
  */
 int
 sys_jail_set(struct thread *td, struct jail_set_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_set(td, auio, uap->flags);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 {
 	struct nameidata nd;
 #ifdef INET
 	struct in_addr *ip4;
 #endif
 #ifdef INET6
 	struct in6_addr *ip6;
 #endif
 	struct vfsopt *opt;
 	struct vfsoptlist *opts;
 	struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
 	struct vnode *root;
 	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
 	char *g_path, *osrelstr;
 #if defined(INET) || defined(INET6)
 	struct prison *tppr;
 	void *op;
 #endif
 	unsigned long hid;
 	size_t namelen, onamelen, pnamelen;
 	int born, created, cuflags, descend, enforce;
 	int error, errmsg_len, errmsg_pos;
 	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
 	int fi, jid, jsys, len, level;
 	int childmax, osreldt, rsnum, slevel;
 	int fullpath_disabled;
 #if defined(INET) || defined(INET6)
 	int ii, ij;
 #endif
 #ifdef INET
 	int ip4s, redo_ip4;
 #endif
 #ifdef INET6
 	int ip6s, redo_ip6;
 #endif
 	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
 	unsigned tallow;
 	char numbuf[12];
 
 	error = priv_check(td, PRIV_JAIL_SET);
 	if (!error && (flags & JAIL_ATTACH))
 		error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 	mypr = td->td_ucred->cr_prison;
 	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
 		return (EPERM);
 	if (flags & ~JAIL_SET_MASK)
 		return (EINVAL);
 
 	/*
 	 * Check all the parameters before committing to anything.  Not all
 	 * errors can be caught early, but we may as well try.  Also, this
 	 * takes care of some expensive stuff (path lookup) before getting
 	 * the allprison lock.
 	 *
 	 * XXX Jails are not filesystems, and jail parameters are not mount
 	 *     options.  But it makes more sense to re-use the vfsopt code
 	 *     than duplicate it under a different name.
 	 */
 	error = vfs_buildopts(optuio, &opts);
 	if (error)
 		return (error);
 #ifdef INET
 	ip4 = NULL;
 #endif
 #ifdef INET6
 	ip6 = NULL;
 #endif
 	g_path = NULL;
 
 	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
 	if (!cuflags) {
 		error = EINVAL;
 		vfs_opterror(opts, "no valid operation (create or update)");
 		goto done_errmsg;
 	}
 
 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 	if (error == ENOENT)
 		jid = 0;
 	else if (error != 0)
 		goto done_free;
 
 	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
 	if (error == ENOENT)
 		gotslevel = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotslevel = 1;
 
 	error =
 	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
 	if (error == ENOENT)
 		gotchildmax = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotchildmax = 1;
 
 	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
 	if (error == ENOENT)
 		gotenforce = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (enforce < 0 || enforce > 2) {
 		error = EINVAL;
 		goto done_free;
 	} else
 		gotenforce = 1;
 
 	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
 	if (error == ENOENT)
 		gotrsnum = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotrsnum = 1;
 
 	pr_flags = ch_flags = 0;
 	for (fi = 0; fi < nitems(pr_flag_names); fi++) {
 		if (pr_flag_names[fi] == NULL)
 			continue;
 		vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
 		vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
 	}
 	ch_flags |= pr_flags;
 	for (fi = 0; fi < nitems(pr_flag_jailsys); fi++) {
 		error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
 		    sizeof(jsys));
 		if (error == ENOENT)
 			continue;
 		if (error != 0)
 			goto done_free;
 		switch (jsys) {
 		case JAIL_SYS_DISABLE:
 			if (!pr_flag_jailsys[fi].disable) {
 				error = EINVAL;
 				goto done_free;
 			}
 			pr_flags |= pr_flag_jailsys[fi].disable;
 			break;
 		case JAIL_SYS_NEW:
 			pr_flags |= pr_flag_jailsys[fi].new;
 			break;
 		case JAIL_SYS_INHERIT:
 			break;
 		default:
 			error = EINVAL;
 			goto done_free;
 		}
 		ch_flags |=
 		    pr_flag_jailsys[fi].new | pr_flag_jailsys[fi].disable;
 	}
 	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
 	    && !(pr_flags & PR_PERSIST)) {
 		error = EINVAL;
 		vfs_opterror(opts, "new jail must persist or attach");
 		goto done_errmsg;
 	}
 #ifdef VIMAGE
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
 		error = EINVAL;
 		vfs_opterror(opts, "vnet cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 #ifdef INET
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
 		error = EINVAL;
 		vfs_opterror(opts, "ip4 cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 #ifdef INET6
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
 		error = EINVAL;
 		vfs_opterror(opts, "ip6 cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 
 	pr_allow = ch_allow = 0;
 	for (fi = 0; fi < nitems(pr_allow_names); fi++) {
 		vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
 		vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
 	}
 	ch_allow |= pr_allow;
 
 	error = vfs_getopt(opts, "name", (void **)&name, &len);
 	if (error == ENOENT)
 		name = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (len == 0 || name[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
 	if (error == ENOENT)
 		host = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || host[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
 	if (error == ENOENT)
 		domain = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || domain[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
 	if (error == ENOENT)
 		uuid = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || uuid[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > HOSTUUIDLEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		uint32_t hid32;
 
 		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
 		hid = hid32;
 	} else
 #endif
 		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
 	if (error == ENOENT)
 		gothid = 0;
 	else if (error != 0)
 		goto done_free;
 	else {
 		gothid = 1;
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 	}
 
 #ifdef INET
 	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
 	if (error == ENOENT)
 		ip4s = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (ip4s & (sizeof(*ip4) - 1)) {
 		error = EINVAL;
 		goto done_free;
 	} else {
 		ch_flags |= PR_IP4_USER;
 		pr_flags |= PR_IP4_USER;
 		if (ip4s > 0) {
 			ip4s /= sizeof(*ip4);
 			if (ip4s > jail_max_af_ips) {
 				error = EINVAL;
 				vfs_opterror(opts, "too many IPv4 addresses");
 				goto done_errmsg;
 			}
 			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
 			bcopy(op, ip4, ip4s * sizeof(*ip4));
 			/*
 			 * IP addresses are all sorted but ip[0] to preserve
 			 * the primary IP address as given from userland.
 			 * This special IP is used for unbound outgoing
 			 * connections as well for "loopback" traffic in case
 			 * source address selection cannot find any more fitting
 			 * address to connect from.
 			 */
 			if (ip4s > 1)
 				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4),
 				    prison_qcmp_v4);
 			/*
 			 * Check for duplicate addresses and do some simple
 			 * zero and broadcast checks. If users give other bogus
 			 * addresses it is their problem.
 			 *
 			 * We do not have to care about byte order for these
 			 * checks so we will do them in NBO.
 			 */
 			for (ii = 0; ii < ip4s; ii++) {
 				if (ip4[ii].s_addr == INADDR_ANY ||
 				    ip4[ii].s_addr == INADDR_BROADCAST) {
 					error = EINVAL;
 					goto done_free;
 				}
 				if ((ii+1) < ip4s &&
 				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
 				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
 					error = EINVAL;
 					goto done_free;
 				}
 			}
 		}
 	}
 #endif
 
 #ifdef INET6
 	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
 	if (error == ENOENT)
 		ip6s = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (ip6s & (sizeof(*ip6) - 1)) {
 		error = EINVAL;
 		goto done_free;
 	} else {
 		ch_flags |= PR_IP6_USER;
 		pr_flags |= PR_IP6_USER;
 		if (ip6s > 0) {
 			ip6s /= sizeof(*ip6);
 			if (ip6s > jail_max_af_ips) {
 				error = EINVAL;
 				vfs_opterror(opts, "too many IPv6 addresses");
 				goto done_errmsg;
 			}
 			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
 			bcopy(op, ip6, ip6s * sizeof(*ip6));
 			if (ip6s > 1)
 				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6),
 				    prison_qcmp_v6);
 			for (ii = 0; ii < ip6s; ii++) {
 				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
 					error = EINVAL;
 					goto done_free;
 				}
 				if ((ii+1) < ip6s &&
 				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
 				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
 				{
 					error = EINVAL;
 					goto done_free;
 				}
 			}
 		}
 	}
 #endif
 
 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
 	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
 		error = EINVAL;
 		vfs_opterror(opts,
 		    "vnet jails cannot have IP address restrictions");
 		goto done_errmsg;
 	}
 #endif
 
 	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
 	if (error == ENOENT)
 		osrelstr = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "osrelease cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (len == 0 || osrelstr[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len >= OSRELEASELEN) {
 			error = ENAMETOOLONG;
 			vfs_opterror(opts,
 			    "osrelease string must be 1-%d bytes long",
 			    OSRELEASELEN - 1);
 			goto done_errmsg;
 		}
 	}
 
 	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
 	if (error == ENOENT)
 		osreldt = 0;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "osreldate cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (osreldt == 0) {
 			error = EINVAL;
 			vfs_opterror(opts, "osreldate cannot be 0");
 			goto done_errmsg;
 		}
 	}
 
 	fullpath_disabled = 0;
 	root = NULL;
 	error = vfs_getopt(opts, "path", (void **)&path, &len);
 	if (error == ENOENT)
 		path = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "path cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (len == 0 || path[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
 		    path, td);
 		error = namei(&nd);
 		if (error)
 			goto done_free;
 		root = nd.ni_vp;
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 		strlcpy(g_path, path, MAXPATHLEN);
 		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
 		if (error == 0)
 			path = g_path;
 		else if (error == ENODEV) {
 			/* proceed if sysctl debug.disablefullpath == 1 */
 			fullpath_disabled = 1;
 			if (len < 2 || (len == 2 && path[0] == '/'))
 				path = NULL;
 		} else {
 			/* exit on other errors */
 			goto done_free;
 		}
 		if (root->v_type != VDIR) {
 			error = ENOTDIR;
 			vput(root);
 			goto done_free;
 		}
 		VOP_UNLOCK(root, 0);
 		if (fullpath_disabled) {
 			/* Leave room for a real-root full pathname. */
 			if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
 			    ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
 				error = ENAMETOOLONG;
 				vrele(root);
 				goto done_free;
 			}
 		}
 	}
 
 	/*
 	 * Find the specified jail, or at least its parent.
 	 * This abuses the file error codes ENOENT and EEXIST.
 	 */
 	pr = NULL;
 	ppr = mypr;
 	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
 		namelc = strrchr(name, '.');
 		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
 		if (*p != '\0')
 			jid = 0;
 	}
 	sx_xlock(&allprison_lock);
 	if (jid != 0) {
 		/*
 		 * See if a requested jid already exists.  There is an
 		 * information leak here if the jid exists but is not within
 		 * the caller's jail hierarchy.  Jail creators will get EEXIST
 		 * even though they cannot see the jail, and CREATE | UPDATE
 		 * will return ENOENT which is not normally a valid error.
 		 */
 		if (jid < 0) {
 			error = EINVAL;
 			vfs_opterror(opts, "negative jid");
 			goto done_unlock_list;
 		}
 		pr = prison_find(jid);
 		if (pr != NULL) {
 			ppr = pr->pr_parent;
 			/* Create: jid must not exist. */
 			if (cuflags == JAIL_CREATE) {
 				mtx_unlock(&pr->pr_mtx);
 				error = EEXIST;
 				vfs_opterror(opts, "jail %d already exists",
 				    jid);
 				goto done_unlock_list;
 			}
 			if (!prison_ischild(mypr, pr)) {
 				mtx_unlock(&pr->pr_mtx);
 				pr = NULL;
 			} else if (pr->pr_uref == 0) {
 				if (!(flags & JAIL_DYING)) {
 					mtx_unlock(&pr->pr_mtx);
 					error = ENOENT;
 					vfs_opterror(opts, "jail %d is dying",
 					    jid);
 					goto done_unlock_list;
 				} else if ((flags & JAIL_ATTACH) ||
 				    (pr_flags & PR_PERSIST)) {
 					/*
 					 * A dying jail might be resurrected
 					 * (via attach or persist), but first
 					 * it must determine if another jail
 					 * has claimed its name.  Accomplish
 					 * this by implicitly re-setting the
 					 * name.
 					 */
 					if (name == NULL)
 						name = prison_name(mypr, pr);
 				}
 			}
 		}
 		if (pr == NULL) {
 			/* Update: jid must exist. */
 			if (cuflags == JAIL_UPDATE) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail %d not found", jid);
 				goto done_unlock_list;
 			}
 		}
 	}
 	/*
 	 * If the caller provided a name, look for a jail by that name.
 	 * This has different semantics for creates and updates keyed by jid
 	 * (where the name must not already exist in a different jail),
 	 * and updates keyed by the name itself (where the name must exist
 	 * because that is the jail being updated).
 	 */
 	namelc = NULL;
 	if (name != NULL) {
 		namelc = strrchr(name, '.');
 		if (namelc == NULL)
 			namelc = name;
 		else {
 			/*
 			 * This is a hierarchical name.  Split it into the
 			 * parent and child names, and make sure the parent
 			 * exists or matches an already found jail.
 			 */
 			if (pr != NULL) {
 				if (strncmp(name, ppr->pr_name, namelc - name)
 				    || ppr->pr_name[namelc - name] != '\0') {
 					mtx_unlock(&pr->pr_mtx);
 					error = EINVAL;
 					vfs_opterror(opts,
 					    "cannot change jail's parent");
 					goto done_unlock_list;
 				}
 			} else {
 				*namelc = '\0';
 				ppr = prison_find_name(mypr, name);
 				if (ppr == NULL) {
 					error = ENOENT;
 					vfs_opterror(opts,
 					    "jail \"%s\" not found", name);
 					goto done_unlock_list;
 				}
 				mtx_unlock(&ppr->pr_mtx);
 				*namelc = '.';
 			}
 			namelc++;
 		}
 		if (namelc[0] != '\0') {
 			pnamelen =
 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
  name_again:
 			deadpr = NULL;
 			FOREACH_PRISON_CHILD(ppr, tpr) {
 				if (tpr != pr && tpr->pr_ref > 0 &&
 				    !strcmp(tpr->pr_name + pnamelen, namelc)) {
 					if (pr == NULL &&
 					    cuflags != JAIL_CREATE) {
 						mtx_lock(&tpr->pr_mtx);
 						if (tpr->pr_ref > 0) {
 							/*
 							 * Use this jail
 							 * for updates.
 							 */
 							if (tpr->pr_uref > 0) {
 								pr = tpr;
 								break;
 							}
 							deadpr = tpr;
 						}
 						mtx_unlock(&tpr->pr_mtx);
 					} else if (tpr->pr_uref > 0) {
 						/*
 						 * Create, or update(jid):
 						 * name must not exist in an
 						 * active sibling jail.
 						 */
 						error = EEXIST;
 						if (pr != NULL)
 							mtx_unlock(&pr->pr_mtx);
 						vfs_opterror(opts,
 						   "jail \"%s\" already exists",
 						   name);
 						goto done_unlock_list;
 					}
 				}
 			}
 			/* If no active jail is found, use a dying one. */
 			if (deadpr != NULL && pr == NULL) {
 				if (flags & JAIL_DYING) {
 					mtx_lock(&deadpr->pr_mtx);
 					if (deadpr->pr_ref == 0) {
 						mtx_unlock(&deadpr->pr_mtx);
 						goto name_again;
 					}
 					pr = deadpr;
 				} else if (cuflags == JAIL_UPDATE) {
 					error = ENOENT;
 					vfs_opterror(opts,
 					    "jail \"%s\" is dying", name);
 					goto done_unlock_list;
 				}
 			}
 			/* Update: name must exist if no jid. */
 			else if (cuflags == JAIL_UPDATE && pr == NULL) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail \"%s\" not found",
 				    name);
 				goto done_unlock_list;
 			}
 		}
 	}
 	/* Update: must provide a jid or name. */
 	else if (cuflags == JAIL_UPDATE && pr == NULL) {
 		error = ENOENT;
 		vfs_opterror(opts, "update specified no jail");
 		goto done_unlock_list;
 	}
 
 	/* If there's no prison to update, create a new one and link it in. */
 	if (pr == NULL) {
 		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
 			if (tpr->pr_childcount >= tpr->pr_childmax) {
 				error = EPERM;
 				vfs_opterror(opts, "prison limit exceeded");
 				goto done_unlock_list;
 			}
 		created = 1;
 		mtx_lock(&ppr->pr_mtx);
 		if (ppr->pr_ref == 0) {
 			mtx_unlock(&ppr->pr_mtx);
 			error = ENOENT;
 			vfs_opterror(opts, "jail \"%s\" not found",
 			    prison_name(mypr, ppr));
 			goto done_unlock_list;
 		}
 		ppr->pr_ref++;
 		ppr->pr_uref++;
 		mtx_unlock(&ppr->pr_mtx);
 		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
 		if (jid == 0) {
 			/* Find the next free jid. */
 			jid = lastprid + 1;
  findnext:
 			if (jid == JAIL_MAX)
 				jid = 1;
 			TAILQ_FOREACH(tpr, &allprison, pr_list) {
 				if (tpr->pr_id < jid)
 					continue;
 				if (tpr->pr_id > jid || tpr->pr_ref == 0) {
 					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
 					break;
 				}
 				if (jid == lastprid) {
 					error = EAGAIN;
 					vfs_opterror(opts,
 					    "no available jail IDs");
 					free(pr, M_PRISON);
 					prison_deref(ppr, PD_DEREF |
 					    PD_DEUREF | PD_LIST_XLOCKED);
 					goto done_releroot;
 				}
 				jid++;
 				goto findnext;
 			}
 			lastprid = jid;
 		} else {
 			/*
 			 * The jail already has a jid (that did not yet exist),
 			 * so just find where to insert it.
 			 */
 			TAILQ_FOREACH(tpr, &allprison, pr_list)
 				if (tpr->pr_id >= jid) {
 					TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
 					break;
 				}
 		}
 		if (tpr == NULL)
 			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
 		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
 			tpr->pr_childcount++;
 
 		pr->pr_parent = ppr;
 		pr->pr_id = jid;
 
 		/* Set some default values, and inherit some from the parent. */
 		if (namelc == NULL)
 			namelc = "";
 		if (path == NULL) {
 			path = "/";
 			root = mypr->pr_root;
 			vref(root);
 		}
 		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
 		pr->pr_flags |= PR_HOST;
 #if defined(INET) || defined(INET6)
 #ifdef VIMAGE
 		if (!(pr_flags & PR_VNET))
 #endif
 		{
 #ifdef INET
 			if (!(ch_flags & PR_IP4_USER))
 				pr->pr_flags |= PR_IP4 | PR_IP4_USER;
 			else if (!(pr_flags & PR_IP4_USER)) {
 				pr->pr_flags |= ppr->pr_flags & PR_IP4;
 				if (ppr->pr_ip4 != NULL) {
 					pr->pr_ip4s = ppr->pr_ip4s;
 					pr->pr_ip4 = malloc(pr->pr_ip4s *
 					    sizeof(struct in_addr), M_PRISON,
 					    M_WAITOK);
 					bcopy(ppr->pr_ip4, pr->pr_ip4,
 					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
 				}
 			}
 #endif
 #ifdef INET6
 			if (!(ch_flags & PR_IP6_USER))
 				pr->pr_flags |= PR_IP6 | PR_IP6_USER;
 			else if (!(pr_flags & PR_IP6_USER)) {
 				pr->pr_flags |= ppr->pr_flags & PR_IP6;
 				if (ppr->pr_ip6 != NULL) {
 					pr->pr_ip6s = ppr->pr_ip6s;
 					pr->pr_ip6 = malloc(pr->pr_ip6s *
 					    sizeof(struct in6_addr), M_PRISON,
 					    M_WAITOK);
 					bcopy(ppr->pr_ip6, pr->pr_ip6,
 					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
 				}
 			}
 #endif
 		}
 #endif
 		/* Source address selection is always on by default. */
 		pr->pr_flags |= _PR_IP_SADDRSEL;
 
 		pr->pr_securelevel = ppr->pr_securelevel;
 		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
 		pr->pr_enforce_statfs = jail_default_enforce_statfs;
 		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
 
 		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
 		if (osrelstr == NULL)
 			strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
 			    sizeof(pr->pr_osrelease));
 		else
 			strlcpy(pr->pr_osrelease, osrelstr,
 			    sizeof(pr->pr_osrelease));
 
 		LIST_INIT(&pr->pr_children);
 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
 
 #ifdef VIMAGE
 		/* Allocate a new vnet if specified. */
 		pr->pr_vnet = (pr_flags & PR_VNET)
 		    ? vnet_alloc() : ppr->pr_vnet;
 #endif
 		/*
 		 * Allocate a dedicated cpuset for each jail.
 		 * Unlike other initial settings, this may return an erorr.
 		 */
 		error = cpuset_create_root(ppr, &pr->pr_cpuset);
 		if (error) {
 			prison_deref(pr, PD_LIST_XLOCKED);
 			goto done_releroot;
 		}
 
 		mtx_lock(&pr->pr_mtx);
 		/*
 		 * New prisons do not yet have a reference, because we do not
 		 * want others to see the incomplete prison once the
 		 * allprison_lock is downgraded.
 		 */
 	} else {
 		created = 0;
 		/*
 		 * Grab a reference for existing prisons, to ensure they
 		 * continue to exist for the duration of the call.
 		 */
 		pr->pr_ref++;
 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
 		if ((pr->pr_flags & PR_VNET) &&
 		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "vnet jails cannot have IP address restrictions");
 			goto done_deref_locked;
 		}
 #endif
 #ifdef INET
 		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "ip4 cannot be changed after creation");
 			goto done_deref_locked;
 		}
 #endif
 #ifdef INET6
 		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "ip6 cannot be changed after creation");
 			goto done_deref_locked;
 		}
 #endif
 	}
 
 	/* Do final error checking before setting anything. */
 	if (gotslevel) {
 		if (slevel < ppr->pr_securelevel) {
 			error = EPERM;
 			goto done_deref_locked;
 		}
 	}
 	if (gotchildmax) {
 		if (childmax >= ppr->pr_childmax) {
 			error = EPERM;
 			goto done_deref_locked;
 		}
 	}
 	if (gotenforce) {
 		if (enforce < ppr->pr_enforce_statfs) {
 			error = EPERM;
 			goto done_deref_locked;
 		}
 	}
 	if (gotrsnum) {
 		/*
 		 * devfs_rsnum is a uint16_t
 		 */
 		if (rsnum < 0 || rsnum > 65535) {
 			error = EINVAL;
 			goto done_deref_locked;
 		}
 		/*
 		 * Nested jails always inherit parent's devfs ruleset
 		 */
 		if (jailed(td->td_ucred)) {
 			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
 				error = EPERM;
 				goto done_deref_locked;
 			} else
 				rsnum = ppr->pr_devfs_rsnum;
 		}
 	}
 #ifdef INET
 	if (ip4s > 0) {
 		if (ppr->pr_flags & PR_IP4) {
 			/*
 			 * Make sure the new set of IP addresses is a
 			 * subset of the parent's list.  Don't worry
 			 * about the parent being unlocked, as any
 			 * setting is done with allprison_lock held.
 			 */
 			for (ij = 0; ij < ppr->pr_ip4s; ij++)
 				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
 					break;
 			if (ij == ppr->pr_ip4s) {
 				error = EPERM;
 				goto done_deref_locked;
 			}
 			if (ip4s > 1) {
 				for (ii = ij = 1; ii < ip4s; ii++) {
 					if (ip4[ii].s_addr ==
 					    ppr->pr_ip4[0].s_addr)
 						continue;
 					for (; ij < ppr->pr_ip4s; ij++)
 						if (ip4[ii].s_addr ==
 						    ppr->pr_ip4[ij].s_addr)
 							break;
 					if (ij == ppr->pr_ip4s)
 						break;
 				}
 				if (ij == ppr->pr_ip4s) {
 					error = EPERM;
 					goto done_deref_locked;
 				}
 			}
 		}
 		/*
 		 * Check for conflicting IP addresses.  We permit them
 		 * if there is no more than one IP on each jail.  If
 		 * there is a duplicate on a jail with more than one
 		 * IP stop checking and return error.
 		 */
 #ifdef VIMAGE
 		for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
 			if (tppr->pr_flags & PR_VNET)
 				break;
 #else
 		tppr = &prison0;
 #endif
 		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
 			if (tpr == pr ||
 #ifdef VIMAGE
 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
 #endif
 			    tpr->pr_uref == 0) {
 				descend = 0;
 				continue;
 			}
 			if (!(tpr->pr_flags & PR_IP4_USER))
 				continue;
 			descend = 0;
 			if (tpr->pr_ip4 == NULL ||
 			    (ip4s == 1 && tpr->pr_ip4s == 1))
 				continue;
 			for (ii = 0; ii < ip4s; ii++) {
 				if (prison_check_ip4_locked(tpr, &ip4[ii]) ==
 				    0) {
 					error = EADDRINUSE;
 					vfs_opterror(opts,
 					    "IPv4 addresses clash");
 					goto done_deref_locked;
 				}
 			}
 		}
 	}
 #endif
 #ifdef INET6
 	if (ip6s > 0) {
 		if (ppr->pr_flags & PR_IP6) {
 			/*
 			 * Make sure the new set of IP addresses is a
 			 * subset of the parent's list.
 			 */
 			for (ij = 0; ij < ppr->pr_ip6s; ij++)
 				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
 				    &ppr->pr_ip6[ij]))
 					break;
 			if (ij == ppr->pr_ip6s) {
 				error = EPERM;
 				goto done_deref_locked;
 			}
 			if (ip6s > 1) {
 				for (ii = ij = 1; ii < ip6s; ii++) {
 					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
 					     &ppr->pr_ip6[0]))
 						continue;
 					for (; ij < ppr->pr_ip6s; ij++)
 						if (IN6_ARE_ADDR_EQUAL(
 						    &ip6[ii], &ppr->pr_ip6[ij]))
 							break;
 					if (ij == ppr->pr_ip6s)
 						break;
 				}
 				if (ij == ppr->pr_ip6s) {
 					error = EPERM;
 					goto done_deref_locked;
 				}
 			}
 		}
 		/* Check for conflicting IP addresses. */
 #ifdef VIMAGE
 		for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
 			if (tppr->pr_flags & PR_VNET)
 				break;
 #else
 		tppr = &prison0;
 #endif
 		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
 			if (tpr == pr ||
 #ifdef VIMAGE
 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
 #endif
 			    tpr->pr_uref == 0) {
 				descend = 0;
 				continue;
 			}
 			if (!(tpr->pr_flags & PR_IP6_USER))
 				continue;
 			descend = 0;
 			if (tpr->pr_ip6 == NULL ||
 			    (ip6s == 1 && tpr->pr_ip6s == 1))
 				continue;
 			for (ii = 0; ii < ip6s; ii++) {
 				if (prison_check_ip6_locked(tpr, &ip6[ii]) ==
 				    0) {
 					error = EADDRINUSE;
 					vfs_opterror(opts,
 					    "IPv6 addresses clash");
 					goto done_deref_locked;
 				}
 			}
 		}
 	}
 #endif
 	onamelen = namelen = 0;
 	if (namelc != NULL) {
 		/* Give a default name of the jid.  Also allow the name to be
 		 * explicitly the jid - but not any other number, and only in
 		 * normal form (no leading zero/etc).
 		 */
 		if (namelc[0] == '\0')
 			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
 		else if ((strtoul(namelc, &p, 10) != jid ||
 			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "name cannot be numeric (unless it is the jid)");
 			goto done_deref_locked;
 		}
 		/*
 		 * Make sure the name isn't too long for the prison or its
 		 * children.
 		 */
 		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
 		onamelen = strlen(pr->pr_name + pnamelen);
 		namelen = strlen(namelc);
 		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
 			error = ENAMETOOLONG;
 			goto done_deref_locked;
 		}
 		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
 			    sizeof(pr->pr_name)) {
 				error = ENAMETOOLONG;
 				goto done_deref_locked;
 			}
 		}
 	}
 	if (pr_allow & ~ppr->pr_allow) {
 		error = EPERM;
 		goto done_deref_locked;
 	}
 
 	/*
 	 * Let modules check their parameters.  This requires unlocking and
 	 * then re-locking the prison, but this is still a valid state as long
 	 * as allprison_lock remains xlocked.
 	 */
 	mtx_unlock(&pr->pr_mtx);
 	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
 	if (error != 0) {
 		prison_deref(pr, created
 		    ? PD_LIST_XLOCKED
 		    : PD_DEREF | PD_LIST_XLOCKED);
 		goto done_releroot;
 	}
 	mtx_lock(&pr->pr_mtx);
 
 	/* At this point, all valid parameters should have been noted. */
 	TAILQ_FOREACH(opt, opts, link) {
 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
 			error = EINVAL;
 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
 			goto done_deref_locked;
 		}
 	}
 
 	/* Set the parameters of the prison. */
 #ifdef INET
 	redo_ip4 = 0;
 	if (pr_flags & PR_IP4_USER) {
 		pr->pr_flags |= PR_IP4;
 		free(pr->pr_ip4, M_PRISON);
 		pr->pr_ip4s = ip4s;
 		pr->pr_ip4 = ip4;
 		ip4 = NULL;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip4(tpr, NULL)) {
 				redo_ip4 = 1;
 				descend = 0;
 			}
 		}
 	}
 #endif
 #ifdef INET6
 	redo_ip6 = 0;
 	if (pr_flags & PR_IP6_USER) {
 		pr->pr_flags |= PR_IP6;
 		free(pr->pr_ip6, M_PRISON);
 		pr->pr_ip6s = ip6s;
 		pr->pr_ip6 = ip6;
 		ip6 = NULL;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip6(tpr, NULL)) {
 				redo_ip6 = 1;
 				descend = 0;
 			}
 		}
 	}
 #endif
 	if (gotslevel) {
 		pr->pr_securelevel = slevel;
 		/* Set all child jails to be at least this level. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			if (tpr->pr_securelevel < slevel)
 				tpr->pr_securelevel = slevel;
 	}
 	if (gotchildmax) {
 		pr->pr_childmax = childmax;
 		/* Set all child jails to under this limit. */
 		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
 			if (tpr->pr_childmax > childmax - level)
 				tpr->pr_childmax = childmax > level
 				    ? childmax - level : 0;
 	}
 	if (gotenforce) {
 		pr->pr_enforce_statfs = enforce;
 		/* Pass this restriction on to the children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			if (tpr->pr_enforce_statfs < enforce)
 				tpr->pr_enforce_statfs = enforce;
 	}
 	if (gotrsnum) {
 		pr->pr_devfs_rsnum = rsnum;
 		/* Pass this restriction on to the children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			tpr->pr_devfs_rsnum = rsnum;
 	}
 	if (namelc != NULL) {
 		if (ppr == &prison0)
 			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
 		else
 			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
 			    ppr->pr_name, namelc);
 		/* Change this component of child names. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
 			    strlen(tpr->pr_name + onamelen) + 1);
 			bcopy(pr->pr_name, tpr->pr_name, namelen);
 		}
 	}
 	if (path != NULL) {
 		/* Try to keep a real-rooted full pathname. */
 		if (fullpath_disabled && path[0] == '/' &&
 		    strcmp(mypr->pr_path, "/"))
 			snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
 			    mypr->pr_path, path);
 		else
 			strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
 		pr->pr_root = root;
 	}
 	if (PR_HOST & ch_flags & ~pr_flags) {
 		if (pr->pr_flags & PR_HOST) {
 			/*
 			 * Copy the parent's host info.  As with pr_ip4 above,
 			 * the lack of a lock on the parent is not a problem;
 			 * it is always set with allprison_lock at least
 			 * shared, and is held exclusively here.
 			 */
 			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
 			    sizeof(pr->pr_hostname));
 			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
 			    sizeof(pr->pr_domainname));
 			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
 			    sizeof(pr->pr_hostuuid));
 			pr->pr_hostid = pr->pr_parent->pr_hostid;
 		}
 	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
 		/* Set this prison, and any descendants without PR_HOST. */
 		if (host != NULL)
 			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
 		if (domain != NULL)
 			strlcpy(pr->pr_domainname, domain, 
 			    sizeof(pr->pr_domainname));
 		if (uuid != NULL)
 			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
 		if (gothid)
 			pr->pr_hostid = hid;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 			if (tpr->pr_flags & PR_HOST)
 				descend = 0;
 			else {
 				if (host != NULL)
 					strlcpy(tpr->pr_hostname,
 					    pr->pr_hostname,
 					    sizeof(tpr->pr_hostname));
 				if (domain != NULL)
 					strlcpy(tpr->pr_domainname, 
 					    pr->pr_domainname,
 					    sizeof(tpr->pr_domainname));
 				if (uuid != NULL)
 					strlcpy(tpr->pr_hostuuid,
 					    pr->pr_hostuuid,
 					    sizeof(tpr->pr_hostuuid));
 				if (gothid)
 					tpr->pr_hostid = hid;
 			}
 		}
 	}
 	if ((tallow = ch_allow & ~pr_allow)) {
 		/* Clear allow bits in all children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			tpr->pr_allow &= ~tallow;
 	}
 	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
 	/*
 	 * Persistent prisons get an extra reference, and prisons losing their
 	 * persist flag lose that reference.  Only do this for existing prisons
 	 * for now, so new ones will remain unseen until after the module
 	 * handlers have completed.
 	 */
 	born = pr->pr_uref == 0;
 	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
 		if (pr_flags & PR_PERSIST) {
 			pr->pr_ref++;
 			pr->pr_uref++;
 		} else {
 			pr->pr_ref--;
 			pr->pr_uref--;
 		}
 	}
 	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
 	pr->pr_flags &= ~PR_REMOVE;
 	mtx_unlock(&pr->pr_mtx);
 
 #ifdef RACCT
 	if (racct_enable && created)
 		prison_racct_attach(pr);
 #endif
 
 	/* Locks may have prevented a complete restriction of child IP
 	 * addresses.  If so, allocate some more memory and try again.
 	 */
 #ifdef INET
 	while (redo_ip4) {
 		ip4s = pr->pr_ip4s;
 		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
 		mtx_lock(&pr->pr_mtx);
 		redo_ip4 = 0;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip4(tpr, ip4)) {
 				if (ip4 != NULL)
 					ip4 = NULL;
 				else
 					redo_ip4 = 1;
 			}
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 #endif
 #ifdef INET6
 	while (redo_ip6) {
 		ip6s = pr->pr_ip6s;
 		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
 		mtx_lock(&pr->pr_mtx);
 		redo_ip6 = 0;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip6(tpr, ip6)) {
 				if (ip6 != NULL)
 					ip6 = NULL;
 				else
 					redo_ip6 = 1;
 			}
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 #endif
 
 	/* Let the modules do their work. */
 	sx_downgrade(&allprison_lock);
 	if (born) {
 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
 		if (error) {
 			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 			prison_deref(pr, created
 			    ? PD_LIST_SLOCKED
 			    : PD_DEREF | PD_LIST_SLOCKED);
 			goto done_errmsg;
 		}
 	}
 	error = osd_jail_call(pr, PR_METHOD_SET, opts);
 	if (error) {
 		if (born)
 			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 		prison_deref(pr, created
 		    ? PD_LIST_SLOCKED
 		    : PD_DEREF | PD_LIST_SLOCKED);
 		goto done_errmsg;
 	}
 
 	/* Attach this process to the prison if requested. */
 	if (flags & JAIL_ATTACH) {
 		mtx_lock(&pr->pr_mtx);
 		error = do_jail_attach(td, pr);
 		if (error) {
 			vfs_opterror(opts, "attach failed");
 			if (!created)
 				prison_deref(pr, PD_DEREF);
 			goto done_errmsg;
 		}
 	}
 
 #ifdef RACCT
 	if (racct_enable && !created) {
 		if (!(flags & JAIL_ATTACH))
 			sx_sunlock(&allprison_lock);
 		prison_racct_modify(pr);
 		if (!(flags & JAIL_ATTACH))
 			sx_slock(&allprison_lock);
 	}
 #endif
 
 	td->td_retval[0] = pr->pr_id;
 
 	/*
 	 * Now that it is all there, drop the temporary reference from existing
 	 * prisons.  Or add a reference to newly created persistent prisons
 	 * (which was not done earlier so that the prison would not be publicly
 	 * visible).
 	 */
 	if (!created) {
 		prison_deref(pr, (flags & JAIL_ATTACH)
 		    ? PD_DEREF
 		    : PD_DEREF | PD_LIST_SLOCKED);
 	} else {
 		if (pr_flags & PR_PERSIST) {
 			mtx_lock(&pr->pr_mtx);
 			pr->pr_ref++;
 			pr->pr_uref++;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		if (!(flags & JAIL_ATTACH))
 			sx_sunlock(&allprison_lock);
 	}
 
 	goto done_free;
 
  done_deref_locked:
 	prison_deref(pr, created
 	    ? PD_LOCKED | PD_LIST_XLOCKED
 	    : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
 	goto done_releroot;
  done_unlock_list:
 	sx_xunlock(&allprison_lock);
  done_releroot:
 	if (root != NULL)
 		vrele(root);
  done_errmsg:
 	if (error) {
 		if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
 		    &errmsg_len) == 0 && errmsg_len > 0) {
 			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
 			if (optuio->uio_segflg == UIO_SYSSPACE)
 				bcopy(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 			else
 				copyout(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 		}
 	}
  done_free:
 #ifdef INET
 	free(ip4, M_PRISON);
 #endif
 #ifdef INET6
 	free(ip6, M_PRISON);
 #endif
 	if (g_path != NULL)
 		free(g_path, M_TEMP);
 	vfs_freeopts(opts);
 	return (error);
 }
 
 
 /*
  * struct jail_get_args {
  *	struct iovec *iovp;
  *	unsigned int iovcnt;
  *	int flags;
  * };
  */
 int
 sys_jail_get(struct thread *td, struct jail_get_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_get(td, auio, uap->flags);
 	if (error == 0)
 		error = copyout(auio->uio_iov, uap->iovp,
 		    uap->iovcnt * sizeof (struct iovec));
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
 {
 	struct prison *pr, *mypr;
 	struct vfsopt *opt;
 	struct vfsoptlist *opts;
 	char *errmsg, *name;
 	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;
 
 	if (flags & ~JAIL_GET_MASK)
 		return (EINVAL);
 
 	/* Get the parameter list. */
 	error = vfs_buildopts(optuio, &opts);
 	if (error)
 		return (error);
 	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
 	mypr = td->td_ucred->cr_prison;
 
 	/*
 	 * Find the prison specified by one of: lastjid, jid, name.
 	 */
 	sx_slock(&allprison_lock);
 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
 	if (error == 0) {
 		TAILQ_FOREACH(pr, &allprison, pr_list) {
 			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
 				mtx_lock(&pr->pr_mtx);
 				if (pr->pr_ref > 0 &&
 				    (pr->pr_uref > 0 || (flags & JAIL_DYING)))
 					break;
 				mtx_unlock(&pr->pr_mtx);
 			}
 		}
 		if (pr != NULL)
 			goto found_prison;
 		error = ENOENT;
 		vfs_opterror(opts, "no jail after %d", jid);
 		goto done_unlock_list;
 	} else if (error != ENOENT)
 		goto done_unlock_list;
 
 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 	if (error == 0) {
 		if (jid != 0) {
 			pr = prison_find_child(mypr, jid);
 			if (pr != NULL) {
 				if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
 					mtx_unlock(&pr->pr_mtx);
 					error = ENOENT;
 					vfs_opterror(opts, "jail %d is dying",
 					    jid);
 					goto done_unlock_list;
 				}
 				goto found_prison;
 			}
 			error = ENOENT;
 			vfs_opterror(opts, "jail %d not found", jid);
 			goto done_unlock_list;
 		}
 	} else if (error != ENOENT)
 		goto done_unlock_list;
 
 	error = vfs_getopt(opts, "name", (void **)&name, &len);
 	if (error == 0) {
 		if (len == 0 || name[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_unlock_list;
 		}
 		pr = prison_find_name(mypr, name);
 		if (pr != NULL) {
 			if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
 				mtx_unlock(&pr->pr_mtx);
 				error = ENOENT;
 				vfs_opterror(opts, "jail \"%s\" is dying",
 				    name);
 				goto done_unlock_list;
 			}
 			goto found_prison;
 		}
 		error = ENOENT;
 		vfs_opterror(opts, "jail \"%s\" not found", name);
 		goto done_unlock_list;
 	} else if (error != ENOENT)
 		goto done_unlock_list;
 
 	vfs_opterror(opts, "no jail specified");
 	error = ENOENT;
 	goto done_unlock_list;
 
  found_prison:
 	/* Get the parameters of the prison. */
 	pr->pr_ref++;
 	locked = PD_LOCKED;
 	td->td_retval[0] = pr->pr_id;
 	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
 	error = vfs_setopt(opts, "parent", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
 	    sizeof(pr->pr_cpuset->cs_id));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 #ifdef INET
 	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
 	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 #endif
 #ifdef INET6
 	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
 	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 #endif
 	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
 	    sizeof(pr->pr_securelevel));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
 	    sizeof(pr->pr_childcount));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
 	    sizeof(pr->pr_childmax));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		uint32_t hid32 = pr->pr_hostid;
 
 		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
 	} else
 #endif
 	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
 	    sizeof(pr->pr_hostid));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
 	    sizeof(pr->pr_enforce_statfs));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
 	    sizeof(pr->pr_devfs_rsnum));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	for (fi = 0; fi < nitems(pr_flag_names); fi++) {
 		if (pr_flag_names[fi] == NULL)
 			continue;
 		i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
 		error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 		i = !i;
 		error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 	}
 	for (fi = 0; fi < nitems(pr_flag_jailsys); fi++) {
 		i = pr->pr_flags &
 		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
 		i = pr_flag_jailsys[fi].disable &&
 		      (i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
 		    : (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
 		    : JAIL_SYS_INHERIT;
 		error =
 		    vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 	}
 	for (fi = 0; fi < nitems(pr_allow_names); fi++) {
 		if (pr_allow_names[fi] == NULL)
 			continue;
 		i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
 		error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 		i = !i;
 		error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done_deref;
 	}
 	i = (pr->pr_uref == 0);
 	error = vfs_setopt(opts, "dying", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	i = !i;
 	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
 	    sizeof(pr->pr_osreldate));
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
 	if (error != 0 && error != ENOENT)
 		goto done_deref;
 
 	/* Get the module parameters. */
 	mtx_unlock(&pr->pr_mtx);
 	locked = 0;
 	error = osd_jail_call(pr, PR_METHOD_GET, opts);
 	if (error)
 		goto done_deref;
 	prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
 
 	/* By now, all parameters should have been noted. */
 	TAILQ_FOREACH(opt, opts, link) {
 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
 			error = EINVAL;
 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
 			goto done_errmsg;
 		}
 	}
 
 	/* Write the fetched parameters back to userspace. */
 	error = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
 			pos = 2 * opt->pos + 1;
 			optuio->uio_iov[pos].iov_len = opt->len;
 			if (opt->value != NULL) {
 				if (optuio->uio_segflg == UIO_SYSSPACE) {
 					bcopy(opt->value,
 					    optuio->uio_iov[pos].iov_base,
 					    opt->len);
 				} else {
 					error = copyout(opt->value,
 					    optuio->uio_iov[pos].iov_base,
 					    opt->len);
 					if (error)
 						break;
 				}
 			}
 		}
 	}
 	goto done_errmsg;
 
  done_deref:
 	prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
 	goto done_errmsg;
 
  done_unlock_list:
 	sx_sunlock(&allprison_lock);
  done_errmsg:
 	if (error && errmsg_pos >= 0) {
 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
 		errmsg_pos = 2 * errmsg_pos + 1;
 		if (errmsg_len > 0) {
 			if (optuio->uio_segflg == UIO_SYSSPACE)
 				bcopy(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 			else
 				copyout(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 		}
 	}
 	vfs_freeopts(opts);
 	return (error);
 }
 
 
 /*
  * struct jail_remove_args {
  *	int jid;
  * };
  */
 int
 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
 {
 	struct prison *pr, *cpr, *lpr, *tpr;
 	int descend, error;
 
 	error = priv_check(td, PRIV_JAIL_REMOVE);
 	if (error)
 		return (error);
 
 	sx_xlock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 	if (pr == NULL) {
 		sx_xunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	/* Remove all descendants of this prison, then remove this prison. */
 	pr->pr_ref++;
 	if (!LIST_EMPTY(&pr->pr_children)) {
 		mtx_unlock(&pr->pr_mtx);
 		lpr = NULL;
 		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
 			mtx_lock(&cpr->pr_mtx);
 			if (cpr->pr_ref > 0) {
 				tpr = cpr;
 				cpr->pr_ref++;
 			} else {
 				/* Already removed - do not do it again. */
 				tpr = NULL;
 			}
 			mtx_unlock(&cpr->pr_mtx);
 			if (lpr != NULL) {
 				mtx_lock(&lpr->pr_mtx);
 				prison_remove_one(lpr);
 				sx_xlock(&allprison_lock);
 			}
 			lpr = tpr;
 		}
 		if (lpr != NULL) {
 			mtx_lock(&lpr->pr_mtx);
 			prison_remove_one(lpr);
 			sx_xlock(&allprison_lock);
 		}
 		mtx_lock(&pr->pr_mtx);
 	}
 	prison_remove_one(pr);
 	return (0);
 }
 
 static void
 prison_remove_one(struct prison *pr)
 {
 	struct proc *p;
 	int deuref;
 
 	/*
 	 * Mark the prison as doomed, so it doesn't accidentally come back
 	 * to life.  It may still be explicitly brought back by jail_set(2).
 	 */
 	pr->pr_flags |= PR_REMOVE;
 
 	/* If the prison was persistent, it is not anymore. */
 	deuref = 0;
 	if (pr->pr_flags & PR_PERSIST) {
 		pr->pr_ref--;
 		deuref = PD_DEUREF;
 		pr->pr_flags &= ~PR_PERSIST;
 	}
 
 	/*
 	 * jail_remove added a reference.  If that's the only one, remove
 	 * the prison now.
 	 */
 	KASSERT(pr->pr_ref > 0,
 	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
 	if (pr->pr_ref == 1) {
 		prison_deref(pr,
 		    deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
 		return;
 	}
 
 	mtx_unlock(&pr->pr_mtx);
 	sx_xunlock(&allprison_lock);
 	/*
 	 * Kill all processes unfortunate enough to be attached to this prison.
 	 */
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &allproc, p_list) {
 		PROC_LOCK(p);
 		if (p->p_state != PRS_NEW && p->p_ucred &&
 		    p->p_ucred->cr_prison == pr)
 			kern_psignal(p, SIGKILL);
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 	/* Remove the temporary reference added by jail_remove. */
 	prison_deref(pr, deuref | PD_DEREF);
 }
 
 
 /*
  * struct jail_attach_args {
  *	int jid;
  * };
  */
 int
 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
 {
 	struct prison *pr;
 	int error;
 
 	error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 
 	/*
 	 * Start with exclusive hold on allprison_lock to ensure that a possible
 	 * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove.
 	 * But then immediately downgrade it since we don't need to stop
 	 * readers.
 	 */
 	sx_xlock(&allprison_lock);
 	sx_downgrade(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 	if (pr == NULL) {
 		sx_sunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	/*
 	 * Do not allow a process to attach to a prison that is not
 	 * considered to be "alive".
 	 */
 	if (pr->pr_uref == 0) {
 		mtx_unlock(&pr->pr_mtx);
 		sx_sunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	return (do_jail_attach(td, pr));
 }
 
 static int
 do_jail_attach(struct thread *td, struct prison *pr)
 {
 	struct proc *p;
 	struct ucred *newcred, *oldcred;
 	int error;
 
 	/*
 	 * XXX: Note that there is a slight race here if two threads
 	 * in the same privileged process attempt to attach to two
 	 * different jails at the same time.  It is important for
 	 * user processes not to do this, or they might end up with
 	 * a process root from one prison, but attached to the jail
 	 * of another.
 	 */
 	pr->pr_ref++;
 	pr->pr_uref++;
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Let modules do whatever they need to prepare for attaching. */
 	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
 	if (error) {
 		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
 		return (error);
 	}
 	sx_sunlock(&allprison_lock);
 
 	/*
 	 * Reparent the newly attached process to this jail.
 	 */
 	p = td->td_proc;
 	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
 	if (error)
 		goto e_revert_osd;
 
 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
 	if ((error = change_dir(pr->pr_root, td)) != 0)
 		goto e_unlock;
 #ifdef MAC
 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
 		goto e_unlock;
 #endif
 	VOP_UNLOCK(pr->pr_root, 0);
-	if ((error = pwd_chroot(td, pr->pr_root)))
+	if ((error = pwd_chroot_chdir(td, pr->pr_root)))
 		goto e_revert_osd;
 
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 	newcred->cr_prison = pr;
 	proc_set_cred(p, newcred);
 	setsugid(p);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
 #ifdef RCTL
 	rctl_proc_ucred_changed(p, newcred);
 	crfree(newcred);
 #endif
 	prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF);
 	crfree(oldcred);
 
 	/*
 	 * If the prison was killed while changing credentials, die along
 	 * with it.
 	 */
 	if (pr->pr_flags & PR_REMOVE) {
 		PROC_LOCK(p);
 		kern_psignal(p, SIGKILL);
 		PROC_UNLOCK(p);
 	}
 
 	return (0);
 
  e_unlock:
 	VOP_UNLOCK(pr->pr_root, 0);
  e_revert_osd:
 	/* Tell modules this thread is still in its old jail after all. */
 	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
 	prison_deref(pr, PD_DEREF | PD_DEUREF);
 	return (error);
 }
 
 
 /*
  * Returns a locked prison instance, or NULL on failure.
  */
 struct prison *
 prison_find(int prid)
 {
 	struct prison *pr;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		if (pr->pr_id == prid) {
 			mtx_lock(&pr->pr_mtx);
 			if (pr->pr_ref > 0)
 				return (pr);
 			mtx_unlock(&pr->pr_mtx);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
  */
 struct prison *
 prison_find_child(struct prison *mypr, int prid)
 {
 	struct prison *pr;
 	int descend;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 		if (pr->pr_id == prid) {
 			mtx_lock(&pr->pr_mtx);
 			if (pr->pr_ref > 0)
 				return (pr);
 			mtx_unlock(&pr->pr_mtx);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Look for the name relative to mypr.  Returns a locked prison or NULL.
  */
 struct prison *
 prison_find_name(struct prison *mypr, const char *name)
 {
 	struct prison *pr, *deadpr;
 	size_t mylen;
 	int descend;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
  again:
 	deadpr = NULL;
 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 		if (!strcmp(pr->pr_name + mylen, name)) {
 			mtx_lock(&pr->pr_mtx);
 			if (pr->pr_ref > 0) {
 				if (pr->pr_uref > 0)
 					return (pr);
 				deadpr = pr;
 			}
 			mtx_unlock(&pr->pr_mtx);
 		}
 	}
 	/* There was no valid prison - perhaps there was a dying one. */
 	if (deadpr != NULL) {
 		mtx_lock(&deadpr->pr_mtx);
 		if (deadpr->pr_ref == 0) {
 			mtx_unlock(&deadpr->pr_mtx);
 			goto again;
 		}
 	}
 	return (deadpr);
 }
 
 /*
  * See if a prison has the specific flag set.
  */
 int
 prison_flag(struct ucred *cred, unsigned flag)
 {
 
 	/* This is an atomic read, so no locking is necessary. */
 	return (cred->cr_prison->pr_flags & flag);
 }
 
 int
 prison_allow(struct ucred *cred, unsigned flag)
 {
 
 	/* This is an atomic read, so no locking is necessary. */
 	return (cred->cr_prison->pr_allow & flag);
 }
 
 /*
  * Remove a prison reference.  If that was the last reference, remove the
  * prison itself - but not in this context in case there are locks held.
  */
 void
 prison_free_locked(struct prison *pr)
 {
 	int ref;
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	ref = --pr->pr_ref;
 	mtx_unlock(&pr->pr_mtx);
 	if (ref == 0)
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 }
 
 void
 prison_free(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	prison_free_locked(pr);
 }
 
 /*
  * Complete a call to either prison_free or prison_proc_free.
  */
 static void
 prison_complete(void *context, int pending)
 {
 	struct prison *pr = context;
 
 	sx_xlock(&allprison_lock);
 	mtx_lock(&pr->pr_mtx);
 	prison_deref(pr, pr->pr_uref
 	    ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED
 	    : PD_LOCKED | PD_LIST_XLOCKED);
 }
 
 /*
  * Remove a prison reference (usually).  This internal version assumes no
  * mutexes are held, except perhaps the prison itself.  If there are no more
  * references, release and delist the prison.  On completion, the prison lock
  * and the allprison lock are both unlocked.
  */
 static void
 prison_deref(struct prison *pr, int flags)
 {
 	struct prison *ppr, *tpr;
 	int ref, lasturef;
 
 	if (!(flags & PD_LOCKED))
 		mtx_lock(&pr->pr_mtx);
 	for (;;) {
 		if (flags & PD_DEUREF) {
 			KASSERT(pr->pr_uref > 0,
 			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
 			     pr->pr_id));
 			pr->pr_uref--;
 			lasturef = pr->pr_uref == 0;
 			if (lasturef)
 				pr->pr_ref++;
 			KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
 		} else
 			lasturef = 0;
 		if (flags & PD_DEREF) {
 			KASSERT(pr->pr_ref > 0,
 			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
 			     pr->pr_id));
 			pr->pr_ref--;
 		}
 		ref = pr->pr_ref;
 		mtx_unlock(&pr->pr_mtx);
 
 		/*
 		 * Tell the modules if the last user reference was removed
 		 * (even it sticks around in dying state).
 		 */
 		if (lasturef) {
 			if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) {
 				sx_xlock(&allprison_lock);
 				flags |= PD_LIST_XLOCKED;
 			}
 			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 			mtx_lock(&pr->pr_mtx);
 			ref = --pr->pr_ref;
 			mtx_unlock(&pr->pr_mtx);
 		}
 
 		/* If the prison still has references, nothing else to do. */
 		if (ref > 0) {
 			if (flags & PD_LIST_SLOCKED)
 				sx_sunlock(&allprison_lock);
 			else if (flags & PD_LIST_XLOCKED)
 				sx_xunlock(&allprison_lock);
 			return;
 		}
 
 		if (flags & PD_LIST_SLOCKED) {
 			if (!sx_try_upgrade(&allprison_lock)) {
 				sx_sunlock(&allprison_lock);
 				sx_xlock(&allprison_lock);
 			}
 		} else if (!(flags & PD_LIST_XLOCKED))
 			sx_xlock(&allprison_lock);
 
 		TAILQ_REMOVE(&allprison, pr, pr_list);
 		LIST_REMOVE(pr, pr_sibling);
 		ppr = pr->pr_parent;
 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
 			tpr->pr_childcount--;
 		sx_xunlock(&allprison_lock);
 
 #ifdef VIMAGE
 		if (pr->pr_vnet != ppr->pr_vnet)
 			vnet_destroy(pr->pr_vnet);
 #endif
 		if (pr->pr_root != NULL)
 			vrele(pr->pr_root);
 		mtx_destroy(&pr->pr_mtx);
 #ifdef INET
 		free(pr->pr_ip4, M_PRISON);
 #endif
 #ifdef INET6
 		free(pr->pr_ip6, M_PRISON);
 #endif
 		if (pr->pr_cpuset != NULL)
 			cpuset_rel(pr->pr_cpuset);
 		osd_jail_exit(pr);
 #ifdef RACCT
 		if (racct_enable)
 			prison_racct_detach(pr);
 #endif
 		free(pr, M_PRISON);
 
 		/* Removing a prison frees a reference on its parent. */
 		pr = ppr;
 		mtx_lock(&pr->pr_mtx);
 		flags = PD_DEREF | PD_DEUREF;
 	}
 }
 
 void
 prison_hold_locked(struct prison *pr)
 {
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	KASSERT(pr->pr_ref > 0,
 	    ("Trying to hold dead prison (jid=%d).", pr->pr_id));
 	pr->pr_ref++;
 }
 
 void
 prison_hold(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 }
 
 void
 prison_proc_hold(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	KASSERT(pr->pr_uref > 0,
 	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
 	pr->pr_uref++;
 	mtx_unlock(&pr->pr_mtx);
 }
 
 void
 prison_proc_free(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 	KASSERT(pr->pr_uref > 0,
 	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
 	if (pr->pr_uref > 1)
 		pr->pr_uref--;
 	else {
 		/*
 		 * Don't remove the last user reference in this context, which
 		 * is expected to be a process that is not only locked, but
 		 * also half dead.
 		 */
 		pr->pr_ref++;
 		mtx_unlock(&pr->pr_mtx);
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 		return;
 	}
 	mtx_unlock(&pr->pr_mtx);
 }
 
 /*
  * Check if a jail supports the given address family.
  *
  * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
  * if not.
  */
 int
 prison_check_af(struct ucred *cred, int af)
 {
 	struct prison *pr;
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 
 	pr = cred->cr_prison;
 #ifdef VIMAGE
 	/* Prisons with their own network stack are not limited. */
 	if (prison_owns_vnet(cred))
 		return (0);
 #endif
 
 	error = 0;
 	switch (af)
 	{
 #ifdef INET
 	case AF_INET:
 		if (pr->pr_flags & PR_IP4)
 		{
 			mtx_lock(&pr->pr_mtx);
 			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
 				error = EAFNOSUPPORT;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (pr->pr_flags & PR_IP6)
 		{
 			mtx_lock(&pr->pr_mtx);
 			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
 				error = EAFNOSUPPORT;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		break;
 #endif
 	case AF_LOCAL:
 	case AF_ROUTE:
 		break;
 	default:
 		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
 			error = EAFNOSUPPORT;
 	}
 	return (error);
 }
 
 /*
  * Check if given address belongs to the jail referenced by cred (wrapper to
  * prison_check_ip[46]).
  *
  * Returns 0 if jail doesn't restrict the address family or if address belongs
  * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
  * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
  */
 int
 prison_if(struct ucred *cred, struct sockaddr *sa)
 {
 #ifdef INET
 	struct sockaddr_in *sai;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 *sai6;
 #endif
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
 
 #ifdef VIMAGE
 	if (prison_owns_vnet(cred))
 		return (0);
 #endif
 
 	error = 0;
 	switch (sa->sa_family)
 	{
 #ifdef INET
 	case AF_INET:
 		sai = (struct sockaddr_in *)sa;
 		error = prison_check_ip4(cred, &sai->sin_addr);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sai6 = (struct sockaddr_in6 *)sa;
 		error = prison_check_ip6(cred, &sai6->sin6_addr);
 		break;
 #endif
 	default:
 		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
 			error = EAFNOSUPPORT;
 	}
 	return (error);
 }
 
 /*
  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
  */
 int
 prison_check(struct ucred *cred1, struct ucred *cred2)
 {
 
 	return ((cred1->cr_prison == cred2->cr_prison ||
 	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
 }
 
 /*
  * Return 1 if p2 is a child of p1, otherwise 0.
  */
 int
 prison_ischild(struct prison *pr1, struct prison *pr2)
 {
 
 	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
 		if (pr1 == pr2)
 			return (1);
 	return (0);
 }
 
 /*
  * Return 1 if the passed credential is in a jail, otherwise 0.
  */
 int
 jailed(struct ucred *cred)
 {
 
 	return (cred->cr_prison != &prison0);
 }
 
 /*
  * Return 1 if the passed credential is in a jail and that jail does not
  * have its own virtual network stack, otherwise 0.
  */
 int
 jailed_without_vnet(struct ucred *cred)
 {
 
 	if (!jailed(cred))
 		return (0);
 #ifdef VIMAGE
 	if (prison_owns_vnet(cred))
 		return (0);
 #endif
 
 	return (1);
 }
 
 /*
  * Return the correct hostname (domainname, et al) for the passed credential.
  */
 void
 getcredhostname(struct ucred *cred, char *buf, size_t size)
 {
 	struct prison *pr;
 
 	/*
 	 * A NULL credential can be used to shortcut to the physical
 	 * system's hostname.
 	 */
 	pr = (cred != NULL) ? cred->cr_prison : &prison0;
 	mtx_lock(&pr->pr_mtx);
 	strlcpy(buf, pr->pr_hostname, size);
 	mtx_unlock(&pr->pr_mtx);
 }
 
 void
 getcreddomainname(struct ucred *cred, char *buf, size_t size)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(buf, cred->cr_prison->pr_domainname, size);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getcredhostid(struct ucred *cred, unsigned long *hostid)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	*hostid = cred->cr_prison->pr_hostid;
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getjailname(struct ucred *cred, char *name, size_t len)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(name, cred->cr_prison->pr_name, len);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 #ifdef VIMAGE
 /*
  * Determine whether the prison represented by cred owns
  * its vnet rather than having it inherited.
  *
  * Returns 1 in case the prison owns the vnet, 0 otherwise.
  */
 int
 prison_owns_vnet(struct ucred *cred)
 {
 
 	/*
 	 * vnets cannot be added/removed after jail creation,
 	 * so no need to lock here.
 	 */
 	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
 }
 #endif
 
 /*
  * Determine whether the subject represented by cred can "see"
  * status of a mount point.
  * Returns: 0 for permitted, ENOENT otherwise.
  * XXX: This function should be called cr_canseemount() and should be
  *      placed in kern_prot.c.
  */
 int
 prison_canseemount(struct ucred *cred, struct mount *mp)
 {
 	struct prison *pr;
 	struct statfs *sp;
 	size_t len;
 
 	pr = cred->cr_prison;
 	if (pr->pr_enforce_statfs == 0)
 		return (0);
 	if (pr->pr_root->v_mount == mp)
 		return (0);
 	if (pr->pr_enforce_statfs == 2)
 		return (ENOENT);
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 * This is ugly check, but this is the only situation when jail's
 	 * directory ends with '/'.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return (0);
 	len = strlen(pr->pr_path);
 	sp = &mp->mnt_stat;
 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
 		return (ENOENT);
 	/*
 	 * Be sure that we don't have situation where jail's root directory
 	 * is "/some/path" and mount point is "/some/pathpath".
 	 */
 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
 		return (ENOENT);
 	return (0);
 }
 
 void
 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
 {
 	char jpath[MAXPATHLEN];
 	struct prison *pr;
 	size_t len;
 
 	pr = cred->cr_prison;
 	if (pr->pr_enforce_statfs == 0)
 		return;
 	if (prison_canseemount(cred, mp) != 0) {
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		strlcpy(sp->f_mntonname, "[restricted]",
 		    sizeof(sp->f_mntonname));
 		return;
 	}
 	if (pr->pr_root->v_mount == mp) {
 		/*
 		 * Clear current buffer data, so we are sure nothing from
 		 * the valid path left there.
 		 */
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		*sp->f_mntonname = '/';
 		return;
 	}
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return;
 	len = strlen(pr->pr_path);
 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
 	/*
 	 * Clear current buffer data, so we are sure nothing from
 	 * the valid path left there.
 	 */
 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 	if (*jpath == '\0') {
 		/* Should never happen. */
 		*sp->f_mntonname = '/';
 	} else {
 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
 	}
 }
 
 /*
  * Check with permission for a specific privilege is granted within jail.  We
  * have a specific list of accepted privileges; the rest are denied.
  */
 int
 prison_priv_check(struct ucred *cred, int priv)
 {
 
 	if (!jailed(cred))
 		return (0);
 
 #ifdef VIMAGE
 	/*
 	 * Privileges specific to prisons with a virtual network stack.
 	 * There might be a duplicate entry here in case the privilege
 	 * is only granted conditionally in the legacy jail case.
 	 */
 	switch (priv) {
 #ifdef notyet
 		/*
 		 * NFS-specific privileges.
 		 */
 	case PRIV_NFS_DAEMON:
 	case PRIV_NFS_LOCKD:
 #endif
 		/*
 		 * Network stack privileges.
 		 */
 	case PRIV_NET_BRIDGE:
 	case PRIV_NET_GRE:
 	case PRIV_NET_BPF:
 	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
 	case PRIV_NET_ROUTE:
 	case PRIV_NET_TAP:
 	case PRIV_NET_SETIFMTU:
 	case PRIV_NET_SETIFFLAGS:
 	case PRIV_NET_SETIFCAP:
 	case PRIV_NET_SETIFDESCR:
 	case PRIV_NET_SETIFNAME	:
 	case PRIV_NET_SETIFMETRIC:
 	case PRIV_NET_SETIFPHYS:
 	case PRIV_NET_SETIFMAC:
 	case PRIV_NET_ADDMULTI:
 	case PRIV_NET_DELMULTI:
 	case PRIV_NET_HWIOCTL:
 	case PRIV_NET_SETLLADDR:
 	case PRIV_NET_ADDIFGROUP:
 	case PRIV_NET_DELIFGROUP:
 	case PRIV_NET_IFCREATE:
 	case PRIV_NET_IFDESTROY:
 	case PRIV_NET_ADDIFADDR:
 	case PRIV_NET_DELIFADDR:
 	case PRIV_NET_LAGG:
 	case PRIV_NET_GIF:
 	case PRIV_NET_SETIFVNET:
 	case PRIV_NET_SETIFFIB:
 
 		/*
 		 * 802.11-related privileges.
 		 */
 	case PRIV_NET80211_GETKEY:
 #ifdef notyet
 	case PRIV_NET80211_MANAGE:		/* XXX-BZ discuss with sam@ */
 #endif
 
 #ifdef notyet
 		/*
 		 * ATM privileges.
 		 */
 	case PRIV_NETATM_CFG:
 	case PRIV_NETATM_ADD:
 	case PRIV_NETATM_DEL:
 	case PRIV_NETATM_SET:
 
 		/*
 		 * Bluetooth privileges.
 		 */
 	case PRIV_NETBLUETOOTH_RAW:
 #endif
 
 		/*
 		 * Netgraph and netgraph module privileges.
 		 */
 	case PRIV_NETGRAPH_CONTROL:
 #ifdef notyet
 	case PRIV_NETGRAPH_TTY:
 #endif
 
 		/*
 		 * IPv4 and IPv6 privileges.
 		 */
 	case PRIV_NETINET_IPFW:
 	case PRIV_NETINET_DIVERT:
 	case PRIV_NETINET_PF:
 	case PRIV_NETINET_DUMMYNET:
 	case PRIV_NETINET_CARP:
 	case PRIV_NETINET_MROUTE:
 	case PRIV_NETINET_RAW:
 	case PRIV_NETINET_ADDRCTRL6:
 	case PRIV_NETINET_ND6:
 	case PRIV_NETINET_SCOPE6:
 	case PRIV_NETINET_ALIFETIME6:
 	case PRIV_NETINET_IPSEC:
 	case PRIV_NETINET_BINDANY:
 
 #ifdef notyet
 		/*
 		 * NCP privileges.
 		 */
 	case PRIV_NETNCP:
 
 		/*
 		 * SMB privileges.
 		 */
 	case PRIV_NETSMB:
 #endif
 
 	/*
 	 * No default: or deny here.
 	 * In case of no permit fall through to next switch().
 	 */
 		if (cred->cr_prison->pr_flags & PR_VNET)
 			return (0);
 	}
 #endif /* VIMAGE */
 
 	switch (priv) {
 
 		/*
 		 * Allow ktrace privileges for root in jail.
 		 */
 	case PRIV_KTRACE:
 
 #if 0
 		/*
 		 * Allow jailed processes to configure audit identity and
 		 * submit audit records (login, etc).  In the future we may
 		 * want to further refine the relationship between audit and
 		 * jail.
 		 */
 	case PRIV_AUDIT_GETAUDIT:
 	case PRIV_AUDIT_SETAUDIT:
 	case PRIV_AUDIT_SUBMIT:
 #endif
 
 		/*
 		 * Allow jailed processes to manipulate process UNIX
 		 * credentials in any way they see fit.
 		 */
 	case PRIV_CRED_SETUID:
 	case PRIV_CRED_SETEUID:
 	case PRIV_CRED_SETGID:
 	case PRIV_CRED_SETEGID:
 	case PRIV_CRED_SETGROUPS:
 	case PRIV_CRED_SETREUID:
 	case PRIV_CRED_SETREGID:
 	case PRIV_CRED_SETRESUID:
 	case PRIV_CRED_SETRESGID:
 
 		/*
 		 * Jail implements visibility constraints already, so allow
 		 * jailed root to override uid/gid-based constraints.
 		 */
 	case PRIV_SEEOTHERGIDS:
 	case PRIV_SEEOTHERUIDS:
 
 		/*
 		 * Jail implements inter-process debugging limits already, so
 		 * allow jailed root various debugging privileges.
 		 */
 	case PRIV_DEBUG_DIFFCRED:
 	case PRIV_DEBUG_SUGID:
 	case PRIV_DEBUG_UNPRIV:
 
 		/*
 		 * Allow jail to set various resource limits and login
 		 * properties, and for now, exceed process resource limits.
 		 */
 	case PRIV_PROC_LIMIT:
 	case PRIV_PROC_SETLOGIN:
 	case PRIV_PROC_SETRLIMIT:
 
 		/*
 		 * System V and POSIX IPC privileges are granted in jail.
 		 */
 	case PRIV_IPC_READ:
 	case PRIV_IPC_WRITE:
 	case PRIV_IPC_ADMIN:
 	case PRIV_IPC_MSGSIZE:
 	case PRIV_MQ_ADMIN:
 
 		/*
 		 * Jail operations within a jail work on child jails.
 		 */
 	case PRIV_JAIL_ATTACH:
 	case PRIV_JAIL_SET:
 	case PRIV_JAIL_REMOVE:
 
 		/*
 		 * Jail implements its own inter-process limits, so allow
 		 * root processes in jail to change scheduling on other
 		 * processes in the same jail.  Likewise for signalling.
 		 */
 	case PRIV_SCHED_DIFFCRED:
 	case PRIV_SCHED_CPUSET:
 	case PRIV_SIGNAL_DIFFCRED:
 	case PRIV_SIGNAL_SUGID:
 
 		/*
 		 * Allow jailed processes to write to sysctls marked as jail
 		 * writable.
 		 */
 	case PRIV_SYSCTL_WRITEJAIL:
 
 		/*
 		 * Allow root in jail to manage a variety of quota
 		 * properties.  These should likely be conditional on a
 		 * configuration option.
 		 */
 	case PRIV_VFS_GETQUOTA:
 	case PRIV_VFS_SETQUOTA:
 
 		/*
 		 * Since Jail relies on chroot() to implement file system
 		 * protections, grant many VFS privileges to root in jail.
 		 * Be careful to exclude mount-related and NFS-related
 		 * privileges.
 		 */
 	case PRIV_VFS_READ:
 	case PRIV_VFS_WRITE:
 	case PRIV_VFS_ADMIN:
 	case PRIV_VFS_EXEC:
 	case PRIV_VFS_LOOKUP:
 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
 	case PRIV_VFS_CHFLAGS_DEV:
 	case PRIV_VFS_CHOWN:
 	case PRIV_VFS_CHROOT:
 	case PRIV_VFS_RETAINSUGID:
 	case PRIV_VFS_FCHROOT:
 	case PRIV_VFS_LINK:
 	case PRIV_VFS_SETGID:
 	case PRIV_VFS_STAT:
 	case PRIV_VFS_STICKYFILE:
 
 		/*
 		 * As in the non-jail case, non-root users are expected to be
 		 * able to read kernel/phyiscal memory (provided /dev/[k]mem
 		 * exists in the jail and they have permission to access it).
 		 */
 	case PRIV_KMEM_READ:
 		return (0);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * setting system flags.
 		 */
 	case PRIV_VFS_SYSFLAGS:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * mounting/unmounting file systems.
 		 */
 	case PRIV_VFS_MOUNT:
 	case PRIV_VFS_UNMOUNT:
 	case PRIV_VFS_MOUNT_NONUSER:
 	case PRIV_VFS_MOUNT_OWNER:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
 		    cred->cr_prison->pr_enforce_statfs < 2)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Allow jailed root to bind reserved ports and reuse in-use
 		 * ports.
 		 */
 	case PRIV_NETINET_RESERVEDPORT:
 	case PRIV_NETINET_REUSEPORT:
 		return (0);
 
 		/*
 		 * Allow jailed root to set certain IPv4/6 (option) headers.
 		 */
 	case PRIV_NETINET_SETHDROPTS:
 		return (0);
 
 		/*
 		 * Conditionally allow creating raw sockets in jail.
 		 */
 	case PRIV_NETINET_RAW:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Since jail implements its own visibility limits on netstat
 		 * sysctls, allow getcred.  This allows identd to work in
 		 * jail.
 		 */
 	case PRIV_NETINET_GETCRED:
 		return (0);
 
 		/*
 		 * Allow jailed root to set loginclass.
 		 */
 	case PRIV_PROC_SETLOGINCLASS:
 		return (0);
 
 		/*
 		 * Do not allow a process inside a jail to read the kernel
 		 * message buffer unless explicitly permitted.
 		 */
 	case PRIV_MSGBUF:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
 			return (0);
 		return (EPERM);
 
 	default:
 		/*
 		 * In all remaining cases, deny the privilege request.  This
 		 * includes almost all network privileges, many system
 		 * configuration privileges.
 		 */
 		return (EPERM);
 	}
 }
 
 /*
  * Return the part of pr2's name that is relative to pr1, or the whole name
  * if it does not directly follow.
  */
 
 char *
 prison_name(struct prison *pr1, struct prison *pr2)
 {
 	char *name;
 
 	/* Jails see themselves as "0" (if they see themselves at all). */
 	if (pr1 == pr2)
 		return "0";
 	name = pr2->pr_name;
 	if (prison_ischild(pr1, pr2)) {
 		/*
 		 * pr1 isn't locked (and allprison_lock may not be either)
 		 * so its length can't be counted on.  But the number of dots
 		 * can be counted on - and counted.
 		 */
 		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
 			name = strchr(name, '.') + 1;
 	}
 	return (name);
 }
 
 /*
  * Return the part of pr2's path that is relative to pr1, or the whole path
  * if it does not directly follow.
  */
 static char *
 prison_path(struct prison *pr1, struct prison *pr2)
 {
 	char *path1, *path2;
 	int len1;
 
 	path1 = pr1->pr_path;
 	path2 = pr2->pr_path;
 	if (!strcmp(path1, "/"))
 		return (path2);
 	len1 = strlen(path1);
 	if (strncmp(path1, path2, len1))
 		return (path2);
 	if (path2[len1] == '\0')
 		return "/";
 	if (path2[len1] == '/')
 		return (path2 + len1);
 	return (path2);
 }
 
 
 /*
  * Jail-related sysctls.
  */
 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
     "Jails");
 
 static int
 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
 {
 	struct xprison *xp;
 	struct prison *pr, *cpr;
 #ifdef INET
 	struct in_addr *ip4 = NULL;
 	int ip4s = 0;
 #endif
 #ifdef INET6
 	struct in6_addr *ip6 = NULL;
 	int ip6s = 0;
 #endif
 	int descend, error;
 
 	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
 	pr = req->td->td_ucred->cr_prison;
 	error = 0;
 	sx_slock(&allprison_lock);
 	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
 #if defined(INET) || defined(INET6)
  again:
 #endif
 		mtx_lock(&cpr->pr_mtx);
 #ifdef INET
 		if (cpr->pr_ip4s > 0) {
 			if (ip4s < cpr->pr_ip4s) {
 				ip4s = cpr->pr_ip4s;
 				mtx_unlock(&cpr->pr_mtx);
 				ip4 = realloc(ip4, ip4s *
 				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
 				goto again;
 			}
 			bcopy(cpr->pr_ip4, ip4,
 			    cpr->pr_ip4s * sizeof(struct in_addr));
 		}
 #endif
 #ifdef INET6
 		if (cpr->pr_ip6s > 0) {
 			if (ip6s < cpr->pr_ip6s) {
 				ip6s = cpr->pr_ip6s;
 				mtx_unlock(&cpr->pr_mtx);
 				ip6 = realloc(ip6, ip6s *
 				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
 				goto again;
 			}
 			bcopy(cpr->pr_ip6, ip6,
 			    cpr->pr_ip6s * sizeof(struct in6_addr));
 		}
 #endif
 		if (cpr->pr_ref == 0) {
 			mtx_unlock(&cpr->pr_mtx);
 			continue;
 		}
 		bzero(xp, sizeof(*xp));
 		xp->pr_version = XPRISON_VERSION;
 		xp->pr_id = cpr->pr_id;
 		xp->pr_state = cpr->pr_uref > 0
 		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
 		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
 		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
 		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
 #ifdef INET
 		xp->pr_ip4s = cpr->pr_ip4s;
 #endif
 #ifdef INET6
 		xp->pr_ip6s = cpr->pr_ip6s;
 #endif
 		mtx_unlock(&cpr->pr_mtx);
 		error = SYSCTL_OUT(req, xp, sizeof(*xp));
 		if (error)
 			break;
 #ifdef INET
 		if (xp->pr_ip4s > 0) {
 			error = SYSCTL_OUT(req, ip4,
 			    xp->pr_ip4s * sizeof(struct in_addr));
 			if (error)
 				break;
 		}
 #endif
 #ifdef INET6
 		if (xp->pr_ip6s > 0) {
 			error = SYSCTL_OUT(req, ip6,
 			    xp->pr_ip6s * sizeof(struct in6_addr));
 			if (error)
 				break;
 		}
 #endif
 	}
 	sx_sunlock(&allprison_lock);
 	free(xp, M_TEMP);
 #ifdef INET
 	free(ip4, M_TEMP);
 #endif
 #ifdef INET6
 	free(ip6, M_TEMP);
 #endif
 	return (error);
 }
 
 SYSCTL_OID(_security_jail, OID_AUTO, list,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_list, "S", "List of active jails");
 
 static int
 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
 {
 	int error, injail;
 
 	injail = jailed(req->td->td_ucred);
 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
 
 	return (error);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_jailed, "I", "Process in jail?");
 
 static int
 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
 {
 	int error, havevnet;
 #ifdef VIMAGE
 	struct ucred *cred = req->td->td_ucred;
 
 	havevnet = jailed(cred) && prison_owns_vnet(cred);
 #else
 	havevnet = 0;
 #endif
 	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
 
 	return (error);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_vnet, "I", "Jail owns VNET?");
 
 #if defined(INET) || defined(INET6)
 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
     &jail_max_af_ips, 0,
     "Number of IP addresses a jail may have at most per address family (deprecated)");
 #endif
 
 /*
  * Default parameters for jail(2) compatibility.  For historical reasons,
  * the sysctl names have varying similarity to the parameter names.  Prisons
  * just see their own parameters, and can't change them.
  */
 static int
 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 	int allow, error, i;
 
 	pr = req->td->td_ucred->cr_prison;
 	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
 
 	/* Get the current flag value, and convert it to a boolean. */
 	i = (allow & arg2) ? 1 : 0;
 	if (arg1 != NULL)
 		i = !i;
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	i = i ? arg2 : 0;
 	if (arg1 != NULL)
 		i ^= arg2;
 	/*
 	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
 	 * for writing.
 	 */
 	mtx_lock(&prison0.pr_mtx);
 	jail_default_allow = (jail_default_allow & ~arg2) | i;
 	mtx_unlock(&prison0.pr_mtx);
 	return (0);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
     "Processes in jail can set their hostnames (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
     "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
     "Processes in jail can use System V IPC primitives (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
     "Prison root can create raw sockets (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
     "Processes in jail can alter system file flags (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
     "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the devfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_fdescfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_FDESCFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the fdescfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the nullfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the procfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the linprocfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the linsysfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the tmpfs file system (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I",
     "Processes in jail can mount the zfs file system (deprecated)");
 
 static int
 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 	int level, error;
 
 	pr = req->td->td_ucred->cr_prison;
 	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
 	error = sysctl_handle_int(oidp, &level, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	*(int *)arg1 = level;
 	return (0);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
     sysctl_jail_default_level, "I",
     "Processes in jail cannot see all mounted file systems (deprecated)");
 
 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
     &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
     sysctl_jail_default_level, "I",
     "Ruleset for the devfs filesystem in jail (deprecated)");
 
 /*
  * Nodes to describe jail parameters.  Maximum length of string parameters
  * is returned in the string itself, and the other parameters exist merely
  * to make themselves and their types known.
  */
 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
     "Jail parameters");
 
 int
 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
 {
 	int i;
 	long l;
 	size_t s;
 	char numbuf[12];
 
 	switch (oidp->oid_kind & CTLTYPE)
 	{
 	case CTLTYPE_LONG:
 	case CTLTYPE_ULONG:
 		l = 0;
 #ifdef SCTL_MASK32
 		if (!(req->flags & SCTL_MASK32))
 #endif
 			return (SYSCTL_OUT(req, &l, sizeof(l)));
 	case CTLTYPE_INT:
 	case CTLTYPE_UINT:
 		i = 0;
 		return (SYSCTL_OUT(req, &i, sizeof(i)));
 	case CTLTYPE_STRING:
 		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
 		return
 		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
 	case CTLTYPE_STRUCT:
 		s = (size_t)arg2;
 		return (SYSCTL_OUT(req, &s, sizeof(s)));
 	}
 	return (0);
 }
 
 /*
  * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
  * jail creation time but cannot be changed in an existing jail.
  */
 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail secure level");
 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", 
     "Jail value for kern.osreldate and uname -K");
 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, 
     "Jail value for kern.osrelease and uname -r");
 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail cannot see all mounted file systems");
 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Ruleset for in-jail devfs mounts");
 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail persistence");
 #ifdef VIMAGE
 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
     "E,jailsys", "Virtual network stack");
 #endif
 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
     "B", "Jail is in the process of shutting down");
 
 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
     "I", "Current number of child jails");
 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Maximum number of child jails");
 
 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
     "Jail hostname");
 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
     "Jail NIS domainname");
 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
     "Jail host UUID");
 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
     "LU", "Jail host ID");
 
 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
 
 #ifdef INET
 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
     "Jail IPv4 address virtualization");
 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
     "S,in_addr,a", "Jail IPv4 addresses");
 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Do (not) use IPv4 source address selection rather than the "
     "primary jail IPv4 address.");
 #endif
 #ifdef INET6
 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
     "Jail IPv6 address virtualization");
 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
     "S,in6_addr,a", "Jail IPv6 addresses");
 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Do (not) use IPv6 source address selection rather than the "
     "primary jail IPv6 address.");
 #endif
 
 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set hostname");
 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may use SYSV IPC");
 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may create raw sockets");
 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may alter system file flags");
 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set file quotas");
 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may read the kernel message buffer");
 
 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount/unmount jail-friendly file systems in general");
 SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the devfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, fdescfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the fdescfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the nullfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the procfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the linprocfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the linsysfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the tmpfs file system");
 SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount the zfs file system");
 
 #ifdef RACCT
 void
 prison_racct_foreach(void (*callback)(struct racct *racct,
     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
     void *arg2, void *arg3)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_slock(&allprison_lock);
 	if (pre != NULL)
 		(pre)();
 	LIST_FOREACH(prr, &allprison_racct, prr_next)
 		(callback)(prr->prr_racct, arg2, arg3);
 	if (post != NULL)
 		(post)();
 	sx_sunlock(&allprison_lock);
 }
 
 static struct prison_racct *
 prison_racct_find_locked(const char *name)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
 		return (NULL);
 
 	LIST_FOREACH(prr, &allprison_racct, prr_next) {
 		if (strcmp(name, prr->prr_name) != 0)
 			continue;
 
 		/* Found prison_racct with a matching name? */
 		prison_racct_hold(prr);
 		return (prr);
 	}
 
 	/* Add new prison_racct. */
 	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
 	racct_create(&prr->prr_racct);
 
 	strcpy(prr->prr_name, name);
 	refcount_init(&prr->prr_refcount, 1);
 	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
 
 	return (prr);
 }
 
 struct prison_racct *
 prison_racct_find(const char *name)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_xlock(&allprison_lock);
 	prr = prison_racct_find_locked(name);
 	sx_xunlock(&allprison_lock);
 	return (prr);
 }
 
 void
 prison_racct_hold(struct prison_racct *prr)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	refcount_acquire(&prr->prr_refcount);
 }
 
 static void
 prison_racct_free_locked(struct prison_racct *prr)
 {
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	if (refcount_release(&prr->prr_refcount)) {
 		racct_destroy(&prr->prr_racct);
 		LIST_REMOVE(prr, prr_next);
 		free(prr, M_PRISON_RACCT);
 	}
 }
 
 void
 prison_racct_free(struct prison_racct *prr)
 {
 	int old;
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_UNLOCKED);
 
 	old = prr->prr_refcount;
 	if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
 		return;
 
 	sx_xlock(&allprison_lock);
 	prison_racct_free_locked(prr);
 	sx_xunlock(&allprison_lock);
 }
 
 static void
 prison_racct_attach(struct prison *pr)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	prr = prison_racct_find_locked(pr->pr_name);
 	KASSERT(prr != NULL, ("cannot find prison_racct"));
 
 	pr->pr_prison_racct = prr;
 }
 
 /*
  * Handle jail renaming.  From the racct point of view, renaming means
  * moving from one prison_racct to another.
  */
 static void
 prison_racct_modify(struct prison *pr)
 {
 #ifdef RCTL
 	struct proc *p;
 	struct ucred *cred;
 #endif
 	struct prison_racct *oldprr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_slock(&allproc_lock);
 	sx_xlock(&allprison_lock);
 
 	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
 		sx_xunlock(&allprison_lock);
 		sx_sunlock(&allproc_lock);
 		return;
 	}
 
 	oldprr = pr->pr_prison_racct;
 	pr->pr_prison_racct = NULL;
 
 	prison_racct_attach(pr);
 
 	/*
 	 * Move resource utilisation records.
 	 */
 	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
 
 #ifdef RCTL
 	/*
 	 * Force rctl to reattach rules to processes.
 	 */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		cred = crhold(p->p_ucred);
 		PROC_UNLOCK(p);
 		rctl_proc_ucred_changed(p, cred);
 		crfree(cred);
 	}
 #endif
 
 	sx_sunlock(&allproc_lock);
 	prison_racct_free_locked(oldprr);
 	sx_xunlock(&allprison_lock);
 }
 
 static void
 prison_racct_detach(struct prison *pr)
 {
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_UNLOCKED);
 
 	if (pr->pr_prison_racct == NULL)
 		return;
 	prison_racct_free(pr->pr_prison_racct);
 	pr->pr_prison_racct = NULL;
 }
 #endif /* RACCT */
 
 #ifdef DDB
 
 static void
 db_show_prison(struct prison *pr)
 {
 	int fi;
 #if defined(INET) || defined(INET6)
 	int ii;
 #endif
 	unsigned jsf;
 #ifdef INET
 	char ip4buf[INET_ADDRSTRLEN];
 #endif
 #ifdef INET6
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	db_printf("prison %p:\n", pr);
 	db_printf(" jid             = %d\n", pr->pr_id);
 	db_printf(" name            = %s\n", pr->pr_name);
 	db_printf(" parent          = %p\n", pr->pr_parent);
 	db_printf(" ref             = %d\n", pr->pr_ref);
 	db_printf(" uref            = %d\n", pr->pr_uref);
 	db_printf(" path            = %s\n", pr->pr_path);
 	db_printf(" cpuset          = %d\n", pr->pr_cpuset
 	    ? pr->pr_cpuset->cs_id : -1);
 #ifdef VIMAGE
 	db_printf(" vnet            = %p\n", pr->pr_vnet);
 #endif
 	db_printf(" root            = %p\n", pr->pr_root);
 	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
 	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
 	db_printf(" children.max    = %d\n", pr->pr_childmax);
 	db_printf(" children.cur    = %d\n", pr->pr_childcount);
 	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
 	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
 	db_printf(" flags           = 0x%x", pr->pr_flags);
 	for (fi = 0; fi < nitems(pr_flag_names); fi++)
 		if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
 			db_printf(" %s", pr_flag_names[fi]);
 	for (fi = 0; fi < nitems(pr_flag_jailsys); fi++) {
 		jsf = pr->pr_flags &
 		    (pr_flag_jailsys[fi].disable | pr_flag_jailsys[fi].new);
 		db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
 		    pr_flag_jailsys[fi].disable && 
 		      (jsf == pr_flag_jailsys[fi].disable) ? "disable"
 		    : (jsf == pr_flag_jailsys[fi].new) ? "new"
 		    : "inherit");
 	}
 	db_printf(" allow           = 0x%x", pr->pr_allow);
 	for (fi = 0; fi < nitems(pr_allow_names); fi++)
 		if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
 			db_printf(" %s", pr_allow_names[fi]);
 	db_printf("\n");
 	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
 	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
 	db_printf(" host.domainname = %s\n", pr->pr_domainname);
 	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
 	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
 #ifdef INET
 	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
 	for (ii = 0; ii < pr->pr_ip4s; ii++)
 		db_printf(" %s %s\n",
 		    ii == 0 ? "ip4.addr        =" : "                 ",
 		    inet_ntoa_r(pr->pr_ip4[ii], ip4buf));
 #endif
 #ifdef INET6
 	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
 	for (ii = 0; ii < pr->pr_ip6s; ii++)
 		db_printf(" %s %s\n",
 		    ii == 0 ? "ip6.addr        =" : "                 ",
 		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
 #endif
 }
 
 DB_SHOW_COMMAND(prison, db_show_prison_command)
 {
 	struct prison *pr;
 
 	if (!have_addr) {
 		/*
 		 * Show all prisons in the list, and prison0 which is not
 		 * listed.
 		 */
 		db_show_prison(&prison0);
 		if (!db_pager_quit) {
 			TAILQ_FOREACH(pr, &allprison, pr_list) {
 				db_show_prison(pr);
 				if (db_pager_quit)
 					break;
 			}
 		}
 		return;
 	}
 
 	if (addr == 0)
 		pr = &prison0;
 	else {
 		/* Look for a prison with the ID and with references. */
 		TAILQ_FOREACH(pr, &allprison, pr_list)
 			if (pr->pr_id == addr && pr->pr_ref > 0)
 				break;
 		if (pr == NULL)
 			/* Look again, without requiring a reference. */
 			TAILQ_FOREACH(pr, &allprison, pr_list)
 				if (pr->pr_id == addr)
 					break;
 		if (pr == NULL)
 			/* Assume address points to a valid prison. */
 			pr = (struct prison *)addr;
 	}
 	db_show_prison(pr);
 }
 
 #endif /* DDB */
diff --git a/sys/sys/filedesc.h b/sys/sys/filedesc.h
index b9ad9fe363e9..5bbc5a3e6fa8 100644
--- a/sys/sys/filedesc.h
+++ b/sys/sys/filedesc.h
@@ -1,246 +1,247 @@
 /*-
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)filedesc.h	8.1 (Berkeley) 6/2/93
  * $FreeBSD$
  */
 
 #ifndef _SYS_FILEDESC_H_
 #define	_SYS_FILEDESC_H_
 
 #include <sys/caprights.h>
 #include <sys/queue.h>
 #include <sys/event.h>
 #include <sys/lock.h>
 #include <sys/priority.h>
 #include <sys/seq.h>
 #include <sys/sx.h>
 
 #include <machine/_limits.h>
 
 struct filecaps {
 	cap_rights_t	 fc_rights;	/* per-descriptor capability rights */
 	u_long		*fc_ioctls;	/* per-descriptor allowed ioctls */
 	int16_t		 fc_nioctls;	/* fc_ioctls array size */
 	uint32_t	 fc_fcntls;	/* per-descriptor allowed fcntls */
 };
 
 struct filedescent {
 	struct file	*fde_file;	/* file structure for open file */
 	struct filecaps	 fde_caps;	/* per-descriptor rights */
 	uint8_t		 fde_flags;	/* per-process open file flags */
 	seq_t		 fde_seq;	/* keep file and caps in sync */
 };
 #define	fde_rights	fde_caps.fc_rights
 #define	fde_fcntls	fde_caps.fc_fcntls
 #define	fde_ioctls	fde_caps.fc_ioctls
 #define	fde_nioctls	fde_caps.fc_nioctls
 #define	fde_change_size	(offsetof(struct filedescent, fde_seq))
 
 struct fdescenttbl {
 	int	fdt_nfiles;		/* number of open files allocated */
 	struct	filedescent fdt_ofiles[0];	/* open files */
 };
 #define	fd_seq(fdt, fd)	(&(fdt)->fdt_ofiles[(fd)].fde_seq)
 
 /*
  * This structure is used for the management of descriptors.  It may be
  * shared by multiple processes.
  */
 #define NDSLOTTYPE	u_long
 
 struct filedesc {
 	struct	fdescenttbl *fd_files;	/* open files table */
 	struct	vnode *fd_cdir;		/* current directory */
 	struct	vnode *fd_rdir;		/* root directory */
 	struct	vnode *fd_jdir;		/* jail root directory */
 	NDSLOTTYPE *fd_map;		/* bitmap of free fds */
 	int	fd_lastfile;		/* high-water mark of fd_ofiles */
 	int	fd_freefile;		/* approx. next free file */
 	u_short	fd_cmask;		/* mask for file creation */
 	int	fd_refcnt;		/* thread reference count */
 	int	fd_holdcnt;		/* hold count on structure + mutex */
 	struct	sx fd_sx;		/* protects members of this struct */
 	struct	kqlist fd_kqlist;	/* list of kqueues on this filedesc */
 	int	fd_holdleaderscount;	/* block fdfree() for shared close() */
 	int	fd_holdleaderswakeup;	/* fdfree() needs wakeup */
 };
 
 /*
  * Structure to keep track of (process leader, struct fildedesc) tuples.
  * Each process has a pointer to such a structure when detailed tracking
  * is needed, e.g., when rfork(RFPROC | RFMEM) causes a file descriptor
  * table to be shared by processes having different "p_leader" pointers
  * and thus distinct POSIX style locks.
  *
  * fdl_refcount and fdl_holdcount are protected by struct filedesc mtx.
  */
 struct filedesc_to_leader {
 	int		fdl_refcount;	/* references from struct proc */
 	int		fdl_holdcount;	/* temporary hold during closef */
 	int		fdl_wakeup;	/* fdfree() waits on closef() */
 	struct proc	*fdl_leader;	/* owner of POSIX locks */
 	/* Circular list: */
 	struct filedesc_to_leader *fdl_prev;
 	struct filedesc_to_leader *fdl_next;
 };
 #define	fd_nfiles	fd_files->fdt_nfiles
 #define	fd_ofiles	fd_files->fdt_ofiles
 
 /*
  * Per-process open flags.
  */
 #define	UF_EXCLOSE	0x01		/* auto-close on exec */
 
 #ifdef _KERNEL
 
 /* Lock a file descriptor table. */
 #define	FILEDESC_LOCK_INIT(fdp)	sx_init(&(fdp)->fd_sx, "filedesc structure")
 #define	FILEDESC_LOCK_DESTROY(fdp)	sx_destroy(&(fdp)->fd_sx)
 #define	FILEDESC_LOCK(fdp)	(&(fdp)->fd_sx)
 #define	FILEDESC_XLOCK(fdp)	sx_xlock(&(fdp)->fd_sx)
 #define	FILEDESC_XUNLOCK(fdp)	sx_xunlock(&(fdp)->fd_sx)
 #define	FILEDESC_SLOCK(fdp)	sx_slock(&(fdp)->fd_sx)
 #define	FILEDESC_SUNLOCK(fdp)	sx_sunlock(&(fdp)->fd_sx)
 
 #define	FILEDESC_LOCK_ASSERT(fdp)	sx_assert(&(fdp)->fd_sx, SX_LOCKED | \
 					    SX_NOTRECURSED)
 #define	FILEDESC_XLOCK_ASSERT(fdp)	sx_assert(&(fdp)->fd_sx, SX_XLOCKED | \
 					    SX_NOTRECURSED)
 #define	FILEDESC_UNLOCK_ASSERT(fdp)	sx_assert(&(fdp)->fd_sx, SX_UNLOCKED)
 
 /* Operation types for kern_dup(). */
 enum {
 	FDDUP_NORMAL,		/* dup() behavior. */
 	FDDUP_FCNTL,		/* fcntl()-style errors. */
 	FDDUP_FIXED,		/* Force fixed allocation. */
 	FDDUP_MUSTREPLACE,	/* Target must exist. */
 	FDDUP_LASTMODE,
 };
 
 /* Flags for kern_dup(). */
 #define	FDDUP_FLAG_CLOEXEC	0x1	/* Atomically set UF_EXCLOSE. */
 
 /* For backward compatibility. */
 #define	falloc(td, resultfp, resultfd, flags) \
 	falloc_caps(td, resultfp, resultfd, flags, NULL)
 
 struct thread;
 
 void	filecaps_init(struct filecaps *fcaps);
 bool	filecaps_copy(const struct filecaps *src, struct filecaps *dst,
 	    bool locked);
 void	filecaps_move(struct filecaps *src, struct filecaps *dst);
 void	filecaps_free(struct filecaps *fcaps);
 
 int	closef(struct file *fp, struct thread *td);
 int	dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
 	    int openerror, int *indxp);
 int	falloc_caps(struct thread *td, struct file **resultfp, int *resultfd,
 	    int flags, struct filecaps *fcaps);
 int	falloc_noinstall(struct thread *td, struct file **resultfp);
 void	_finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
 	    struct filecaps *fcaps);
 int	finstall(struct thread *td, struct file *fp, int *resultfd, int flags,
 	    struct filecaps *fcaps);
 int	fdalloc(struct thread *td, int minfd, int *result);
 int	fdallocn(struct thread *td, int minfd, int *fds, int n);
 int	fdcheckstd(struct thread *td);
 void	fdclose(struct thread *td, struct file *fp, int idx);
 void	fdcloseexec(struct thread *td);
 void	fdsetugidsafety(struct thread *td);
 struct	filedesc *fdcopy(struct filedesc *fdp);
 int	fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds,
 	    struct filedesc **newfdp);
 void	fdinstall_remapped(struct thread *td, struct filedesc *fdp);
 void	fdunshare(struct thread *td);
 void	fdescfree(struct thread *td);
 void	fdescfree_remapped(struct filedesc *fdp);
 struct	filedesc *fdinit(struct filedesc *fdp, bool prepfiles);
 struct	filedesc *fdshare(struct filedesc *fdp);
 struct filedesc_to_leader *
 	filedesc_to_leader_alloc(struct filedesc_to_leader *old,
 	    struct filedesc *fdp, struct proc *leader);
 int	getvnode(struct thread *td, int fd, cap_rights_t *rightsp,
 	    struct file **fpp);
 void	mountcheckdirs(struct vnode *olddp, struct vnode *newdp);
 
 int	fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
 	    struct file **fpp, struct filecaps *havecapsp);
 int	fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp,
 	    struct file **fpp, struct filecaps *havecapsp);
 
 /* Return a referenced file from an unlocked descriptor. */
 int	fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
 	    struct file **fpp, seq_t *seqp);
 
 /* Requires a FILEDESC_{S,X}LOCK held and returns without a ref. */
 static __inline struct file *
 fget_locked(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	if (fd < 0 || fd > fdp->fd_lastfile)
 		return (NULL);
 
 	return (fdp->fd_ofiles[fd].fde_file);
 }
 
 static __inline struct filedescent *
 fdeget_locked(struct filedesc *fdp, int fd)
 {
 	struct filedescent *fde;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	if (fd < 0 || fd > fdp->fd_lastfile)
 		return (NULL);
 
 	fde = &fdp->fd_ofiles[fd];
 	if (fde->fde_file == NULL)
 		return (NULL);
 
 	return (fde);
 }
 
 static __inline bool
 fd_modified(struct filedesc *fdp, int fd, seq_t seq)
 {
 
 	return (!seq_consistent(fd_seq(fdp->fd_files, fd), seq));
 }
 
 /* cdir/rdir/jdir manipulation functions. */
 void	pwd_chdir(struct thread *td, struct vnode *vp);
 int	pwd_chroot(struct thread *td, struct vnode *vp);
+int	pwd_chroot_chdir(struct thread *td, struct vnode *vp);
 void	pwd_ensure_dirs(void);
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_FILEDESC_H_ */