diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 14e6e735f8e7..7c9a15ae18f3 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -1,5162 +1,5172 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 1999 Poul-Henning Kamp.
  * Copyright (c) 2008 Bjoern A. Zeeb.
  * Copyright (c) 2009 James Gritton.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_nfs.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/osd.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/epoch.h>
 #include <sys/taskqueue.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/racct.h>
 #include <sys/rctl.h>
 #include <sys/refcount.h>
 #include <sys/sx.h>
 #include <sys/sysent.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/uuid.h>
 #include <sys/vnode.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif /* DDB */
 
 #include <security/mac/mac_framework.h>
 
 #define	PRISON0_HOSTUUID_MODULE	"hostuuid"
 
 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
 
 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
 #ifdef INET
 #ifdef INET6
 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
 #else
 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
 #endif
 #else /* !INET */
 #ifdef INET6
 #define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
 #else
 #define	_PR_IP_SADDRSEL	0
 #endif
 #endif
 
 /* prison0 describes what is "real" about the system. */
 struct prison prison0 = {
 	.pr_id		= 0,
 	.pr_name	= "0",
 	.pr_ref		= 1,
 	.pr_uref	= 1,
 	.pr_path	= "/",
 	.pr_securelevel	= -1,
 	.pr_devfs_rsnum = 0,
 	.pr_state	= PRISON_STATE_ALIVE,
 	.pr_childmax	= JAIL_MAX,
 	.pr_hostuuid	= DEFAULT_HOSTUUID,
 	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
 #ifdef VIMAGE
 	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
 #else
 	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
 #endif
 	.pr_allow	= PR_ALLOW_PRISON0,
 };
 _Static_assert((PR_ALLOW_PRISON0 & ~PR_ALLOW_ALL_STATIC) == 0,
     "Bits enabled in PR_ALLOW_PRISON0 that are not statically reserved");
 
 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
 
 struct bool_flags {
 	const char	*name;
 	const char	*noname;
 	volatile u_int	 flag;
 };
 struct jailsys_flags {
 	const char	*name;
 	unsigned	 disable;
 	unsigned	 new;
 };
 
 /*
  * Handle jail teardown in a dedicated thread to avoid deadlocks from
  * vnet_destroy().
  */
 TASKQUEUE_DEFINE_THREAD(jail_remove);
 
 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
 struct	sx allprison_lock;
 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
 struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
 LIST_HEAD(, prison_racct) allprison_racct;
 int	lastprid = 0;
 int	lastdeadid = 0;
 
 static int get_next_prid(struct prison **insprp);
 static int get_next_deadid(struct prison **insprp);
 static int do_jail_attach(struct thread *td, struct prison *pr, int drflags);
 static void prison_complete(void *context, int pending);
 static void prison_deref(struct prison *pr, int flags);
 static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison);
 static int prison_lock_xlock(struct prison *pr, int flags);
 static void prison_cleanup(struct prison *pr);
 static void prison_free_not_last(struct prison *pr);
 static void prison_proc_free_not_last(struct prison *pr);
 static void prison_proc_relink(struct prison *opr, struct prison *npr,
     struct proc *p);
 static void prison_set_allow_locked(struct prison *pr, unsigned flag,
     int enable);
 static char *prison_path(struct prison *pr1, struct prison *pr2);
 #ifdef RACCT
 static void prison_racct_attach(struct prison *pr);
 static void prison_racct_modify(struct prison *pr);
 static void prison_racct_detach(struct prison *pr);
 #endif
 
 /* Flags for prison_deref */
 #define	PD_DEREF	0x01	/* Decrement pr_ref */
 #define	PD_DEUREF	0x02	/* Decrement pr_uref */
 #define	PD_KILL		0x04	/* Remove jail, kill processes, etc */
 #define	PD_LOCKED	0x10	/* pr_mtx is held */
 #define	PD_LIST_SLOCKED	0x20	/* allprison_lock is held shared */
 #define	PD_LIST_XLOCKED	0x40	/* allprison_lock is held exclusive */
 #define PD_OP_FLAGS	0x07	/* Operation flags */
 #define PD_LOCK_FLAGS	0x70	/* Lock status flags */
 
 /*
  * Parameter names corresponding to PR_* flag values.  Size values are for kvm
  * as we cannot figure out the size of a sparse array, or an array without a
  * terminating entry.
  */
 static struct bool_flags pr_flag_bool[] = {
 	{"persist", "nopersist", PR_PERSIST},
 #ifdef INET
 	{"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
 #endif
 #ifdef INET6
 	{"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
 #endif
 };
 const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
 
 static struct jailsys_flags pr_flag_jailsys[] = {
 	{"host", 0, PR_HOST},
 #ifdef VIMAGE
 	{"vnet", 0, PR_VNET},
 #endif
 #ifdef INET
 	{"ip4", PR_IP4_USER, PR_IP4_USER},
 #endif
 #ifdef INET6
 	{"ip6", PR_IP6_USER, PR_IP6_USER},
 #endif
 };
 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
 
 /*
  * Make this array full-size so dynamic parameters can be added.
  * It is protected by prison0.mtx, but lockless reading is allowed
  * with an atomic check of the flag values.
  */
 static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
 	{"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
 	{"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
 	{"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
 	{"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
 	{"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
 	{"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
 	{"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
 	{"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
 	{"allow.reserved_ports", "allow.noreserved_ports",
 	 PR_ALLOW_RESERVED_PORTS},
 	{"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
 	{"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug",
 	 PR_ALLOW_UNPRIV_DEBUG},
 	{"allow.suser", "allow.nosuser", PR_ALLOW_SUSER},
 #ifdef VIMAGE
 	{"allow.nfsd", "allow.nonfsd", PR_ALLOW_NFSD},
 #endif
 	{"allow.extattr", "allow.noextattr", PR_ALLOW_EXTATTR},
 	{"allow.adjtime", "allow.noadjtime", PR_ALLOW_ADJTIME},
 	{"allow.settime", "allow.nosettime", PR_ALLOW_SETTIME},
 	{"allow.routing", "allow.norouting", PR_ALLOW_ROUTING},
+	{"allow.unprivileged_parent_tampering",
+	    "allow.nounprivileged_parent_tampering",
+	    PR_ALLOW_UNPRIV_PARENT_TAMPER},
 };
 static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC;
 const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
 
 #define	JAIL_DEFAULT_ALLOW		(PR_ALLOW_SET_HOSTNAME | \
 					 PR_ALLOW_RESERVED_PORTS | \
 					 PR_ALLOW_UNPRIV_DEBUG | \
 					 PR_ALLOW_SUSER)
 #define	JAIL_DEFAULT_ENFORCE_STATFS	2
 #define	JAIL_DEFAULT_DEVFS_RSNUM	0
 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
 #if defined(INET) || defined(INET6)
 static unsigned jail_max_af_ips = 255;
 #endif
 
 /*
  * Initialize the parts of prison0 that can't be static-initialized with
  * constants.  This is called from proc0_init() after creating thread0 cpuset.
  */
 void
 prison0_init(void)
 {
 	uint8_t *file, *data;
 	size_t size;
 	char buf[sizeof(prison0.pr_hostuuid)];
 	bool valid;
 
 	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
 	prison0.pr_osreldate = osreldate;
 	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
 
 	/* If we have a preloaded hostuuid, use it. */
 	file = preload_search_by_type(PRISON0_HOSTUUID_MODULE);
 	if (file != NULL) {
 		data = preload_fetch_addr(file);
 		size = preload_fetch_size(file);
 		if (data != NULL) {
 			/*
 			 * The preloaded data may include trailing whitespace, almost
 			 * certainly a newline; skip over any whitespace or
 			 * non-printable characters to be safe.
 			 */
 			while (size > 0 && data[size - 1] <= 0x20) {
 				size--;
 			}
 
 			valid = false;
 
 			/*
 			 * Not NUL-terminated when passed from loader, but
 			 * validate_uuid requires that due to using sscanf (as
 			 * does the subsequent strlcpy, since it still reads
 			 * past the given size to return the true length);
 			 * bounce to a temporary buffer to fix.
 			 */
 			if (size >= sizeof(buf))
 				goto done;
 
 			memcpy(buf, data, size);
 			buf[size] = '\0';
 
 			if (validate_uuid(buf, size, NULL, 0) != 0)
 				goto done;
 
 			valid = true;
 			(void)strlcpy(prison0.pr_hostuuid, buf,
 			    sizeof(prison0.pr_hostuuid));
 
 done:
 			if (bootverbose && !valid) {
 				printf("hostuuid: preload data malformed: '%.*s'\n",
 				    (int)size, data);
 			}
 		}
 	}
 	if (bootverbose)
 		printf("hostuuid: using %s\n", prison0.pr_hostuuid);
 }
 
 /*
  * struct jail_args {
  *	struct jail *jail;
  * };
  */
 int
 sys_jail(struct thread *td, struct jail_args *uap)
 {
 	uint32_t version;
 	int error;
 	struct jail j;
 
 	error = copyin(uap->jail, &version, sizeof(uint32_t));
 	if (error)
 		return (error);
 
 	switch (version) {
 	case 0:
 	{
 		struct jail_v0 j0;
 
 		/* FreeBSD single IPv4 jails. */
 		bzero(&j, sizeof(struct jail));
 		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
 		if (error)
 			return (error);
 		j.version = j0.version;
 		j.path = j0.path;
 		j.hostname = j0.hostname;
 		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
 		break;
 	}
 
 	case 1:
 		/*
 		 * Version 1 was used by multi-IPv4 jail implementations
 		 * that never made it into the official kernel.
 		 */
 		return (EINVAL);
 
 	case 2:	/* JAIL_API_VERSION */
 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
 		error = copyin(uap->jail, &j, sizeof(struct jail));
 		if (error)
 			return (error);
 		break;
 
 	default:
 		/* Sci-Fi jails are not supported, sorry. */
 		return (EINVAL);
 	}
 	return (kern_jail(td, &j));
 }
 
 int
 kern_jail(struct thread *td, struct jail *j)
 {
 	struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
 #ifdef INET
 			    + 1
 #endif
 #ifdef INET6
 			    + 1
 #endif
 			    )];
 	struct uio opt;
 	char *u_path, *u_hostname, *u_name;
 	struct bool_flags *bf;
 #ifdef INET
 	uint32_t ip4s;
 	struct in_addr *u_ip4;
 #endif
 #ifdef INET6
 	struct in6_addr *u_ip6;
 #endif
 	size_t tmplen;
 	int error, enforce_statfs;
 
 	bzero(&optiov, sizeof(optiov));
 	opt.uio_iov = optiov;
 	opt.uio_iovcnt = 0;
 	opt.uio_offset = -1;
 	opt.uio_resid = -1;
 	opt.uio_segflg = UIO_SYSSPACE;
 	opt.uio_rw = UIO_READ;
 	opt.uio_td = td;
 
 	/* Set permissions for top-level jails from sysctls. */
 	if (!jailed(td->td_ucred)) {
 		for (bf = pr_flag_allow;
 		     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 			atomic_load_int(&bf->flag) != 0;
 		     bf++) {
 			optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
 			    (jail_default_allow & bf->flag)
 			    ? bf->name : bf->noname);
 			optiov[opt.uio_iovcnt].iov_len =
 			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
 			opt.uio_iovcnt += 2;
 		}
 		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
 		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
 		opt.uio_iovcnt++;
 		enforce_statfs = jail_default_enforce_statfs;
 		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
 		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
 		opt.uio_iovcnt++;
 	}
 
 	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
 #ifdef INET
 	ip4s = (j->version == 0) ? 1 : j->ip4s;
 	if (ip4s > jail_max_af_ips)
 		return (EINVAL);
 	tmplen += ip4s * sizeof(struct in_addr);
 #else
 	if (j->ip4s > 0)
 		return (EINVAL);
 #endif
 #ifdef INET6
 	if (j->ip6s > jail_max_af_ips)
 		return (EINVAL);
 	tmplen += j->ip6s * sizeof(struct in6_addr);
 #else
 	if (j->ip6s > 0)
 		return (EINVAL);
 #endif
 	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
 	u_hostname = u_path + MAXPATHLEN;
 	u_name = u_hostname + MAXHOSTNAMELEN;
 #ifdef INET
 	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
 #endif
 #ifdef INET6
 #ifdef INET
 	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
 #else
 	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
 #endif
 #endif
 	optiov[opt.uio_iovcnt].iov_base = "path";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_path;
 	error = copyinstr(j->path, u_path, MAXPATHLEN,
 	    &optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_hostname;
 	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
 	    &optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 	if (j->jailname != NULL) {
 		optiov[opt.uio_iovcnt].iov_base = "name";
 		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
 		opt.uio_iovcnt++;
 		optiov[opt.uio_iovcnt].iov_base = u_name;
 		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
 		    &optiov[opt.uio_iovcnt].iov_len);
 		if (error) {
 			free(u_path, M_TEMP);
 			return (error);
 		}
 		opt.uio_iovcnt++;
 	}
 #ifdef INET
 	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_ip4;
 	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
 	if (j->version == 0)
 		u_ip4->s_addr = j->ip4s;
 	else {
 		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
 		if (error) {
 			free(u_path, M_TEMP);
 			return (error);
 		}
 	}
 	opt.uio_iovcnt++;
 #endif
 #ifdef INET6
 	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_ip6;
 	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
 	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 #endif
 	KASSERT(opt.uio_iovcnt <= nitems(optiov),
 		("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
 	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
 	free(u_path, M_TEMP);
 	return (error);
 }
 
 /*
  * struct jail_set_args {
  *	struct iovec *iovp;
  *	unsigned int iovcnt;
  *	int flags;
  * };
  */
 int
 sys_jail_set(struct thread *td, struct jail_set_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_set(td, auio, uap->flags);
 	freeuio(auio);
 	return (error);
 }
 
 #if defined(INET) || defined(INET6)
 typedef int prison_addr_cmp_t(const void *, const void *);
 typedef bool prison_addr_valid_t(const void *);
 static const struct pr_family {
 	size_t			size;
 	prison_addr_cmp_t	*cmp;
 	prison_addr_valid_t	*valid;
 	int			ip_flag;
 } pr_families[PR_FAMILY_MAX] = {
 #ifdef INET
 	[PR_INET] = {
 		.size = sizeof(struct in_addr),
 		.cmp = prison_qcmp_v4,
 		.valid = prison_valid_v4,
 		.ip_flag = PR_IP4_USER,
 	 },
 #endif
 #ifdef INET6
 	[PR_INET6] = {
 		.size = sizeof(struct in6_addr),
 		.cmp = prison_qcmp_v6,
 		.valid = prison_valid_v6,
 		.ip_flag = PR_IP6_USER,
 	},
 #endif
 };
 
 /*
  * Network address lists (pr_addrs) allocation for jails.  The addresses
  * are accessed locklessly by the network stack, thus need to be protected by
  * the network epoch.
  */
 struct prison_ip {
 	struct epoch_context ctx;
 	uint32_t	ips;
 #ifdef FUTURE_C
 	/*
 	 * XXX Variable-length automatic arrays in union may be
 	 * supported in future C.
 	 */
 	union {
 		char pr_ip[];
 		struct in_addr pr_ip4[];
 		struct in6_addr pr_ip6[];
 	};
 #else /* No future C :( */
 	char pr_ip[];
 #endif
 };
 
 static char *
 PR_IP(struct prison_ip *pip, const pr_family_t af, int idx)
 {
 	MPASS(pip);
 	MPASS(af < PR_FAMILY_MAX);
 	MPASS(idx >= 0 && idx < pip->ips);
 
 	return (pip->pr_ip + pr_families[af].size * idx);
 }
 
 static struct prison_ip *
 prison_ip_alloc(const pr_family_t af, uint32_t cnt, int flags)
 {
 	struct prison_ip *pip;
 
 	pip = malloc(sizeof(struct prison_ip) + cnt * pr_families[af].size,
 	    M_PRISON, flags);
 	if (pip != NULL)
 		pip->ips = cnt;
 	return (pip);
 }
 
 /*
  * Allocate and copyin user supplied address list, sorting and validating.
  * kern_jail_set() helper.
  */
 static struct prison_ip *
 prison_ip_copyin(const pr_family_t af, void *op, uint32_t cnt)
 {
 	prison_addr_cmp_t *const cmp = pr_families[af].cmp;
 	const size_t size = pr_families[af].size;
 	struct prison_ip *pip;
 
 	pip = prison_ip_alloc(af, cnt, M_WAITOK);
 	bcopy(op, pip->pr_ip, cnt * size);
 	/*
 	 * IP addresses are all sorted but ip[0] to preserve
 	 * the primary IP address as given from userland.
 	 * This special IP is used for unbound outgoing
 	 * connections as well for "loopback" traffic in case
 	 * source address selection cannot find any more fitting
 	 * address to connect from.
 	 */
 	if (cnt > 1)
 		qsort(PR_IP(pip, af, 1), cnt - 1, size, cmp);
 	/*
 	 * Check for duplicate addresses and do some simple
 	 * zero and broadcast checks. If users give other bogus
 	 * addresses it is their problem.
 	 */
 	for (int i = 0; i < cnt; i++) {
 		if (!pr_families[af].valid(PR_IP(pip, af, i))) {
 			free(pip, M_PRISON);
 			return (NULL);
 		}
 		if (i + 1 < cnt &&
 		    (cmp(PR_IP(pip, af, 0), PR_IP(pip, af, i + 1)) == 0 ||
 		     cmp(PR_IP(pip, af, i), PR_IP(pip, af, i + 1)) == 0)) {
 			free(pip, M_PRISON);
 			return (NULL);
 		}
 	}
 
 	return (pip);
 }
 
 /*
  * Allocate and dup parent prison address list.
  * kern_jail_set() helper.
  */
 static void
 prison_ip_dup(struct prison *ppr, struct prison *pr, const pr_family_t af)
 {
 	const struct prison_ip *ppip = ppr->pr_addrs[af];
 	struct prison_ip *pip;
 
 	if (ppip != NULL) {
 		pip = prison_ip_alloc(af, ppip->ips, M_WAITOK);
 		bcopy(ppip->pr_ip, pip->pr_ip, pip->ips * pr_families[af].size);
 		pr->pr_addrs[af] = pip;
 	}
 }
 
 /*
  * Make sure the new set of IP addresses is a subset of the parent's list.
  * Don't worry about the parent being unlocked, as any setting is done with
  * allprison_lock held.
  * kern_jail_set() helper.
  */
 static bool
 prison_ip_parent_match(struct prison_ip *ppip, struct prison_ip *pip,
     const pr_family_t af)
 {
 	prison_addr_cmp_t *const cmp = pr_families[af].cmp;
 	int i, j;
 
 	if (ppip == NULL)
 		return (false);
 
 	for (i = 0; i < ppip->ips; i++)
 		if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, i)) == 0)
 			break;
 
 	if (i == ppip->ips)
 		/* Main address not present in parent. */
 		return (false);
 
 	if (pip->ips > 1) {
 		for (i = j = 1; i < pip->ips; i++) {
 			if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0)
 				/* Equals to parent primary address. */
 				continue;
 			for (; j < ppip->ips; j++)
 				if (cmp(PR_IP(pip, af, i),
 				    PR_IP(ppip, af, j)) == 0)
 					break;
 			if (j == ppip->ips)
 				break;
 		}
 		if (j == ppip->ips)
 			/* Address not present in parent. */
 			return (false);
 	}
 	return (true);
 }
 
 /*
  * Check for conflicting IP addresses.  We permit them if there is no more
  * than one IP on each jail.  If there is a duplicate on a jail with more
  * than one IP stop checking and return error.
  * kern_jail_set() helper.
  */
 static bool
 prison_ip_conflict_check(const struct prison *ppr, const struct prison *pr,
     struct prison_ip *pip, pr_family_t af)
 {
 	const struct prison *tppr, *tpr;
 	int descend;
 
 #ifdef VIMAGE
 	for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
 		if (tppr->pr_flags & PR_VNET)
 			break;
 #else
 	tppr = &prison0;
 #endif
 	FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
 		if (tpr == pr ||
 #ifdef VIMAGE
 		    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
 #endif
 		    !prison_isalive(tpr)) {
 			descend = 0;
 			continue;
 		}
 		if (!(tpr->pr_flags & pr_families[af].ip_flag))
 			continue;
 		descend = 0;
 		if (tpr->pr_addrs[af] == NULL ||
 		    (pip->ips == 1 && tpr->pr_addrs[af]->ips == 1))
 			continue;
 		for (int i = 0; i < pip->ips; i++)
 			if (prison_ip_check(tpr, af, PR_IP(pip, af, i)) == 0)
 				return (false);
 	}
 
 	return (true);
 }
 
 _Static_assert(offsetof(struct prison_ip, ctx) == 0,
     "prison must start with epoch context");
 static void
 prison_ip_free_deferred(epoch_context_t ctx)
 {
 
 	free(ctx, M_PRISON);
 }
 
 static void
 prison_ip_free(struct prison_ip *pip)
 {
 
 	if (pip != NULL)
 		NET_EPOCH_CALL(prison_ip_free_deferred, &pip->ctx);
 }
 
 static void
 prison_ip_set(struct prison *pr, const pr_family_t af, struct prison_ip *new)
 {
 	struct prison_ip **mem, *old;
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 
 	mem = &pr->pr_addrs[af];
 
 	old = *mem;
 	atomic_store_ptr(mem, new);
 	prison_ip_free(old);
 }
 
 /*
  * Restrict a prison's IP address list with its parent's, possibly replacing
  * it.  Return true if succeed, otherwise should redo.
  * kern_jail_set() helper.
  */
 static bool
 prison_ip_restrict(struct prison *pr, const pr_family_t af,
     struct prison_ip **newp)
 {
 	struct prison_ip *ppip = pr->pr_parent->pr_addrs[af];
 	struct prison_ip *pip = pr->pr_addrs[af];
 	int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
 	const size_t size = pr_families[af].size;
 	struct prison_ip *new = newp != NULL ? *newp : NULL;
 	uint32_t ips;
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 
 	/*
 	 * Due to epoch-synchronized access to the IP address lists we always
 	 * allocate a new list even if the old one has enough space.  We could
 	 * atomically update an IPv4 address inside a list, but that would
 	 * screw up sorting, and in case of IPv6 we can't even atomically write
 	 * one.
 	 */
 	if (ppip == NULL) {
 		if (pip != NULL)
 			prison_ip_set(pr, af, NULL);
 		return (true);
 	}
 
 	if (!(pr->pr_flags & pr_families[af].ip_flag)) {
 		if (new == NULL) {
 			new = prison_ip_alloc(af, ppip->ips, M_NOWAIT);
 			if (new == NULL)
 				return (false); /* Redo */
 		}
 		/* This has no user settings, so just copy the parent's list. */
 		MPASS(new->ips == ppip->ips);
 		bcopy(ppip->pr_ip, new->pr_ip, ppip->ips * size);
 		prison_ip_set(pr, af, new);
 		if (newp != NULL)
 			*newp = NULL; /* Used */
 	} else if (pip != NULL) {
 		/* Remove addresses that aren't in the parent. */
 		int i;
 
 		i = 0; /* index in pip */
 		ips = 0; /* index in new */
 
 		if (new == NULL) {
 			new = prison_ip_alloc(af, pip->ips, M_NOWAIT);
 			if (new == NULL)
 				return (false); /* Redo */
 		}
 
 		for (int pi = 0; pi < ppip->ips; pi++)
 			if (cmp(PR_IP(pip, af, 0), PR_IP(ppip, af, pi)) == 0) {
 				/* Found our primary address in parent. */
 				bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
 				    size);
 				i++;
 				ips++;
 				break;
 			}
 		for (int pi = 1; i < pip->ips; ) {
 			/* Check against primary, which is unsorted. */
 			if (cmp(PR_IP(pip, af, i), PR_IP(ppip, af, 0)) == 0) {
 				/* Matches parent's primary address. */
 				bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
 				    size);
 				i++;
 				ips++;
 				continue;
 			}
 			/* The rest are sorted. */
 			switch (pi >= ppip->ips ? -1 :
 				cmp(PR_IP(pip, af, i), PR_IP(ppip, af, pi))) {
 			case -1:
 				i++;
 				break;
 			case 0:
 				bcopy(PR_IP(pip, af, i), PR_IP(new, af, ips),
 				    size);
 				i++;
 				pi++;
 				ips++;
 				break;
 			case 1:
 				pi++;
 				break;
 			}
 		}
 		if (ips == 0) {
 			if (newp == NULL || *newp == NULL)
 				prison_ip_free(new);
 			new = NULL;
 		} else {
 			/* Shrink to real size */
 			KASSERT((new->ips >= ips),
 			    ("Out-of-bounds write to prison_ip %p", new));
 			new->ips = ips;
 		}
 		prison_ip_set(pr, af, new);
 		if (newp != NULL)
 			*newp = NULL; /* Used */
 	}
 	return (true);
 }
 
 /*
  * Fast-path check if an address belongs to a prison.
  */
 int
 prison_ip_check(const struct prison *pr, const pr_family_t af,
     const void *addr)
 {
 	int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
 	struct prison_ip *pip;
 	int i, a, z, d;
 
 	MPASS(mtx_owned(&pr->pr_mtx) ||
 	    in_epoch(net_epoch_preempt) ||
 	    sx_xlocked(&allprison_lock));
 
 	pip = atomic_load_ptr(&pr->pr_addrs[af]);
 	if (__predict_false(pip == NULL))
 		return (EAFNOSUPPORT);
 
 	/* Check the primary IP. */
 	if (cmp(PR_IP(pip, af, 0), addr) == 0)
 		return (0);
 
 	/*
 	 * All the other IPs are sorted so we can do a binary search.
 	 */
 	a = 0;
 	z = pip->ips - 2;
 	while (a <= z) {
 		i = (a + z) / 2;
 		d = cmp(PR_IP(pip, af, i + 1), addr);
 		if (d > 0)
 			z = i - 1;
 		else if (d < 0)
 			a = i + 1;
 		else
 			return (0);
 	}
 
 	return (EADDRNOTAVAIL);
 }
 
 /*
  * Grab primary IP.  Historically required mutex, but nothing prevents
  * us to support epoch-protected access.  Is it used in fast path?
  * in{6}_jail.c helper
  */
 const void *
 prison_ip_get0(const struct prison *pr, const pr_family_t af)
 {
 	const struct prison_ip *pip = pr->pr_addrs[af];
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	MPASS(pip);
 
 	return (pip->pr_ip);
 }
 
 u_int
 prison_ip_cnt(const struct prison *pr, const pr_family_t af)
 {
 
 	return (pr->pr_addrs[af]->ips);
 }
 #endif	/* defined(INET) || defined(INET6) */
 
 int
 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 {
 	struct nameidata nd;
 #ifdef INET
 	struct prison_ip *ip4;
 #endif
 #ifdef INET6
 	struct prison_ip *ip6;
 #endif
 	struct vfsopt *opt;
 	struct vfsoptlist *opts;
 	struct prison *pr, *deadpr, *dinspr, *inspr, *mypr, *ppr, *tpr;
 	struct vnode *root;
 	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
 	char *g_path, *osrelstr;
 	struct bool_flags *bf;
 	struct jailsys_flags *jsf;
 #if defined(INET) || defined(INET6)
 	void *op;
 #endif
 	unsigned long hid;
 	size_t namelen, onamelen, pnamelen;
 	int created, cuflags, descend, drflags, enforce;
 	int error, errmsg_len, errmsg_pos;
 	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
 	int deadid, jid, jsys, len, level;
 	int childmax, osreldt, rsnum, slevel;
 #ifdef INET
 	int ip4s;
 	bool redo_ip4;
 #endif
 #ifdef INET6
 	int ip6s;
 	bool redo_ip6;
 #endif
 	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
 	uint64_t pr_allow_diff;
 	unsigned tallow;
 	char numbuf[12];
 
 	error = priv_check(td, PRIV_JAIL_SET);
 	if (!error && (flags & JAIL_ATTACH))
 		error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 	mypr = td->td_ucred->cr_prison;
 	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
 		return (EPERM);
 	if (flags & ~JAIL_SET_MASK)
 		return (EINVAL);
 
 	/*
 	 * Check all the parameters before committing to anything.  Not all
 	 * errors can be caught early, but we may as well try.  Also, this
 	 * takes care of some expensive stuff (path lookup) before getting
 	 * the allprison lock.
 	 *
 	 * XXX Jails are not filesystems, and jail parameters are not mount
 	 *     options.  But it makes more sense to re-use the vfsopt code
 	 *     than duplicate it under a different name.
 	 */
 	error = vfs_buildopts(optuio, &opts);
 	if (error)
 		return (error);
 #ifdef INET
 	ip4 = NULL;
 #endif
 #ifdef INET6
 	ip6 = NULL;
 #endif
 	g_path = NULL;
 
 	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
 	if (!cuflags) {
 		error = EINVAL;
 		vfs_opterror(opts, "no valid operation (create or update)");
 		goto done_errmsg;
 	}
 
 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 	if (error == ENOENT)
 		jid = 0;
 	else if (error != 0)
 		goto done_free;
 
 	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
 	if (error == ENOENT)
 		gotslevel = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotslevel = 1;
 
 	error =
 	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
 	if (error == ENOENT)
 		gotchildmax = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotchildmax = 1;
 
 	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
 	if (error == ENOENT)
 		gotenforce = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (enforce < 0 || enforce > 2) {
 		error = EINVAL;
 		goto done_free;
 	} else
 		gotenforce = 1;
 
 	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
 	if (error == ENOENT)
 		gotrsnum = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotrsnum = 1;
 
 	pr_flags = ch_flags = 0;
 	for (bf = pr_flag_bool;
 	     bf < pr_flag_bool + nitems(pr_flag_bool);
 	     bf++) {
 		vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
 		vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
 	}
 	ch_flags |= pr_flags;
 	for (jsf = pr_flag_jailsys;
 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
 	     jsf++) {
 		error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
 		if (error == ENOENT)
 			continue;
 		if (error != 0)
 			goto done_free;
 		switch (jsys) {
 		case JAIL_SYS_DISABLE:
 			if (!jsf->disable) {
 				error = EINVAL;
 				goto done_free;
 			}
 			pr_flags |= jsf->disable;
 			break;
 		case JAIL_SYS_NEW:
 			pr_flags |= jsf->new;
 			break;
 		case JAIL_SYS_INHERIT:
 			break;
 		default:
 			error = EINVAL;
 			goto done_free;
 		}
 		ch_flags |= jsf->new | jsf->disable;
 	}
 	if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE
 	    && !(pr_flags & PR_PERSIST)) {
 		error = EINVAL;
 		vfs_opterror(opts, "new jail must persist or attach");
 		goto done_errmsg;
 	}
 #ifdef VIMAGE
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
 		error = EINVAL;
 		vfs_opterror(opts, "vnet cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 #ifdef INET
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
 		error = EINVAL;
 		vfs_opterror(opts, "ip4 cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 #ifdef INET6
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
 		error = EINVAL;
 		vfs_opterror(opts, "ip6 cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 
 	pr_allow = ch_allow = 0;
 	for (bf = pr_flag_allow;
 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 		atomic_load_int(&bf->flag) != 0;
 	     bf++) {
 		vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
 		vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
 	}
 	ch_allow |= pr_allow;
 
 	error = vfs_getopt(opts, "name", (void **)&name, &len);
 	if (error == ENOENT)
 		name = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (len == 0 || name[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
 	if (error == ENOENT)
 		host = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || host[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
 	if (error == ENOENT)
 		domain = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || domain[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
 	if (error == ENOENT)
 		uuid = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || uuid[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > HOSTUUIDLEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		uint32_t hid32;
 
 		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
 		hid = hid32;
 	} else
 #endif
 		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
 	if (error == ENOENT)
 		gothid = 0;
 	else if (error != 0)
 		goto done_free;
 	else {
 		gothid = 1;
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 	}
 
 #ifdef INET
 	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
 	if (error == ENOENT)
 		ip4s = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (ip4s & (sizeof(struct in_addr) - 1)) {
 		error = EINVAL;
 		goto done_free;
 	} else {
 		ch_flags |= PR_IP4_USER;
 		pr_flags |= PR_IP4_USER;
 		if (ip4s > 0) {
 			ip4s /= sizeof(struct in_addr);
 			if (ip4s > jail_max_af_ips) {
 				error = EINVAL;
 				vfs_opterror(opts, "too many IPv4 addresses");
 				goto done_errmsg;
 			}
 			ip4 = prison_ip_copyin(PR_INET, op, ip4s);
 			if (ip4 == NULL) {
 				error = EINVAL;
 				goto done_free;
 			}
 		}
 	}
 #endif
 
 #ifdef INET6
 	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
 	if (error == ENOENT)
 		ip6s = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (ip6s & (sizeof(struct in6_addr) - 1)) {
 		error = EINVAL;
 		goto done_free;
 	} else {
 		ch_flags |= PR_IP6_USER;
 		pr_flags |= PR_IP6_USER;
 		if (ip6s > 0) {
 			ip6s /= sizeof(struct in6_addr);
 			if (ip6s > jail_max_af_ips) {
 				error = EINVAL;
 				vfs_opterror(opts, "too many IPv6 addresses");
 				goto done_errmsg;
 			}
 			ip6 = prison_ip_copyin(PR_INET6, op, ip6s);
 			if (ip6 == NULL) {
 				error = EINVAL;
 				goto done_free;
 			}
 		}
 	}
 #endif
 
 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
 	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
 		error = EINVAL;
 		vfs_opterror(opts,
 		    "vnet jails cannot have IP address restrictions");
 		goto done_errmsg;
 	}
 #endif
 
 	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
 	if (error == ENOENT)
 		osrelstr = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "osrelease cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (len == 0 || osrelstr[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len >= OSRELEASELEN) {
 			error = ENAMETOOLONG;
 			vfs_opterror(opts,
 			    "osrelease string must be 1-%d bytes long",
 			    OSRELEASELEN - 1);
 			goto done_errmsg;
 		}
 	}
 
 	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
 	if (error == ENOENT)
 		osreldt = 0;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "osreldate cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (osreldt == 0) {
 			error = EINVAL;
 			vfs_opterror(opts, "osreldate cannot be 0");
 			goto done_errmsg;
 		}
 	}
 
 	root = NULL;
 	error = vfs_getopt(opts, "path", (void **)&path, &len);
 	if (error == ENOENT)
 		path = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "path cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (len == 0 || path[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path);
 		error = namei(&nd);
 		if (error)
 			goto done_free;
 		root = nd.ni_vp;
 		NDFREE_PNBUF(&nd);
 		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 		strlcpy(g_path, path, MAXPATHLEN);
 		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
 		if (error == 0) {
 			path = g_path;
 		} else {
 			/* exit on other errors */
 			goto done_free;
 		}
 		if (root->v_type != VDIR) {
 			error = ENOTDIR;
 			vput(root);
 			goto done_free;
 		}
 		VOP_UNLOCK(root);
 	}
 
 	/*
 	 * Find the specified jail, or at least its parent.
 	 * This abuses the file error codes ENOENT and EEXIST.
 	 */
 	pr = NULL;
 	inspr = NULL;
 	deadpr = NULL;
 	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
 		namelc = strrchr(name, '.');
 		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
 		if (*p != '\0')
 			jid = 0;
 	}
 	sx_xlock(&allprison_lock);
 	drflags = PD_LIST_XLOCKED;
 	ppr = mypr;
 	if (!prison_isalive(ppr)) {
 		/* This jail is dying.  This process will surely follow. */
 		error = EAGAIN;
 		goto done_deref;
 	}
 	if (jid != 0) {
 		if (jid < 0) {
 			error = EINVAL;
 			vfs_opterror(opts, "negative jid");
 			goto done_deref;
 		}
 		/*
 		 * See if a requested jid already exists.  Keep track of
 		 * where it can be inserted later.
 		 */
 		TAILQ_FOREACH(inspr, &allprison, pr_list) {
 			if (inspr->pr_id < jid)
 				continue;
 			if (inspr->pr_id > jid)
 				break;
 			if (prison_isalive(inspr)) {
 				pr = inspr;
 				mtx_lock(&pr->pr_mtx);
 				drflags |= PD_LOCKED;
 			} else {
 				/* Note a dying jail to handle later. */
 				deadpr = inspr;
 			}
 			inspr = NULL;
 			break;
 		}
 		if (cuflags == JAIL_CREATE && pr != NULL) {
 			/*
 			 * Even creators that cannot see the jail will
 			 * get EEXIST.
 			 */
 			error = EEXIST;
 			vfs_opterror(opts, "jail %d already exists", jid);
 			goto done_deref;
 		}
 		if ((pr == NULL)
 		    ? cuflags == JAIL_UPDATE
 		    : !prison_ischild(mypr, pr)) {
 			/*
 			 * Updaters get ENOENT for nonexistent jails,
 			 * or for jails they cannot see.  The latter
 			 * case is true even for CREATE | UPDATE,
 			 * which normally cannot give this error.
 			 */
 			error = ENOENT;
 			vfs_opterror(opts, "jail %d not found", jid);
 			goto done_deref;
 		}
 	}
 	/*
 	 * If the caller provided a name, look for a jail by that name.
 	 * This has different semantics for creates and updates keyed by jid
 	 * (where the name must not already exist in a different jail),
 	 * and updates keyed by the name itself (where the name must exist
 	 * because that is the jail being updated).
 	 */
 	namelc = NULL;
 	if (name != NULL) {
 		namelc = strrchr(name, '.');
 		if (namelc == NULL)
 			namelc = name;
 		else {
 			/*
 			 * This is a hierarchical name.  Split it into the
 			 * parent and child names, and make sure the parent
 			 * exists or matches an already found jail.
 			 */
 			if (pr != NULL) {
 				if (strncmp(name, ppr->pr_name, namelc - name)
 				    || ppr->pr_name[namelc - name] != '\0') {
 					error = EINVAL;
 					vfs_opterror(opts,
 					    "cannot change jail's parent");
 					goto done_deref;
 				}
 			} else {
 				*namelc = '\0';
 				ppr = prison_find_name(mypr, name);
 				if (ppr == NULL) {
 					error = ENOENT;
 					vfs_opterror(opts,
 					    "jail \"%s\" not found", name);
 					goto done_deref;
 				}
 				mtx_unlock(&ppr->pr_mtx);
 				if (!prison_isalive(ppr)) {
 					error = ENOENT;
 					vfs_opterror(opts,
 					    "jail \"%s\" is dying", name);
 					goto done_deref;
 				}
 				*namelc = '.';
 			}
 			namelc++;
 		}
 		if (namelc[0] != '\0') {
 			pnamelen =
 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
 			FOREACH_PRISON_CHILD(ppr, tpr) {
 				if (tpr == pr || !prison_isalive(tpr) ||
 				    strcmp(tpr->pr_name + pnamelen, namelc))
 					continue;
 				if (cuflags == JAIL_CREATE || pr != NULL) {
 					/*
 					 * Create, or update(jid): name must
 					 * not exist in an active sibling jail.
 					 */
 					error = EEXIST;
 					vfs_opterror(opts,
 					    "jail \"%s\" already exists", name);
 					goto done_deref;
 				}
 				/* Use this jail for updates. */
 				pr = tpr;
 				mtx_lock(&pr->pr_mtx);
 				drflags |= PD_LOCKED;
 				break;
 			}
 			/*
 			 * Update: name must exist if no jid is specified.
 			 * As with the jid case, the jail must be currently
 			 * visible, or else even CREATE | UPDATE will get
 			 * an error.
 			 */
 			if ((pr == NULL)
 			    ? cuflags == JAIL_UPDATE
 			    : !prison_isalive(pr)) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail \"%s\" not found",
 				    name);
 				goto done_deref;
 			}
 		}
 	}
 	/* Update: must provide a jid or name. */
 	else if (cuflags == JAIL_UPDATE && pr == NULL) {
 		error = ENOENT;
 		vfs_opterror(opts, "update specified no jail");
 		goto done_deref;
 	}
 
 	/* If there's no prison to update, create a new one and link it in. */
 	created = pr == NULL;
 	if (created) {
 		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
 			if (tpr->pr_childcount >= tpr->pr_childmax) {
 				error = EPERM;
 				vfs_opterror(opts, "prison limit exceeded");
 				goto done_deref;
 			}
 
 		if (deadpr != NULL) {
 			/*
 			 * The prison being created has the same ID as a dying
 			 * one.  Handle this by giving the dying jail a new ID.
 			 * This may cause some confusion to user space, but
 			 * only to those listing dying jails.
 			 */
 			deadid = get_next_deadid(&dinspr);
 			if (deadid == 0) {
 				error = EAGAIN;
 				vfs_opterror(opts, "no available jail IDs");
 				goto done_deref;
 			}
 			mtx_lock(&deadpr->pr_mtx);
 			deadpr->pr_id = deadid;
 			mtx_unlock(&deadpr->pr_mtx);
 			if (dinspr == deadpr)
 				inspr = deadpr;
 			else {
 				inspr = TAILQ_NEXT(deadpr, pr_list);
 				TAILQ_REMOVE(&allprison, deadpr, pr_list);
 				if (dinspr != NULL)
 					TAILQ_INSERT_AFTER(&allprison, dinspr,
 					    deadpr, pr_list);
 				else
 					TAILQ_INSERT_HEAD(&allprison, deadpr,
 					    pr_list);
 			}
 		}
 		if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) {
 			error = EAGAIN;
 			vfs_opterror(opts, "no available jail IDs");
 			goto done_deref;
 		}
 
 		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
 		pr->pr_state = PRISON_STATE_INVALID;
 		refcount_init(&pr->pr_ref, 1);
 		refcount_init(&pr->pr_uref, 0);
 		drflags |= PD_DEREF;
 		LIST_INIT(&pr->pr_children);
 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
 
 		pr->pr_id = jid;
 		if (inspr != NULL)
 			TAILQ_INSERT_BEFORE(inspr, pr, pr_list);
 		else
 			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
 
 		pr->pr_parent = ppr;
 		prison_hold(ppr);
 		prison_proc_hold(ppr);
 		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
 			tpr->pr_childcount++;
 
 		/* Set some default values, and inherit some from the parent. */
 		if (namelc == NULL)
 			namelc = "";
 		if (path == NULL) {
 			path = "/";
 			root = mypr->pr_root;
 			vref(root);
 		}
 		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
 		pr->pr_flags |= PR_HOST;
 #if defined(INET) || defined(INET6)
 #ifdef VIMAGE
 		if (!(pr_flags & PR_VNET))
 #endif
 		{
 #ifdef INET
 			if (!(ch_flags & PR_IP4_USER))
 				pr->pr_flags |= PR_IP4 | PR_IP4_USER;
 			else if (!(pr_flags & PR_IP4_USER)) {
 				pr->pr_flags |= ppr->pr_flags & PR_IP4;
 				prison_ip_dup(ppr, pr, PR_INET);
 			}
 #endif
 #ifdef INET6
 			if (!(ch_flags & PR_IP6_USER))
 				pr->pr_flags |= PR_IP6 | PR_IP6_USER;
 			else if (!(pr_flags & PR_IP6_USER)) {
 				pr->pr_flags |= ppr->pr_flags & PR_IP6;
 				prison_ip_dup(ppr, pr, PR_INET6);
 			}
 #endif
 		}
 #endif
 		/* Source address selection is always on by default. */
 		pr->pr_flags |= _PR_IP_SADDRSEL;
 
 		pr->pr_securelevel = ppr->pr_securelevel;
 		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
 		pr->pr_enforce_statfs = jail_default_enforce_statfs;
 		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
 
 		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
 		if (osrelstr == NULL)
 			strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
 			    sizeof(pr->pr_osrelease));
 		else
 			strlcpy(pr->pr_osrelease, osrelstr,
 			    sizeof(pr->pr_osrelease));
 
 #ifdef VIMAGE
 		/*
 		 * Allocate a new vnet if specified.
 		 *
 		 * Set PR_VNET now if so, so that the vnet is disposed of
 		 * properly when the jail is destroyed.
 		 */
 		if (pr_flags & PR_VNET) {
 			pr->pr_flags |= PR_VNET;
 			pr->pr_vnet = vnet_alloc();
 		} else {
 			pr->pr_vnet = ppr->pr_vnet;
 		}
 #endif
 		/*
 		 * Allocate a dedicated cpuset for each jail.
 		 * Unlike other initial settings, this may return an error.
 		 */
 		error = cpuset_create_root(ppr, &pr->pr_cpuset);
 		if (error)
 			goto done_deref;
 
 		mtx_lock(&pr->pr_mtx);
 		drflags |= PD_LOCKED;
 	} else {
 		/*
 		 * Grab a reference for existing prisons, to ensure they
 		 * continue to exist for the duration of the call.
 		 */
 		prison_hold(pr);
 		drflags |= PD_DEREF;
 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
 		if ((pr->pr_flags & PR_VNET) &&
 		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "vnet jails cannot have IP address restrictions");
 			goto done_deref;
 		}
 #endif
 #ifdef INET
 		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "ip4 cannot be changed after creation");
 			goto done_deref;
 		}
 #endif
 #ifdef INET6
 		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "ip6 cannot be changed after creation");
 			goto done_deref;
 		}
 #endif
 	}
 
 	/* Do final error checking before setting anything. */
 	if (gotslevel) {
 		if (slevel < ppr->pr_securelevel) {
 			error = EPERM;
 			goto done_deref;
 		}
 	}
 	if (gotchildmax) {
 		if (childmax >= ppr->pr_childmax) {
 			error = EPERM;
 			goto done_deref;
 		}
 	}
 	if (gotenforce) {
 		if (enforce < ppr->pr_enforce_statfs) {
 			error = EPERM;
 			goto done_deref;
 		}
 	}
 	if (gotrsnum) {
 		/*
 		 * devfs_rsnum is a uint16_t
 		 */
 		if (rsnum < 0 || rsnum > 65535) {
 			error = EINVAL;
 			goto done_deref;
 		}
 		/*
 		 * Nested jails always inherit parent's devfs ruleset
 		 */
 		if (jailed(td->td_ucred)) {
 			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
 				error = EPERM;
 				goto done_deref;
 			} else
 				rsnum = ppr->pr_devfs_rsnum;
 		}
 	}
 #ifdef INET
 	if (ip4s > 0) {
 		if ((ppr->pr_flags & PR_IP4) &&
 		    !prison_ip_parent_match(ppr->pr_addrs[PR_INET], ip4,
 		    PR_INET)) {
 			error = EPERM;
 			goto done_deref;
 		}
 		if (!prison_ip_conflict_check(ppr, pr, ip4, PR_INET)) {
 			error = EADDRINUSE;
 			vfs_opterror(opts, "IPv4 addresses clash");
 			goto done_deref;
 		}
 	}
 #endif
 #ifdef INET6
 	if (ip6s > 0) {
 		if ((ppr->pr_flags & PR_IP6) &&
 		    !prison_ip_parent_match(ppr->pr_addrs[PR_INET6], ip6,
 		    PR_INET6)) {
 			error = EPERM;
 			goto done_deref;
 		}
 		if (!prison_ip_conflict_check(ppr, pr, ip6, PR_INET6)) {
 			error = EADDRINUSE;
 			vfs_opterror(opts, "IPv6 addresses clash");
 			goto done_deref;
 		}
 	}
 #endif
 	onamelen = namelen = 0;
 	if (namelc != NULL) {
 		/* Give a default name of the jid.  Also allow the name to be
 		 * explicitly the jid - but not any other number, and only in
 		 * normal form (no leading zero/etc).
 		 */
 		if (namelc[0] == '\0')
 			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
 		else if ((strtoul(namelc, &p, 10) != jid ||
 			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "name cannot be numeric (unless it is the jid)");
 			goto done_deref;
 		}
 		/*
 		 * Make sure the name isn't too long for the prison or its
 		 * children.
 		 */
 		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
 		onamelen = strlen(pr->pr_name + pnamelen);
 		namelen = strlen(namelc);
 		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
 			error = ENAMETOOLONG;
 			goto done_deref;
 		}
 		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
 			    sizeof(pr->pr_name)) {
 				error = ENAMETOOLONG;
 				goto done_deref;
 			}
 		}
 	}
 	pr_allow_diff = pr_allow & ~ppr->pr_allow;
 	if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) {
 		error = EPERM;
 		goto done_deref;
 	}
 
 	/*
 	 * Let modules check their parameters.  This requires unlocking and
 	 * then re-locking the prison, but this is still a valid state as long
 	 * as allprison_lock remains xlocked.
 	 */
 	mtx_unlock(&pr->pr_mtx);
 	drflags &= ~PD_LOCKED;
 	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
 	if (error != 0)
 		goto done_deref;
 	mtx_lock(&pr->pr_mtx);
 	drflags |= PD_LOCKED;
 
 	/* At this point, all valid parameters should have been noted. */
 	TAILQ_FOREACH(opt, opts, link) {
 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
 			error = EINVAL;
 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
 			goto done_deref;
 		}
 	}
 
 	/* Set the parameters of the prison. */
 #ifdef INET
 	redo_ip4 = false;
 	if (pr_flags & PR_IP4_USER) {
 		pr->pr_flags |= PR_IP4;
 		prison_ip_set(pr, PR_INET, ip4);
 		ip4 = NULL;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (!prison_ip_restrict(tpr, PR_INET, NULL)) {
 				redo_ip4 = true;
 				descend = 0;
 			}
 		}
 	}
 #endif
 #ifdef INET6
 	redo_ip6 = false;
 	if (pr_flags & PR_IP6_USER) {
 		pr->pr_flags |= PR_IP6;
 		prison_ip_set(pr, PR_INET6, ip6);
 		ip6 = NULL;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (!prison_ip_restrict(tpr, PR_INET6, NULL)) {
 				redo_ip6 = true;
 				descend = 0;
 			}
 		}
 	}
 #endif
 	if (gotslevel) {
 		pr->pr_securelevel = slevel;
 		/* Set all child jails to be at least this level. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			if (tpr->pr_securelevel < slevel)
 				tpr->pr_securelevel = slevel;
 	}
 	if (gotchildmax) {
 		pr->pr_childmax = childmax;
 		/* Set all child jails to under this limit. */
 		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
 			if (tpr->pr_childmax > childmax - level)
 				tpr->pr_childmax = childmax > level
 				    ? childmax - level : 0;
 	}
 	if (gotenforce) {
 		pr->pr_enforce_statfs = enforce;
 		/* Pass this restriction on to the children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			if (tpr->pr_enforce_statfs < enforce)
 				tpr->pr_enforce_statfs = enforce;
 	}
 	if (gotrsnum) {
 		pr->pr_devfs_rsnum = rsnum;
 		/* Pass this restriction on to the children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			tpr->pr_devfs_rsnum = rsnum;
 	}
 	if (namelc != NULL) {
 		if (ppr == &prison0)
 			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
 		else
 			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
 			    ppr->pr_name, namelc);
 		/* Change this component of child names. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
 			    strlen(tpr->pr_name + onamelen) + 1);
 			bcopy(pr->pr_name, tpr->pr_name, namelen);
 		}
 	}
 	if (path != NULL) {
 		/* Try to keep a real-rooted full pathname. */
 		strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
 		pr->pr_root = root;
 		root = NULL;
 	}
 	if (PR_HOST & ch_flags & ~pr_flags) {
 		if (pr->pr_flags & PR_HOST) {
 			/*
 			 * Copy the parent's host info.  As with pr_ip4 above,
 			 * the lack of a lock on the parent is not a problem;
 			 * it is always set with allprison_lock at least
 			 * shared, and is held exclusively here.
 			 */
 			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
 			    sizeof(pr->pr_hostname));
 			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
 			    sizeof(pr->pr_domainname));
 			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
 			    sizeof(pr->pr_hostuuid));
 			pr->pr_hostid = pr->pr_parent->pr_hostid;
 		}
 	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
 		/* Set this prison, and any descendants without PR_HOST. */
 		if (host != NULL)
 			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
 		if (domain != NULL)
 			strlcpy(pr->pr_domainname, domain, 
 			    sizeof(pr->pr_domainname));
 		if (uuid != NULL)
 			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
 		if (gothid)
 			pr->pr_hostid = hid;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 			if (tpr->pr_flags & PR_HOST)
 				descend = 0;
 			else {
 				if (host != NULL)
 					strlcpy(tpr->pr_hostname,
 					    pr->pr_hostname,
 					    sizeof(tpr->pr_hostname));
 				if (domain != NULL)
 					strlcpy(tpr->pr_domainname, 
 					    pr->pr_domainname,
 					    sizeof(tpr->pr_domainname));
 				if (uuid != NULL)
 					strlcpy(tpr->pr_hostuuid,
 					    pr->pr_hostuuid,
 					    sizeof(tpr->pr_hostuuid));
 				if (gothid)
 					tpr->pr_hostid = hid;
 			}
 		}
 	}
 	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
 	if ((tallow = ch_allow & ~pr_allow))
 		prison_set_allow_locked(pr, tallow, 0);
 	/*
 	 * Persistent prisons get an extra reference, and prisons losing their
 	 * persist flag lose that reference.
 	 */
 	if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) {
 		if (pr_flags & PR_PERSIST) {
 			prison_hold(pr);
 			/*
 			 * This may be a new prison's first user reference,
 			 * but wait to call it alive until after OSD calls
 			 * have had a chance to run (and perhaps to fail).
 			 */
 			refcount_acquire(&pr->pr_uref);
 		} else {
 			drflags |= PD_DEUREF;
 			prison_free_not_last(pr);
 		}
 	}
 	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
 	mtx_unlock(&pr->pr_mtx);
 	drflags &= ~PD_LOCKED;
 	/*
 	 * Any errors past this point will need to de-persist newly created
 	 * prisons, as well as call remove methods.
 	 */
 	if (created)
 		drflags |= PD_KILL;
 
 #ifdef RACCT
 	if (racct_enable && created)
 		prison_racct_attach(pr);
 #endif
 
 	/* Locks may have prevented a complete restriction of child IP
 	 * addresses.  If so, allocate some more memory and try again.
 	 */
 #ifdef INET
 	while (redo_ip4) {
 		ip4s = pr->pr_addrs[PR_INET]->ips;
 		MPASS(ip4 == NULL);
 		ip4 = prison_ip_alloc(PR_INET, ip4s, M_WAITOK);
 		mtx_lock(&pr->pr_mtx);
 		redo_ip4 = false;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (!prison_ip_restrict(tpr, PR_INET, &ip4))
 				redo_ip4 = true;
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 #endif
 #ifdef INET6
 	while (redo_ip6) {
 		ip6s = pr->pr_addrs[PR_INET6]->ips;
 		MPASS(ip6 == NULL);
 		ip6 = prison_ip_alloc(PR_INET6, ip6s, M_WAITOK);
 		mtx_lock(&pr->pr_mtx);
 		redo_ip6 = false;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (!prison_ip_restrict(tpr, PR_INET6, &ip6))
 				redo_ip6 = true;
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 #endif
 
 	/* Let the modules do their work. */
 	if (created) {
 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
 		if (error)
 			goto done_deref;
 	}
 	error = osd_jail_call(pr, PR_METHOD_SET, opts);
 	if (error)
 		goto done_deref;
 
 	/*
 	 * A new prison is now ready to be seen; either it has gained a user
 	 * reference via persistence, or is about to gain one via attachment.
 	 */
 	if (created) {
 		drflags = prison_lock_xlock(pr, drflags);
 		pr->pr_state = PRISON_STATE_ALIVE;
 	}
 
 	/* Attach this process to the prison if requested. */
 	if (flags & JAIL_ATTACH) {
 		error = do_jail_attach(td, pr,
 		    prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS));
 		drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED);
 		if (error) {
 			vfs_opterror(opts, "attach failed");
 			goto done_deref;
 		}
 	}
 
 #ifdef RACCT
 	if (racct_enable && !created) {
 		if (drflags & PD_LOCKED) {
 			mtx_unlock(&pr->pr_mtx);
 			drflags &= ~PD_LOCKED;
 		}
 		if (drflags & PD_LIST_XLOCKED) {
 			sx_xunlock(&allprison_lock);
 			drflags &= ~PD_LIST_XLOCKED;
 		}
 		prison_racct_modify(pr);
 	}
 #endif
 
 	if (created && pr != &prison0 && (pr->pr_allow & PR_ALLOW_NFSD) != 0 &&
 	    (pr->pr_root->v_vflag & VV_ROOT) == 0)
 		printf("Warning jail jid=%d: mountd/nfsd requires a separate"
 		   " file system\n", pr->pr_id);
 
 	drflags &= ~PD_KILL;
 	td->td_retval[0] = pr->pr_id;
 
  done_deref:
 	/* Release any temporary prison holds and/or locks. */
 	if (pr != NULL)
 		prison_deref(pr, drflags);
 	else if (drflags & PD_LIST_SLOCKED)
 		sx_sunlock(&allprison_lock);
 	else if (drflags & PD_LIST_XLOCKED)
 		sx_xunlock(&allprison_lock);
 	if (root != NULL)
 		vrele(root);
  done_errmsg:
 	if (error) {
 		/* Write the error message back to userspace. */
 		if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
 		    &errmsg_len) == 0 && errmsg_len > 0) {
 			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
 			if (optuio->uio_segflg == UIO_SYSSPACE)
 				bcopy(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 			else
 				(void)copyout(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 		}
 	}
  done_free:
 #ifdef INET
 	prison_ip_free(ip4);
 #endif
 #ifdef INET6
 	prison_ip_free(ip6);
 #endif
 	if (g_path != NULL)
 		free(g_path, M_TEMP);
 	vfs_freeopts(opts);
 	return (error);
 }
 
 /*
  * Find the next available prison ID.  Return the ID on success, or zero
  * on failure.  Also set a pointer to the allprison list entry the prison
  * should be inserted before.
  */
 static int
 get_next_prid(struct prison **insprp)
 {
 	struct prison *inspr;
 	int jid, maxid;
 
 	jid = lastprid % JAIL_MAX + 1;
 	if (TAILQ_EMPTY(&allprison) ||
 	    TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) {
 		/*
 		 * A common case is for all jails to be implicitly numbered,
 		 * which means they'll go on the end of the list, at least
 		 * for the first JAIL_MAX times.
 		 */
 		inspr = NULL;
 	} else {
 		/*
 		 * Take two passes through the allprison list: first starting
 		 * with the proposed jid, then ending with it.
 		 */
 		for (maxid = JAIL_MAX; maxid != 0; ) {
 			TAILQ_FOREACH(inspr, &allprison, pr_list) {
 				if (inspr->pr_id < jid)
 					continue;
 				if (inspr->pr_id > jid) {
 					/* Found an opening. */
 					maxid = 0;
 					break;
 				}
 				if (++jid > maxid) {
 					if (lastprid == maxid || lastprid == 0)
 					{
 						/*
 						 * The entire legal range
 						 * has been traversed
 						 */
 						return 0;
 					}
 					/* Try again from the start. */
 					jid = 1;
 					maxid = lastprid;
 					break;
 				}
 			}
 			if (inspr == NULL) {
 				/* Found room at the end of the list. */
 				break;
 			}
 		}
 	}
 	*insprp = inspr;
 	lastprid = jid;
 	return (jid);
 }
 
 /*
  * Find the next available ID for a renumbered dead prison.  This is the same
  * as get_next_prid, but counting backward from the end of the range.
  */
 static int
 get_next_deadid(struct prison **dinsprp)
 {
 	struct prison *dinspr;
 	int deadid, minid;
 
 	deadid = lastdeadid ? lastdeadid - 1 : JAIL_MAX;
 	/*
 	 * Take two reverse passes through the allprison list: first
 	 * starting with the proposed deadid, then ending with it.
 	 */
 	for (minid = 1; minid != 0; ) {
 		TAILQ_FOREACH_REVERSE(dinspr, &allprison, prisonlist, pr_list) {
 			if (dinspr->pr_id > deadid)
 				continue;
 			if (dinspr->pr_id < deadid) {
 				/* Found an opening. */
 				minid = 0;
 				break;
 			}
 			if (--deadid < minid) {
 				if (lastdeadid == minid || lastdeadid == 0)
 				{
 					/*
 					 * The entire legal range
 					 * has been traversed
 					 */
 					return 0;
 				}
 				/* Try again from the end. */
 				deadid = JAIL_MAX;
 				minid = lastdeadid;
 				break;
 			}
 		}
 		if (dinspr == NULL) {
 			/* Found room at the beginning of the list. */
 			break;
 		}
 	}
 	*dinsprp = dinspr;
 	lastdeadid = deadid;
 	return (deadid);
 }
 
 /*
  * struct jail_get_args {
  *	struct iovec *iovp;
  *	unsigned int iovcnt;
  *	int flags;
  * };
  */
 int
 sys_jail_get(struct thread *td, struct jail_get_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_get(td, auio, uap->flags);
 	if (error == 0)
 		error = copyout(auio->uio_iov, uap->iovp,
 		    uap->iovcnt * sizeof(struct iovec));
 	freeuio(auio);
 	return (error);
 }
 
 int
 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
 {
 	struct bool_flags *bf;
 	struct jailsys_flags *jsf;
 	struct prison *pr, *mypr;
 	struct vfsopt *opt;
 	struct vfsoptlist *opts;
 	char *errmsg, *name;
 	int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos;
 	unsigned f;
 
 	if (flags & ~JAIL_GET_MASK)
 		return (EINVAL);
 
 	/* Get the parameter list. */
 	error = vfs_buildopts(optuio, &opts);
 	if (error)
 		return (error);
 	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
 	mypr = td->td_ucred->cr_prison;
 	pr = NULL;
 
 	/*
 	 * Find the prison specified by one of: lastjid, jid, name.
 	 */
 	sx_slock(&allprison_lock);
 	drflags = PD_LIST_SLOCKED;
 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
 	if (error == 0) {
 		TAILQ_FOREACH(pr, &allprison, pr_list) {
 			if (pr->pr_id > jid &&
 			    ((flags & JAIL_DYING) || prison_isalive(pr)) &&
 			    prison_ischild(mypr, pr)) {
 				mtx_lock(&pr->pr_mtx);
 				drflags |= PD_LOCKED;
 				goto found_prison;
 			}
 		}
 		error = ENOENT;
 		vfs_opterror(opts, "no jail after %d", jid);
 		goto done;
 	} else if (error != ENOENT)
 		goto done;
 
 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 	if (error == 0) {
 		if (jid != 0) {
 			pr = prison_find_child(mypr, jid);
 			if (pr != NULL) {
 				drflags |= PD_LOCKED;
 				if (!(prison_isalive(pr) ||
 				    (flags & JAIL_DYING))) {
 					error = ENOENT;
 					vfs_opterror(opts, "jail %d is dying",
 					    jid);
 					goto done;
 				}
 				goto found_prison;
 			}
 			error = ENOENT;
 			vfs_opterror(opts, "jail %d not found", jid);
 			goto done;
 		}
 	} else if (error != ENOENT)
 		goto done;
 
 	error = vfs_getopt(opts, "name", (void **)&name, &len);
 	if (error == 0) {
 		if (len == 0 || name[len - 1] != '\0') {
 			error = EINVAL;
 			goto done;
 		}
 		pr = prison_find_name(mypr, name);
 		if (pr != NULL) {
 			drflags |= PD_LOCKED;
 			if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail \"%s\" is dying",
 				    name);
 				goto done;
 			}
 			goto found_prison;
 		}
 		error = ENOENT;
 		vfs_opterror(opts, "jail \"%s\" not found", name);
 		goto done;
 	} else if (error != ENOENT)
 		goto done;
 
 	vfs_opterror(opts, "no jail specified");
 	error = ENOENT;
 	goto done;
 
  found_prison:
 	/* Get the parameters of the prison. */
 	prison_hold(pr);
 	drflags |= PD_DEREF;
 	td->td_retval[0] = pr->pr_id;
 	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
 	error = vfs_setopt(opts, "parent", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
 	    sizeof(pr->pr_cpuset->cs_id));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
 	if (error != 0 && error != ENOENT)
 		goto done;
 #ifdef INET
 	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_addrs[PR_INET]->pr_ip,
 	    pr->pr_addrs[PR_INET] ? pr->pr_addrs[PR_INET]->ips *
 	    pr_families[PR_INET].size : 0 );
 	if (error != 0 && error != ENOENT)
 		goto done;
 #endif
 #ifdef INET6
 	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_addrs[PR_INET6]->pr_ip,
 	    pr->pr_addrs[PR_INET6] ? pr->pr_addrs[PR_INET6]->ips *
 	    pr_families[PR_INET6].size : 0 );
 	if (error != 0 && error != ENOENT)
 		goto done;
 #endif
 	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
 	    sizeof(pr->pr_securelevel));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
 	    sizeof(pr->pr_childcount));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
 	    sizeof(pr->pr_childmax));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
 	if (error != 0 && error != ENOENT)
 		goto done;
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		uint32_t hid32 = pr->pr_hostid;
 
 		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
 	} else
 #endif
 	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
 	    sizeof(pr->pr_hostid));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
 	    sizeof(pr->pr_enforce_statfs));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
 	    sizeof(pr->pr_devfs_rsnum));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	for (bf = pr_flag_bool;
 	     bf < pr_flag_bool + nitems(pr_flag_bool);
 	     bf++) {
 		i = (pr->pr_flags & bf->flag) ? 1 : 0;
 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 		i = !i;
 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 	}
 	for (jsf = pr_flag_jailsys;
 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
 	     jsf++) {
 		f = pr->pr_flags & (jsf->disable | jsf->new);
 		i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
 		    : (f == jsf->new) ? JAIL_SYS_NEW
 		    : JAIL_SYS_INHERIT;
 		error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 	}
 	for (bf = pr_flag_allow;
 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 		atomic_load_int(&bf->flag) != 0;
 	     bf++) {
 		i = (pr->pr_allow & bf->flag) ? 1 : 0;
 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 		i = !i;
 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 	}
 	i = !prison_isalive(pr);
 	error = vfs_setopt(opts, "dying", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	i = !i;
 	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
 	    sizeof(pr->pr_osreldate));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
 	if (error != 0 && error != ENOENT)
 		goto done;
 
 	/* Get the module parameters. */
 	mtx_unlock(&pr->pr_mtx);
 	drflags &= ~PD_LOCKED;
 	error = osd_jail_call(pr, PR_METHOD_GET, opts);
 	if (error)
 		goto done;
 	prison_deref(pr, drflags);
 	pr = NULL;
 	drflags = 0;
 
 	/* By now, all parameters should have been noted. */
 	TAILQ_FOREACH(opt, opts, link) {
 		if (!opt->seen &&
 		    (strstr(opt->name, JAIL_META_PRIVATE ".") == opt->name ||
 		    strstr(opt->name, JAIL_META_SHARED ".") == opt->name)) {
 			/* Communicate back a missing key. */
 			free(opt->value, M_MOUNT);
 			opt->value = NULL;
 			opt->len = 0;
 			continue;
 		}
 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
 			error = EINVAL;
 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
 			goto done;
 		}
 	}
 
 	/* Write the fetched parameters back to userspace. */
 	error = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
 			pos = 2 * opt->pos + 1;
 			optuio->uio_iov[pos].iov_len = opt->len;
 			if (opt->value != NULL) {
 				if (optuio->uio_segflg == UIO_SYSSPACE) {
 					bcopy(opt->value,
 					    optuio->uio_iov[pos].iov_base,
 					    opt->len);
 				} else {
 					error = copyout(opt->value,
 					    optuio->uio_iov[pos].iov_base,
 					    opt->len);
 					if (error)
 						break;
 				}
 			}
 		}
 	}
 
  done:
 	/* Release any temporary prison holds and/or locks. */
 	if (pr != NULL)
 		prison_deref(pr, drflags);
 	else if (drflags & PD_LIST_SLOCKED)
 		sx_sunlock(&allprison_lock);
 	if (error && errmsg_pos >= 0) {
 		/* Write the error message back to userspace. */
 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
 		errmsg_pos = 2 * errmsg_pos + 1;
 		if (errmsg_len > 0) {
 			if (optuio->uio_segflg == UIO_SYSSPACE)
 				bcopy(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 			else
 				(void)copyout(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 		}
 	}
 	vfs_freeopts(opts);
 	return (error);
 }
 
 /*
  * struct jail_remove_args {
  *	int jid;
  * };
  */
 int
 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
 {
 	struct prison *pr;
 	int error;
 
 	error = priv_check(td, PRIV_JAIL_REMOVE);
 	if (error)
 		return (error);
 
 	sx_xlock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 	if (pr == NULL) {
 		sx_xunlock(&allprison_lock);
 		return (EINVAL);
 	}
 	if (!prison_isalive(pr)) {
 		/* Silently ignore already-dying prisons. */
 		mtx_unlock(&pr->pr_mtx);
 		sx_xunlock(&allprison_lock);
 		return (0);
 	}
 	prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED);
 	return (0);
 }
 
 /*
  * struct jail_attach_args {
  *	int jid;
  * };
  */
 int
 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
 {
 	struct prison *pr;
 	int error;
 
 	error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 	if (pr == NULL) {
 		sx_sunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	/* Do not allow a process to attach to a prison that is not alive. */
 	if (!prison_isalive(pr)) {
 		mtx_unlock(&pr->pr_mtx);
 		sx_sunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
 }
 
 static int
 do_jail_attach(struct thread *td, struct prison *pr, int drflags)
 {
 	struct proc *p;
 	struct ucred *newcred, *oldcred;
 	int error;
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	sx_assert(&allprison_lock, SX_LOCKED);
 	drflags &= PD_LOCK_FLAGS;
 	/*
 	 * XXX: Note that there is a slight race here if two threads
 	 * in the same privileged process attempt to attach to two
 	 * different jails at the same time.  It is important for
 	 * user processes not to do this, or they might end up with
 	 * a process root from one prison, but attached to the jail
 	 * of another.
 	 */
 	prison_hold(pr);
 	refcount_acquire(&pr->pr_uref);
 	drflags |= PD_DEREF | PD_DEUREF;
 	mtx_unlock(&pr->pr_mtx);
 	drflags &= ~PD_LOCKED;
 
 	/* Let modules do whatever they need to prepare for attaching. */
 	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
 	if (error) {
 		prison_deref(pr, drflags);
 		return (error);
 	}
 	sx_unlock(&allprison_lock);
 	drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED);
 
 	/*
 	 * Reparent the newly attached process to this jail.
 	 */
 	p = td->td_proc;
 	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
 	if (error)
 		goto e_revert_osd;
 
 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
 	if ((error = change_dir(pr->pr_root, td)) != 0)
 		goto e_unlock;
 #ifdef MAC
 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
 		goto e_unlock;
 #endif
 	VOP_UNLOCK(pr->pr_root);
 	if ((error = pwd_chroot_chdir(td, pr->pr_root)))
 		goto e_revert_osd;
 
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 	newcred->cr_prison = pr;
 	proc_set_cred(p, newcred);
 	setsugid(p);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
 #ifdef RCTL
 	rctl_proc_ucred_changed(p, newcred);
 	crfree(newcred);
 #endif
 	prison_proc_relink(oldcred->cr_prison, pr, p);
 	prison_deref(oldcred->cr_prison, drflags);
 	crfree(oldcred);
 
 	/*
 	 * If the prison was killed while changing credentials, die along
 	 * with it.
 	 */
 	if (!prison_isalive(pr)) {
 		PROC_LOCK(p);
 		kern_psignal(p, SIGKILL);
 		PROC_UNLOCK(p);
 	}
 
 	return (0);
 
  e_unlock:
 	VOP_UNLOCK(pr->pr_root);
  e_revert_osd:
 	/* Tell modules this thread is still in its old jail after all. */
 	sx_slock(&allprison_lock);
 	drflags |= PD_LIST_SLOCKED;
 	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
 	prison_deref(pr, drflags);
 	return (error);
 }
 
 /*
  * Returns a locked prison instance, or NULL on failure.
  */
 struct prison *
 prison_find(int prid)
 {
 	struct prison *pr;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		if (pr->pr_id < prid)
 			continue;
 		if (pr->pr_id > prid)
 			break;
 		KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr));
 		mtx_lock(&pr->pr_mtx);
 		return (pr);
 	}
 	return (NULL);
 }
 
 /*
  * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
  */
 struct prison *
 prison_find_child(struct prison *mypr, int prid)
 {
 	struct prison *pr;
 	int descend;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 		if (pr->pr_id == prid) {
 			KASSERT(prison_isvalid(pr),
 			    ("Found invalid prison %p", pr));
 			mtx_lock(&pr->pr_mtx);
 			return (pr);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Look for the name relative to mypr.  Returns a locked prison or NULL.
  */
 struct prison *
 prison_find_name(struct prison *mypr, const char *name)
 {
 	struct prison *pr, *deadpr;
 	size_t mylen;
 	int descend;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
 	deadpr = NULL;
 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 		if (!strcmp(pr->pr_name + mylen, name)) {
 			KASSERT(prison_isvalid(pr),
 			    ("Found invalid prison %p", pr));
 			if (prison_isalive(pr)) {
 				mtx_lock(&pr->pr_mtx);
 				return (pr);
 			}
 			deadpr = pr;
 		}
 	}
 	/* There was no valid prison - perhaps there was a dying one. */
 	if (deadpr != NULL)
 		mtx_lock(&deadpr->pr_mtx);
 	return (deadpr);
 }
 
 /*
  * See if a prison has the specific flag set.  The prison should be locked,
  * unless checking for flags that are only set at jail creation (such as
  * PR_IP4 and PR_IP6), or only the single bit is examined, without regard
  * to any other prison data.
  */
 bool
 prison_flag(struct ucred *cred, unsigned flag)
 {
 
 	return ((cred->cr_prison->pr_flags & flag) != 0);
 }
 
 /*
  * See if a prison has the specific allow flag set.
  * The prison *should* be locked, or only a single bit is examined, without
  * regard to any other prison data.
  */
 bool
 prison_allow(struct ucred *cred, unsigned flag)
 {
 
 	return ((cred->cr_prison->pr_allow & flag) != 0);
 }
 
 /*
  * Hold a prison reference, by incrementing pr_ref.  It is generally
  * an error to hold a prison that does not already have a reference.
  * A prison record will remain valid as long as it has at least one
  * reference, and will not be removed as long as either the prison
  * mutex or the allprison lock is held (allprison_lock may be shared).
  */
 void
 prison_hold_locked(struct prison *pr)
 {
 
 	/* Locking is no longer required. */
 	prison_hold(pr);
 }
 
 void
 prison_hold(struct prison *pr)
 {
 #ifdef INVARIANTS
 	int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref);
 
 	KASSERT(was_valid,
 	    ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
 #else
 	refcount_acquire(&pr->pr_ref);
 #endif
 }
 
 /*
  * Remove a prison reference.  If that was the last reference, the
  * prison will be removed (at a later time).
  */
 void
 prison_free_locked(struct prison *pr)
 {
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	/*
 	 * Locking is no longer required, but unlock because the caller
 	 * expects it.
 	 */
 	mtx_unlock(&pr->pr_mtx);
 	prison_free(pr);
 }
 
 void
 prison_free(struct prison *pr)
 {
 
 	KASSERT(refcount_load(&pr->pr_ref) > 0,
 	    ("Trying to free dead prison %p (jid=%d).",
 	     pr, pr->pr_id));
 	if (!refcount_release_if_not_last(&pr->pr_ref)) {
 		/*
 		 * Don't remove the last reference in this context,
 		 * in case there are locks held.
 		 */
 		taskqueue_enqueue(taskqueue_jail_remove, &pr->pr_task);
 	}
 }
 
 static void
 prison_free_not_last(struct prison *pr)
 {
 #ifdef INVARIANTS
 	int lastref;
 
 	KASSERT(refcount_load(&pr->pr_ref) > 0,
 	    ("Trying to free dead prison %p (jid=%d).",
 	     pr, pr->pr_id));
 	lastref = refcount_release(&pr->pr_ref);
 	KASSERT(!lastref,
 	    ("prison_free_not_last freed last ref on prison %p (jid=%d).",
 	     pr, pr->pr_id));
 #else
 	refcount_release(&pr->pr_ref);
 #endif
 }
 
 /*
  * Hold a prison for user visibility, by incrementing pr_uref.
  * It is generally an error to hold a prison that isn't already
  * user-visible, except through the jail system calls.  It is also
  * an error to hold an invalid prison.  A prison record will remain
  * alive as long as it has at least one user reference, and will not
  * be set to the dying state until the prison mutex and allprison_lock
  * are both freed.
  */
 void
 prison_proc_hold(struct prison *pr)
 {
 #ifdef INVARIANTS
 	int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref);
 
 	KASSERT(was_alive,
 	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
 #else
 	refcount_acquire(&pr->pr_uref);
 #endif
 }
 
 /*
  * Remove a prison user reference.  If it was the last reference, the
  * prison will be considered "dying", and may be removed once all of
  * its references are dropped.
  */
 void
 prison_proc_free(struct prison *pr)
 {
 
 	/*
 	 * Locking is only required when releasing the last reference.
 	 * This allows assurance that a locked prison will remain alive
 	 * until it is unlocked.
 	 */
 	KASSERT(refcount_load(&pr->pr_uref) > 0,
 	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
 	if (!refcount_release_if_not_last(&pr->pr_uref)) {
 		/*
 		 * Don't remove the last user reference in this context,
 		 * which is expected to be a process that is not only locked,
 		 * but also half dead.  Add a reference so any calls to
 		 * prison_free() won't re-submit the task.
 		 */
 		prison_hold(pr);
 		mtx_lock(&pr->pr_mtx);
 		KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC),
 		    ("Redundant last reference in prison_proc_free (jid=%d)",
 		     pr->pr_id));
 		pr->pr_flags |= PR_COMPLETE_PROC;
 		mtx_unlock(&pr->pr_mtx);
 		taskqueue_enqueue(taskqueue_jail_remove, &pr->pr_task);
 	}
 }
 
 static void
 prison_proc_free_not_last(struct prison *pr)
 {
 #ifdef INVARIANTS
 	int lastref;
 
 	KASSERT(refcount_load(&pr->pr_uref) > 0,
 	    ("Trying to free dead prison %p (jid=%d).",
 	     pr, pr->pr_id));
 	lastref = refcount_release(&pr->pr_uref);
 	KASSERT(!lastref,
 	    ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).",
 	     pr, pr->pr_id));
 #else
 	refcount_release(&pr->pr_uref);
 #endif
 }
 
 void
 prison_proc_link(struct prison *pr, struct proc *p)
 {
 
 	sx_assert(&allproc_lock, SA_XLOCKED);
 	LIST_INSERT_HEAD(&pr->pr_proclist, p, p_jaillist);
 }
 
 void
 prison_proc_unlink(struct prison *pr, struct proc *p)
 {
 
 	sx_assert(&allproc_lock, SA_XLOCKED);
 	LIST_REMOVE(p, p_jaillist);
 }
 
 static void
 prison_proc_relink(struct prison *opr, struct prison *npr, struct proc *p)
 {
 
 	sx_xlock(&allproc_lock);
 	prison_proc_unlink(opr, p);
 	prison_proc_link(npr, p);
 	sx_xunlock(&allproc_lock);
 }
 
 /*
  * Complete a call to either prison_free or prison_proc_free.
  */
 static void
 prison_complete(void *context, int pending)
 {
 	struct prison *pr = context;
 	int drflags;
 
 	/*
 	 * This could be called to release the last reference, or the last
 	 * user reference (plus the reference held in prison_proc_free).
 	 */
 	drflags = prison_lock_xlock(pr, PD_DEREF);
 	if (pr->pr_flags & PR_COMPLETE_PROC) {
 		pr->pr_flags &= ~PR_COMPLETE_PROC;
 		drflags |= PD_DEUREF;
 	}
 	prison_deref(pr, drflags);
 }
 
 static void
 prison_kill_processes_cb(struct proc *p, void *arg __unused)
 {
 
 	kern_psignal(p, SIGKILL);
 }
 
 /*
  * Note the iteration does not guarantee acting on all processes.
  * Most notably there may be fork or jail_attach in progress.
  */
 void
 prison_proc_iterate(struct prison *pr, void (*cb)(struct proc *, void *),
     void *cbarg)
 {
 	struct prison *ppr;
 	struct proc *p;
 
 	if (atomic_load_int(&pr->pr_childcount) == 0) {
 		sx_slock(&allproc_lock);
 		LIST_FOREACH(p, &pr->pr_proclist, p_jaillist) {
 			if (p->p_state == PRS_NEW)
 				continue;
 			PROC_LOCK(p);
 			cb(p, cbarg);
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		if (atomic_load_int(&pr->pr_childcount) == 0)
 			return;
 		/*
 		 * Some jails popped up during the iteration, fall through to a
 		 * system-wide search.
 		 */
 	}
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state != PRS_NEW && p->p_ucred != NULL) {
 			for (ppr = p->p_ucred->cr_prison; ppr != NULL;
 			    ppr = ppr->pr_parent) {
 				if (ppr == pr) {
 					cb(p, cbarg);
 					break;
 				}
 			}
 		}
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 }
 
 /*
  * Remove a prison reference and/or user reference (usually).
  * This assumes context that allows sleeping (for allprison_lock),
  * with no non-sleeping locks held, except perhaps the prison itself.
  * If there are no more references, release and delist the prison.
  * On completion, the prison lock and the allprison lock are both
  * unlocked.
  */
 static void
 prison_deref(struct prison *pr, int flags)
 {
 	struct prisonlist freeprison;
 	struct prison *killpr, *rpr, *ppr, *tpr;
 
 	killpr = NULL;
 	TAILQ_INIT(&freeprison);
 	/*
 	 * Release this prison as requested, which may cause its parent
 	 * to be released, and then maybe its grandparent, etc.
 	 */
 	for (;;) {
 		if (flags & PD_KILL) {
 			/* Kill the prison and its descendents. */
 			KASSERT(pr != &prison0,
 			    ("prison_deref trying to kill prison0"));
 			if (!(flags & PD_DEREF)) {
 				prison_hold(pr);
 				flags |= PD_DEREF;
 			}
 			flags = prison_lock_xlock(pr, flags);
 			prison_deref_kill(pr, &freeprison);
 		}
 		if (flags & PD_DEUREF) {
 			/* Drop a user reference. */
 			KASSERT(refcount_load(&pr->pr_uref) > 0,
 			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
 			     pr->pr_id));
 			if (!refcount_release_if_not_last(&pr->pr_uref)) {
 				if (!(flags & PD_DEREF)) {
 					prison_hold(pr);
 					flags |= PD_DEREF;
 				}
 				flags = prison_lock_xlock(pr, flags);
 				if (refcount_release(&pr->pr_uref) &&
 				    pr->pr_state == PRISON_STATE_ALIVE) {
 					/*
 					 * When the last user references goes,
 					 * this becomes a dying prison.
 					 */
 					KASSERT(
 					    refcount_load(&prison0.pr_uref) > 0,
 					    ("prison0 pr_uref=0"));
 					pr->pr_state = PRISON_STATE_DYING;
 					mtx_unlock(&pr->pr_mtx);
 					flags &= ~PD_LOCKED;
 					prison_cleanup(pr);
 				}
 			}
 		}
 		if (flags & PD_KILL) {
 			/*
 			 * Any remaining user references are probably processes
 			 * that need to be killed, either in this prison or its
 			 * descendants.
 			 */
 			if (refcount_load(&pr->pr_uref) > 0)
 				killpr = pr;
 			/* Make sure the parent prison doesn't get killed. */
 			flags &= ~PD_KILL;
 		}
 		if (flags & PD_DEREF) {
 			/* Drop a reference. */
 			KASSERT(refcount_load(&pr->pr_ref) > 0,
 			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
 			     pr->pr_id));
 			if (!refcount_release_if_not_last(&pr->pr_ref)) {
 				flags = prison_lock_xlock(pr, flags);
 				if (refcount_release(&pr->pr_ref)) {
 					/*
 					 * When the last reference goes,
 					 * unlink the prison and set it aside.
 					 */
 					KASSERT(
 					    refcount_load(&pr->pr_uref) == 0,
 					    ("prison_deref: last ref, "
 					     "but still has %d urefs (jid=%d)",
 					     pr->pr_uref, pr->pr_id));
 					KASSERT(
 					    refcount_load(&prison0.pr_ref) != 0,
 					    ("prison0 pr_ref=0"));
 					pr->pr_state = PRISON_STATE_INVALID;
 					TAILQ_REMOVE(&allprison, pr, pr_list);
 					LIST_REMOVE(pr, pr_sibling);
 					TAILQ_INSERT_TAIL(&freeprison, pr,
 					    pr_list);
 					for (ppr = pr->pr_parent;
 					     ppr != NULL;
 					     ppr = ppr->pr_parent)
 						ppr->pr_childcount--;
 					/*
 					 * Removing a prison frees references
 					 * from its parent.
 					 */
 					ppr = pr->pr_parent;
 					pr->pr_parent = NULL;
 					mtx_unlock(&pr->pr_mtx);
 
 					pr = ppr;
 					flags &= ~PD_LOCKED;
 					flags |= PD_DEREF | PD_DEUREF;
 					continue;
 				}
 			}
 		}
 		break;
 	}
 
 	/* Release all the prison locks. */
 	if (flags & PD_LOCKED)
 		mtx_unlock(&pr->pr_mtx);
 	if (flags & PD_LIST_SLOCKED)
 		sx_sunlock(&allprison_lock);
 	else if (flags & PD_LIST_XLOCKED)
 		sx_xunlock(&allprison_lock);
 
 	/* Kill any processes attached to a killed prison. */
 	if (killpr != NULL)
 		prison_proc_iterate(killpr, prison_kill_processes_cb, NULL);
 
 	/*
 	 * Finish removing any unreferenced prisons, which couldn't happen
 	 * while allprison_lock was held (to avoid a LOR on vrele).
 	 */
 	TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) {
 #ifdef VIMAGE
 		if (rpr->pr_flags & PR_VNET)
 			vnet_destroy(rpr->pr_vnet);
 #endif
 		if (rpr->pr_root != NULL)
 			vrele(rpr->pr_root);
 		mtx_destroy(&rpr->pr_mtx);
 #ifdef INET
 		prison_ip_free(rpr->pr_addrs[PR_INET]);
 #endif
 #ifdef INET6
 		prison_ip_free(rpr->pr_addrs[PR_INET6]);
 #endif
 		if (rpr->pr_cpuset != NULL)
 			cpuset_rel(rpr->pr_cpuset);
 		osd_jail_exit(rpr);
 #ifdef RACCT
 		if (racct_enable)
 			prison_racct_detach(rpr);
 #endif
 		TAILQ_REMOVE(&freeprison, rpr, pr_list);
 		free(rpr, M_PRISON);
 	}
 }
 
 /*
  * Kill the prison and its descendants.  Mark them as dying, clear the
  * persist flag, and call module remove methods.
  */
 static void
 prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
 {
 	struct prison *cpr, *ppr, *rpr;
 	bool descend;
 
 	/*
 	 * Unlike the descendants, the target prison can be killed
 	 * even if it is currently dying.  This is useful for failed
 	 * creation in jail_set(2).
 	 */
 	KASSERT(refcount_load(&pr->pr_ref) > 0,
 	    ("Trying to kill dead prison %p (jid=%d).",
 	     pr, pr->pr_id));
 	refcount_acquire(&pr->pr_uref);
 	pr->pr_state = PRISON_STATE_DYING;
 	mtx_unlock(&pr->pr_mtx);
 
 	rpr = NULL;
 	FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) {
 		if (descend) {
 			if (!prison_isalive(cpr)) {
 				descend = false;
 				continue;
 			}
 			prison_hold(cpr);
 			prison_proc_hold(cpr);
 			mtx_lock(&cpr->pr_mtx);
 			cpr->pr_state = PRISON_STATE_DYING;
 			cpr->pr_flags |= PR_REMOVE;
 			mtx_unlock(&cpr->pr_mtx);
 			continue;
 		}
 		if (!(cpr->pr_flags & PR_REMOVE))
 			continue;
 		prison_cleanup(cpr);
 		mtx_lock(&cpr->pr_mtx);
 		cpr->pr_flags &= ~PR_REMOVE;
 		if (cpr->pr_flags & PR_PERSIST) {
 			cpr->pr_flags &= ~PR_PERSIST;
 			prison_proc_free_not_last(cpr);
 			prison_free_not_last(cpr);
 		}
 		(void)refcount_release(&cpr->pr_uref);
 		if (refcount_release(&cpr->pr_ref)) {
 			/*
 			 * When the last reference goes, unlink the prison
 			 * and set it aside for prison_deref() to handle.
 			 * Delay unlinking the sibling list to keep the loop
 			 * safe.
 			 */
 			if (rpr != NULL)
 				LIST_REMOVE(rpr, pr_sibling);
 			rpr = cpr;
 			rpr->pr_state = PRISON_STATE_INVALID;
 			TAILQ_REMOVE(&allprison, rpr, pr_list);
 			TAILQ_INSERT_TAIL(freeprison, rpr, pr_list);
 			/*
 			 * Removing a prison frees references from its parent.
 			 */
 			ppr = rpr->pr_parent;
 			prison_proc_free_not_last(ppr);
 			prison_free_not_last(ppr);
 			for (; ppr != NULL; ppr = ppr->pr_parent)
 				ppr->pr_childcount--;
 		}
 		mtx_unlock(&cpr->pr_mtx);
 	}
 	if (rpr != NULL)
 		LIST_REMOVE(rpr, pr_sibling);
 
 	prison_cleanup(pr);
 	mtx_lock(&pr->pr_mtx);
 	if (pr->pr_flags & PR_PERSIST) {
 		pr->pr_flags &= ~PR_PERSIST;
 		prison_proc_free_not_last(pr);
 		prison_free_not_last(pr);
 	}
 	(void)refcount_release(&pr->pr_uref);
 }
 
 /*
  * Given the current locking state in the flags, make sure allprison_lock
  * is held exclusive, and the prison is locked.  Return flags indicating
  * the new state.
  */
 static int
 prison_lock_xlock(struct prison *pr, int flags)
 {
 
 	if (!(flags & PD_LIST_XLOCKED)) {
 		/*
 		 * Get allprison_lock, which may be an upgrade,
 		 * and may require unlocking the prison.
 		 */
 		if (flags & PD_LOCKED) {
 			mtx_unlock(&pr->pr_mtx);
 			flags &= ~PD_LOCKED;
 		}
 		if (flags & PD_LIST_SLOCKED) {
 			if (!sx_try_upgrade(&allprison_lock)) {
 				sx_sunlock(&allprison_lock);
 				sx_xlock(&allprison_lock);
 			}
 			flags &= ~PD_LIST_SLOCKED;
 		} else
 			sx_xlock(&allprison_lock);
 		flags |= PD_LIST_XLOCKED;
 	}
 	if (!(flags & PD_LOCKED)) {
 		/* Lock the prison mutex. */
 		mtx_lock(&pr->pr_mtx);
 		flags |= PD_LOCKED;
 	}
 	return flags;
 }
 
 /*
  * Release a prison's resources when it starts dying (when the last user
  * reference is dropped, or when it is killed).
  */
 static void
 prison_cleanup(struct prison *pr)
 {
 	sx_assert(&allprison_lock, SA_XLOCKED);
 	mtx_assert(&pr->pr_mtx, MA_NOTOWNED);
 	vfs_exjail_delete(pr);
 	shm_remove_prison(pr);
 	(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 }
 
 /*
  * Set or clear a permission bit in the pr_allow field, passing restrictions
  * (cleared permission) down to child jails.
  */
 void
 prison_set_allow(struct ucred *cred, unsigned flag, int enable)
 {
 	struct prison *pr;
 
 	pr = cred->cr_prison;
 	sx_slock(&allprison_lock);
 	mtx_lock(&pr->pr_mtx);
 	prison_set_allow_locked(pr, flag, enable);
 	mtx_unlock(&pr->pr_mtx);
 	sx_sunlock(&allprison_lock);
 }
 
 static void
 prison_set_allow_locked(struct prison *pr, unsigned flag, int enable)
 {
 	struct prison *cpr;
 	int descend;
 
 	if (enable != 0)
 		pr->pr_allow |= flag;
 	else {
 		pr->pr_allow &= ~flag;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
 			cpr->pr_allow &= ~flag;
 	}
 }
 
 /*
  * Check if a jail supports the given address family.
  *
  * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
  * if not.
  */
 int
 prison_check_af(struct ucred *cred, int af)
 {
 	struct prison *pr;
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 
 	pr = cred->cr_prison;
 #ifdef VIMAGE
 	/* Prisons with their own network stack are not limited. */
 	if (prison_owns_vnet(pr))
 		return (0);
 #endif
 
 	error = 0;
 	switch (af)
 	{
 #ifdef INET
 	case AF_INET:
 		if (pr->pr_flags & PR_IP4)
 		{
 			mtx_lock(&pr->pr_mtx);
 			if ((pr->pr_flags & PR_IP4) &&
 			    pr->pr_addrs[PR_INET] == NULL)
 				error = EAFNOSUPPORT;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (pr->pr_flags & PR_IP6)
 		{
 			mtx_lock(&pr->pr_mtx);
 			if ((pr->pr_flags & PR_IP6) &&
 			    pr->pr_addrs[PR_INET6] == NULL)
 				error = EAFNOSUPPORT;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		break;
 #endif
 	case AF_LOCAL:
 	case AF_ROUTE:
 	case AF_NETLINK:
 		break;
 	default:
 		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
 			error = EAFNOSUPPORT;
 	}
 	return (error);
 }
 
 /*
  * Check if given address belongs to the jail referenced by cred (wrapper to
  * prison_check_ip[46]).
  *
  * Returns 0 if jail doesn't restrict the address family or if address belongs
  * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
  * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
  */
 int
 prison_if(struct ucred *cred, const struct sockaddr *sa)
 {
 #ifdef INET
 	const struct sockaddr_in *sai;
 #endif
 #ifdef INET6
 	const struct sockaddr_in6 *sai6;
 #endif
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
 
 #ifdef VIMAGE
 	if (prison_owns_vnet(cred->cr_prison))
 		return (0);
 #endif
 
 	error = 0;
 	switch (sa->sa_family)
 	{
 #ifdef INET
 	case AF_INET:
 		sai = (const struct sockaddr_in *)sa;
 		error = prison_check_ip4(cred, &sai->sin_addr);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sai6 = (const struct sockaddr_in6 *)sa;
 		error = prison_check_ip6(cred, &sai6->sin6_addr);
 		break;
 #endif
 	default:
 		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
 			error = EAFNOSUPPORT;
 	}
 	return (error);
 }
 
 /*
  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
  */
 int
 prison_check(struct ucred *cred1, struct ucred *cred2)
 {
 
 	return ((cred1->cr_prison == cred2->cr_prison ||
 	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
 }
 
 /*
  * For mountd/nfsd to run within a prison, it must be:
  * - A vnet prison.
  * - PR_ALLOW_NFSD must be set on it.
  * - The root directory (pr_root) of the prison must be
  *   a file system mount point, so the mountd can hang
  *   export information on it.
  * - The prison's enforce_statfs cannot be 0, so that
  *   mountd(8) can do exports.
  */
 bool
 prison_check_nfsd(struct ucred *cred)
 {
 
 	if (jailed_without_vnet(cred))
 		return (false);
 	if (!prison_allow(cred, PR_ALLOW_NFSD))
 		return (false);
 	if ((cred->cr_prison->pr_root->v_vflag & VV_ROOT) == 0)
 		return (false);
 	if (cred->cr_prison->pr_enforce_statfs == 0)
 		return (false);
 	return (true);
 }
 
 /*
  * Return true if p2 is a child of p1, otherwise false.
  */
 bool
 prison_ischild(struct prison *pr1, struct prison *pr2)
 {
 
 	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
 		if (pr1 == pr2)
 			return (true);
 	return (false);
 }
 
 /*
  * Return true if the prison is currently alive.  A prison is alive if it
  * holds user references and it isn't being removed.
  */
 bool
 prison_isalive(const struct prison *pr)
 {
 
 	if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE))
 		return (false);
 	return (true);
 }
 
 /*
  * Return true if the prison is currently valid.  A prison is valid if it has
  * been fully created, and is not being destroyed.  Note that dying prisons
  * are still considered valid.  Invalid prisons won't be found under normal
  * circumstances, as they're only put in that state by functions that have
  * an exclusive hold on allprison_lock.
  */
 bool
 prison_isvalid(struct prison *pr)
 {
 
 	if (__predict_false(pr->pr_state == PRISON_STATE_INVALID))
 		return (false);
 	if (__predict_false(refcount_load(&pr->pr_ref) == 0))
 		return (false);
 	return (true);
 }
 
 /*
  * Return true if the passed credential is in a jail and that jail does not
  * have its own virtual network stack, otherwise false.
  */
 bool
 jailed_without_vnet(struct ucred *cred)
 {
 
 	if (!jailed(cred))
 		return (false);
 #ifdef VIMAGE
 	if (prison_owns_vnet(cred->cr_prison))
 		return (false);
 #endif
 
 	return (true);
 }
 
 /*
  * Return the correct hostname (domainname, et al) for the passed credential.
  */
 void
 getcredhostname(struct ucred *cred, char *buf, size_t size)
 {
 	struct prison *pr;
 
 	/*
 	 * A NULL credential can be used to shortcut to the physical
 	 * system's hostname.
 	 */
 	pr = (cred != NULL) ? cred->cr_prison : &prison0;
 	mtx_lock(&pr->pr_mtx);
 	strlcpy(buf, pr->pr_hostname, size);
 	mtx_unlock(&pr->pr_mtx);
 }
 
 void
 getcreddomainname(struct ucred *cred, char *buf, size_t size)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(buf, cred->cr_prison->pr_domainname, size);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getcredhostid(struct ucred *cred, unsigned long *hostid)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	*hostid = cred->cr_prison->pr_hostid;
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getjailname(struct ucred *cred, char *name, size_t len)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(name, cred->cr_prison->pr_name, len);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 #ifdef VIMAGE
 /*
  * Determine whether the prison owns its VNET.
  */
 bool
 prison_owns_vnet(struct prison *pr)
 {
 
 	/*
 	 * vnets cannot be added/removed after jail creation,
 	 * so no need to lock here.
 	 */
 	return ((pr->pr_flags & PR_VNET) != 0);
 }
 #endif
 
 /*
  * Determine whether the subject represented by cred can "see"
  * status of a mount point.
  * Returns: 0 for permitted, ENOENT otherwise.
  * XXX: This function should be called cr_canseemount() and should be
  *      placed in kern_prot.c.
  */
 int
 prison_canseemount(struct ucred *cred, struct mount *mp)
 {
 	struct prison *pr;
 	struct statfs *sp;
 	size_t len;
 
 	pr = cred->cr_prison;
 	if (pr->pr_enforce_statfs == 0)
 		return (0);
 	if (pr->pr_root->v_mount == mp)
 		return (0);
 	if (pr->pr_enforce_statfs == 2)
 		return (ENOENT);
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 * This is ugly check, but this is the only situation when jail's
 	 * directory ends with '/'.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return (0);
 	len = strlen(pr->pr_path);
 	sp = &mp->mnt_stat;
 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
 		return (ENOENT);
 	/*
 	 * Be sure that we don't have situation where jail's root directory
 	 * is "/some/path" and mount point is "/some/pathpath".
 	 */
 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
 		return (ENOENT);
 	return (0);
 }
 
 void
 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
 {
 	char jpath[MAXPATHLEN];
 	struct prison *pr;
 	size_t len;
 
 	pr = cred->cr_prison;
 	if (pr->pr_enforce_statfs == 0)
 		return;
 	if (prison_canseemount(cred, mp) != 0) {
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		strlcpy(sp->f_mntonname, "[restricted]",
 		    sizeof(sp->f_mntonname));
 		return;
 	}
 	if (pr->pr_root->v_mount == mp) {
 		/*
 		 * Clear current buffer data, so we are sure nothing from
 		 * the valid path left there.
 		 */
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		*sp->f_mntonname = '/';
 		return;
 	}
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return;
 	len = strlen(pr->pr_path);
 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
 	/*
 	 * Clear current buffer data, so we are sure nothing from
 	 * the valid path left there.
 	 */
 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 	if (*jpath == '\0') {
 		/* Should never happen. */
 		*sp->f_mntonname = '/';
 	} else {
 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
 	}
 }
 
 /*
  * Check with permission for a specific privilege is granted within jail.  We
  * have a specific list of accepted privileges; the rest are denied.
  */
 int
 prison_priv_check(struct ucred *cred, int priv)
 {
 	struct prison *pr;
 	int error;
 
 	/*
 	 * Some policies have custom handlers. This routine should not be
 	 * called for them. See priv_check_cred().
 	 */
 	switch (priv) {
 	case PRIV_VFS_LOOKUP:
 	case PRIV_VFS_GENERATION:
 		KASSERT(0, ("prison_priv_check instead of a custom handler "
 		    "called for %d\n", priv));
 	}
 
 	if (!jailed(cred))
 		return (0);
 
 #ifdef VIMAGE
 	/*
 	 * Privileges specific to prisons with a virtual network stack.
 	 * There might be a duplicate entry here in case the privilege
 	 * is only granted conditionally in the legacy jail case.
 	 */
 	switch (priv) {
 		/*
 		 * NFS-specific privileges.
 		 */
 	case PRIV_NFS_DAEMON:
 	case PRIV_VFS_GETFH:
 	case PRIV_VFS_MOUNT_EXPORTED:
 		if (!prison_check_nfsd(cred))
 			return (EPERM);
 #ifdef notyet
 	case PRIV_NFS_LOCKD:
 #endif
 		/*
 		 * Network stack privileges.
 		 */
 	case PRIV_NET_BRIDGE:
 	case PRIV_NET_GRE:
 	case PRIV_NET_BPF:
 	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
 	case PRIV_NET_ROUTE:
 	case PRIV_NET_TAP:
 	case PRIV_NET_SETIFMTU:
 	case PRIV_NET_SETIFFLAGS:
 	case PRIV_NET_SETIFCAP:
 	case PRIV_NET_SETIFDESCR:
 	case PRIV_NET_SETIFNAME	:
 	case PRIV_NET_SETIFMETRIC:
 	case PRIV_NET_SETIFPHYS:
 	case PRIV_NET_SETIFMAC:
 	case PRIV_NET_SETLANPCP:
 	case PRIV_NET_ADDMULTI:
 	case PRIV_NET_DELMULTI:
 	case PRIV_NET_HWIOCTL:
 	case PRIV_NET_SETLLADDR:
 	case PRIV_NET_ADDIFGROUP:
 	case PRIV_NET_DELIFGROUP:
 	case PRIV_NET_IFCREATE:
 	case PRIV_NET_IFDESTROY:
 	case PRIV_NET_ADDIFADDR:
 	case PRIV_NET_DELIFADDR:
 	case PRIV_NET_LAGG:
 	case PRIV_NET_GIF:
 	case PRIV_NET_SETIFVNET:
 	case PRIV_NET_SETIFFIB:
 	case PRIV_NET_OVPN:
 	case PRIV_NET_ME:
 	case PRIV_NET_WG:
 
 		/*
 		 * 802.11-related privileges.
 		 */
 	case PRIV_NET80211_VAP_GETKEY:
 	case PRIV_NET80211_VAP_MANAGE:
 
 #ifdef notyet
 		/*
 		 * ATM privileges.
 		 */
 	case PRIV_NETATM_CFG:
 	case PRIV_NETATM_ADD:
 	case PRIV_NETATM_DEL:
 	case PRIV_NETATM_SET:
 
 		/*
 		 * Bluetooth privileges.
 		 */
 	case PRIV_NETBLUETOOTH_RAW:
 #endif
 
 		/*
 		 * Netgraph and netgraph module privileges.
 		 */
 	case PRIV_NETGRAPH_CONTROL:
 #ifdef notyet
 	case PRIV_NETGRAPH_TTY:
 #endif
 
 		/*
 		 * IPv4 and IPv6 privileges.
 		 */
 	case PRIV_NETINET_IPFW:
 	case PRIV_NETINET_DIVERT:
 	case PRIV_NETINET_PF:
 	case PRIV_NETINET_DUMMYNET:
 	case PRIV_NETINET_CARP:
 	case PRIV_NETINET_MROUTE:
 	case PRIV_NETINET_RAW:
 	case PRIV_NETINET_ADDRCTRL6:
 	case PRIV_NETINET_ND6:
 	case PRIV_NETINET_SCOPE6:
 	case PRIV_NETINET_ALIFETIME6:
 	case PRIV_NETINET_IPSEC:
 	case PRIV_NETINET_BINDANY:
 
 #ifdef notyet
 		/*
 		 * NCP privileges.
 		 */
 	case PRIV_NETNCP:
 
 		/*
 		 * SMB privileges.
 		 */
 	case PRIV_NETSMB:
 #endif
 
 	/*
 	 * No default: or deny here.
 	 * In case of no permit fall through to next switch().
 	 */
 		if (cred->cr_prison->pr_flags & PR_VNET)
 			return (0);
 	}
 #endif /* VIMAGE */
 
 	switch (priv) {
 		/*
 		 * Allow ktrace privileges for root in jail.
 		 */
 	case PRIV_KTRACE:
 
 #if 0
 		/*
 		 * Allow jailed processes to configure audit identity and
 		 * submit audit records (login, etc).  In the future we may
 		 * want to further refine the relationship between audit and
 		 * jail.
 		 */
 	case PRIV_AUDIT_GETAUDIT:
 	case PRIV_AUDIT_SETAUDIT:
 	case PRIV_AUDIT_SUBMIT:
 #endif
 
 		/*
 		 * Allow jailed processes to manipulate process UNIX
 		 * credentials in any way they see fit.
 		 */
 	case PRIV_CRED_SETCRED:
 	case PRIV_CRED_SETUID:
 	case PRIV_CRED_SETEUID:
 	case PRIV_CRED_SETGID:
 	case PRIV_CRED_SETEGID:
 	case PRIV_CRED_SETGROUPS:
 	case PRIV_CRED_SETREUID:
 	case PRIV_CRED_SETREGID:
 	case PRIV_CRED_SETRESUID:
 	case PRIV_CRED_SETRESGID:
 
 		/*
 		 * Jail implements visibility constraints already, so allow
 		 * jailed root to override uid/gid-based constraints.
 		 */
 	case PRIV_SEEOTHERGIDS:
 	case PRIV_SEEOTHERUIDS:
 	case PRIV_SEEJAILPROC:
 
 		/*
 		 * Jail implements inter-process debugging limits already, so
 		 * allow jailed root various debugging privileges.
 		 */
 	case PRIV_DEBUG_DIFFCRED:
 	case PRIV_DEBUG_SUGID:
 	case PRIV_DEBUG_UNPRIV:
+	case PRIV_DEBUG_DIFFJAIL:
 
 		/*
 		 * Allow jail to set various resource limits and login
 		 * properties, and for now, exceed process resource limits.
 		 */
 	case PRIV_PROC_LIMIT:
 	case PRIV_PROC_SETLOGIN:
 	case PRIV_PROC_SETRLIMIT:
 
 		/*
 		 * Debuggers should work in jails.
 		 */
 	case PRIV_PROC_MEM_WRITE:
 
 		/*
 		 * System V and POSIX IPC privileges are granted in jail.
 		 */
 	case PRIV_IPC_READ:
 	case PRIV_IPC_WRITE:
 	case PRIV_IPC_ADMIN:
 	case PRIV_IPC_MSGSIZE:
 	case PRIV_MQ_ADMIN:
 
 		/*
 		 * Jail operations within a jail work on child jails.
 		 */
 	case PRIV_JAIL_ATTACH:
 	case PRIV_JAIL_SET:
 	case PRIV_JAIL_REMOVE:
 
 		/*
 		 * Jail implements its own inter-process limits, so allow
 		 * root processes in jail to change scheduling on other
 		 * processes in the same jail.  Likewise for signalling.
 		 */
 	case PRIV_SCHED_DIFFCRED:
 	case PRIV_SCHED_CPUSET:
+	case PRIV_SCHED_DIFFJAIL:
 	case PRIV_SIGNAL_DIFFCRED:
 	case PRIV_SIGNAL_SUGID:
+	case PRIV_SIGNAL_DIFFJAIL:
 
 		/*
 		 * Allow jailed processes to write to sysctls marked as jail
 		 * writable.
 		 */
 	case PRIV_SYSCTL_WRITEJAIL:
 
 		/*
 		 * Allow root in jail to manage a variety of quota
 		 * properties.  These should likely be conditional on a
 		 * configuration option.
 		 */
 	case PRIV_VFS_GETQUOTA:
 	case PRIV_VFS_SETQUOTA:
 
 		/*
 		 * Since Jail relies on chroot() to implement file system
 		 * protections, grant many VFS privileges to root in jail.
 		 * Be careful to exclude mount-related and NFS-related
 		 * privileges.
 		 */
 	case PRIV_VFS_READ:
 	case PRIV_VFS_WRITE:
 	case PRIV_VFS_ADMIN:
 	case PRIV_VFS_EXEC:
 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
 	case PRIV_VFS_CHFLAGS_DEV:
 	case PRIV_VFS_CHOWN:
 	case PRIV_VFS_CHROOT:
 	case PRIV_VFS_RETAINSUGID:
 	case PRIV_VFS_FCHROOT:
 	case PRIV_VFS_LINK:
 	case PRIV_VFS_SETGID:
 	case PRIV_VFS_STAT:
 	case PRIV_VFS_STICKYFILE:
 
 		/*
 		 * As in the non-jail case, non-root users are expected to be
 		 * able to read kernel/physical memory (provided /dev/[k]mem
 		 * exists in the jail and they have permission to access it).
 		 */
 	case PRIV_KMEM_READ:
 		return (0);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * setting system flags.
 		 */
 	case PRIV_VFS_SYSFLAGS:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * mounting/unmounting file systems.
 		 */
 	case PRIV_VFS_MOUNT:
 	case PRIV_VFS_UNMOUNT:
 	case PRIV_VFS_MOUNT_NONUSER:
 	case PRIV_VFS_MOUNT_OWNER:
 		pr = cred->cr_prison;
 		prison_lock(pr);
 		if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2)
 			error = 0;
 		else
 			error = EPERM;
 		prison_unlock(pr);
 		return (error);
 
 		/*
 		 * Jails should hold no disposition on the PRIV_VFS_READ_DIR
 		 * policy.  priv_check_cred will not specifically allow it, and
 		 * we may want a MAC policy to allow it.
 		 */
 	case PRIV_VFS_READ_DIR:
 		return (0);
 
 		/*
 		 * Conditionally allow privileged process in the jail to
 		 * manipulate filesystem extended attributes in the system
 		 * namespace.
 		 */
 	case PRIV_VFS_EXTATTR_SYSTEM:
 		if ((cred->cr_prison->pr_allow & PR_ALLOW_EXTATTR) != 0)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Conditionnaly allow locking (unlocking) physical pages
 		 * in memory.
 		 */
 	case PRIV_VM_MLOCK:
 	case PRIV_VM_MUNLOCK:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Conditionally allow jailed root to bind reserved ports.
 		 */
 	case PRIV_NETINET_RESERVEDPORT:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Allow jailed root to reuse in-use ports.
 		 */
 	case PRIV_NETINET_REUSEPORT:
 		return (0);
 
 		/*
 		 * Allow jailed root to set certain IPv4/6 (option) headers.
 		 */
 	case PRIV_NETINET_SETHDROPTS:
 		return (0);
 
 		/*
 		 * Conditionally allow creating raw sockets in jail.
 		 */
 	case PRIV_NETINET_RAW:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Since jail implements its own visibility limits on netstat
 		 * sysctls, allow getcred.  This allows identd to work in
 		 * jail.
 		 */
 	case PRIV_NETINET_GETCRED:
 		return (0);
 
 		/*
 		 * Allow jailed root to set loginclass.
 		 */
 	case PRIV_PROC_SETLOGINCLASS:
 		return (0);
 
 		/*
 		 * Do not allow a process inside a jail to read the kernel
 		 * message buffer unless explicitly permitted.
 		 */
 	case PRIV_MSGBUF:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
 			return (0);
 		return (EPERM);
 
 		/*
 		 * Conditionally allow privileged process in the jail adjust
 		 * machine time.
 		 */
 	case PRIV_ADJTIME:
 	case PRIV_NTP_ADJTIME:
 		if (cred->cr_prison->pr_allow &
 		    (PR_ALLOW_ADJTIME | PR_ALLOW_SETTIME)) {
 			return (0);
 		}
 		return (EPERM);
 
 		/*
 		 * Conditionally allow privileged process in the jail set
 		 * machine time.
 		 */
 	case PRIV_SETTIMEOFDAY:
 	case PRIV_CLOCK_SETTIME:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_SETTIME)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Conditionally allow privileged process in the jail to modify
 		 * the routing table.
 		 */
 	case PRIV_NET_ROUTE:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_ROUTING)
 			return (0);
 		else
 			return (EPERM);
 
 	default:
 		/*
 		 * In all remaining cases, deny the privilege request.  This
 		 * includes almost all network privileges, many system
 		 * configuration privileges.
 		 */
 		return (EPERM);
 	}
 }
 
 /*
  * Return the part of pr2's name that is relative to pr1, or the whole name
  * if it does not directly follow.
  */
 
 char *
 prison_name(struct prison *pr1, struct prison *pr2)
 {
 	char *name;
 
 	/* Jails see themselves as "0" (if they see themselves at all). */
 	if (pr1 == pr2)
 		return "0";
 	name = pr2->pr_name;
 	if (prison_ischild(pr1, pr2)) {
 		/*
 		 * pr1 isn't locked (and allprison_lock may not be either)
 		 * so its length can't be counted on.  But the number of dots
 		 * can be counted on - and counted.
 		 */
 		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
 			name = strchr(name, '.') + 1;
 	}
 	return (name);
 }
 
 /*
  * Return the part of pr2's path that is relative to pr1, or the whole path
  * if it does not directly follow.
  */
 static char *
 prison_path(struct prison *pr1, struct prison *pr2)
 {
 	char *path1, *path2;
 	int len1;
 
 	path1 = pr1->pr_path;
 	path2 = pr2->pr_path;
 	if (!strcmp(path1, "/"))
 		return (path2);
 	len1 = strlen(path1);
 	if (strncmp(path1, path2, len1))
 		return (path2);
 	if (path2[len1] == '\0')
 		return "/";
 	if (path2[len1] == '/')
 		return (path2 + len1);
 	return (path2);
 }
 
 /*
  * Jail-related sysctls.
  */
 SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Jails");
 
 #if defined(INET) || defined(INET6)
 /*
  * Copy address array to memory that would be then SYSCTL_OUT-ed.
  * sysctl_jail_list() helper.
  */
 static void
 prison_ip_copyout(struct prison *pr, const pr_family_t af, void **out, int *len)
 {
 	const struct prison_ip *pip;
 	const size_t size = pr_families[af].size;
 
  again:
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
 	if ((pip = pr->pr_addrs[af]) != NULL) {
 		if (*len < pip->ips) {
 			*len = pip->ips;
 			mtx_unlock(&pr->pr_mtx);
 			*out = realloc(*out, *len * size, M_TEMP, M_WAITOK);
 			mtx_lock(&pr->pr_mtx);
 			goto again;
 		}
 		bcopy(pip->pr_ip, *out, pip->ips * size);
 	}
 }
 #endif
 
 static int
 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
 {
 	struct xprison *xp;
 	struct prison *pr, *cpr;
 #ifdef INET
 	struct in_addr *ip4 = NULL;
 	int ip4s = 0;
 #endif
 #ifdef INET6
 	struct in6_addr *ip6 = NULL;
 	int ip6s = 0;
 #endif
 	int descend, error;
 
 	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
 	pr = req->td->td_ucred->cr_prison;
 	error = 0;
 	sx_slock(&allprison_lock);
 	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
 		mtx_lock(&cpr->pr_mtx);
 #ifdef INET
 		prison_ip_copyout(cpr, PR_INET, (void **)&ip4, &ip4s);
 #endif
 #ifdef INET6
 		prison_ip_copyout(cpr, PR_INET6, (void **)&ip6, &ip6s);
 #endif
 		bzero(xp, sizeof(*xp));
 		xp->pr_version = XPRISON_VERSION;
 		xp->pr_id = cpr->pr_id;
 		xp->pr_state = cpr->pr_state;
 		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
 		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
 		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
 #ifdef INET
 		xp->pr_ip4s = ip4s;
 #endif
 #ifdef INET6
 		xp->pr_ip6s = ip6s;
 #endif
 		mtx_unlock(&cpr->pr_mtx);
 		error = SYSCTL_OUT(req, xp, sizeof(*xp));
 		if (error)
 			break;
 #ifdef INET
 		if (xp->pr_ip4s > 0) {
 			error = SYSCTL_OUT(req, ip4,
 			    xp->pr_ip4s * sizeof(struct in_addr));
 			if (error)
 				break;
 		}
 #endif
 #ifdef INET6
 		if (xp->pr_ip6s > 0) {
 			error = SYSCTL_OUT(req, ip6,
 			    xp->pr_ip6s * sizeof(struct in6_addr));
 			if (error)
 				break;
 		}
 #endif
 	}
 	sx_sunlock(&allprison_lock);
 	free(xp, M_TEMP);
 #ifdef INET
 	free(ip4, M_TEMP);
 #endif
 #ifdef INET6
 	free(ip6, M_TEMP);
 #endif
 	return (error);
 }
 
 SYSCTL_OID(_security_jail, OID_AUTO, list,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_list, "S", "List of active jails");
 
 static int
 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
 {
 	int error, injail;
 
 	injail = jailed(req->td->td_ucred);
 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
 
 	return (error);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_jailed, "I", "Process in jail?");
 
 static int
 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
 {
 	int error, havevnet;
 #ifdef VIMAGE
 	struct ucred *cred = req->td->td_ucred;
 
 	havevnet = jailed(cred) && prison_owns_vnet(cred->cr_prison);
 #else
 	havevnet = 0;
 #endif
 	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
 
 	return (error);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_vnet, "I", "Jail owns vnet?");
 
 #if defined(INET) || defined(INET6)
 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
     &jail_max_af_ips, 0,
     "Number of IP addresses a jail may have at most per address family (deprecated)");
 #endif
 
 /*
  * Default parameters for jail(2) compatibility.  For historical reasons,
  * the sysctl names have varying similarity to the parameter names.  Prisons
  * just see their own parameters, and can't change them.
  */
 static int
 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	/* Get the current flag value, and convert it to a boolean. */
 	if (req->td->td_ucred->cr_prison == &prison0) {
 		mtx_lock(&prison0.pr_mtx);
 		i = (jail_default_allow & arg2) != 0;
 		mtx_unlock(&prison0.pr_mtx);
 	} else
 		i = prison_allow(req->td->td_ucred, arg2);
 
 	if (arg1 != NULL)
 		i = !i;
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	i = i ? arg2 : 0;
 	if (arg1 != NULL)
 		i ^= arg2;
 	/*
 	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
 	 * for writing.
 	 */
 	mtx_lock(&prison0.pr_mtx);
 	jail_default_allow = (jail_default_allow & ~arg2) | i;
 	mtx_unlock(&prison0.pr_mtx);
 	return (0);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
     "Processes in jail can set their hostnames (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
     "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
     "Processes in jail can use System V IPC primitives (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
     "Prison root can create raw sockets (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
     "Processes in jail can alter system file flags (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
     "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mlock_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MLOCK, sysctl_jail_default_allow, "I",
     "Processes in jail can lock/unlock physical pages in memory");
 
 static int
 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 	int level, error;
 
 	pr = req->td->td_ucred->cr_prison;
 	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
 	error = sysctl_handle_int(oidp, &level, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	*(int *)arg1 = level;
 	return (0);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
     sysctl_jail_default_level, "I",
     "Processes in jail cannot see all mounted file systems (deprecated)");
 
 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
     &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
     sysctl_jail_default_level, "I",
     "Ruleset for the devfs filesystem in jail (deprecated)");
 
 SYSCTL_NODE(_security_jail, OID_AUTO, children, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Limits and stats of child jails");
 
 static int
 sysctl_jail_children(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 	int i;
 
 	pr = req->td->td_ucred->cr_prison;
 
 	switch (oidp->oid_kind & CTLTYPE) {
 	case CTLTYPE_INT:
 		i = *(int *)((char *)pr + arg2);
 		return (SYSCTL_OUT(req, &i, sizeof(i)));
 	}
 
 	return (0);
 }
 
 SYSCTL_PROC(_security_jail_children, OID_AUTO, max,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, offsetof(struct prison, pr_childmax), sysctl_jail_children,
     "I", "Maximum number of child jails");
 SYSCTL_PROC(_security_jail_children, OID_AUTO, cur,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, offsetof(struct prison, pr_childcount), sysctl_jail_children,
     "I", "Current number of child jails");
 
 /*
  * Nodes to describe jail parameters.  Maximum length of string parameters
  * is returned in the string itself, and the other parameters exist merely
  * to make themselves and their types known.
  */
 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Jail parameters");
 
 int
 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
 {
 	int i;
 	long l;
 	size_t s;
 	char numbuf[12];
 
 	switch (oidp->oid_kind & CTLTYPE)
 	{
 	case CTLTYPE_LONG:
 	case CTLTYPE_ULONG:
 		l = 0;
 #ifdef SCTL_MASK32
 		if (!(req->flags & SCTL_MASK32))
 #endif
 			return (SYSCTL_OUT(req, &l, sizeof(l)));
 	case CTLTYPE_INT:
 	case CTLTYPE_UINT:
 		i = 0;
 		return (SYSCTL_OUT(req, &i, sizeof(i)));
 	case CTLTYPE_STRING:
 		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
 		return
 		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
 	case CTLTYPE_STRUCT:
 		s = (size_t)arg2;
 		return (SYSCTL_OUT(req, &s, sizeof(s)));
 	}
 	return (0);
 }
 
 /*
  * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
  * jail creation time but cannot be changed in an existing jail.
  */
 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail secure level");
 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
     "Jail value for kern.osreldate and uname -K");
 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
     "Jail value for kern.osrelease and uname -r");
 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail cannot see all mounted file systems");
 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Ruleset for in-jail devfs mounts");
 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail persistence");
 #ifdef VIMAGE
 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
     "E,jailsys", "Virtual network stack");
 #endif
 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
     "B", "Jail is in the process of shutting down");
 
 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
     "I", "Current number of child jails");
 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Maximum number of child jails");
 
 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
     "Jail hostname");
 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
     "Jail NIS domainname");
 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
     "Jail host UUID");
 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
     "LU", "Jail host ID");
 
 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
 
 #ifdef INET
 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
     "Jail IPv4 address virtualization");
 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
     "S,in_addr,a", "Jail IPv4 addresses");
 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Do (not) use IPv4 source address selection rather than the "
     "primary jail IPv4 address.");
 #endif
 #ifdef INET6
 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
     "Jail IPv6 address virtualization");
 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
     "S,in6_addr,a", "Jail IPv6 addresses");
 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Do (not) use IPv6 source address selection rather than the "
     "primary jail IPv6 address.");
 #endif
 
 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set hostname");
 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may use SYSV IPC");
 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may create raw sockets");
 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may alter system file flags");
 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set file quotas");
 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may lock (unlock) physical pages in memory");
 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may bind sockets to reserved ports");
 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may read the kernel message buffer");
 SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Unprivileged processes may use process debugging facilities");
+SYSCTL_JAIL_PARAM(_allow, unprivileged_parent_tampering,
+    CTLTYPE_INT | CTLFLAG_RW, "B",
+    "Unprivileged parent jail processes may tamper with same-uid processes"
+    " (signal/debug/cpuset)");
 SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Processes in jail with uid 0 have privilege");
 #ifdef VIMAGE
 SYSCTL_JAIL_PARAM(_allow, nfsd, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Mountd/nfsd may run in the jail");
 #endif
 SYSCTL_JAIL_PARAM(_allow, extattr, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set system-level filesystem extended attributes");
 SYSCTL_JAIL_PARAM(_allow, adjtime, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may adjust system time");
 SYSCTL_JAIL_PARAM(_allow, settime, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set system time");
 SYSCTL_JAIL_PARAM(_allow, routing, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may modify routing table");
 
 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount/unmount jail-friendly file systems in general");
 
 /*
  * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>.  Return
  * its associated bit in the pr_allow bitmask, or zero if the parameter was
  * not created.
  */
 unsigned
 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
     const char *descr)
 {
 	struct bool_flags *bf;
 	struct sysctl_oid *parent;
 	char *allow_name, *allow_noname, *allowed;
 #ifndef NO_SYSCTL_DESCR
 	char *descr_deprecated;
 #endif
 	u_int allow_flag;
 
 	if (prefix
 	    ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
 		< 0 ||
 	      asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
 		< 0
 	    : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
 	      asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
 		free(allow_name, M_PRISON);
 		return 0;
 	}
 
 	/*
 	 * See if this parameter has already beed added, i.e. a module was
 	 * previously loaded/unloaded.
 	 */
 	mtx_lock(&prison0.pr_mtx);
 	for (bf = pr_flag_allow;
 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 		atomic_load_int(&bf->flag) != 0;
 	     bf++) {
 		if (strcmp(bf->name, allow_name) == 0) {
 			allow_flag = bf->flag;
 			goto no_add;
 		}
 	}
 
 	/*
 	 * Find a free bit in pr_allow_all, failing if there are none
 	 * (which shouldn't happen as long as we keep track of how many
 	 * potential dynamic flags exist).
 	 */
 	for (allow_flag = 1;; allow_flag <<= 1) {
 		if (allow_flag == 0)
 			goto no_add;
 		if ((pr_allow_all & allow_flag) == 0)
 			break;
 	}
 
 	/* Note the parameter in the next open slot in pr_flag_allow. */
 	for (bf = pr_flag_allow; ; bf++) {
 		if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
 			/* This should never happen, but is not fatal. */
 			allow_flag = 0;
 			goto no_add;
 		}
 		if (atomic_load_int(&bf->flag) == 0)
 			break;
 	}
 	bf->name = allow_name;
 	bf->noname = allow_noname;
 	pr_allow_all |= allow_flag;
 	/*
 	 * prison0 always has permission for the new parameter.
 	 * Other jails must have it granted to them.
 	 */
 	prison0.pr_allow |= allow_flag;
 	/* The flag indicates a valid entry, so make sure it is set last. */
 	atomic_store_rel_int(&bf->flag, allow_flag);
 	mtx_unlock(&prison0.pr_mtx);
 
 	/*
 	 * Create sysctls for the parameter, and the back-compat global
 	 * permission.
 	 */
 	parent = prefix
 	    ? SYSCTL_ADD_NODE(NULL,
 		  SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
 		  OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr)
 	    : &sysctl___security_jail_param_allow;
 	(void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
 	    name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 	    NULL, 0, sysctl_jail_param, "B", descr);
 	if ((prefix
 	     ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
 	     : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
 #ifndef NO_SYSCTL_DESCR
 		(void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
 		    descr);
 #endif
 		(void)SYSCTL_ADD_PROC(NULL,
 		    SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
 		    sysctl_jail_default_allow, "I", descr_deprecated);
 #ifndef NO_SYSCTL_DESCR
 		free(descr_deprecated, M_TEMP);
 #endif
 		free(allowed, M_TEMP);
 	}
 	return allow_flag;
 
  no_add:
 	mtx_unlock(&prison0.pr_mtx);
 	free(allow_name, M_PRISON);
 	free(allow_noname, M_PRISON);
 	return allow_flag;
 }
 
 /*
  * The VFS system will register jail-aware filesystems here.  They each get
  * a parameter allow.mount.xxxfs and a flag to check when a jailed user
  * attempts to mount.
  */
 void
 prison_add_vfs(struct vfsconf *vfsp)
 {
 #ifdef NO_SYSCTL_DESCR
 
 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
 	    NULL, NULL);
 #else
 	char *descr;
 
 	(void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
 	    vfsp->vfc_name);
 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
 	    NULL, descr);
 	free(descr, M_TEMP);
 #endif
 }
 
 #ifdef RACCT
 void
 prison_racct_foreach(void (*callback)(struct racct *racct,
     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
     void *arg2, void *arg3)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_slock(&allprison_lock);
 	if (pre != NULL)
 		(pre)();
 	LIST_FOREACH(prr, &allprison_racct, prr_next)
 		(callback)(prr->prr_racct, arg2, arg3);
 	if (post != NULL)
 		(post)();
 	sx_sunlock(&allprison_lock);
 }
 
 static struct prison_racct *
 prison_racct_find_locked(const char *name)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
 		return (NULL);
 
 	LIST_FOREACH(prr, &allprison_racct, prr_next) {
 		if (strcmp(name, prr->prr_name) != 0)
 			continue;
 
 		/* Found prison_racct with a matching name? */
 		prison_racct_hold(prr);
 		return (prr);
 	}
 
 	/* Add new prison_racct. */
 	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
 	racct_create(&prr->prr_racct);
 
 	strcpy(prr->prr_name, name);
 	refcount_init(&prr->prr_refcount, 1);
 	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
 
 	return (prr);
 }
 
 struct prison_racct *
 prison_racct_find(const char *name)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_xlock(&allprison_lock);
 	prr = prison_racct_find_locked(name);
 	sx_xunlock(&allprison_lock);
 	return (prr);
 }
 
 void
 prison_racct_hold(struct prison_racct *prr)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	refcount_acquire(&prr->prr_refcount);
 }
 
 static void
 prison_racct_free_locked(struct prison_racct *prr)
 {
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	if (refcount_release(&prr->prr_refcount)) {
 		racct_destroy(&prr->prr_racct);
 		LIST_REMOVE(prr, prr_next);
 		free(prr, M_PRISON_RACCT);
 	}
 }
 
 void
 prison_racct_free(struct prison_racct *prr)
 {
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_UNLOCKED);
 
 	if (refcount_release_if_not_last(&prr->prr_refcount))
 		return;
 
 	sx_xlock(&allprison_lock);
 	prison_racct_free_locked(prr);
 	sx_xunlock(&allprison_lock);
 }
 
 static void
 prison_racct_attach(struct prison *pr)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	prr = prison_racct_find_locked(pr->pr_name);
 	KASSERT(prr != NULL, ("cannot find prison_racct"));
 
 	pr->pr_prison_racct = prr;
 }
 
 /*
  * Handle jail renaming.  From the racct point of view, renaming means
  * moving from one prison_racct to another.
  */
 static void
 prison_racct_modify(struct prison *pr)
 {
 #ifdef RCTL
 	struct proc *p;
 	struct ucred *cred;
 #endif
 	struct prison_racct *oldprr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_slock(&allproc_lock);
 	sx_xlock(&allprison_lock);
 
 	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
 		sx_xunlock(&allprison_lock);
 		sx_sunlock(&allproc_lock);
 		return;
 	}
 
 	oldprr = pr->pr_prison_racct;
 	pr->pr_prison_racct = NULL;
 
 	prison_racct_attach(pr);
 
 	/*
 	 * Move resource utilisation records.
 	 */
 	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
 
 #ifdef RCTL
 	/*
 	 * Force rctl to reattach rules to processes.
 	 */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		cred = crhold(p->p_ucred);
 		PROC_UNLOCK(p);
 		rctl_proc_ucred_changed(p, cred);
 		crfree(cred);
 	}
 #endif
 
 	sx_sunlock(&allproc_lock);
 	prison_racct_free_locked(oldprr);
 	sx_xunlock(&allprison_lock);
 }
 
 static void
 prison_racct_detach(struct prison *pr)
 {
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_UNLOCKED);
 
 	if (pr->pr_prison_racct == NULL)
 		return;
 	prison_racct_free(pr->pr_prison_racct);
 	pr->pr_prison_racct = NULL;
 }
 #endif /* RACCT */
 
 #ifdef DDB
 
 static void
 db_show_prison(struct prison *pr)
 {
 	struct bool_flags *bf;
 	struct jailsys_flags *jsf;
 #if defined(INET) || defined(INET6)
 	int ii;
 	struct prison_ip *pip;
 #endif
 	unsigned f;
 #ifdef INET
 	char ip4buf[INET_ADDRSTRLEN];
 #endif
 #ifdef INET6
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	db_printf("prison %p:\n", pr);
 	db_printf(" jid             = %d\n", pr->pr_id);
 	db_printf(" name            = %s\n", pr->pr_name);
 	db_printf(" parent          = %p\n", pr->pr_parent);
 	db_printf(" ref             = %d\n", pr->pr_ref);
 	db_printf(" uref            = %d\n", pr->pr_uref);
 	db_printf(" state           = %s\n",
 	    pr->pr_state == PRISON_STATE_ALIVE ? "alive" :
 	    pr->pr_state == PRISON_STATE_DYING ? "dying" :
 	    "invalid");
 	db_printf(" path            = %s\n", pr->pr_path);
 	db_printf(" cpuset          = %d\n", pr->pr_cpuset
 	    ? pr->pr_cpuset->cs_id : -1);
 #ifdef VIMAGE
 	db_printf(" vnet            = %p\n", pr->pr_vnet);
 #endif
 	db_printf(" root            = %p\n", pr->pr_root);
 	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
 	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
 	db_printf(" children.max    = %d\n", pr->pr_childmax);
 	db_printf(" children.cur    = %d\n", pr->pr_childcount);
 	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
 	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
 	db_printf(" flags           = 0x%x", pr->pr_flags);
 	for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
 		if (pr->pr_flags & bf->flag)
 			db_printf(" %s", bf->name);
 	for (jsf = pr_flag_jailsys;
 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
 	     jsf++) {
 		f = pr->pr_flags & (jsf->disable | jsf->new);
 		db_printf(" %-16s= %s\n", jsf->name,
 		    (f != 0 && f == jsf->disable) ? "disable"
 		    : (f == jsf->new) ? "new"
 		    : "inherit");
 	}
 	db_printf(" allow           = 0x%x", pr->pr_allow);
 	for (bf = pr_flag_allow;
 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 		atomic_load_int(&bf->flag) != 0;
 	     bf++)
 		if (pr->pr_allow & bf->flag)
 			db_printf(" %s", bf->name);
 	db_printf("\n");
 	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
 	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
 	db_printf(" host.domainname = %s\n", pr->pr_domainname);
 	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
 	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
 #ifdef INET
 	if ((pip = pr->pr_addrs[PR_INET]) != NULL) {
 		db_printf(" ip4s            = %d\n", pip->ips);
 		for (ii = 0; ii < pip->ips; ii++)
 			db_printf(" %s %s\n",
 			    ii == 0 ? "ip4.addr        =" : "                 ",
 			    inet_ntoa_r(
 			    *(const struct in_addr *)PR_IP(pip, PR_INET, ii),
 			    ip4buf));
 	}
 #endif
 #ifdef INET6
 	if ((pip = pr->pr_addrs[PR_INET6]) != NULL) {
 		db_printf(" ip6s            = %d\n", pip->ips);
 		for (ii = 0; ii < pip->ips; ii++)
 			db_printf(" %s %s\n",
 			    ii == 0 ? "ip6.addr        =" : "                 ",
 			    ip6_sprintf(ip6buf,
 			    (const struct in6_addr *)PR_IP(pip, PR_INET6, ii)));
 	}
 #endif
 }
 
 DB_SHOW_COMMAND(prison, db_show_prison_command)
 {
 	struct prison *pr;
 
 	if (!have_addr) {
 		/*
 		 * Show all prisons in the list, and prison0 which is not
 		 * listed.
 		 */
 		db_show_prison(&prison0);
 		if (!db_pager_quit) {
 			TAILQ_FOREACH(pr, &allprison, pr_list) {
 				db_show_prison(pr);
 				if (db_pager_quit)
 					break;
 			}
 		}
 		return;
 	}
 
 	if (addr == 0)
 		pr = &prison0;
 	else {
 		/* Look for a prison with the ID and with references. */
 		TAILQ_FOREACH(pr, &allprison, pr_list)
 			if (pr->pr_id == addr && pr->pr_ref > 0)
 				break;
 		if (pr == NULL)
 			/* Look again, without requiring a reference. */
 			TAILQ_FOREACH(pr, &allprison, pr_list)
 				if (pr->pr_id == addr)
 					break;
 		if (pr == NULL)
 			/* Assume address points to a valid prison. */
 			pr = (struct prison *)addr;
 	}
 	db_show_prison(pr);
 }
 
 #endif /* DDB */
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
index bbb622547598..2cd5b7069023 100644
--- a/sys/kern/kern_prot.c
+++ b/sys/kern/kern_prot.c
@@ -1,3114 +1,3170 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993
  *	The Regents of the University of California.
  * (c) UNIX System Laboratories, Inc.
  * Copyright (c) 2000-2001 Robert N. M. Watson.
  * All rights reserved.
  *
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * System calls related to processes and protection
  */
 
 #include <sys/cdefs.h>
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/abi_compat.h>
 #include <sys/acct.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
 #include <sys/lock.h>
 #include <sys/loginclass.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/ptrace.h>
 #include <sys/refcount.h>
 #include <sys/sx.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #ifdef COMPAT_43
 #include <sys/sysent.h>
 #endif
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 #include <sys/racct.h>
 #include <sys/rctl.h>
 #include <sys/resourcevar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 
 #ifdef MAC
 #include <security/mac/mac_syscalls.h>
 #endif
 
 #include <vm/uma.h>
 
 #ifdef REGRESSION
 FEATURE(regression,
     "Kernel support for interfaces necessary for regression testing (SECURITY RISK!)");
 #endif
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 static MALLOC_DEFINE(M_CRED, "cred", "credentials");
 
 SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "BSD security policy");
 
 static void crfree_final(struct ucred *cr);
 
 static inline void
 groups_check_positive_len(int ngrp)
 {
 	MPASS2(ngrp >= 0, "negative number of groups");
 }
 static inline void
 groups_check_max_len(int ngrp)
 {
 	MPASS2(ngrp <= ngroups_max, "too many supplementary groups");
 }
 
 static void groups_normalize(int *ngrp, gid_t *groups);
 static void crsetgroups_internal(struct ucred *cr, int ngrp,
     const gid_t *groups);
 
 static int cr_canseeotheruids(struct ucred *u1, struct ucred *u2);
 static int cr_canseeothergids(struct ucred *u1, struct ucred *u2);
 static int cr_canseejailproc(struct ucred *u1, struct ucred *u2);
 
 #ifndef _SYS_SYSPROTO_H_
 struct getpid_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getpid(struct thread *td, struct getpid_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	td->td_retval[0] = p->p_pid;
 #if defined(COMPAT_43)
 	if (SV_PROC_FLAG(p, SV_AOUT))
 		td->td_retval[1] = kern_getppid(td);
 #endif
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getppid_args {
         int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getppid(struct thread *td, struct getppid_args *uap)
 {
 
 	td->td_retval[0] = kern_getppid(td);
 	return (0);
 }
 
 int
 kern_getppid(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	return (p->p_oppid);
 }
 
 /*
  * Get process group ID; note that POSIX getpgrp takes no parameter.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getpgrp_args {
         int     dummy;
 };
 #endif
 int
 sys_getpgrp(struct thread *td, struct getpgrp_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	td->td_retval[0] = p->p_pgrp->pg_id;
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /* Get an arbitrary pid's process group id */
 #ifndef _SYS_SYSPROTO_H_
 struct getpgid_args {
 	pid_t	pid;
 };
 #endif
 int
 sys_getpgid(struct thread *td, struct getpgid_args *uap)
 {
 	struct proc *p;
 	int error;
 
 	if (uap->pid == 0) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 	} else {
 		p = pfind(uap->pid);
 		if (p == NULL)
 			return (ESRCH);
 		error = p_cansee(td, p);
 		if (error) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 	}
 	td->td_retval[0] = p->p_pgrp->pg_id;
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /*
  * Get an arbitrary pid's session id.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getsid_args {
 	pid_t	pid;
 };
 #endif
 int
 sys_getsid(struct thread *td, struct getsid_args *uap)
 {
 
 	return (kern_getsid(td, uap->pid));
 }
 
 int
 kern_getsid(struct thread *td, pid_t pid)
 {
 	struct proc *p;
 	int error;
 
 	if (pid == 0) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 	} else {
 		p = pfind(pid);
 		if (p == NULL)
 			return (ESRCH);
 		error = p_cansee(td, p);
 		if (error) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 	}
 	td->td_retval[0] = p->p_session->s_sid;
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getuid_args {
         int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getuid(struct thread *td, struct getuid_args *uap)
 {
 
 	td->td_retval[0] = td->td_ucred->cr_ruid;
 #if defined(COMPAT_43)
 	td->td_retval[1] = td->td_ucred->cr_uid;
 #endif
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct geteuid_args {
         int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_geteuid(struct thread *td, struct geteuid_args *uap)
 {
 
 	td->td_retval[0] = td->td_ucred->cr_uid;
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getgid_args {
         int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getgid(struct thread *td, struct getgid_args *uap)
 {
 
 	td->td_retval[0] = td->td_ucred->cr_rgid;
 #if defined(COMPAT_43)
 	td->td_retval[1] = td->td_ucred->cr_gid;
 #endif
 	return (0);
 }
 
 /*
  * Get effective group ID.  The "egid" is groups[0], and could be obtained
  * via getgroups.  This syscall exists because it is somewhat painful to do
  * correctly in a library function.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getegid_args {
         int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getegid(struct thread *td, struct getegid_args *uap)
 {
 
 	td->td_retval[0] = td->td_ucred->cr_gid;
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getgroups_args {
 	int	gidsetsize;
 	gid_t	*gidset;
 };
 #endif
 int
 sys_getgroups(struct thread *td, struct getgroups_args *uap)
 {
 	struct ucred *cred;
 	gid_t *ugidset;
 	int ngrp, error;
 
 	cred = td->td_ucred;
 
 	/*
 	 * cr_gid has been moved out of cr_groups, but we'll continue exporting
 	 * the egid as groups[0] for the time being until we audit userland for
 	 * any surprises.
 	 */
 	ngrp = cred->cr_ngroups + 1;
 
 	if (uap->gidsetsize == 0) {
 		error = 0;
 		goto out;
 	}
 	if (uap->gidsetsize < ngrp)
 		return (EINVAL);
 
 	ugidset = uap->gidset;
 	error = copyout(&cred->cr_gid, ugidset, sizeof(*ugidset));
 	if (error != 0)
 		goto out;
 
 	if (ngrp > 1)
 		error = copyout(cred->cr_groups, ugidset + 1,
 		    (ngrp - 1) * sizeof(*ugidset));
 out:
 	td->td_retval[0] = ngrp;
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setsid_args {
         int     dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_setsid(struct thread *td, struct setsid_args *uap)
 {
 	struct pgrp *pgrp;
 	int error;
 	struct proc *p = td->td_proc;
 	struct pgrp *newpgrp;
 	struct session *newsess;
 
 	pgrp = NULL;
 
 	newpgrp = uma_zalloc(pgrp_zone, M_WAITOK);
 	newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK | M_ZERO);
 
 again:
 	error = 0;
 	sx_xlock(&proctree_lock);
 
 	if (p->p_pgid == p->p_pid || (pgrp = pgfind(p->p_pid)) != NULL) {
 		if (pgrp != NULL)
 			PGRP_UNLOCK(pgrp);
 		error = EPERM;
 	} else {
 		error = enterpgrp(p, p->p_pid, newpgrp, newsess);
 		if (error == ERESTART)
 			goto again;
 		MPASS(error == 0);
 		td->td_retval[0] = p->p_pid;
 		newpgrp = NULL;
 		newsess = NULL;
 	}
 
 	sx_xunlock(&proctree_lock);
 
 	uma_zfree(pgrp_zone, newpgrp);
 	free(newsess, M_SESSION);
 
 	return (error);
 }
 
 /*
  * set process group (setpgid/old setpgrp)
  *
  * caller does setpgid(targpid, targpgid)
  *
  * pid must be caller or child of caller (ESRCH)
  * if a child
  *	pid must be in same session (EPERM)
  *	pid can't have done an exec (EACCES)
  * if pgid != pid
  * 	there must exist some pid in same session having pgid (EPERM)
  * pid must not be session leader (EPERM)
  */
 #ifndef _SYS_SYSPROTO_H_
 struct setpgid_args {
 	int	pid;		/* target process id */
 	int	pgid;		/* target pgrp id */
 };
 #endif
 /* ARGSUSED */
 int
 sys_setpgid(struct thread *td, struct setpgid_args *uap)
 {
 	struct proc *curp = td->td_proc;
 	struct proc *targp;	/* target process */
 	struct pgrp *pgrp;	/* target pgrp */
 	int error;
 	struct pgrp *newpgrp;
 
 	if (uap->pgid < 0)
 		return (EINVAL);
 
 	newpgrp = uma_zalloc(pgrp_zone, M_WAITOK);
 
 again:
 	error = 0;
 
 	sx_xlock(&proctree_lock);
 	if (uap->pid != 0 && uap->pid != curp->p_pid) {
 		if ((targp = pfind(uap->pid)) == NULL) {
 			error = ESRCH;
 			goto done;
 		}
 		if (!inferior(targp)) {
 			PROC_UNLOCK(targp);
 			error = ESRCH;
 			goto done;
 		}
 		if ((error = p_cansee(td, targp))) {
 			PROC_UNLOCK(targp);
 			goto done;
 		}
 		if (targp->p_pgrp == NULL ||
 		    targp->p_session != curp->p_session) {
 			PROC_UNLOCK(targp);
 			error = EPERM;
 			goto done;
 		}
 		if (targp->p_flag & P_EXEC) {
 			PROC_UNLOCK(targp);
 			error = EACCES;
 			goto done;
 		}
 		PROC_UNLOCK(targp);
 	} else
 		targp = curp;
 	if (SESS_LEADER(targp)) {
 		error = EPERM;
 		goto done;
 	}
 	if (uap->pgid == 0)
 		uap->pgid = targp->p_pid;
 	if ((pgrp = pgfind(uap->pgid)) == NULL) {
 		if (uap->pgid == targp->p_pid) {
 			error = enterpgrp(targp, uap->pgid, newpgrp,
 			    NULL);
 			if (error == 0)
 				newpgrp = NULL;
 		} else
 			error = EPERM;
 	} else {
 		if (pgrp == targp->p_pgrp) {
 			PGRP_UNLOCK(pgrp);
 			goto done;
 		}
 		if (pgrp->pg_id != targp->p_pid &&
 		    pgrp->pg_session != curp->p_session) {
 			PGRP_UNLOCK(pgrp);
 			error = EPERM;
 			goto done;
 		}
 		PGRP_UNLOCK(pgrp);
 		error = enterthispgrp(targp, pgrp);
 	}
 done:
 	KASSERT(error == 0 || newpgrp != NULL,
 	    ("setpgid failed and newpgrp is NULL"));
 	if (error == ERESTART)
 		goto again;
 	sx_xunlock(&proctree_lock);
 	uma_zfree(pgrp_zone, newpgrp);
 	return (error);
 }
 
 static int
 gidp_cmp(const void *p1, const void *p2)
 {
 	const gid_t g1 = *(const gid_t *)p1;
 	const gid_t g2 = *(const gid_t *)p2;
 
 	return ((g1 > g2) - (g1 < g2));
 }
 
 /*
  * Final storage for supplementary groups will be returned via 'groups'.
  * '*groups' must be NULL on input, and if not equal to 'smallgroups'
  * on output, must be freed (M_TEMP) *even if* an error is returned.
  */
 static int
 kern_setcred_copyin_supp_groups(struct setcred *const wcred,
     const u_int flags, gid_t *const smallgroups, gid_t **const groups)
 {
 	MPASS(*groups == NULL);
 
 	if (flags & SETCREDF_SUPP_GROUPS) {
 		int error;
 
 		/*
 		 * Check for the limit for number of groups right now in order
 		 * to limit the amount of bytes to copy.
 		 */
 		if (wcred->sc_supp_groups_nb > ngroups_max)
 			return (EINVAL);
 
 		/*
 		 * Since we are going to be copying the supplementary groups
 		 * from userland, make room also for the effective GID right
 		 * now, to avoid having to allocate and copy again the
 		 * supplementary groups.
 		 */
 		*groups = wcred->sc_supp_groups_nb <= CRED_SMALLGROUPS_NB ?
 		    smallgroups : malloc(wcred->sc_supp_groups_nb *
 		    sizeof(*groups), M_TEMP, M_WAITOK);
 
 		error = copyin(wcred->sc_supp_groups, *groups,
 		    wcred->sc_supp_groups_nb * sizeof(*groups));
 		if (error != 0)
 			return (error);
 		wcred->sc_supp_groups = *groups;
 	} else {
 		wcred->sc_supp_groups_nb = 0;
 		wcred->sc_supp_groups = NULL;
 	}
 
 	return (0);
 }
 
 int
 user_setcred(struct thread *td, const u_int flags,
     const void *const uwcred, const size_t size, bool is_32bit)
 {
 	struct setcred wcred;
 #ifdef MAC
 	struct mac mac;
 	/* Pointer to 'struct mac' or 'struct mac32'. */
 	void *umac;
 #endif
 	gid_t smallgroups[CRED_SMALLGROUPS_NB];
 	gid_t *groups = NULL;
 	int error;
 
 	/*
 	 * As the only point of this wrapper function is to copyin() from
 	 * userland, we only interpret the data pieces we need to perform this
 	 * operation and defer further sanity checks to kern_setcred(), except
 	 * that we redundantly check here that no unknown flags have been
 	 * passed.
 	 */
 	if ((flags & ~SETCREDF_MASK) != 0)
 		return (EINVAL);
 
 #ifdef COMPAT_FREEBSD32
 	if (is_32bit) {
 		struct setcred32 wcred32;
 
 		if (size != sizeof(wcred32))
 			return (EINVAL);
 		error = copyin(uwcred, &wcred32, sizeof(wcred32));
 		if (error != 0)
 			return (error);
 		/* These fields have exactly the same sizes and positions. */
 		memcpy(&wcred, &wcred32, &wcred32.setcred32_copy_end -
 		    &wcred32.setcred32_copy_start);
 		/* Remaining fields are pointers and need PTRIN*(). */
 		PTRIN_CP(wcred32, wcred, sc_supp_groups);
 		PTRIN_CP(wcred32, wcred, sc_label);
 	} else
 #endif /* COMPAT_FREEBSD32 */
 	{
 		if (size != sizeof(wcred))
 			return (EINVAL);
 		error = copyin(uwcred, &wcred, sizeof(wcred));
 		if (error != 0)
 			return (error);
 	}
 #ifdef MAC
 	umac = wcred.sc_label;
 #endif
 	/* Also done on !MAC as a defensive measure. */
 	wcred.sc_label = NULL;
 
 	/*
 	 * Copy supplementary groups as needed.  There is no specific
 	 * alternative for 32-bit compatibility as 'gid_t' has the same size
 	 * everywhere.
 	 */
 	error = kern_setcred_copyin_supp_groups(&wcred, flags, smallgroups,
 	    &groups);
 	if (error != 0)
 		goto free_groups;
 
 #ifdef MAC
 	if ((flags & SETCREDF_MAC_LABEL) != 0) {
 #ifdef COMPAT_FREEBSD32
 		if (is_32bit)
 			error = mac_label_copyin32(umac, &mac, NULL);
 		else
 #endif
 			error = mac_label_copyin(umac, &mac, NULL);
 		if (error != 0)
 			goto free_groups;
 		wcred.sc_label = &mac;
 	}
 #endif
 
 	error = kern_setcred(td, flags, &wcred, groups);
 
 #ifdef MAC
 	if (wcred.sc_label != NULL)
 		free_copied_label(wcred.sc_label);
 #endif
 
 free_groups:
 	if (groups != smallgroups)
 		free(groups, M_TEMP);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setcred_args {
 	u_int			 flags;	/* Flags. */
 	const struct setcred	*wcred;
 	size_t			 size;	/* Passed 'setcred' structure length. */
 };
 #endif
 /* ARGSUSED */
 int
 sys_setcred(struct thread *td, struct setcred_args *uap)
 {
 	return (user_setcred(td, uap->flags, uap->wcred, uap->size, false));
 }
 
 /*
  * CAUTION: This function normalizes groups in 'wcred'.
  *
  * If 'preallocated_groups' is non-NULL, it must be an already allocated array
  * of size 'wcred->sc_supp_groups_nb' containing the supplementary groups, and
  * 'wcred->sc_supp_groups' then must point to it.
  */
 int
 kern_setcred(struct thread *const td, const u_int flags,
     struct setcred *const wcred, gid_t *preallocated_groups)
 {
 	struct proc *const p = td->td_proc;
 	struct ucred *new_cred, *old_cred, *to_free_cred;
 	struct uidinfo *uip = NULL, *ruip = NULL;
 #ifdef MAC
 	void *mac_set_proc_data = NULL;
 	bool proc_label_set = false;
 #endif
 	gid_t *groups = NULL;
 	gid_t smallgroups[CRED_SMALLGROUPS_NB];
 	int error;
 	bool cred_set;
 
 	/* Bail out on unrecognized flags. */
 	if (flags & ~SETCREDF_MASK)
 		return (EINVAL);
 
 	/*
 	 * Part 1: We allocate and perform preparatory operations with no locks.
 	 */
 
 	if (flags & SETCREDF_SUPP_GROUPS) {
 		if (wcred->sc_supp_groups_nb > ngroups_max)
 			return (EINVAL);
 		if (preallocated_groups != NULL) {
 			groups = preallocated_groups;
 			MPASS(preallocated_groups == wcred->sc_supp_groups);
 		} else {
 			if (wcred->sc_supp_groups_nb <= CRED_SMALLGROUPS_NB)
 				groups = smallgroups;
 			else
 				groups = malloc(wcred->sc_supp_groups_nb *
 				    sizeof(*groups), M_TEMP, M_WAITOK);
 			memcpy(groups, wcred->sc_supp_groups,
 			    wcred->sc_supp_groups_nb * sizeof(*groups));
 		}
 	}
 
 	if (flags & SETCREDF_MAC_LABEL) {
 #ifdef MAC
 		error = mac_set_proc_prepare(td, wcred->sc_label,
 		    &mac_set_proc_data);
 		if (error != 0)
 			goto free_groups;
 #else
 		error = ENOTSUP;
 		goto free_groups;
 #endif
 	}
 
 	if (flags & SETCREDF_UID) {
 		AUDIT_ARG_EUID(wcred->sc_uid);
 		uip = uifind(wcred->sc_uid);
 	}
 	if (flags & SETCREDF_RUID) {
 		AUDIT_ARG_RUID(wcred->sc_ruid);
 		ruip = uifind(wcred->sc_ruid);
 	}
 	if (flags & SETCREDF_SVUID)
 		AUDIT_ARG_SUID(wcred->sc_svuid);
 
 	if (flags & SETCREDF_GID)
 		AUDIT_ARG_EGID(wcred->sc_gid);
 	if (flags & SETCREDF_RGID)
 		AUDIT_ARG_RGID(wcred->sc_rgid);
 	if (flags & SETCREDF_SVGID)
 		AUDIT_ARG_SGID(wcred->sc_svgid);
 	if (flags & SETCREDF_SUPP_GROUPS) {
 		/*
 		 * Output the raw supplementary groups array for better
 		 * traceability.
 		 */
 		AUDIT_ARG_GROUPSET(groups, wcred->sc_supp_groups_nb);
 		groups_normalize(&wcred->sc_supp_groups_nb, groups);
 	}
 
 	/*
 	 * We first completely build the new credentials and only then pass them
 	 * to MAC along with the old ones so that modules can check whether the
 	 * requested transition is allowed.
 	 */
 	new_cred = crget();
 	to_free_cred = new_cred;
 	if (flags & SETCREDF_SUPP_GROUPS)
 		crextend(new_cred, wcred->sc_supp_groups_nb);
 
 #ifdef MAC
 	mac_cred_setcred_enter();
 #endif
 
 	/*
 	 * Part 2: We grab the process lock as to have a stable view of its
 	 * current credentials, and prepare a copy of them with the requested
 	 * changes applied under that lock.
 	 */
 
 	PROC_LOCK(p);
 	old_cred = crcopysafe(p, new_cred);
 
 	/*
 	 * Change user IDs.
 	 */
 	if (flags & SETCREDF_UID)
 		change_euid(new_cred, uip);
 	if (flags & SETCREDF_RUID)
 		change_ruid(new_cred, ruip);
 	if (flags & SETCREDF_SVUID)
 		change_svuid(new_cred, wcred->sc_svuid);
 
 	/*
 	 * Change groups.
 	 */
 	if (flags & SETCREDF_SUPP_GROUPS)
 		crsetgroups_internal(new_cred, wcred->sc_supp_groups_nb,
 		    groups);
 	if (flags & SETCREDF_GID)
 		change_egid(new_cred, wcred->sc_gid);
 	if (flags & SETCREDF_RGID)
 		change_rgid(new_cred, wcred->sc_rgid);
 	if (flags & SETCREDF_SVGID)
 		change_svgid(new_cred, wcred->sc_svgid);
 
 #ifdef MAC
 	/*
 	 * Change the MAC label.
 	 */
 	if (flags & SETCREDF_MAC_LABEL) {
 		error = mac_set_proc_core(td, new_cred, mac_set_proc_data);
 		if (error != 0)
 			goto unlock_finish;
 		proc_label_set = true;
 	}
 
 	/*
 	 * MAC security modules checks.
 	 */
 	error = mac_cred_check_setcred(flags, old_cred, new_cred);
 	if (error != 0)
 		goto unlock_finish;
 #endif
 	/*
 	 * Privilege check.
 	 */
 	error = priv_check_cred(old_cred, PRIV_CRED_SETCRED);
 	if (error != 0)
 		goto unlock_finish;
 
 	/*
 	 * Set the new credentials, noting that they have changed.
 	 */
 	cred_set = proc_set_cred_enforce_proc_lim(p, new_cred);
 	if (cred_set) {
 		setsugid(p);
 		to_free_cred = old_cred;
 		MPASS(error == 0);
 	} else
 		error = EAGAIN;
 
 unlock_finish:
 	PROC_UNLOCK(p);
 	/*
 	 * Part 3: After releasing the process lock, we perform cleanups and
 	 * finishing operations.
 	 */
 
 #ifdef MAC
 	if (mac_set_proc_data != NULL)
 		mac_set_proc_finish(td, proc_label_set, mac_set_proc_data);
 	mac_cred_setcred_exit();
 #endif
 	crfree(to_free_cred);
 	if (uip != NULL)
 		uifree(uip);
 	if (ruip != NULL)
 		uifree(ruip);
 free_groups:
 	if (groups != preallocated_groups && groups != smallgroups)
 		free(groups, M_TEMP); /* Deals with 'groups' being NULL. */
 	return (error);
 }
 
 /*
  * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD
  * compatible.  It says that setting the uid/gid to euid/egid is a special
  * case of "appropriate privilege".  Once the rules are expanded out, this
  * basically means that setuid(nnn) sets all three id's, in all permitted
  * cases unless _POSIX_SAVED_IDS is enabled.  In that case, setuid(getuid())
  * does not set the saved id - this is dangerous for traditional BSD
  * programs.  For this reason, we *really* do not want to set
  * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2.
  */
 #define POSIX_APPENDIX_B_4_2_2
 
 #ifndef _SYS_SYSPROTO_H_
 struct setuid_args {
 	uid_t	uid;
 };
 #endif
 /* ARGSUSED */
 int
 sys_setuid(struct thread *td, struct setuid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	uid_t uid;
 	struct uidinfo *uip;
 	int error;
 
 	uid = uap->uid;
 	AUDIT_ARG_UID(uid);
 	newcred = crget();
 	uip = uifind(uid);
 	PROC_LOCK(p);
 	/*
 	 * Copy credentials so other references do not see our changes.
 	 */
 	oldcred = crcopysafe(p, newcred);
 
 #ifdef MAC
 	error = mac_cred_check_setuid(oldcred, uid);
 	if (error)
 		goto fail;
 #endif
 
 	/*
 	 * See if we have "permission" by POSIX 1003.1 rules.
 	 *
 	 * Note that setuid(geteuid()) is a special case of
 	 * "appropriate privileges" in appendix B.4.2.2.  We need
 	 * to use this clause to be compatible with traditional BSD
 	 * semantics.  Basically, it means that "setuid(xx)" sets all
 	 * three id's (assuming you have privs).
 	 *
 	 * Notes on the logic.  We do things in three steps.
 	 * 1: We determine if the euid is going to change, and do EPERM
 	 *    right away.  We unconditionally change the euid later if this
 	 *    test is satisfied, simplifying that part of the logic.
 	 * 2: We determine if the real and/or saved uids are going to
 	 *    change.  Determined by compile options.
 	 * 3: Change euid last. (after tests in #2 for "appropriate privs")
 	 */
 	if (uid != oldcred->cr_ruid &&		/* allow setuid(getuid()) */
 #ifdef _POSIX_SAVED_IDS
 	    uid != oldcred->cr_svuid &&		/* allow setuid(saved gid) */
 #endif
 #ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
 	    uid != oldcred->cr_uid &&		/* allow setuid(geteuid()) */
 #endif
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETUID)) != 0)
 		goto fail;
 
 #ifdef _POSIX_SAVED_IDS
 	/*
 	 * Do we have "appropriate privileges" (are we root or uid == euid)
 	 * If so, we are changing the real uid and/or saved uid.
 	 */
 	if (
 #ifdef POSIX_APPENDIX_B_4_2_2	/* Use the clause from B.4.2.2 */
 	    uid == oldcred->cr_uid ||
 #endif
 	    /* We are using privs. */
 	    priv_check_cred(oldcred, PRIV_CRED_SETUID) == 0)
 #endif
 	{
 		/*
 		 * Set the real uid.
 		 */
 		if (uid != oldcred->cr_ruid) {
 			change_ruid(newcred, uip);
 			setsugid(p);
 		}
 		/*
 		 * Set saved uid
 		 *
 		 * XXX always set saved uid even if not _POSIX_SAVED_IDS, as
 		 * the security of seteuid() depends on it.  B.4.2.2 says it
 		 * is important that we should do this.
 		 */
 		if (uid != oldcred->cr_svuid) {
 			change_svuid(newcred, uid);
 			setsugid(p);
 		}
 	}
 
 	/*
 	 * In all permitted cases, we are changing the euid.
 	 */
 	if (uid != oldcred->cr_uid) {
 		change_euid(newcred, uip);
 		setsugid(p);
 	}
 	/*
 	 * This also transfers the proc count to the new user.
 	 */
 	proc_set_cred(p, newcred);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
 #ifdef RCTL
 	rctl_proc_ucred_changed(p, newcred);
 	crfree(newcred);
 #endif
 	uifree(uip);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	uifree(uip);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct seteuid_args {
 	uid_t	euid;
 };
 #endif
 /* ARGSUSED */
 int
 sys_seteuid(struct thread *td, struct seteuid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	uid_t euid;
 	struct uidinfo *euip;
 	int error;
 
 	euid = uap->euid;
 	AUDIT_ARG_EUID(euid);
 	newcred = crget();
 	euip = uifind(euid);
 	PROC_LOCK(p);
 	/*
 	 * Copy credentials so other references do not see our changes.
 	 */
 	oldcred = crcopysafe(p, newcred);
 
 #ifdef MAC
 	error = mac_cred_check_seteuid(oldcred, euid);
 	if (error)
 		goto fail;
 #endif
 
 	if (euid != oldcred->cr_ruid &&		/* allow seteuid(getuid()) */
 	    euid != oldcred->cr_svuid &&	/* allow seteuid(saved uid) */
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETEUID)) != 0)
 		goto fail;
 
 	/*
 	 * Everything's okay, do it.
 	 */
 	if (oldcred->cr_uid != euid) {
 		change_euid(newcred, euip);
 		setsugid(p);
 	}
 	proc_set_cred(p, newcred);
 	PROC_UNLOCK(p);
 	uifree(euip);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	uifree(euip);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setgid_args {
 	gid_t	gid;
 };
 #endif
 /* ARGSUSED */
 int
 sys_setgid(struct thread *td, struct setgid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	gid_t gid;
 	int error;
 
 	gid = uap->gid;
 	AUDIT_ARG_GID(gid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 
 #ifdef MAC
 	error = mac_cred_check_setgid(oldcred, gid);
 	if (error)
 		goto fail;
 #endif
 
 	/*
 	 * See if we have "permission" by POSIX 1003.1 rules.
 	 *
 	 * Note that setgid(getegid()) is a special case of
 	 * "appropriate privileges" in appendix B.4.2.2.  We need
 	 * to use this clause to be compatible with traditional BSD
 	 * semantics.  Basically, it means that "setgid(xx)" sets all
 	 * three id's (assuming you have privs).
 	 *
 	 * For notes on the logic here, see setuid() above.
 	 */
 	if (gid != oldcred->cr_rgid &&		/* allow setgid(getgid()) */
 #ifdef _POSIX_SAVED_IDS
 	    gid != oldcred->cr_svgid &&		/* allow setgid(saved gid) */
 #endif
 #ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
 	    gid != oldcred->cr_gid && /* allow setgid(getegid()) */
 #endif
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETGID)) != 0)
 		goto fail;
 
 #ifdef _POSIX_SAVED_IDS
 	/*
 	 * Do we have "appropriate privileges" (are we root or gid == egid)
 	 * If so, we are changing the real uid and saved gid.
 	 */
 	if (
 #ifdef POSIX_APPENDIX_B_4_2_2	/* use the clause from B.4.2.2 */
 	    gid == oldcred->cr_gid ||
 #endif
 	    /* We are using privs. */
 	    priv_check_cred(oldcred, PRIV_CRED_SETGID) == 0)
 #endif
 	{
 		/*
 		 * Set real gid
 		 */
 		if (oldcred->cr_rgid != gid) {
 			change_rgid(newcred, gid);
 			setsugid(p);
 		}
 		/*
 		 * Set saved gid
 		 *
 		 * XXX always set saved gid even if not _POSIX_SAVED_IDS, as
 		 * the security of setegid() depends on it.  B.4.2.2 says it
 		 * is important that we should do this.
 		 */
 		if (oldcred->cr_svgid != gid) {
 			change_svgid(newcred, gid);
 			setsugid(p);
 		}
 	}
 	/*
 	 * In all cases permitted cases, we are changing the egid.
 	 * Copy credentials so other references do not see our changes.
 	 */
 	if (oldcred->cr_gid != gid) {
 		change_egid(newcred, gid);
 		setsugid(p);
 	}
 	proc_set_cred(p, newcred);
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setegid_args {
 	gid_t	egid;
 };
 #endif
 /* ARGSUSED */
 int
 sys_setegid(struct thread *td, struct setegid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	gid_t egid;
 	int error;
 
 	egid = uap->egid;
 	AUDIT_ARG_EGID(egid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 
 #ifdef MAC
 	error = mac_cred_check_setegid(oldcred, egid);
 	if (error)
 		goto fail;
 #endif
 
 	if (egid != oldcred->cr_rgid &&		/* allow setegid(getgid()) */
 	    egid != oldcred->cr_svgid &&	/* allow setegid(saved gid) */
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID)) != 0)
 		goto fail;
 
 	if (oldcred->cr_gid != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
 	proc_set_cred(p, newcred);
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setgroups_args {
 	int	gidsetsize;
 	gid_t	*gidset;
 };
 #endif
 /* ARGSUSED */
 int
 sys_setgroups(struct thread *td, struct setgroups_args *uap)
 {
 	gid_t smallgroups[CRED_SMALLGROUPS_NB];
 	gid_t *groups;
 	int gidsetsize, error;
 
 	/*
 	 * Sanity check size now to avoid passing too big a value to copyin(),
 	 * even if kern_setgroups() will do it again.
 	 *
 	 * Ideally, the 'gidsetsize' argument should have been a 'u_int' (and it
 	 * was, in this implementation, for a long time), but POSIX standardized
 	 * getgroups() to take an 'int' and it would be quite entrapping to have
 	 * setgroups() differ.
 	 */
 	gidsetsize = uap->gidsetsize;
 	/* XXXKE Limit to ngroups_max when we change the userland interface. */
 	if (gidsetsize > ngroups_max + 1 || gidsetsize < 0)
 		return (EINVAL);
 
 	if (gidsetsize > CRED_SMALLGROUPS_NB)
 		groups = malloc(gidsetsize * sizeof(gid_t), M_TEMP, M_WAITOK);
 	else
 		groups = smallgroups;
 
 	error = copyin(uap->gidset, groups, gidsetsize * sizeof(gid_t));
 	if (error == 0)
 		error = kern_setgroups(td, &gidsetsize, groups);
 
 	if (groups != smallgroups)
 		free(groups, M_TEMP);
 	return (error);
 }
 
 /*
  * CAUTION: This function normalizes 'groups', possibly also changing the value
  * of '*ngrpp' as a consequence.
  */
 int
 kern_setgroups(struct thread *td, int *ngrpp, gid_t *groups)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	int ngrp, error;
 	gid_t egid;
 
 	ngrp = *ngrpp;
 	/* Sanity check size. */
 	/* XXXKE Limit to ngroups_max when we change the userland interface. */
 	if (ngrp < 0 || ngrp > ngroups_max + 1)
 		return (EINVAL);
 
 	AUDIT_ARG_GROUPSET(groups, ngrp);
 	/*
 	 * setgroups(0, NULL) is a legitimate way of clearing the groups vector
 	 * on non-BSD systems (which generally do not have the egid in the
 	 * groups[0]).  We risk security holes when running non-BSD software if
 	 * we do not do the same.  So we allow and treat 0 for 'ngrp' specially
 	 * below (twice).
 	 */
 	if (ngrp != 0) {
 		/*
 		 * To maintain userland compat for now, we use the first group
 		 * as our egid and we'll use the rest as our supplemental
 		 * groups.
 		 */
 		egid = groups[0];
 		ngrp--;
 		groups++;
 
 		groups_normalize(&ngrp, groups);
 		*ngrpp = ngrp;
 	}
 	newcred = crget();
 	crextend(newcred, ngrp);
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 
 #ifdef MAC
 	/*
 	 * We pass NULL here explicitly if we don't have any supplementary
 	 * groups mostly for the sake of normalization, but also to avoid/detect
 	 * a situation where a MAC module has some assumption about the layout
 	 * of `groups` matching historical behavior.
 	 */
 	error = mac_cred_check_setgroups(oldcred, ngrp,
 	    ngrp == 0 ? NULL : groups);
 	if (error)
 		goto fail;
 #endif
 
 	error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS);
 	if (error)
 		goto fail;
 
 	/*
 	 * If some groups were passed, the first one is currently the desired
 	 * egid.  This code is to be removed (along with some commented block
 	 * above) when setgroups() is changed to take only supplementary groups.
 	 */
 	if (ngrp != 0)
 		newcred->cr_gid = egid;
 	crsetgroups_internal(newcred, ngrp, groups);
 
 	setsugid(p);
 	proc_set_cred(p, newcred);
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setreuid_args {
 	uid_t	ruid;
 	uid_t	euid;
 };
 #endif
 /* ARGSUSED */
 int
 sys_setreuid(struct thread *td, struct setreuid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	uid_t euid, ruid;
 	struct uidinfo *euip, *ruip;
 	int error;
 
 	euid = uap->euid;
 	ruid = uap->ruid;
 	AUDIT_ARG_EUID(euid);
 	AUDIT_ARG_RUID(ruid);
 	newcred = crget();
 	euip = uifind(euid);
 	ruip = uifind(ruid);
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 
 #ifdef MAC
 	error = mac_cred_check_setreuid(oldcred, ruid, euid);
 	if (error)
 		goto fail;
 #endif
 
 	if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
 	      ruid != oldcred->cr_svuid) ||
 	     (euid != (uid_t)-1 && euid != oldcred->cr_uid &&
 	      euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETREUID)) != 0)
 		goto fail;
 
 	if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
 		change_euid(newcred, euip);
 		setsugid(p);
 	}
 	if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
 		change_ruid(newcred, ruip);
 		setsugid(p);
 	}
 	if ((ruid != (uid_t)-1 || newcred->cr_uid != newcred->cr_ruid) &&
 	    newcred->cr_svuid != newcred->cr_uid) {
 		change_svuid(newcred, newcred->cr_uid);
 		setsugid(p);
 	}
 	proc_set_cred(p, newcred);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
 #ifdef RCTL
 	rctl_proc_ucred_changed(p, newcred);
 	crfree(newcred);
 #endif
 	uifree(ruip);
 	uifree(euip);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	uifree(ruip);
 	uifree(euip);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setregid_args {
 	gid_t	rgid;
 	gid_t	egid;
 };
 #endif
 /* ARGSUSED */
 int
 sys_setregid(struct thread *td, struct setregid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	gid_t egid, rgid;
 	int error;
 
 	egid = uap->egid;
 	rgid = uap->rgid;
 	AUDIT_ARG_EGID(egid);
 	AUDIT_ARG_RGID(rgid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 
 #ifdef MAC
 	error = mac_cred_check_setregid(oldcred, rgid, egid);
 	if (error)
 		goto fail;
 #endif
 
 	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
 	    rgid != oldcred->cr_svgid) ||
 	     (egid != (gid_t)-1 && egid != oldcred->cr_gid &&
 	     egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID)) != 0)
 		goto fail;
 
 	if (egid != (gid_t)-1 && oldcred->cr_gid != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
 	if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
 		change_rgid(newcred, rgid);
 		setsugid(p);
 	}
 	if ((rgid != (gid_t)-1 || newcred->cr_gid != newcred->cr_rgid) &&
 	    newcred->cr_svgid != newcred->cr_gid) {
 		change_svgid(newcred, newcred->cr_gid);
 		setsugid(p);
 	}
 	proc_set_cred(p, newcred);
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 /*
  * setresuid(ruid, euid, suid) is like setreuid except control over the saved
  * uid is explicit.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct setresuid_args {
 	uid_t	ruid;
 	uid_t	euid;
 	uid_t	suid;
 };
 #endif
 /* ARGSUSED */
 int
 sys_setresuid(struct thread *td, struct setresuid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	uid_t euid, ruid, suid;
 	struct uidinfo *euip, *ruip;
 	int error;
 
 	euid = uap->euid;
 	ruid = uap->ruid;
 	suid = uap->suid;
 	AUDIT_ARG_EUID(euid);
 	AUDIT_ARG_RUID(ruid);
 	AUDIT_ARG_SUID(suid);
 	newcred = crget();
 	euip = uifind(euid);
 	ruip = uifind(ruid);
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 
 #ifdef MAC
 	error = mac_cred_check_setresuid(oldcred, ruid, euid, suid);
 	if (error)
 		goto fail;
 #endif
 
 	if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
 	     ruid != oldcred->cr_svuid &&
 	      ruid != oldcred->cr_uid) ||
 	     (euid != (uid_t)-1 && euid != oldcred->cr_ruid &&
 	    euid != oldcred->cr_svuid &&
 	      euid != oldcred->cr_uid) ||
 	     (suid != (uid_t)-1 && suid != oldcred->cr_ruid &&
 	    suid != oldcred->cr_svuid &&
 	      suid != oldcred->cr_uid)) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID)) != 0)
 		goto fail;
 
 	if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
 		change_euid(newcred, euip);
 		setsugid(p);
 	}
 	if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
 		change_ruid(newcred, ruip);
 		setsugid(p);
 	}
 	if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) {
 		change_svuid(newcred, suid);
 		setsugid(p);
 	}
 	proc_set_cred(p, newcred);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
 #ifdef RCTL
 	rctl_proc_ucred_changed(p, newcred);
 	crfree(newcred);
 #endif
 	uifree(ruip);
 	uifree(euip);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	uifree(ruip);
 	uifree(euip);
 	crfree(newcred);
 	return (error);
 
 }
 
 /*
  * setresgid(rgid, egid, sgid) is like setregid except control over the saved
  * gid is explicit.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct setresgid_args {
 	gid_t	rgid;
 	gid_t	egid;
 	gid_t	sgid;
 };
 #endif
 /* ARGSUSED */
 int
 sys_setresgid(struct thread *td, struct setresgid_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *newcred, *oldcred;
 	gid_t egid, rgid, sgid;
 	int error;
 
 	egid = uap->egid;
 	rgid = uap->rgid;
 	sgid = uap->sgid;
 	AUDIT_ARG_EGID(egid);
 	AUDIT_ARG_RGID(rgid);
 	AUDIT_ARG_SGID(sgid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 
 #ifdef MAC
 	error = mac_cred_check_setresgid(oldcred, rgid, egid, sgid);
 	if (error)
 		goto fail;
 #endif
 
 	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
 	      rgid != oldcred->cr_svgid &&
 	      rgid != oldcred->cr_gid) ||
 	     (egid != (gid_t)-1 && egid != oldcred->cr_rgid &&
 	      egid != oldcred->cr_svgid &&
 	      egid != oldcred->cr_gid) ||
 	     (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid &&
 	      sgid != oldcred->cr_svgid &&
 	      sgid != oldcred->cr_gid)) &&
 	    (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID)) != 0)
 		goto fail;
 
 	if (egid != (gid_t)-1 && oldcred->cr_gid != egid) {
 		change_egid(newcred, egid);
 		setsugid(p);
 	}
 	if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
 		change_rgid(newcred, rgid);
 		setsugid(p);
 	}
 	if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) {
 		change_svgid(newcred, sgid);
 		setsugid(p);
 	}
 	proc_set_cred(p, newcred);
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getresuid_args {
 	uid_t	*ruid;
 	uid_t	*euid;
 	uid_t	*suid;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getresuid(struct thread *td, struct getresuid_args *uap)
 {
 	struct ucred *cred;
 	int error1 = 0, error2 = 0, error3 = 0;
 
 	cred = td->td_ucred;
 	if (uap->ruid)
 		error1 = copyout(&cred->cr_ruid,
 		    uap->ruid, sizeof(cred->cr_ruid));
 	if (uap->euid)
 		error2 = copyout(&cred->cr_uid,
 		    uap->euid, sizeof(cred->cr_uid));
 	if (uap->suid)
 		error3 = copyout(&cred->cr_svuid,
 		    uap->suid, sizeof(cred->cr_svuid));
 	return (error1 ? error1 : error2 ? error2 : error3);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getresgid_args {
 	gid_t	*rgid;
 	gid_t	*egid;
 	gid_t	*sgid;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getresgid(struct thread *td, struct getresgid_args *uap)
 {
 	struct ucred *cred;
 	int error1 = 0, error2 = 0, error3 = 0;
 
 	cred = td->td_ucred;
 	if (uap->rgid)
 		error1 = copyout(&cred->cr_rgid,
 		    uap->rgid, sizeof(cred->cr_rgid));
 	if (uap->egid)
 		error2 = copyout(&cred->cr_gid,
 		    uap->egid, sizeof(cred->cr_gid));
 	if (uap->sgid)
 		error3 = copyout(&cred->cr_svgid,
 		    uap->sgid, sizeof(cred->cr_svgid));
 	return (error1 ? error1 : error2 ? error2 : error3);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct issetugid_args {
 	int dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_issetugid(struct thread *td, struct issetugid_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	/*
 	 * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time,
 	 * we use P_SUGID because we consider changing the owners as
 	 * "tainting" as well.
 	 * This is significant for procs that start as root and "become"
 	 * a user without an exec - programs cannot know *everything*
 	 * that libc *might* have put in their data segment.
 	 */
 	td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0;
 	return (0);
 }
 
 int
 sys___setugid(struct thread *td, struct __setugid_args *uap)
 {
 #ifdef REGRESSION
 	struct proc *p;
 
 	p = td->td_proc;
 	switch (uap->flag) {
 	case 0:
 		PROC_LOCK(p);
 		p->p_flag &= ~P_SUGID;
 		PROC_UNLOCK(p);
 		return (0);
 	case 1:
 		PROC_LOCK(p);
 		p->p_flag |= P_SUGID;
 		PROC_UNLOCK(p);
 		return (0);
 	default:
 		return (EINVAL);
 	}
 #else /* !REGRESSION */
 
 	return (ENOSYS);
 #endif /* REGRESSION */
 }
 
 #ifdef INVARIANTS
 static void
 groups_check_normalized(int ngrp, const gid_t *groups)
 {
 	gid_t prev_g;
 
 	groups_check_positive_len(ngrp);
 	groups_check_max_len(ngrp);
 
 	if (ngrp <= 1)
 		return;
 
 	prev_g = groups[0];
 	for (int i = 1; i < ngrp; ++i) {
 		const gid_t g = groups[i];
 
 		if (prev_g >= g)
 			panic("%s: groups[%d] (%u) >= groups[%d] (%u)",
 			    __func__, i - 1, prev_g, i, g);
 		prev_g = g;
 	}
 }
 #else
 #define groups_check_normalized(...)
 #endif
 
 /*
  * Returns whether gid designates a supplementary group in cred.
  */
 bool
 group_is_supplementary(const gid_t gid, const struct ucred *const cred)
 {
 
 	groups_check_normalized(cred->cr_ngroups, cred->cr_groups);
 
 	/*
 	 * Perform a binary search of the supplementary groups.  This is
 	 * possible because we sort the groups in crsetgroups().
 	 */
 	return (bsearch(&gid, cred->cr_groups, cred->cr_ngroups,
 	    sizeof(gid), gidp_cmp) != NULL);
 }
 
 /*
  * Check if gid is a member of the (effective) group set (i.e., effective and
  * supplementary groups).
  */
 bool
 groupmember(gid_t gid, const struct ucred *cred)
 {
 
 	groups_check_positive_len(cred->cr_ngroups);
 
 	if (gid == cred->cr_gid)
 		return (true);
 
 	return (group_is_supplementary(gid, cred));
 }
 
 /*
  * Check if gid is a member of the real group set (i.e., real and supplementary
  * groups).
  */
 bool
 realgroupmember(gid_t gid, const struct ucred *cred)
 {
 	/*
 	 * Although the equality test on 'cr_rgid' below doesn't access
 	 * 'cr_groups', we check for the latter's length here as we assume that,
 	 * if 'cr_ngroups' is 0, the passed 'struct ucred' is invalid, and
 	 * 'cr_rgid' may not have been filled.
 	 */
 	groups_check_positive_len(cred->cr_ngroups);
 
 	if (gid == cred->cr_rgid)
 		return (true);
 
 	return (group_is_supplementary(gid, cred));
 }
 
 /*
  * Test the active securelevel against a given level.  securelevel_gt()
  * implements (securelevel > level).  securelevel_ge() implements
  * (securelevel >= level).  Note that the logic is inverted -- these
  * functions return EPERM on "success" and 0 on "failure".
  *
  * Due to care taken when setting the securelevel, we know that no jail will
  * be less secure that its parent (or the physical system), so it is sufficient
  * to test the current jail only.
  *
  * XXXRW: Possibly since this has to do with privilege, it should move to
  * kern_priv.c.
  */
 int
 securelevel_gt(struct ucred *cr, int level)
 {
 
 	return (cr->cr_prison->pr_securelevel > level ? EPERM : 0);
 }
 
 int
 securelevel_ge(struct ucred *cr, int level)
 {
 
 	return (cr->cr_prison->pr_securelevel >= level ? EPERM : 0);
 }
 
 /*
  * 'see_other_uids' determines whether or not visibility of processes
  * and sockets with credentials holding different real uids is possible
  * using a variety of system MIBs.
  * XXX: data declarations should be together near the beginning of the file.
  */
 static int	see_other_uids = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW,
     &see_other_uids, 0,
     "Unprivileged processes may see subjects/objects with different real uid");
 
 /*-
  * Determine if u1 "can see" the subject specified by u2, according to the
  * 'see_other_uids' policy.
  * Returns: 0 for permitted, ESRCH otherwise
  * Locks: none
  * References: *u1 and *u2 must not change during the call
  *             u1 may equal u2, in which case only one reference is required
  */
 static int
 cr_canseeotheruids(struct ucred *u1, struct ucred *u2)
 {
 
 	if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) {
 		if (priv_check_cred(u1, PRIV_SEEOTHERUIDS) != 0)
 			return (ESRCH);
 	}
 	return (0);
 }
 
 /*
  * 'see_other_gids' determines whether or not visibility of processes
  * and sockets with credentials holding different real gids is possible
  * using a variety of system MIBs.
  * XXX: data declarations should be together near the beginning of the file.
  */
 static int	see_other_gids = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW,
     &see_other_gids, 0,
     "Unprivileged processes may see subjects/objects with different real gid");
 
 /*
  * Determine if u1 can "see" the subject specified by u2, according to the
  * 'see_other_gids' policy.
  * Returns: 0 for permitted, ESRCH otherwise
  * Locks: none
  * References: *u1 and *u2 must not change during the call
  *             u1 may equal u2, in which case only one reference is required
  */
 static int
 cr_canseeothergids(struct ucred *u1, struct ucred *u2)
 {
 	if (!see_other_gids) {
 		if (realgroupmember(u1->cr_rgid, u2))
 			return (0);
 
 		for (int i = 1; i < u1->cr_ngroups; i++)
 			if (realgroupmember(u1->cr_groups[i], u2))
 				return (0);
 
 		if (priv_check_cred(u1, PRIV_SEEOTHERGIDS) != 0)
 			return (ESRCH);
 	}
 
 	return (0);
 }
 
 /*
  * 'see_jail_proc' determines whether or not visibility of processes and
  * sockets with credentials holding different jail ids is possible using a
  * variety of system MIBs.
  *
  * XXX: data declarations should be together near the beginning of the file.
  */
 
 static int	see_jail_proc = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, see_jail_proc, CTLFLAG_RW,
     &see_jail_proc, 0,
     "Unprivileged processes may see subjects/objects with different jail ids");
 
 /*-
  * Determine if u1 "can see" the subject specified by u2, according to the
  * 'see_jail_proc' policy.
  * Returns: 0 for permitted, ESRCH otherwise
  * Locks: none
  * References: *u1 and *u2 must not change during the call
  *             u1 may equal u2, in which case only one reference is required
  */
 static int
 cr_canseejailproc(struct ucred *u1, struct ucred *u2)
 {
 	if (see_jail_proc || /* Policy deactivated. */
 	    u1->cr_prison == u2->cr_prison || /* Same jail. */
 	    priv_check_cred(u1, PRIV_SEEJAILPROC) == 0) /* Privileged. */
 		return (0);
 
 	return (ESRCH);
 }
 
+/*
+ * Determine if u1 can tamper with the subject specified by u2, if they are in
+ * different jails and 'unprivileged_parent_tampering' jail policy allows it.
+ *
+ * May be called if u1 and u2 are in the same jail, but it is expected that the
+ * caller has already done a prison_check() prior to calling it.
+ *
+ * Returns: 0 for permitted, EPERM otherwise
+ */
+static int
+cr_can_tamper_with_subjail(struct ucred *u1, struct ucred *u2, int priv)
+{
+
+	MPASS(prison_check(u1, u2) == 0);
+	if (u1->cr_prison == u2->cr_prison)
+		return (0);
+
+	if (priv_check_cred(u1, priv) == 0)
+		return (0);
+
+	/*
+	 * Jails do not maintain a distinct UID space, so process visibility is
+	 * all that would control an unprivileged process' ability to tamper
+	 * with a process in a subjail by default if we did not have the
+	 * allow.unprivileged_parent_tampering knob to restrict it by default.
+	 */
+	if (prison_allow(u2, PR_ALLOW_UNPRIV_PARENT_TAMPER))
+		return (0);
+
+	return (EPERM);
+}
+
 /*
  * Helper for cr_cansee*() functions to abide by system-wide security.bsd.see_*
  * policies.  Determines if u1 "can see" u2 according to these policies.
  * Returns: 0 for permitted, ESRCH otherwise
  */
 int
 cr_bsd_visible(struct ucred *u1, struct ucred *u2)
 {
 	int error;
 
 	error = cr_canseeotheruids(u1, u2);
 	if (error != 0)
 		return (error);
 	error = cr_canseeothergids(u1, u2);
 	if (error != 0)
 		return (error);
 	error = cr_canseejailproc(u1, u2);
 	if (error != 0)
 		return (error);
 	return (0);
 }
 
 /*-
  * Determine if u1 "can see" the subject specified by u2.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: none
  * References: *u1 and *u2 must not change during the call
  *             u1 may equal u2, in which case only one reference is required
  */
 int
 cr_cansee(struct ucred *u1, struct ucred *u2)
 {
 	int error;
 
 	if ((error = prison_check(u1, u2)))
 		return (error);
 #ifdef MAC
 	if ((error = mac_cred_check_visible(u1, u2)))
 		return (error);
 #endif
 	if ((error = cr_bsd_visible(u1, u2)))
 		return (error);
 	return (0);
 }
 
 /*-
  * Determine if td "can see" the subject specified by p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect p->p_ucred must be held.  td really
  *        should be curthread.
  * References: td and p must be valid for the lifetime of the call
  */
 int
 p_cansee(struct thread *td, struct proc *p)
 {
 	/* Wrap cr_cansee() for all functionality. */
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (td->td_proc == p)
 		return (0);
 	return (cr_cansee(td->td_ucred, p->p_ucred));
 }
 
 /*
  * 'conservative_signals' prevents the delivery of a broad class of
  * signals by unprivileged processes to processes that have changed their
  * credentials since the last invocation of execve().  This can prevent
  * the leakage of cached information or retained privileges as a result
  * of a common class of signal-related vulnerabilities.  However, this
  * may interfere with some applications that expect to be able to
  * deliver these signals to peer processes after having given up
  * privilege.
  */
 static int	conservative_signals = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, conservative_signals, CTLFLAG_RW,
     &conservative_signals, 0, "Unprivileged processes prevented from "
     "sending certain signals to processes whose credentials have changed");
 /*-
  * Determine whether cred may deliver the specified signal to proc.
  * Returns: 0 for permitted, an errno value otherwise.
  * Locks: A lock must be held for proc.
  * References: cred and proc must be valid for the lifetime of the call.
  */
 int
 cr_cansignal(struct ucred *cred, struct proc *proc, int signum)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(proc, MA_OWNED);
 	/*
 	 * Jail semantics limit the scope of signalling to proc in the
 	 * same jail as cred, if cred is in jail.
 	 */
 	error = prison_check(cred, proc->p_ucred);
 	if (error)
 		return (error);
 #ifdef MAC
 	if ((error = mac_proc_check_signal(cred, proc, signum)))
 		return (error);
 #endif
 	if ((error = cr_bsd_visible(cred, proc->p_ucred)))
 		return (error);
 
 	/*
 	 * UNIX signal semantics depend on the status of the P_SUGID
 	 * bit on the target process.  If the bit is set, then additional
 	 * restrictions are placed on the set of available signals.
 	 */
 	if (conservative_signals && (proc->p_flag & P_SUGID)) {
 		switch (signum) {
 		case 0:
 		case SIGKILL:
 		case SIGINT:
 		case SIGTERM:
 		case SIGALRM:
 		case SIGSTOP:
 		case SIGTTIN:
 		case SIGTTOU:
 		case SIGTSTP:
 		case SIGHUP:
 		case SIGUSR1:
 		case SIGUSR2:
 			/*
 			 * Generally, permit job and terminal control
 			 * signals.
 			 */
 			break;
 		default:
 			/* Not permitted without privilege. */
 			error = priv_check_cred(cred, PRIV_SIGNAL_SUGID);
 			if (error)
 				return (error);
 		}
 	}
 
 	/*
 	 * Generally, the target credential's ruid or svuid must match the
 	 * subject credential's ruid or euid.
 	 */
 	if (cred->cr_ruid != proc->p_ucred->cr_ruid &&
 	    cred->cr_ruid != proc->p_ucred->cr_svuid &&
 	    cred->cr_uid != proc->p_ucred->cr_ruid &&
 	    cred->cr_uid != proc->p_ucred->cr_svuid) {
 		error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED);
 		if (error)
 			return (error);
 	}
 
+	/*
+	 * At this point, the target may be in a different jail than the
+	 * subject -- the subject must be in a parent jail to the target,
+	 * whether it is prison0 or a subordinate of prison0 that has
+	 * children.  Additional privileges are required to allow this, as
+	 * whether the creds are truly equivalent or not must be determined on
+	 * a case-by-case basis.
+	 */
+	error = cr_can_tamper_with_subjail(cred, proc->p_ucred,
+	    PRIV_SIGNAL_DIFFJAIL);
+	if (error)
+		return (error);
+
 	return (0);
 }
 
 /*-
  * Determine whether td may deliver the specified signal to p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect various components of td and p
  *        must be held.  td must be curthread, and a lock must be
  *        held for p.
  * References: td and p must be valid for the lifetime of the call
  */
 int
 p_cansignal(struct thread *td, struct proc *p, int signum)
 {
 
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (td->td_proc == p)
 		return (0);
 
 	/*
 	 * UNIX signalling semantics require that processes in the same
 	 * session always be able to deliver SIGCONT to one another,
 	 * overriding the remaining protections.
 	 */
 	/* XXX: This will require an additional lock of some sort. */
 	if (signum == SIGCONT && td->td_proc->p_session == p->p_session)
 		return (0);
 	/*
 	 * Some compat layers use SIGTHR and higher signals for
 	 * communication between different kernel threads of the same
 	 * process, so that they expect that it's always possible to
 	 * deliver them, even for suid applications where cr_cansignal() can
 	 * deny such ability for security consideration.  It should be
 	 * pretty safe to do since the only way to create two processes
 	 * with the same p_leader is via rfork(2).
 	 */
 	if (td->td_proc->p_leader != NULL && signum >= SIGTHR &&
 	    signum < SIGTHR + 4 && td->td_proc->p_leader == p->p_leader)
 		return (0);
 
 	return (cr_cansignal(td->td_ucred, p, signum));
 }
 
 /*-
  * Determine whether td may reschedule p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect various components of td and p
  *        must be held.  td must be curthread, and a lock must
  *        be held for p.
  * References: td and p must be valid for the lifetime of the call
  */
 int
 p_cansched(struct thread *td, struct proc *p)
 {
 	int error;
 
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (td->td_proc == p)
 		return (0);
 	if ((error = prison_check(td->td_ucred, p->p_ucred)))
 		return (error);
 #ifdef MAC
 	if ((error = mac_proc_check_sched(td->td_ucred, p)))
 		return (error);
 #endif
 	if ((error = cr_bsd_visible(td->td_ucred, p->p_ucred)))
 		return (error);
 
 	if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid &&
 	    td->td_ucred->cr_uid != p->p_ucred->cr_ruid) {
 		error = priv_check(td, PRIV_SCHED_DIFFCRED);
 		if (error)
 			return (error);
 	}
+
+	error = cr_can_tamper_with_subjail(td->td_ucred, p->p_ucred,
+	    PRIV_SCHED_DIFFJAIL);
+	if (error)
+		return (error);
+
 	return (0);
 }
 
 /*
  * Handle getting or setting the prison's unprivileged_proc_debug
  * value.
  */
 static int
 sysctl_unprivileged_proc_debug(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = prison_allow(req->td->td_ucred, PR_ALLOW_UNPRIV_DEBUG);
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (val != 0 && val != 1)
 		return (EINVAL);
 	prison_set_allow(req->td->td_ucred, PR_ALLOW_UNPRIV_DEBUG, val);
 	return (0);
 }
 
 /*
  * The 'unprivileged_proc_debug' flag may be used to disable a variety of
  * unprivileged inter-process debugging services, including some procfs
  * functionality, ptrace(), and ktrace().  In the past, inter-process
  * debugging has been involved in a variety of security problems, and sites
  * not requiring the service might choose to disable it when hardening
  * systems.
  */
 SYSCTL_PROC(_security_bsd, OID_AUTO, unprivileged_proc_debug,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_SECURE |
     CTLFLAG_MPSAFE, 0, 0, sysctl_unprivileged_proc_debug, "I",
     "Unprivileged processes may use process debugging facilities");
 
 /*
  * Return true if the object owner/group ids are subset of the active
  * credentials.
  */
 bool
 cr_xids_subset(struct ucred *active_cred, struct ucred *obj_cred)
 {
 	int i;
 	bool grpsubset, uidsubset;
 
 	/*
 	 * Is p's group set a subset of td's effective group set?  This
 	 * includes p's egid, group access list, rgid, and svgid.
 	 */
 	grpsubset = true;
 	for (i = 0; i < obj_cred->cr_ngroups; i++) {
 		if (!groupmember(obj_cred->cr_groups[i], active_cred)) {
 			grpsubset = false;
 			break;
 		}
 	}
 	grpsubset = grpsubset &&
 	    groupmember(obj_cred->cr_rgid, active_cred) &&
 	    groupmember(obj_cred->cr_svgid, active_cred);
 
 	/*
 	 * Are the uids present in obj_cred's credential equal to
 	 * active_cred's effective uid?  This includes obj_cred's
 	 * euid, svuid, and ruid.
 	 */
 	uidsubset = (active_cred->cr_uid == obj_cred->cr_uid &&
 	    active_cred->cr_uid == obj_cred->cr_svuid &&
 	    active_cred->cr_uid == obj_cred->cr_ruid);
 
 	return (uidsubset && grpsubset);
 }
 
 /*-
  * Determine whether td may debug p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect various components of td and p
  *        must be held.  td must be curthread, and a lock must
  *        be held for p.
  * References: td and p must be valid for the lifetime of the call
  */
 int
 p_candebug(struct thread *td, struct proc *p)
 {
 	int error;
 
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (td->td_proc == p)
 		return (0);
 	if ((error = priv_check(td, PRIV_DEBUG_UNPRIV)))
 		return (error);
 	if ((error = prison_check(td->td_ucred, p->p_ucred)))
 		return (error);
 #ifdef MAC
 	if ((error = mac_proc_check_debug(td->td_ucred, p)))
 		return (error);
 #endif
 	if ((error = cr_bsd_visible(td->td_ucred, p->p_ucred)))
 		return (error);
 
 	/*
 	 * If p's gids aren't a subset, or the uids aren't a subset,
 	 * or the credential has changed, require appropriate privilege
 	 * for td to debug p.
 	 */
 	if (!cr_xids_subset(td->td_ucred, p->p_ucred)) {
 		error = priv_check(td, PRIV_DEBUG_DIFFCRED);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Has the credential of the process changed since the last exec()?
 	 */
 	if ((p->p_flag & P_SUGID) != 0) {
 		error = priv_check(td, PRIV_DEBUG_SUGID);
 		if (error)
 			return (error);
 	}
 
+	error = cr_can_tamper_with_subjail(td->td_ucred, p->p_ucred,
+	    PRIV_DEBUG_DIFFJAIL);
+	if (error)
+		return (error);
+
 	/* Can't trace init when securelevel > 0. */
 	if (p == initproc) {
 		error = securelevel_gt(td->td_ucred, 0);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Can't trace a process that's currently exec'ing.
 	 *
 	 * XXX: Note, this is not a security policy decision, it's a
 	 * basic correctness/functionality decision.  Therefore, this check
 	 * should be moved to the caller's of p_candebug().
 	 */
 	if ((p->p_flag & P_INEXEC) != 0)
 		return (EBUSY);
 
 	/* Denied explicitly */
 	if ((p->p_flag2 & P2_NOTRACE) != 0) {
 		error = priv_check(td, PRIV_DEBUG_DENIED);
 		if (error != 0)
 			return (error);
 	}
 
 	return (0);
 }
 
 /*-
  * Determine whether the subject represented by cred can "see" a socket.
  * Returns: 0 for permitted, ENOENT otherwise.
  */
 int
 cr_canseesocket(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	error = prison_check(cred, so->so_cred);
 	if (error)
 		return (ENOENT);
 #ifdef MAC
 	error = mac_socket_check_visible(cred, so);
 	if (error)
 		return (error);
 #endif
 	if (cr_bsd_visible(cred, so->so_cred))
 		return (ENOENT);
 
 	return (0);
 }
 
 /*-
  * Determine whether td can wait for the exit of p.
  * Returns: 0 for permitted, an errno value otherwise
  * Locks: Sufficient locks to protect various components of td and p
  *        must be held.  td must be curthread, and a lock must
  *        be held for p.
  * References: td and p must be valid for the lifetime of the call
 
  */
 int
 p_canwait(struct thread *td, struct proc *p)
 {
 	int error;
 
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((error = prison_check(td->td_ucred, p->p_ucred)))
 		return (error);
 #ifdef MAC
 	if ((error = mac_proc_check_wait(td->td_ucred, p)))
 		return (error);
 #endif
 #if 0
 	/* XXXMAC: This could have odd effects on some shells. */
 	if ((error = cr_bsd_visible(td->td_ucred, p->p_ucred)))
 		return (error);
 #endif
 
 	return (0);
 }
 
 /*
  * Credential management.
  *
  * struct ucred objects are rarely allocated but gain and lose references all
  * the time (e.g., on struct file alloc/dealloc) turning refcount updates into
  * a significant source of cache-line ping ponging. Common cases are worked
  * around by modifying thread-local counter instead if the cred to operate on
  * matches td_realucred.
  *
  * The counter is split into 2 parts:
  * - cr_users -- total count of all struct proc and struct thread objects
  *   which have given cred in p_ucred and td_ucred respectively
  * - cr_ref -- the actual ref count, only valid if cr_users == 0
  *
  * If users == 0 then cr_ref behaves similarly to refcount(9), in particular if
  * the count reaches 0 the object is freeable.
  * If users > 0 and curthread->td_realucred == cred, then updates are performed
  * against td_ucredref.
  * In other cases updates are performed against cr_ref.
  *
  * Changing td_realucred into something else decrements cr_users and transfers
  * accumulated updates.
  */
 struct ucred *
 crcowget(struct ucred *cr)
 {
 
 	mtx_lock(&cr->cr_mtx);
 	KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p",
 	    __func__, cr->cr_users, cr));
 	cr->cr_users++;
 	cr->cr_ref++;
 	mtx_unlock(&cr->cr_mtx);
 	return (cr);
 }
 
 static struct ucred *
 crunuse(struct thread *td)
 {
 	struct ucred *cr, *crold;
 
 	MPASS(td->td_realucred == td->td_ucred);
 	cr = td->td_realucred;
 	mtx_lock(&cr->cr_mtx);
 	cr->cr_ref += td->td_ucredref;
 	td->td_ucredref = 0;
 	KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p",
 	    __func__, cr->cr_users, cr));
 	cr->cr_users--;
 	if (cr->cr_users == 0) {
 		KASSERT(cr->cr_ref > 0, ("%s: ref %ld not > 0 on cred %p",
 		    __func__, cr->cr_ref, cr));
 		crold = cr;
 	} else {
 		cr->cr_ref--;
 		crold = NULL;
 	}
 	mtx_unlock(&cr->cr_mtx);
 	td->td_realucred = NULL;
 	return (crold);
 }
 
 static void
 crunusebatch(struct ucred *cr, u_int users, long ref)
 {
 
 	KASSERT(users > 0, ("%s: passed users %d not > 0 ; cred %p",
 	    __func__, users, cr));
 	mtx_lock(&cr->cr_mtx);
 	KASSERT(cr->cr_users >= users, ("%s: users %d not > %d on cred %p",
 	    __func__, cr->cr_users, users, cr));
 	cr->cr_users -= users;
 	cr->cr_ref += ref;
 	cr->cr_ref -= users;
 	if (cr->cr_users > 0) {
 		mtx_unlock(&cr->cr_mtx);
 		return;
 	}
 	KASSERT(cr->cr_ref >= 0, ("%s: ref %ld not >= 0 on cred %p",
 	    __func__, cr->cr_ref, cr));
 	if (cr->cr_ref > 0) {
 		mtx_unlock(&cr->cr_mtx);
 		return;
 	}
 	crfree_final(cr);
 }
 
 void
 crcowfree(struct thread *td)
 {
 	struct ucred *cr;
 
 	cr = crunuse(td);
 	if (cr != NULL)
 		crfree(cr);
 }
 
 struct ucred *
 crcowsync(void)
 {
 	struct thread *td;
 	struct proc *p;
 	struct ucred *crnew, *crold;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	MPASS(td->td_realucred == td->td_ucred);
 	if (td->td_realucred == p->p_ucred)
 		return (NULL);
 
 	crnew = crcowget(p->p_ucred);
 	crold = crunuse(td);
 	td->td_realucred = crnew;
 	td->td_ucred = td->td_realucred;
 	return (crold);
 }
 
 /*
  * Batching.
  */
 void
 credbatch_add(struct credbatch *crb, struct thread *td)
 {
 	struct ucred *cr;
 
 	MPASS(td->td_realucred != NULL);
 	MPASS(td->td_realucred == td->td_ucred);
 	MPASS(TD_GET_STATE(td) == TDS_INACTIVE);
 	cr = td->td_realucred;
 	KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p",
 	    __func__, cr->cr_users, cr));
 	if (crb->cred != cr) {
 		if (crb->users > 0) {
 			MPASS(crb->cred != NULL);
 			crunusebatch(crb->cred, crb->users, crb->ref);
 			crb->users = 0;
 			crb->ref = 0;
 		}
 	}
 	crb->cred = cr;
 	crb->users++;
 	crb->ref += td->td_ucredref;
 	td->td_ucredref = 0;
 	td->td_realucred = NULL;
 }
 
 void
 credbatch_final(struct credbatch *crb)
 {
 
 	MPASS(crb->cred != NULL);
 	MPASS(crb->users > 0);
 	crunusebatch(crb->cred, crb->users, crb->ref);
 }
 
 /*
  * Allocate a zeroed cred structure.
  */
 struct ucred *
 crget(void)
 {
 	struct ucred *cr;
 
 	cr = malloc(sizeof(*cr), M_CRED, M_WAITOK | M_ZERO);
 	mtx_init(&cr->cr_mtx, "cred", NULL, MTX_DEF);
 	cr->cr_ref = 1;
 #ifdef AUDIT
 	audit_cred_init(cr);
 #endif
 #ifdef MAC
 	mac_cred_init(cr);
 #endif
 	cr->cr_groups = cr->cr_smallgroups;
 	cr->cr_agroups = nitems(cr->cr_smallgroups);
 	return (cr);
 }
 
 /*
  * Claim another reference to a ucred structure.
  */
 struct ucred *
 crhold(struct ucred *cr)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (__predict_true(td->td_realucred == cr)) {
 		KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p",
 		    __func__, cr->cr_users, cr));
 		td->td_ucredref++;
 		return (cr);
 	}
 	mtx_lock(&cr->cr_mtx);
 	cr->cr_ref++;
 	mtx_unlock(&cr->cr_mtx);
 	return (cr);
 }
 
 /*
  * Free a cred structure.  Throws away space when ref count gets to 0.
  */
 void
 crfree(struct ucred *cr)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (__predict_true(td->td_realucred == cr)) {
 		KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p",
 		    __func__, cr->cr_users, cr));
 		td->td_ucredref--;
 		return;
 	}
 	mtx_lock(&cr->cr_mtx);
 	KASSERT(cr->cr_users >= 0, ("%s: users %d not >= 0 on cred %p",
 	    __func__, cr->cr_users, cr));
 	cr->cr_ref--;
 	if (cr->cr_users > 0) {
 		mtx_unlock(&cr->cr_mtx);
 		return;
 	}
 	KASSERT(cr->cr_ref >= 0, ("%s: ref %ld not >= 0 on cred %p",
 	    __func__, cr->cr_ref, cr));
 	if (cr->cr_ref > 0) {
 		mtx_unlock(&cr->cr_mtx);
 		return;
 	}
 	crfree_final(cr);
 }
 
 static void
 crfree_final(struct ucred *cr)
 {
 
 	KASSERT(cr->cr_users == 0, ("%s: users %d not == 0 on cred %p",
 	    __func__, cr->cr_users, cr));
 	KASSERT(cr->cr_ref == 0, ("%s: ref %ld not == 0 on cred %p",
 	    __func__, cr->cr_ref, cr));
 
 	/*
 	 * Some callers of crget(), such as nfs_statfs(), allocate a temporary
 	 * credential, but don't allocate a uidinfo structure.
 	 */
 	if (cr->cr_uidinfo != NULL)
 		uifree(cr->cr_uidinfo);
 	if (cr->cr_ruidinfo != NULL)
 		uifree(cr->cr_ruidinfo);
 	if (cr->cr_prison != NULL)
 		prison_free(cr->cr_prison);
 	if (cr->cr_loginclass != NULL)
 		loginclass_free(cr->cr_loginclass);
 #ifdef AUDIT
 	audit_cred_destroy(cr);
 #endif
 #ifdef MAC
 	mac_cred_destroy(cr);
 #endif
 	mtx_destroy(&cr->cr_mtx);
 	if (cr->cr_groups != cr->cr_smallgroups)
 		free(cr->cr_groups, M_CRED);
 	free(cr, M_CRED);
 }
 
 /*
  * Copy a ucred's contents from a template.  Does not block.
  */
 void
 crcopy(struct ucred *dest, struct ucred *src)
 {
 
 	bcopy(&src->cr_startcopy, &dest->cr_startcopy,
 	    (unsigned)((caddr_t)&src->cr_endcopy -
 		(caddr_t)&src->cr_startcopy));
 	dest->cr_flags = src->cr_flags;
 	crsetgroups(dest, src->cr_ngroups, src->cr_groups);
 	uihold(dest->cr_uidinfo);
 	uihold(dest->cr_ruidinfo);
 	prison_hold(dest->cr_prison);
 	loginclass_hold(dest->cr_loginclass);
 #ifdef AUDIT
 	audit_cred_copy(src, dest);
 #endif
 #ifdef MAC
 	mac_cred_copy(src, dest);
 #endif
 }
 
 /*
  * Dup cred struct to a new held one.
  */
 struct ucred *
 crdup(struct ucred *cr)
 {
 	struct ucred *newcr;
 
 	newcr = crget();
 	crcopy(newcr, cr);
 	return (newcr);
 }
 
 /*
  * Fill in a struct xucred based on a struct ucred.
  */
 void
 cru2x(struct ucred *cr, struct xucred *xcr)
 {
 	int ngroups;
 
 	bzero(xcr, sizeof(*xcr));
 	xcr->cr_version = XUCRED_VERSION;
 	xcr->cr_uid = cr->cr_uid;
 	xcr->cr_gid = cr->cr_gid;
 
 	/*
 	 * We use a union to alias cr_gid to cr_groups[0] in the xucred, so
 	 * this is kind of ugly; cr_ngroups still includes the egid for our
 	 * purposes to avoid bumping the xucred version.
 	 */
 	ngroups = MIN(cr->cr_ngroups + 1, nitems(xcr->cr_groups));
 	xcr->cr_ngroups = ngroups;
 	bcopy(cr->cr_groups, xcr->cr_sgroups,
 	    (ngroups - 1) * sizeof(*cr->cr_groups));
 }
 
 void
 cru2xt(struct thread *td, struct xucred *xcr)
 {
 
 	cru2x(td->td_ucred, xcr);
 	xcr->cr_pid = td->td_proc->p_pid;
 }
 
 /*
  * Change process credentials.
  *
  * Callers are responsible for providing the reference for passed credentials
  * and for freeing old ones.  Calls chgproccnt() to correctly account the
  * current process to the proper real UID, if the latter has changed.  Returns
  * whether the operation was successful.  Failure can happen only on
  * 'enforce_proc_lim' being true and if no new process can be accounted to the
  * new real UID because of the current limit (see the inner comment for more
  * details) and the caller does not have privilege (PRIV_PROC_LIMIT) to override
  * that.
  */
 static bool
 _proc_set_cred(struct proc *p, struct ucred *newcred, bool enforce_proc_lim)
 {
 	struct ucred *const oldcred = p->p_ucred;
 
 	MPASS(oldcred != NULL);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(newcred->cr_users == 0, ("%s: users %d not 0 on cred %p",
 	    __func__, newcred->cr_users, newcred));
 	KASSERT(newcred->cr_ref == 1, ("%s: ref %ld not 1 on cred %p",
 	    __func__, newcred->cr_ref, newcred));
 
 	if (newcred->cr_ruidinfo != oldcred->cr_ruidinfo) {
 		/*
 		 * XXXOC: This check is flawed but nonetheless the best we can
 		 * currently do as we don't really track limits per UID contrary
 		 * to what we pretend in setrlimit(2).  Until this is reworked,
 		 * we just check here that the number of processes for our new
 		 * real UID doesn't exceed this process' process number limit
 		 * (which is meant to be associated with the current real UID).
 		 */
 		const int proccnt_changed = chgproccnt(newcred->cr_ruidinfo, 1,
 		    enforce_proc_lim ? lim_cur_proc(p, RLIMIT_NPROC) : 0);
 
 		if (!proccnt_changed) {
 			if (priv_check_cred(oldcred, PRIV_PROC_LIMIT) != 0)
 				return (false);
 			(void)chgproccnt(newcred->cr_ruidinfo, 1, 0);
 		}
 	}
 
 	mtx_lock(&oldcred->cr_mtx);
 	KASSERT(oldcred->cr_users > 0, ("%s: users %d not > 0 on cred %p",
 	    __func__, oldcred->cr_users, oldcred));
 	oldcred->cr_users--;
 	mtx_unlock(&oldcred->cr_mtx);
 	p->p_ucred = newcred;
 	newcred->cr_users = 1;
 	PROC_UPDATE_COW(p);
 	if (newcred->cr_ruidinfo != oldcred->cr_ruidinfo)
 		(void)chgproccnt(oldcred->cr_ruidinfo, -1, 0);
 	return (true);
 }
 
 void
 proc_set_cred(struct proc *p, struct ucred *newcred)
 {
 	bool success __diagused = _proc_set_cred(p, newcred, false);
 
 	MPASS(success);
 }
 
 bool
 proc_set_cred_enforce_proc_lim(struct proc *p, struct ucred *newcred)
 {
 	return (_proc_set_cred(p, newcred, true));
 }
 
 void
 proc_unset_cred(struct proc *p, bool decrement_proc_count)
 {
 	struct ucred *cr;
 
 	MPASS(p->p_state == PRS_ZOMBIE || p->p_state == PRS_NEW);
 	cr = p->p_ucred;
 	p->p_ucred = NULL;
 	KASSERT(cr->cr_users > 0, ("%s: users %d not > 0 on cred %p",
 	    __func__, cr->cr_users, cr));
 	mtx_lock(&cr->cr_mtx);
 	cr->cr_users--;
 	if (cr->cr_users == 0)
 		KASSERT(cr->cr_ref > 0, ("%s: ref %ld not > 0 on cred %p",
 		    __func__, cr->cr_ref, cr));
 	mtx_unlock(&cr->cr_mtx);
 	if (decrement_proc_count)
 		(void)chgproccnt(cr->cr_ruidinfo, -1, 0);
 	crfree(cr);
 }
 
 struct ucred *
 crcopysafe(struct proc *p, struct ucred *cr)
 {
 	struct ucred *oldcred;
 	int groups;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	oldcred = p->p_ucred;
 	while (cr->cr_agroups < oldcred->cr_ngroups) {
 		groups = oldcred->cr_ngroups;
 		PROC_UNLOCK(p);
 		crextend(cr, groups);
 		PROC_LOCK(p);
 		oldcred = p->p_ucred;
 	}
 	crcopy(cr, oldcred);
 
 	return (oldcred);
 }
 
 /*
  * Extend the passed-in credentials to hold n groups.
  *
  * Must not be called after groups have been set.
  */
 void
 crextend(struct ucred *cr, int n)
 {
 	size_t nbytes;
 
 	MPASS2(cr->cr_ref == 1, "'cr_ref' must be 1 (referenced, unshared)");
 	MPASS2((cr->cr_flags & CRED_FLAG_GROUPSET) == 0,
 	    "groups on 'cr' already set!");
 	groups_check_positive_len(n);
 	groups_check_max_len(n);
 
 	if (n <= cr->cr_agroups)
 		return;
 
 	nbytes = n * sizeof(gid_t);
 	if (nbytes < n)
 		panic("Too many groups (memory size overflow)! "
 		    "Computation of 'kern.ngroups' should have prevented this, "
 		    "please fix it. In the meantime, reduce 'kern.ngroups'.");
 
 	/*
 	 * We allocate a power of 2 larger than 'nbytes', except when that
 	 * exceeds PAGE_SIZE, in which case we allocate the right multiple of
 	 * pages.  We assume PAGE_SIZE is a power of 2 (the call to roundup2()
 	 * below) but do not need to for sizeof(gid_t).
 	 */
 	if (nbytes < PAGE_SIZE) {
 		if (!powerof2(nbytes))
 			/* fls*() return a bit index starting at 1. */
 			nbytes = 1 << flsl(nbytes);
 	} else
 		nbytes = roundup2(nbytes, PAGE_SIZE);
 
 	/* Free the old array. */
 	if (cr->cr_groups != cr->cr_smallgroups)
 		free(cr->cr_groups, M_CRED);
 
 	cr->cr_groups = malloc(nbytes, M_CRED, M_WAITOK | M_ZERO);
 	cr->cr_agroups = nbytes / sizeof(gid_t);
 }
 
 /*
  * Normalizes a set of groups to be applied to a 'struct ucred'.
  *
  * Normalization ensures that the supplementary groups are sorted in ascending
  * order and do not contain duplicates.
  */
 static void
 groups_normalize(int *ngrp, gid_t *groups)
 {
 	gid_t prev_g;
 	int ins_idx;
 
 	groups_check_positive_len(*ngrp);
 	groups_check_max_len(*ngrp);
 
 	if (*ngrp <= 1)
 		return;
 
 	qsort(groups, *ngrp, sizeof(*groups), gidp_cmp);
 
 	/* Remove duplicates. */
 	prev_g = groups[0];
 	ins_idx = 1;
 	for (int i = ins_idx; i < *ngrp; ++i) {
 		const gid_t g = groups[i];
 
 		if (g != prev_g) {
 			if (i != ins_idx)
 				groups[ins_idx] = g;
 			++ins_idx;
 			prev_g = g;
 		}
 	}
 	*ngrp = ins_idx;
 
 	groups_check_normalized(*ngrp, groups);
 }
 
 /*
  * Internal function copying groups into a credential.
  *
  * 'ngrp' must be strictly positive.  Either the passed 'groups' array must have
  * been normalized in advance (see groups_normalize()), else it must be so
  * before the structure is to be used again.
  *
  * This function is suitable to be used under any lock (it doesn't take any lock
  * itself nor sleep, and in particular doesn't allocate memory).  crextend()
  * must have been called beforehand to ensure sufficient space is available.
  * See also crsetgroups(), which handles that.
  */
 static void
 crsetgroups_internal(struct ucred *cr, int ngrp, const gid_t *groups)
 {
 
 	MPASS2(cr->cr_ref == 1, "'cr_ref' must be 1 (referenced, unshared)");
 	MPASS2(cr->cr_agroups >= ngrp, "'cr_agroups' too small");
 	groups_check_positive_len(ngrp);
 
 	bcopy(groups, cr->cr_groups, ngrp * sizeof(gid_t));
 	cr->cr_ngroups = ngrp;
 	cr->cr_flags |= CRED_FLAG_GROUPSET;
 }
 
 /*
  * Copy groups in to a credential after expanding it if required.
  *
  * May sleep in order to allocate memory (except if, e.g., crextend() was called
  * before with 'ngrp' or greater).  Truncates the list to ngroups_max if
  * it is too large.  Array 'groups' doesn't need to be sorted.  'ngrp' must be
  * strictly positive.
  */
 void
 crsetgroups(struct ucred *cr, int ngrp, const gid_t *groups)
 {
 
 	if (ngrp > ngroups_max)
 		ngrp = ngroups_max;
 	cr->cr_ngroups = 0;
 	if (ngrp == 0) {
 		cr->cr_flags |= CRED_FLAG_GROUPSET;
 		return;
 	}
 
 	/*
 	 * crextend() asserts that groups are not set, as it may allocate a new
 	 * backing storage without copying the content of the old one.  Since we
 	 * are going to install a completely new set anyway, signal that we
 	 * consider the old ones thrown away.
 	 */
 	cr->cr_flags &= ~CRED_FLAG_GROUPSET;
 
 	crextend(cr, ngrp);
 	crsetgroups_internal(cr, ngrp, groups);
 	groups_normalize(&cr->cr_ngroups, cr->cr_groups);
 }
 
 /*
  * Same as crsetgroups() but sets the effective GID as well.
  *
  * This function ensures that an effective GID is always present in credentials.
  * An empty array will only set the effective GID to the default_egid, while a
  * non-empty array will peel off groups[0] to set as the effective GID and use
  * the remainder, if any, as supplementary groups.
  */
 void
 crsetgroups_and_egid(struct ucred *cr, int ngrp, const gid_t *groups,
     const gid_t default_egid)
 {
 	if (ngrp == 0) {
 		cr->cr_gid = default_egid;
 		cr->cr_ngroups = 0;
 		cr->cr_flags |= CRED_FLAG_GROUPSET;
 		return;
 	}
 
 	crsetgroups(cr, ngrp - 1, groups + 1);
 	cr->cr_gid = groups[0];
 }
 
 /*
  * Get login name, if available.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getlogin_args {
 	char	*namebuf;
 	u_int	namelen;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getlogin(struct thread *td, struct getlogin_args *uap)
 {
 	char login[MAXLOGNAME];
 	struct proc *p = td->td_proc;
 	size_t len;
 
 	if (uap->namelen > MAXLOGNAME)
 		uap->namelen = MAXLOGNAME;
 	PROC_LOCK(p);
 	SESS_LOCK(p->p_session);
 	len = strlcpy(login, p->p_session->s_login, uap->namelen) + 1;
 	SESS_UNLOCK(p->p_session);
 	PROC_UNLOCK(p);
 	if (len > uap->namelen)
 		return (ERANGE);
 	return (copyout(login, uap->namebuf, len));
 }
 
 /*
  * Set login name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct setlogin_args {
 	char	*namebuf;
 };
 #endif
 /* ARGSUSED */
 int
 sys_setlogin(struct thread *td, struct setlogin_args *uap)
 {
 	struct proc *p = td->td_proc;
 	int error;
 	char logintmp[MAXLOGNAME];
 
 	CTASSERT(sizeof(p->p_session->s_login) >= sizeof(logintmp));
 
 	error = priv_check(td, PRIV_PROC_SETLOGIN);
 	if (error)
 		return (error);
 	error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL);
 	if (error != 0) {
 		if (error == ENAMETOOLONG)
 			error = EINVAL;
 		return (error);
 	}
 	AUDIT_ARG_LOGIN(logintmp);
 	PROC_LOCK(p);
 	SESS_LOCK(p->p_session);
 	strcpy(p->p_session->s_login, logintmp);
 	SESS_UNLOCK(p->p_session);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 void
 setsugid(struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag |= P_SUGID;
 }
 
 /*-
  * Change a process's effective uid.
  * Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_euid(struct ucred *newcred, struct uidinfo *euip)
 {
 
 	newcred->cr_uid = euip->ui_uid;
 	uihold(euip);
 	uifree(newcred->cr_uidinfo);
 	newcred->cr_uidinfo = euip;
 }
 
 /*-
  * Change a process's effective gid.
  * Side effects: newcred->cr_gid will be modified.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_egid(struct ucred *newcred, gid_t egid)
 {
 
 	newcred->cr_gid = egid;
 }
 
 /*-
  * Change a process's real uid.
  * Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo
  *               will be updated.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_ruid(struct ucred *newcred, struct uidinfo *ruip)
 {
 
 	newcred->cr_ruid = ruip->ui_uid;
 	uihold(ruip);
 	uifree(newcred->cr_ruidinfo);
 	newcred->cr_ruidinfo = ruip;
 }
 
 /*-
  * Change a process's real gid.
  * Side effects: newcred->cr_rgid will be updated.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_rgid(struct ucred *newcred, gid_t rgid)
 {
 
 	newcred->cr_rgid = rgid;
 }
 
 /*-
  * Change a process's saved uid.
  * Side effects: newcred->cr_svuid will be updated.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_svuid(struct ucred *newcred, uid_t svuid)
 {
 
 	newcred->cr_svuid = svuid;
 }
 
 /*-
  * Change a process's saved gid.
  * Side effects: newcred->cr_svgid will be updated.
  * References: newcred must be an exclusive credential reference for the
  *             duration of the call.
  */
 void
 change_svgid(struct ucred *newcred, gid_t svgid)
 {
 
 	newcred->cr_svgid = svgid;
 }
 
 bool allow_ptrace = true;
 SYSCTL_BOOL(_security_bsd, OID_AUTO, allow_ptrace, CTLFLAG_RWTUN,
     &allow_ptrace, 0,
     "Deny ptrace(2) use by returning ENOSYS");
diff --git a/sys/sys/jail.h b/sys/sys/jail.h
index efe5e3ee8db7..0ad80ec53157 100644
--- a/sys/sys/jail.h
+++ b/sys/sys/jail.h
@@ -1,510 +1,513 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 1999 Poul-Henning Kamp.
  * Copyright (c) 2009 James Gritton.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef _SYS_JAIL_H_
 #define _SYS_JAIL_H_
 
 #ifdef _KERNEL
 struct jail_v0 {
 	u_int32_t	version;
 	char		*path;
 	char		*hostname;
 	u_int32_t	ip_number;
 };
 #endif
 
 struct jail {
 	uint32_t	version;
 	char		*path;
 	char		*hostname;
 	char		*jailname;
 	uint32_t	ip4s;
 	uint32_t	ip6s;
 	struct in_addr	*ip4;
 	struct in6_addr	*ip6;
 };
 #define	JAIL_API_VERSION	2
 
 /*
  * For all xprison structs, always keep the pr_version an int and
  * the first variable so userspace can easily distinguish them.
  */
 #ifndef _KERNEL
 struct xprison_v1 {
 	int		 pr_version;
 	int		 pr_id;
 	char		 pr_path[MAXPATHLEN];
 	char		 pr_host[MAXHOSTNAMELEN];
 	u_int32_t	 pr_ip;
 };
 #endif
 
 struct xprison {
 	int		 pr_version;
 	int		 pr_id;
 	int		 pr_state;
 	cpusetid_t	 pr_cpusetid;
 	char		 pr_path[MAXPATHLEN];
 	char		 pr_host[MAXHOSTNAMELEN];
 	char		 pr_name[MAXHOSTNAMELEN];
 	uint32_t	 pr_ip4s;
 	uint32_t	 pr_ip6s;
 #if 0
 	/*
 	 * sizeof(xprison) will be malloced + size needed for all
 	 * IPv4 and IPv6 addesses. Offsets are based numbers of addresses.
 	 */
 	struct in_addr	 pr_ip4[];
 	struct in6_addr	 pr_ip6[];
 #endif
 };
 #define	XPRISON_VERSION		3
 
 enum prison_state {
     PRISON_STATE_INVALID = 0,	/* New prison, not ready to be seen */
     PRISON_STATE_ALIVE,		/* Current prison, visible to all */
     PRISON_STATE_DYING		/* Removed but holding resources, */
 };				/* optionally visible. */
 
 /*
  * Flags for jail_set and jail_get.
  */
 #define	JAIL_CREATE	0x01	/* Create jail if it doesn't exist */
 #define	JAIL_UPDATE	0x02	/* Update parameters of existing jail */
 #define	JAIL_ATTACH	0x04	/* Attach to jail upon creation */
 #define	JAIL_DYING	0x08	/* Allow getting a dying jail */
 #define	JAIL_SET_MASK	0x0f	/* JAIL_DYING is deprecated/ignored here */
 #define	JAIL_GET_MASK	0x08
 
 #define	JAIL_SYS_DISABLE	0
 #define	JAIL_SYS_NEW		1
 #define	JAIL_SYS_INHERIT	2
 
 #ifndef _KERNEL
 
 struct iovec;
 
 __BEGIN_DECLS
 int jail(struct jail *);
 int jail_set(struct iovec *, unsigned int, int);
 int jail_get(struct iovec *, unsigned int, int);
 int jail_attach(int);
 int jail_remove(int);
 __END_DECLS
 
 #else /* _KERNEL */
 
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/_task.h>
 
 #define JAIL_MAX	999999
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_PRISON);
 #endif
 #endif /* _KERNEL */
 
 #if defined(_KERNEL) || defined(_WANT_PRISON)
 
 #include <sys/osd.h>
 
 #define	HOSTUUIDLEN	64
 #define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
 #define	OSRELEASELEN	32
 
 #define	JAIL_META_PRIVATE	"meta"
 #define	JAIL_META_SHARED	"env"
 
 struct racct;
 struct prison_racct;
 
 typedef enum {
 	PR_INET		= 0,
 	PR_INET6	= 1,
 	PR_FAMILY_MAX	= 2,
 } pr_family_t;
 
 /*
  * This structure describes a prison.  It is pointed to by all struct
  * ucreds's of the inmates.  pr_ref keeps track of them and is used to
  * delete the structure when the last inmate is dead.
  *
  * Lock key:
  *   (a) allprison_lock
  *   (A) allproc_lock
  *   (c) set only during creation before the structure is shared, no mutex
  *       required to read
  *   (m) locked by pr_mtx
  *   (p) locked by pr_mtx, and also at least shared allprison_lock required
  *       to update
  *   (q) locked by both pr_mtx and allprison_lock
  *   (r) atomic via refcount(9), pr_mtx and allprison_lock required to
  *       decrement to zero
  *   (n) read access granted with the network epoch
  */
 struct prison {
 	TAILQ_ENTRY(prison) pr_list;			/* (a) all prisons */
 	int		 pr_id;				/* (c) prison id */
 	volatile u_int	 pr_ref;			/* (r) refcount */
 	volatile u_int	 pr_uref;			/* (r) user (alive) refcount */
 	unsigned	 pr_flags;			/* (p) PR_* flags */
 	LIST_HEAD(, prison) pr_children;		/* (a) list of child jails */
 	LIST_HEAD(, proc) pr_proclist;			/* (A) list of jailed processes */
 	LIST_ENTRY(prison) pr_sibling;			/* (a) next in parent's list */
 	struct prison	*pr_parent;			/* (c) containing jail */
 	struct mtx	 pr_mtx;
 	struct task	 pr_task;			/* (c) destroy task */
 	struct osd	 pr_osd;			/* (p) additional data */
 	struct cpuset	*pr_cpuset;			/* (p) cpuset */
 	struct vnet	*pr_vnet;			/* (c) network stack */
 	struct vnode	*pr_root;			/* (c) vnode to rdir */
 	struct prison_ip  *pr_addrs[PR_FAMILY_MAX];	/* (p,n) IPs of jail */
 	struct prison_racct *pr_prison_racct;		/* (c) racct jail proxy */
 	void		*pr_sparep[3];
 	int		 pr_childcount;			/* (a) number of child jails */
 	int		 pr_childmax;			/* (p) maximum child jails */
 	unsigned	 pr_allow;			/* (p) PR_ALLOW_* flags */
 	int		 pr_securelevel;		/* (p) securelevel */
 	int		 pr_enforce_statfs;		/* (p) statfs permission */
 	int		 pr_devfs_rsnum;		/* (p) devfs ruleset */
 	enum prison_state pr_state;			/* (q) state in life cycle */
 	volatile int	 pr_exportcnt;			/* (r) count of mount exports */
 	int		 pr_spare;
 	int		 pr_osreldate;			/* (c) kern.osreldate value */
 	unsigned long	 pr_hostid;			/* (p) jail hostid */
 	char		 pr_name[MAXHOSTNAMELEN];	/* (p) admin jail name */
 	char		 pr_path[MAXPATHLEN];		/* (c) chroot path */
 	char		 pr_hostname[MAXHOSTNAMELEN];	/* (p) jail hostname */
 	char		 pr_domainname[MAXHOSTNAMELEN];	/* (p) jail domainname */
 	char		 pr_hostuuid[HOSTUUIDLEN];	/* (p) jail hostuuid */
 	char		 pr_osrelease[OSRELEASELEN];	/* (c) kern.osrelease value */
 };
 
 struct prison_racct {
 	LIST_ENTRY(prison_racct) prr_next;
 	char		prr_name[MAXHOSTNAMELEN];
 	u_int		prr_refcount;
 	struct racct	*prr_racct;
 };
 #endif /* _KERNEL || _WANT_PRISON */
 
 #ifdef _KERNEL
 /* Flag bits set via options */
 #define	PR_PERSIST	0x00000001	/* Can exist without processes */
 #define	PR_HOST		0x00000002	/* Virtualize hostname et al */
 #define	PR_IP4_USER	0x00000004	/* Restrict IPv4 addresses */
 #define	PR_IP6_USER	0x00000008	/* Restrict IPv6 addresses */
 #define	PR_VNET		0x00000010	/* Virtual network stack */
 #define	PR_IP4_SADDRSEL	0x00000080	/* Do IPv4 src addr sel. or use the */
 					/* primary jail address. */
 #define	PR_IP6_SADDRSEL	0x00000100	/* Do IPv6 src addr sel. or use the */
 					/* primary jail address. */
 
 /* Internal flag bits */
 #define	PR_REMOVE	0x01000000	/* In process of being removed */
 #define	PR_IP4		0x02000000	/* IPv4 restricted or disabled */
 					/* by this jail or an ancestor */
 #define	PR_IP6		0x04000000	/* IPv6 restricted or disabled */
 					/* by this jail or an ancestor */
 #define PR_COMPLETE_PROC 0x08000000	/* prison_complete called from */
 					/* prison_proc_free, releases uref */
 
 /*
  * Flags for pr_allow
  * Bits not noted here may be used for dynamic allow.mount.xxxfs.
  */
 #define	PR_ALLOW_SET_HOSTNAME		0x00000001
 #define	PR_ALLOW_SYSVIPC		0x00000002
 #define	PR_ALLOW_RAW_SOCKETS		0x00000004
 #define	PR_ALLOW_CHFLAGS		0x00000008
 #define	PR_ALLOW_MOUNT			0x00000010
 #define	PR_ALLOW_QUOTAS			0x00000020
 #define	PR_ALLOW_SOCKET_AF		0x00000040
 #define	PR_ALLOW_MLOCK			0x00000080
 #define	PR_ALLOW_READ_MSGBUF		0x00000100
 #define	PR_ALLOW_UNPRIV_DEBUG		0x00000200
 #define	PR_ALLOW_SUSER			0x00000400
 #define	PR_ALLOW_RESERVED_PORTS		0x00008000
 #define	PR_ALLOW_KMEM_ACCESS		0x00010000	/* reserved, not used yet */
 #define	PR_ALLOW_NFSD			0x00020000
 #define	PR_ALLOW_EXTATTR		0x00040000
 #define	PR_ALLOW_ADJTIME		0x00080000
 #define	PR_ALLOW_SETTIME		0x00100000
 #define	PR_ALLOW_ROUTING		0x00200000
+#define	PR_ALLOW_UNPRIV_PARENT_TAMPER	0x00400000
 
 /*
  * PR_ALLOW_PRISON0 are the allow flags that we apply by default to prison0,
  * while PR_ALLOW_ALL_STATIC are all of the allow bits that we have allocated at
  * build time.  PR_ALLOW_ALL_STATIC should contain any bit above that we expect
  * to be used on the system, while PR_ALLOW_PRISON0 will be some subset of that.
  */
-#define	PR_ALLOW_ALL_STATIC		0x003f87ff
-#define	PR_ALLOW_PRISON0		(PR_ALLOW_ALL_STATIC)
+#define	PR_ALLOW_ALL_STATIC		0x007f87ff
+#define	PR_ALLOW_PRISON0		\
+    (PR_ALLOW_ALL_STATIC & ~(PR_ALLOW_UNPRIV_PARENT_TAMPERING))
 
 /*
  * PR_ALLOW_DIFFERENCES determines which flags are able to be
  * different between the parent and child jail upon creation.
  */
-#define	PR_ALLOW_DIFFERENCES		(PR_ALLOW_UNPRIV_DEBUG)
+#define	PR_ALLOW_DIFFERENCES		\
+    (PR_ALLOW_UNPRIV_DEBUG | PR_ALLOW_UNPRIV_PARENT_TAMPER)
 
 /*
  * OSD methods
  */
 #define	PR_METHOD_CREATE	0
 #define	PR_METHOD_GET		1
 #define	PR_METHOD_SET		2
 #define	PR_METHOD_CHECK		3
 #define	PR_METHOD_ATTACH	4
 #define	PR_METHOD_REMOVE	5
 #define	PR_MAXMETHOD		6
 
 /*
  * Lock/unlock a prison.
  * XXX These exist not so much for general convenience, but to be useable in
  *     the FOREACH_PRISON_DESCENDANT_LOCKED macro which can't handle them in
  *     non-function form as currently defined.
  */
 static __inline void
 prison_lock(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 }
 
 static __inline void
 prison_unlock(struct prison *pr)
 {
 
 	mtx_unlock(&pr->pr_mtx);
 }
 
 /* Traverse a prison's immediate children. */
 #define	FOREACH_PRISON_CHILD(ppr, cpr)					\
 	LIST_FOREACH(cpr, &(ppr)->pr_children, pr_sibling)
 
 /*
  * Preorder traversal of all of a prison's descendants.
  * This ugly loop allows the macro to be followed by a single block
  * as expected in a looping primitive.
  */
 #define	FOREACH_PRISON_DESCENDANT(ppr, cpr, descend)			\
 	for ((cpr) = (ppr), (descend) = 1;				\
 	    ((cpr) = (((descend) && !LIST_EMPTY(&(cpr)->pr_children))	\
 	      ? LIST_FIRST(&(cpr)->pr_children)				\
 	      : ((cpr) == (ppr)						\
 		 ? NULL							\
 		 : (((descend) = LIST_NEXT(cpr, pr_sibling) != NULL)	\
 		    ? LIST_NEXT(cpr, pr_sibling)			\
 		    : (cpr)->pr_parent))));)				\
 		if (!(descend))						\
 			;						\
 		else
 
 /*
  * As above, but lock descendants on the way down and unlock on the way up.
  */
 #define	FOREACH_PRISON_DESCENDANT_LOCKED(ppr, cpr, descend)		\
 	for ((cpr) = (ppr), (descend) = 1;				\
 	    ((cpr) = (((descend) && !LIST_EMPTY(&(cpr)->pr_children))	\
 	      ? LIST_FIRST(&(cpr)->pr_children)				\
 	      : ((cpr) == (ppr)						\
 		 ? NULL							\
 		 : ((prison_unlock(cpr),				\
 		    (descend) = LIST_NEXT(cpr, pr_sibling) != NULL)	\
 		    ? LIST_NEXT(cpr, pr_sibling)			\
 		    : (cpr)->pr_parent))));)				\
 		if ((descend) ? (prison_lock(cpr), 0) : 1)		\
 			;						\
 		else
 
 /*
  * As above, but also keep track of the level descended to.
  */
 #define	FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(ppr, cpr, descend, level)\
 	for ((cpr) = (ppr), (descend) = 1, (level) = 0;			\
 	    ((cpr) = (((descend) && !LIST_EMPTY(&(cpr)->pr_children))	\
 	      ? (level++, LIST_FIRST(&(cpr)->pr_children))		\
 	      : ((cpr) == (ppr)						\
 		 ? NULL							\
 		 : ((prison_unlock(cpr),				\
 		    (descend) = LIST_NEXT(cpr, pr_sibling) != NULL)	\
 		    ? LIST_NEXT(cpr, pr_sibling)			\
 		    : (level--, (cpr)->pr_parent)))));)			\
 		if ((descend) ? (prison_lock(cpr), 0) : 1)		\
 			;						\
 		else
 
 /*
  * Traverse a prison's descendants, visiting both preorder and postorder.
  */
 #define FOREACH_PRISON_DESCENDANT_PRE_POST(ppr, cpr, descend)		\
 	for ((cpr) = (ppr), (descend) = 1;				\
 	     ((cpr) = (descend)						\
 	      ? ((descend) = !LIST_EMPTY(&(cpr)->pr_children))		\
 		? LIST_FIRST(&(cpr)->pr_children)			\
 		: (cpr)							\
 	      : ((descend) = LIST_NEXT(cpr, pr_sibling) != NULL)	\
 		? LIST_NEXT(cpr, pr_sibling)				\
 		: cpr->pr_parent) != (ppr);)
 
 /*
  * Attributes of the physical system, and the root of the jail tree.
  */
 extern struct	prison prison0;
 
 TAILQ_HEAD(prisonlist, prison);
 extern struct	prisonlist allprison;
 extern struct	sx allprison_lock;
 
 /*
  * Sysctls to describe jail parameters.
  */
 SYSCTL_DECL(_security_jail);
 SYSCTL_DECL(_security_jail_param);
 
 #define SYSCTL_JAIL_PARAM_DECL(name)					\
 	SYSCTL_DECL(_security_jail_param_##name)
 #define	SYSCTL_JAIL_PARAM(module, param, type, fmt, descr)		\
 	SYSCTL_PROC(_security_jail_param ## module, OID_AUTO, param,	\
 	    (type) | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_param, fmt, descr)
 #define	SYSCTL_JAIL_PARAM_STRING(module, param, access, len, descr)	\
 	SYSCTL_PROC(_security_jail_param ## module, OID_AUTO, param,	\
 	    CTLTYPE_STRING | CTLFLAG_MPSAFE | (access), NULL, len,	\
 	    sysctl_jail_param, "A", descr)
 #define	SYSCTL_JAIL_PARAM_STRUCT(module, param, access, len, fmt, descr) \
 	SYSCTL_PROC(_security_jail_param ## module, OID_AUTO, param,	\
 	    CTLTYPE_STRUCT | CTLFLAG_MPSAFE | (access), NULL, len,	\
 	    sysctl_jail_param, fmt, descr)
 #define	SYSCTL_JAIL_PARAM_NODE(module, descr)				\
 	SYSCTL_NODE(_security_jail_param, OID_AUTO, module, CTLFLAG_MPSAFE, \
 	    0, descr)
 #define	SYSCTL_JAIL_PARAM_SUBNODE(parent, module, descr)		\
 	SYSCTL_NODE(_security_jail_param_##parent, OID_AUTO, module,	\
 	    CTLFLAG_MPSAFE, 0, descr)
 #define	SYSCTL_JAIL_PARAM_SYS_NODE(module, access, descr)		\
 	SYSCTL_JAIL_PARAM_NODE(module, descr);				\
 	SYSCTL_JAIL_PARAM(_##module, , CTLTYPE_INT | (access), "E,jailsys", \
 	    descr)
 #define	SYSCTL_JAIL_PARAM_SYS_SUBNODE(parent, module, access, descr)	\
 	SYSCTL_JAIL_PARAM_SUBNODE(parent, module, descr);		\
 	SYSCTL_JAIL_PARAM(_##parent##_##module, , CTLTYPE_INT | (access), \
 	    "E,jailsys", descr)
 
 /*
  * Kernel support functions for jail().
  */
 struct ucred;
 struct mount;
 struct sockaddr;
 struct statfs;
 struct vfsconf;
 
 /*
  * Return 1 if the passed credential is in a jail, otherwise 0.
  */
 #define jailed(cred)	(cred->cr_prison != &prison0)
 
 bool jailed_without_vnet(struct ucred *);
 void getcredhostname(struct ucred *, char *, size_t);
 void getcreddomainname(struct ucred *, char *, size_t);
 void getcredhostuuid(struct ucred *, char *, size_t);
 void getcredhostid(struct ucred *, unsigned long *);
 void getjailname(struct ucred *cred, char *name, size_t len);
 void prison0_init(void);
 bool prison_allow(struct ucred *, unsigned);
 int prison_check(struct ucred *cred1, struct ucred *cred2);
 bool prison_check_nfsd(struct ucred *cred);
 bool prison_owns_vnet(struct prison *pr);
 int prison_canseemount(struct ucred *cred, struct mount *mp);
 void prison_enforce_statfs(struct ucred *cred, struct mount *mp,
     struct statfs *sp);
 struct prison *prison_find(int prid);
 struct prison *prison_find_child(struct prison *, int);
 struct prison *prison_find_name(struct prison *, const char *);
 bool prison_flag(struct ucred *, unsigned);
 void prison_free(struct prison *pr);
 void prison_free_locked(struct prison *pr);
 void prison_hold(struct prison *pr);
 void prison_hold_locked(struct prison *pr);
 void prison_proc_hold(struct prison *);
 void prison_proc_free(struct prison *);
 void prison_proc_link(struct prison *, struct proc *);
 void prison_proc_unlink(struct prison *, struct proc *);
 void prison_proc_iterate(struct prison *, void (*)(struct proc *, void *), void *);
 void prison_set_allow(struct ucred *cred, unsigned flag, int enable);
 bool prison_ischild(struct prison *, struct prison *);
 bool prison_isalive(const struct prison *);
 bool prison_isvalid(struct prison *);
 #if defined(INET) || defined(INET6)
 int prison_ip_check(const struct prison *, const pr_family_t, const void *);
 const void *prison_ip_get0(const struct prison *, const pr_family_t);
 u_int prison_ip_cnt(const struct prison *, const pr_family_t);
 #endif
 #ifdef INET
 bool prison_equal_ip4(struct prison *, struct prison *);
 int prison_get_ip4(struct ucred *cred, struct in_addr *ia);
 int prison_local_ip4(struct ucred *cred, struct in_addr *ia);
 int prison_remote_ip4(struct ucred *cred, struct in_addr *ia);
 int prison_check_ip4(const struct ucred *, const struct in_addr *);
 int prison_check_ip4_locked(const struct prison *, const struct in_addr *);
 bool prison_saddrsel_ip4(struct ucred *, struct in_addr *);
 int prison_qcmp_v4(const void *, const void *);
 bool prison_valid_v4(const void *);
 #endif
 #ifdef INET6
 bool prison_equal_ip6(struct prison *, struct prison *);
 int prison_get_ip6(struct ucred *, struct in6_addr *);
 int prison_local_ip6(struct ucred *, struct in6_addr *, int);
 int prison_remote_ip6(struct ucred *, struct in6_addr *);
 int prison_check_ip6(const struct ucred *, const struct in6_addr *);
 int prison_check_ip6_locked(const struct prison *, const struct in6_addr *);
 bool prison_saddrsel_ip6(struct ucred *, struct in6_addr *);
 int prison_qcmp_v6(const void *, const void *);
 bool prison_valid_v6(const void *);
 #endif
 int prison_check_af(struct ucred *cred, int af);
 int prison_if(struct ucred *cred, const struct sockaddr *sa);
 char *prison_name(struct prison *, struct prison *);
 int prison_priv_check(struct ucred *cred, int priv);
 int sysctl_jail_param(SYSCTL_HANDLER_ARGS);
 unsigned prison_add_allow(const char *prefix, const char *name,
     const char *prefix_descr, const char *descr);
 void prison_add_vfs(struct vfsconf *vfsp);
 void prison_racct_foreach(void (*callback)(struct racct *racct,
     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
     void *arg2, void *arg3);
 struct prison_racct *prison_racct_find(const char *name);
 void prison_racct_hold(struct prison_racct *prr);
 void prison_racct_free(struct prison_racct *prr);
 
 #endif /* _KERNEL */
 #endif /* !_SYS_JAIL_H_ */
diff --git a/sys/sys/priv.h b/sys/sys/priv.h
index 1f73877ab450..9c493629f7cf 100644
--- a/sys/sys/priv.h
+++ b/sys/sys/priv.h
@@ -1,565 +1,568 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2006 nCircle Network Security, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson for the TrustedBSD
  * Project under contract to nCircle Network Security, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY,
  * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * Privilege checking interface for BSD kernel.
  */
 #ifndef _SYS_PRIV_H_
 #define	_SYS_PRIV_H_
 
 /*
  * Privilege list, sorted loosely by kernel subsystem.
  *
  * Think carefully before adding or reusing one of these privileges -- are
  * there existing instances referring to the same privilege?  Third party
  * vendors may request the assignment of privileges to be used in loadable
  * modules.  Particular numeric privilege assignments are part of the
  * loadable kernel module ABI, and should not be changed across minor
  * releases.
  *
  * When adding a new privilege, remember to determine if it's appropriate
  * for use in jail, and update the privilege switch in prison_priv_check()
  * in kern_jail.c as necessary.
  */
 
 /*
  * Track beginning of privilege list.
  */
 #define	_PRIV_LOWEST	1
 
 /*
  * The remaining privileges typically correspond to one or a small
  * number of specific privilege checks, and have (relatively) precise
  * meanings.  They are loosely sorted into a set of base system
  * privileges, such as the ability to reboot, and then loosely by
  * subsystem, indicated by a subsystem name.
  */
 #define	_PRIV_ROOT		1	/* Removed. */
 #define	PRIV_ACCT		2	/* Manage process accounting. */
 #define	PRIV_MAXFILES		3	/* Exceed system open files limit. */
 #define	PRIV_MAXPROC		4	/* Exceed system processes limit. */
 #define	PRIV_KTRACE		5	/* Set/clear KTRFAC_ROOT on ktrace. */
 #define	PRIV_SETDUMPER		6	/* Configure dump device. */
 #define	PRIV_REBOOT		8	/* Can reboot system. */
 #define	PRIV_SWAPON		9	/* Can swapon(). */
 #define	PRIV_SWAPOFF		10	/* Can swapoff(). */
 #define	PRIV_MSGBUF		11	/* Can read kernel message buffer. */
 #define	PRIV_IO			12	/* Can perform low-level I/O. */
 #define	PRIV_KEYBOARD		13	/* Reprogram keyboard. */
 #define	PRIV_DRIVER		14	/* Low-level driver privilege. */
 #define	PRIV_ADJTIME		15	/* Set time adjustment. */
 #define	PRIV_NTP_ADJTIME	16	/* Set NTP time adjustment. */
 #define	PRIV_CLOCK_SETTIME	17	/* Can call clock_settime. */
 #define	PRIV_SETTIMEOFDAY	18	/* Can call settimeofday. */
 #define	_PRIV_SETHOSTID		19	/* Removed. */
 #define	_PRIV_SETDOMAINNAME	20	/* Removed. */
 
 /*
  * Audit subsystem privileges.
  */
 #define	PRIV_AUDIT_CONTROL	40	/* Can configure audit. */
 #define	PRIV_AUDIT_FAILSTOP	41	/* Can run during audit fail stop. */
 #define	PRIV_AUDIT_GETAUDIT	42	/* Can get proc audit properties. */
 #define	PRIV_AUDIT_SETAUDIT	43	/* Can set proc audit properties. */
 #define	PRIV_AUDIT_SUBMIT	44	/* Can submit an audit record. */
 
 /*
  * Credential management privileges.
  */
 #define	PRIV_CRED_SETUID	50	/* setuid. */
 #define	PRIV_CRED_SETEUID	51	/* seteuid to !ruid and !svuid. */
 #define	PRIV_CRED_SETGID	52	/* setgid. */
 #define	PRIV_CRED_SETEGID	53	/* setgid to !rgid and !svgid. */
 #define	PRIV_CRED_SETGROUPS	54	/* Set process additional groups. */
 #define	PRIV_CRED_SETREUID	55	/* setreuid. */
 #define	PRIV_CRED_SETREGID	56	/* setregid. */
 #define	PRIV_CRED_SETRESUID	57	/* setresuid. */
 #define	PRIV_CRED_SETRESGID	58	/* setresgid. */
 #define	PRIV_SEEOTHERGIDS	59	/* Exempt bsd.seeothergids. */
 #define	PRIV_SEEOTHERUIDS	60	/* Exempt bsd.seeotheruids. */
 #define	PRIV_SEEJAILPROC	61	/* Exempt from bsd.see_jail_proc. */
 #define	PRIV_CRED_SETCRED	62	/* setcred. */
 
 /*
  * Debugging privileges.
  */
 #define	PRIV_DEBUG_DIFFCRED	80	/* Exempt debugging other users. */
 #define	PRIV_DEBUG_SUGID	81	/* Exempt debugging setuid proc. */
 #define	PRIV_DEBUG_UNPRIV	82	/* Exempt unprivileged debug limit. */
 #define	PRIV_DEBUG_DENIED	83	/* Exempt P2_NOTRACE. */
+#define	PRIV_DEBUG_DIFFJAIL	84	/* Exempt debugging other jails. */
 
 /*
  * Dtrace privileges.
  */
 #define	PRIV_DTRACE_KERNEL	90	/* Allow use of DTrace on the kernel. */
 #define	PRIV_DTRACE_PROC	91	/* Allow attaching DTrace to process. */
 #define	PRIV_DTRACE_USER	92	/* Process may submit DTrace events. */
 
 /*
  * Firmware privilegs.
  */
 #define	PRIV_FIRMWARE_LOAD	100	/* Can load firmware. */
 
 /*
  * Jail privileges.
  */
 #define	PRIV_JAIL_ATTACH	110	/* Attach to a jail. */
 #define	PRIV_JAIL_SET		111	/* Set jail parameters. */
 #define	PRIV_JAIL_REMOVE	112	/* Remove a jail. */
 
 /*
  * Kernel environment privileges.
  */
 #define	PRIV_KENV_SET		120	/* Set kernel env. variables. */
 #define	PRIV_KENV_UNSET		121	/* Unset kernel env. variables. */
 
 /*
  * Loadable kernel module privileges.
  */
 #define	PRIV_KLD_LOAD		130	/* Load a kernel module. */
 #define	PRIV_KLD_UNLOAD		131	/* Unload a kernel module. */
 
 /*
  * Privileges associated with the MAC Framework and specific MAC policy
  * modules.
  */
 #define	PRIV_MAC_PARTITION	140	/* Privilege in mac_partition policy. */
 #define	PRIV_MAC_PRIVS		141	/* Privilege in the mac_privs policy. */
 
 /*
  * Process-related privileges.
  */
 #define	PRIV_PROC_LIMIT		160	/* Exceed user process limit. */
 #define	PRIV_PROC_SETLOGIN	161	/* Can call setlogin. */
 #define	PRIV_PROC_SETRLIMIT	162	/* Can raise resources limits. */
 #define	PRIV_PROC_SETLOGINCLASS	163	/* Can call setloginclass(2). */
 
 /*
  * System V IPC privileges.
  */
 #define	PRIV_IPC_READ		170	/* Can override IPC read perm. */
 #define	PRIV_IPC_WRITE		171	/* Can override IPC write perm. */
 #define	PRIV_IPC_ADMIN		172	/* Can override IPC owner-only perm. */
 #define	PRIV_IPC_MSGSIZE	173	/* Exempt IPC message queue limit. */
 
 /*
  * POSIX message queue privileges.
  */
 #define	PRIV_MQ_ADMIN		180	/* Can override msgq owner-only perm. */
 
 /*
  * Performance monitoring counter privileges.
  */
 #define	PRIV_PMC_MANAGE		190	/* Can administer PMC. */
 #define	PRIV_PMC_SYSTEM		191	/* Can allocate a system-wide PMC. */
 
 /*
  * Scheduling privileges.
  */
 #define	PRIV_SCHED_DIFFCRED	200	/* Exempt scheduling other users. */
 #define	PRIV_SCHED_SETPRIORITY	201	/* Can set lower nice value for proc. */
 #define	PRIV_SCHED_RTPRIO	202	/* Can set real time scheduling. */
 #define	PRIV_SCHED_SETPOLICY	203	/* Can set scheduler policy. */
 #define	PRIV_SCHED_SET		204	/* Can set thread scheduler. */
 #define	PRIV_SCHED_SETPARAM	205	/* Can set thread scheduler params. */
 #define	PRIV_SCHED_CPUSET	206	/* Can manipulate cpusets. */
 #define	PRIV_SCHED_CPUSET_INTR	207	/* Can adjust IRQ to CPU binding. */
 #define	PRIV_SCHED_IDPRIO	208	/* Can set idle time scheduling. */
+#define	PRIV_SCHED_DIFFJAIL	209	/* Exempt scheduling other jails. */
 
 /*
  * POSIX semaphore privileges.
  */
 #define	PRIV_SEM_WRITE		220	/* Can override sem write perm. */
 
 /*
  * Signal privileges.
  */
 #define	PRIV_SIGNAL_DIFFCRED	230	/* Exempt signalling other users. */
 #define	PRIV_SIGNAL_SUGID	231	/* Non-conserv signal setuid proc. */
+#define	PRIV_SIGNAL_DIFFJAIL	232	/* Exempt signalling other jails. */
 
 /*
  * Sysctl privileges.
  */
 #define	PRIV_SYSCTL_DEBUG	240	/* Can invoke sysctl.debug. */
 #define	PRIV_SYSCTL_WRITE	241	/* Can write sysctls. */
 #define	PRIV_SYSCTL_WRITEJAIL	242	/* Can write sysctls, jail permitted. */
 #define	PRIV_SYSCTL_MEMLOCK	243	/* Large requests are not serialized. */
 
 /*
  * TTY privileges.
  */
 #define	PRIV_TTY_CONSOLE	250	/* Set console to tty. */
 #define	PRIV_TTY_DRAINWAIT	251	/* Set tty drain wait time. */
 #define	PRIV_TTY_DTRWAIT	252	/* Set DTR wait on tty. */
 #define	PRIV_TTY_EXCLUSIVE	253	/* Override tty exclusive flag. */
 #define	_PRIV_TTY_PRISON	254	/* Removed. */
 #define	PRIV_TTY_STI		255	/* Simulate input on another tty. */
 #define	PRIV_TTY_SETA		256	/* Set tty termios structure. */
 
 /*
  * UFS-specific privileges.
  */
 #define	PRIV_UFS_EXTATTRCTL	270	/* Can configure EAs on UFS1. */
 #define	PRIV_UFS_QUOTAOFF	271	/* quotaoff(). */
 #define	PRIV_UFS_QUOTAON	272	/* quotaon(). */
 #define	PRIV_UFS_SETUSE		273	/* setuse(). */
 
 /*
  * ZFS-specific privileges.
  */
 #define	PRIV_ZFS_POOL_CONFIG	280	/* Can configure ZFS pools. */
 #define	PRIV_ZFS_INJECT		281	/* Can inject faults in the ZFS fault
 					   injection framework. */
 #define	PRIV_ZFS_JAIL		282	/* Can attach/detach ZFS file systems
 					   to/from jails. */
 
 /*
  * NFS-specific privileges.
  */
 #define	PRIV_NFS_DAEMON		290	/* Can become the NFS daemon. */
 #define	PRIV_NFS_LOCKD		291	/* Can become NFS lock daemon. */
 
 /*
  * VFS privileges.
  */
 #define	PRIV_VFS_READ		310	/* Override vnode DAC read perm. */
 #define	PRIV_VFS_WRITE		311	/* Override vnode DAC write perm. */
 #define	PRIV_VFS_ADMIN		312	/* Override vnode DAC admin perm. */
 #define	PRIV_VFS_EXEC		313	/* Override vnode DAC exec perm. */
 #define	PRIV_VFS_LOOKUP		314	/* Override vnode DAC lookup perm. */
 #define	PRIV_VFS_BLOCKRESERVE	315	/* Can use free block reserve. */
 #define	PRIV_VFS_CHFLAGS_DEV	316	/* Can chflags() a device node. */
 #define	PRIV_VFS_CHOWN		317	/* Can set user; group to non-member. */
 #define	PRIV_VFS_CHROOT		318	/* chroot(). */
 #define	PRIV_VFS_RETAINSUGID	319	/* Can retain sugid bits on change. */
 #define	PRIV_VFS_EXCEEDQUOTA	320	/* Exempt from quota restrictions. */
 #define	PRIV_VFS_EXTATTR_SYSTEM	321	/* Operate on system EA namespace. */
 #define	PRIV_VFS_FCHROOT	322	/* fchroot(). */
 #define	PRIV_VFS_FHOPEN		323	/* Can fhopen(). */
 #define	PRIV_VFS_FHSTAT		324	/* Can fhstat(). */
 #define	PRIV_VFS_FHSTATFS	325	/* Can fhstatfs(). */
 #define	PRIV_VFS_GENERATION	326	/* stat() returns generation number. */
 #define	PRIV_VFS_GETFH		327	/* Can retrieve file handles. */
 #define	PRIV_VFS_GETQUOTA	328	/* getquota(). */
 #define	PRIV_VFS_LINK		329	/* bsd.hardlink_check_uid */
 #define	PRIV_VFS_MKNOD_BAD	330	/* Was: mknod() can mark bad inodes. */
 #define	PRIV_VFS_MKNOD_DEV	331	/* Can mknod() to create dev nodes. */
 #define	PRIV_VFS_MKNOD_WHT	332	/* Can mknod() to create whiteout. */
 #define	PRIV_VFS_MOUNT		333	/* Can mount(). */
 #define	PRIV_VFS_MOUNT_OWNER	334	/* Can manage other users' file systems. */
 #define	PRIV_VFS_MOUNT_EXPORTED	335	/* Can set MNT_EXPORTED on mount. */
 #define	PRIV_VFS_MOUNT_PERM	336	/* Override dev node perms at mount. */
 #define	PRIV_VFS_MOUNT_SUIDDIR	337	/* Can set MNT_SUIDDIR on mount. */
 #define	PRIV_VFS_MOUNT_NONUSER	338	/* Can perform a non-user mount. */
 #define	PRIV_VFS_SETGID		339	/* Can setgid if not in group. */
 #define	PRIV_VFS_SETQUOTA	340	/* setquota(). */
 #define	PRIV_VFS_STICKYFILE	341	/* Can set sticky bit on file. */
 #define	PRIV_VFS_SYSFLAGS	342	/* Can modify system flags. */
 #define	PRIV_VFS_UNMOUNT	343	/* Can unmount(). */
 #define	PRIV_VFS_STAT		344	/* Override vnode MAC stat perm. */
 #define	PRIV_VFS_READ_DIR	345	/* Can read(2) a dirfd, needs sysctl. */
 
 /*
  * Virtual memory privileges.
  */
 #define	PRIV_VM_MADV_PROTECT	360	/* Can set MADV_PROTECT. */
 #define	PRIV_VM_MLOCK		361	/* Can mlock(), mlockall(). */
 #define	PRIV_VM_MUNLOCK		362	/* Can munlock(), munlockall(). */
 #define	PRIV_VM_SWAP_NOQUOTA	363	/*
 					 * Can override the global
 					 * swap reservation limits.
 					 */
 #define	PRIV_VM_SWAP_NORLIMIT	364	/*
 					 * Can override the per-uid
 					 * swap reservation limits.
 					 */
 
 /*
  * Device file system privileges.
  */
 #define	PRIV_DEVFS_RULE		370	/* Can manage devfs rules. */
 #define	PRIV_DEVFS_SYMLINK	371	/* Can create symlinks in devfs. */
 
 /*
  * Random number generator privileges.
  */
 #define	PRIV_RANDOM_RESEED	380	/* Closing /dev/random reseeds. */
 
 /*
  * Network stack privileges.
  */
 #define	PRIV_NET_BRIDGE		390	/* Administer bridge. */
 #define	PRIV_NET_GRE		391	/* Administer GRE. */
 #define	_PRIV_NET_PPP		392	/* Removed. */
 #define	_PRIV_NET_SLIP		393	/* Removed. */
 #define	PRIV_NET_BPF		394	/* Monitor BPF. */
 #define	PRIV_NET_RAW		395	/* Open raw socket. */
 #define	PRIV_NET_ROUTE		396	/* Administer routing. */
 #define	PRIV_NET_TAP		397	/* Can open tap device. */
 #define	PRIV_NET_SETIFMTU	398	/* Set interface MTU. */
 #define	PRIV_NET_SETIFFLAGS	399	/* Set interface flags. */
 #define	PRIV_NET_SETIFCAP	400	/* Set interface capabilities. */
 #define	PRIV_NET_SETIFNAME	401	/* Set interface name. */
 #define	PRIV_NET_SETIFMETRIC	402	/* Set interface metrics. */
 #define	PRIV_NET_SETIFPHYS	403	/* Set interface physical layer prop. */
 #define	PRIV_NET_SETIFMAC	404	/* Set interface MAC label. */
 #define	PRIV_NET_ADDMULTI	405	/* Add multicast addr. to ifnet. */
 #define	PRIV_NET_DELMULTI	406	/* Delete multicast addr. from ifnet. */
 #define	PRIV_NET_HWIOCTL	407	/* Issue hardware ioctl on ifnet. */
 #define	PRIV_NET_SETLLADDR	408	/* Set interface link-level address. */
 #define	PRIV_NET_ADDIFGROUP	409	/* Add new interface group. */
 #define	PRIV_NET_DELIFGROUP	410	/* Delete interface group. */
 #define	PRIV_NET_IFCREATE	411	/* Create cloned interface. */
 #define	PRIV_NET_IFDESTROY	412	/* Destroy cloned interface. */
 #define	PRIV_NET_ADDIFADDR	413	/* Add protocol addr to interface. */
 #define	PRIV_NET_DELIFADDR	414	/* Delete protocol addr on interface. */
 #define	PRIV_NET_LAGG		415	/* Administer lagg interface. */
 #define	PRIV_NET_GIF		416	/* Administer gif interface. */
 #define	PRIV_NET_SETIFVNET	417	/* Move interface to vnet. */
 #define	PRIV_NET_SETIFDESCR	418	/* Set interface description. */
 #define	PRIV_NET_SETIFFIB	419	/* Set interface fib. */
 #define	PRIV_NET_VXLAN		420	/* Administer vxlan. */
 #define	PRIV_NET_SETLANPCP	421	/* Set LAN priority. */
 #define	PRIV_NET_SETVLANPCP	PRIV_NET_SETLANPCP /* Alias Set VLAN priority */
 #define	PRIV_NET_OVPN		422	/* Administer OpenVPN DCO. */
 #define	PRIV_NET_ME		423	/* Administer ME interface. */
 #define	PRIV_NET_WG		424	/* Administer WireGuard interface. */
 
 /*
  * 802.11-related privileges.
  */
 #define	PRIV_NET80211_VAP_GETKEY	440	/* Query VAP 802.11 keys. */
 #define	PRIV_NET80211_VAP_MANAGE	441	/* Administer 802.11 VAP */
 #define	PRIV_NET80211_VAP_SETMAC	442	/* Set VAP MAC address */
 #define	PRIV_NET80211_CREATE_VAP	443	/* Create a new VAP */
 
 /*
  * Placeholder for AppleTalk privileges, not supported anymore.
  */
 #define	_PRIV_NETATALK_RESERVEDPORT	450	/* Bind low port number. */
 
 /*
  * ATM privileges.
  */
 #define	PRIV_NETATM_CFG		460
 #define	PRIV_NETATM_ADD		461
 #define	PRIV_NETATM_DEL		462
 #define	PRIV_NETATM_SET		463
 
 /*
  * Bluetooth privileges.
  */
 #define	PRIV_NETBLUETOOTH_RAW	470	/* Open raw bluetooth socket. */
 
 /*
  * Netgraph and netgraph module privileges.
  */
 #define	PRIV_NETGRAPH_CONTROL	480	/* Open netgraph control socket. */
 #define	PRIV_NETGRAPH_TTY	481	/* Configure tty for netgraph. */
 
 /*
  * IPv4 and IPv6 privileges.
  */
 #define	PRIV_NETINET_RESERVEDPORT	490	/* Bind low port number. */
 #define	PRIV_NETINET_IPFW	491	/* Administer IPFW firewall. */
 #define	PRIV_NETINET_DIVERT	492	/* Open IP divert socket. */
 #define	PRIV_NETINET_PF		493	/* Administer pf firewall. */
 #define	PRIV_NETINET_DUMMYNET	494	/* Administer DUMMYNET. */
 #define	PRIV_NETINET_CARP	495	/* Administer CARP. */
 #define	PRIV_NETINET_MROUTE	496	/* Administer multicast routing. */
 #define	PRIV_NETINET_RAW	497	/* Open netinet raw socket. */
 #define	PRIV_NETINET_GETCRED	498	/* Query netinet pcb credentials. */
 #define	PRIV_NETINET_ADDRCTRL6	499	/* Administer IPv6 address scopes. */
 #define	PRIV_NETINET_ND6	500	/* Administer IPv6 neighbor disc. */
 #define	PRIV_NETINET_SCOPE6	501	/* Administer IPv6 address scopes. */
 #define	PRIV_NETINET_ALIFETIME6	502	/* Administer IPv6 address lifetimes. */
 #define	PRIV_NETINET_IPSEC	503	/* Administer IPSEC. */
 #define	PRIV_NETINET_REUSEPORT	504	/* Allow [rapid] port/address reuse. */
 #define	PRIV_NETINET_SETHDROPTS	505	/* Set certain IPv4/6 header options. */
 #define	PRIV_NETINET_BINDANY	506	/* Allow bind to any address. */
 #define	PRIV_NETINET_HASHKEY	507	/* Get and set hash keys for IPv4/6. */
 #define	PRIV_NETINET_KTLSKEYS	508	/* Read ktls session keys. */
 
 /*
  * Placeholders for IPX/SPX privileges, not supported any more.
  */
 #define	_PRIV_NETIPX_RESERVEDPORT	520	/* Bind low port number. */
 #define	_PRIV_NETIPX_RAW		521	/* Open netipx raw socket. */
 
 /*
  * NCP privileges.
  */
 #define	PRIV_NETNCP		530	/* Use another user's connection. */
 
 /*
  * SMB privileges.
  */
 #define	PRIV_NETSMB		540	/* Use another user's connection. */
 
 /*
  * VM86 privileges.
  */
 #define	PRIV_VM86_INTCALL	550	/* Allow invoking vm86 int handlers. */
 
 #define	PRIV_PIPEBUF		560	/* Allow to allocate reserved pipebuf
 					   space */
 
 /*
  * Set of reserved privilege values, which will be allocated to code as
  * needed, in order to avoid renumbering later privileges due to insertion.
  */
 #define	_PRIV_RESERVED1		561
 #define	_PRIV_RESERVED2		562
 #define	_PRIV_RESERVED3		563
 #define	_PRIV_RESERVED4		564
 #define	_PRIV_RESERVED5		565
 #define	_PRIV_RESERVED6		566
 #define	_PRIV_RESERVED7		567
 #define	_PRIV_RESERVED8		568
 #define	_PRIV_RESERVED9		569
 #define	_PRIV_RESERVED10	570
 #define	_PRIV_RESERVED11	571
 #define	_PRIV_RESERVED12	572
 #define	_PRIV_RESERVED13	573
 #define	_PRIV_RESERVED14	574
 #define	_PRIV_RESERVED15	575
 
 /*
  * Define a set of valid privilege numbers that can be used by loadable
  * modules that don't yet have privilege reservations.  Ideally, these should
  * not be used, since their meaning is opaque to any policies that are aware
  * of specific privileges, such as jail, and as such may be arbitrarily
  * denied.
  */
 #define	PRIV_MODULE0		600
 #define	PRIV_MODULE1		601
 #define	PRIV_MODULE2		602
 #define	PRIV_MODULE3		603
 #define	PRIV_MODULE4		604
 #define	PRIV_MODULE5		605
 #define	PRIV_MODULE6		606
 #define	PRIV_MODULE7		607
 #define	PRIV_MODULE8		608
 #define	PRIV_MODULE9		609
 #define	PRIV_MODULE10		610
 #define	PRIV_MODULE11		611
 #define	PRIV_MODULE12		612
 #define	PRIV_MODULE13		613
 #define	PRIV_MODULE14		614
 #define	PRIV_MODULE15		615
 
 /*
  * DDB(4) privileges.
  */
 #define	PRIV_DDB_CAPTURE	620	/* Allow reading of DDB capture log. */
 
 /*
  * Arla/nnpfs privileges.
  */
 #define	PRIV_NNPFS_DEBUG	630	/* Perforn ARLA_VIOC_NNPFSDEBUG. */
 
 /*
  * cpuctl(4) privileges.
  */
 #define PRIV_CPUCTL_WRMSR	640	/* Write model-specific register. */
 #define PRIV_CPUCTL_UPDATE	641	/* Update cpu microcode. */
 
 /*
  * Capi4BSD privileges.
  */
 #define	PRIV_C4B_RESET_CTLR	650	/* Load firmware, reset controller. */
 #define	PRIV_C4B_TRACE		651	/* Unrestricted CAPI message tracing. */
 
 /*
  * OpenAFS privileges.
  */
 #define	PRIV_AFS_ADMIN		660	/* Can change AFS client settings. */
 #define	PRIV_AFS_DAEMON		661	/* Can become the AFS daemon. */
 
 /*
  * Resource Limits privileges.
  */
 #define	PRIV_RCTL_GET_RACCT	670
 #define	PRIV_RCTL_GET_RULES	671
 #define	PRIV_RCTL_GET_LIMITS	672
 #define	PRIV_RCTL_ADD_RULE	673
 #define	PRIV_RCTL_REMOVE_RULE	674
 
 /*
  * mem(4) privileges.
  */
 #define	PRIV_KMEM_READ		680	/* Open mem/kmem for reading. */
 #define	PRIV_KMEM_WRITE		681	/* Open mem/kmem for writing. */
 #define	PRIV_PROC_MEM_WRITE	682	/* Writes via proc_rwmem */
 
 /*
  * Kernel debugger privileges.
  */
 #define	PRIV_KDB_SET_BACKEND	690	/* Allow setting KDB backend. */
 
 /*
  * veriexec override privileges - very rare!
  */
 #define	PRIV_VERIEXEC_DIRECT	700	/* Can override 'indirect' */
 #define	PRIV_VERIEXEC_NOVERIFY	701	/* Can override O_VERIFY */
 #define	PRIV_VERIEXEC_CONTROL	702	/* Can configure veriexec */
 
 /*
  * Track end of privilege list.
  */
 #define	_PRIV_HIGHEST		703
 
 /*
  * Validate that a named privilege is known by the privilege system.  Invalid
  * privileges presented to the privilege system by a priv_check interface
  * will result in a panic.  This is only approximate due to sparse allocation
  * of the privilege space.
  */
 #define	PRIV_VALID(x)	((x) > _PRIV_LOWEST && (x) < _PRIV_HIGHEST)
 
 #ifdef _KERNEL
 /*
  * Privilege check interfaces, modeled after historic suser() interfaces, but
  * with the addition of a specific privilege name.  No flags are currently
  * defined for the API.  Historically, flags specified using the real uid
  * instead of the effective uid, and whether or not the check should be
  * allowed in jail.
  */
 struct thread;
 struct ucred;
 int	priv_check(struct thread *td, int priv);
 int	priv_check_cred(struct ucred *cred, int priv);
 int	priv_check_cred_vfs_lookup(struct ucred *cred);
 int	priv_check_cred_vfs_lookup_nomac(struct ucred *cred);
 int	priv_check_cred_vfs_generation(struct ucred *cred);
 #endif
 
 #endif /* !_SYS_PRIV_H_ */
diff --git a/usr.sbin/jail/jail.8 b/usr.sbin/jail/jail.8
index dd7b91d5cefa..421aa9babb4c 100644
--- a/usr.sbin/jail/jail.8
+++ b/usr.sbin/jail/jail.8
@@ -1,1599 +1,1605 @@
 .\" Copyright (c) 2000, 2003 Robert N. M. Watson
 .\" Copyright (c) 2008-2012 James Gritton
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.Dd May 11, 2025
+.Dd August 7, 2025
 .Dt JAIL 8
 .Os
 .Sh NAME
 .Nm jail
 .Nd "manage system jails"
 .Sh SYNOPSIS
 .Ss From Configuration File
 .Nm
 .Op Fl cm
 .Op Fl Cdqv
 .Op Fl f Ar conf_file
 .Op Fl p Ar limit
 .Op Ar jail
 .Nm
 .Op Fl r
 .Op Fl Cqv
 .Op Fl f Ar conf_file
 .Op Fl p Ar limit
 .Op Cm * | Ar jail ...
 .Ss Without Configuration File
 .Nm
 .Op Fl cm
 .Op Fl dhilqv
 .Op Fl J Ar jid_file
 .Op Fl u Ar username
 .Op Fl U Ar username
 .Ar param Ns = Ns Ar value ...
 .Op Cm command Ns = Ns Ar command ...
 .Nm
 .Op Fl rR
 .Op Fl qv
 .Op Cm * | Ar jail ...
 .Ss Show Parameters
 .Nm
 .Op Fl f Ar conf_file
 .Fl e
 .Ar separator
 .Ss Backward Compatibility
 .Nm
 .Op Fl dhilqv
 .Op Fl J Ar jid_file
 .Op Fl u Ar username
 .Op Fl U Ar username
 .Op Fl n Ar jailname
 .Op Fl s Ar securelevel
 .Ar path hostname ip Ns Op Cm \&, Ns Ar ...
 .Ar command ...
 .Sh DESCRIPTION
 The
 .Nm
 utility creates new jails, or modifies or removes existing jails.
 It can also print a list of configured jails and their parameters.
 A jail
 .Pq or Dq prison
 is specified via parameters on the command line, or in the
 .Xr jail.conf 5
 file.
 .Pp
 At least one of the options
 .Fl c ,
 .Fl e ,
 .Fl m
 or
 .Fl r
 must be specified.
 These options are used alone or in combination to describe the operation to
 perform:
 .Bl -tag -width indent
 .It Fl c
 Create a new jail.
 The jail
 .Va jid
 and
 .Va name
 parameters (if specified on the command line)
 must not refer to an existing jail.
 .It Fl e Ar separator
 Exhibit a list of all configured non-wildcard jails and their parameters.
 No jail creation, modification or removal performed if this option is used.
 The
 .Ar separator
 string is used to separate parameters.
 Use
 .Xr jls 8
 utility to list running jails.
 .It Fl m
 Modify an existing jail.
 One of the
 .Va jid
 or
 .Va name
 parameters must exist and refer to an existing jail.
 Some parameters may not be changed on a running jail.
 .It Fl r
 Remove the
 .Ar jail
 specified by jid or name.
 All jailed processes are killed, and all jails that are
 children of this jail are also
 removed.
 .It Fl rc
 Restart an existing jail.
 The jail is first removed and then re-created, as if
 .Dq Nm Fl r
 and
 .Dq Nm Fl c
 were run in succession.
 .It Fl cm
 Create a jail if it does not exist, or modify the jail if it does exist.
 .It Fl mr
 Modify an existing jail.
 The jail may be restarted if necessary to modify parameters than could
 not otherwise be changed.
 .It Fl cmr
 Create a jail if it doesn't exist, or modify (and possibly restart) the
 jail if it does exist.
 .El
 .Pp
 Other available options are:
 .Bl -tag -width indent
 .It Fl C
 Clean up after an already-removed jail, running commands and operations
 that are typically run following jail removal.
 .It Fl f Ar conf_file
 Use configuration file
 .Ar conf_file
 instead of the default
 .Pa /etc/jail.conf .
 .It Fl h
 Resolve the
 .Va host.hostname
 parameter (or
 .Va hostname )
 and add all IP addresses returned by the resolver
 to the list of addresses for this jail.
 This is equivalent to the
 .Va ip_hostname
 parameter.
 .It Fl i
 Output (only) the jail identifier of the newly created jail(s).
 This implies the
 .Fl q
 option.
 .It Fl J Ar jid_file
 Write a
 .Ar jid_file
 file, containing the parameters used to start the jail.
 .It Fl l
 Run commands in a clean environment.
 This is deprecated and is equivalent to the exec.clean parameter.
 .It Fl n Ar jailname
 Set the jail's name.
 This is deprecated and is equivalent to the
 .Va name
 parameter.
 .It Fl p Ar limit
 Limit the number of commands from
 .Va  exec.*
 that can run simultaneously.
 .It Fl q
 Suppress the message printed whenever a jail is created, modified or removed.
 Only error messages will be printed.
 .It Fl R
 A variation of the
 .Fl r
 option that removes an existing jail without using the configuration file.
 No removal-related parameters for this jail will be used \(em the jail will
 simply be removed.
 .It Fl s Ar securelevel
 Set the
 .Va kern.securelevel
 MIB entry to the specified value inside the newly created jail.
 This is deprecated and is equivalent to the
 .Va securelevel
 parameter.
 .It Fl u Ar username
 The user name from host environment as whom jailed commands should run.
 This is deprecated and is equivalent to the
 .Va exec.jail_user
 and
 .Va exec.system_jail_user
 parameters.
 .It Fl U Ar username
 The user name from the jailed environment as whom jailed commands should run.
 This is deprecated and is equivalent to the
 .Va exec.jail_user
 parameter.
 .It Fl v
 Print a message on every operation, such as running commands and
 mounting filesystems.
 .It Fl d
 This is deprecated and is equivalent to the
 .Va allow.dying
 parameter, which is also deprecated.
 It used to allow making changes to a
 .Va dying
 jail.
 Now such jails are always replaced when a new jail is created with the same
 .Va jid
 or
 .Va name .
 .El
 .Pp
 If no arguments are given after the options, the operation (except
 remove) will be performed on all jails specified in the
 .Xr jail.conf 5
 file.
 A single argument of a jail name will operate only on the specified jail.
 The
 .Fl r
 and
 .Fl R
 options can also remove running jails that aren't in the
 .Xr jail.conf 5
 file, specified by name or jid.
 .Pp
 An argument of
 .Dq *
 is a wildcard that will operate on all jails, regardless of whether
 they appear in
 .Xr jail.conf 5 ;
 this is the surest way for
 .Fl r
 to remove all jails.
 If hierarchical jails exist, a partial-matching wildcard definition may
 be specified.
 For example, an argument of
 .Dq foo.*
 would apply to jails with names like
 .Dq foo.bar
 and
 .Dq foo.bar.baz .
 .Pp
 A jail may also be specified via parameters directly on the command line in
 .Dq name=value
 form, ignoring the contents of
 .Xr jail.conf 5 .
 For backward compatibility, the command line may also have four fixed
 parameters, without names:
 .Ar path ,
 .Ar hostname ,
 .Ar ip ,
 and
 .Ar command .
 .Ss Jail Parameters
 Parameters in the
 .Xr jail.conf 5
 file, or on the command line, are generally of the form
 .Dq name=value .
 Some parameters are boolean, and do not have a value but are set by the
 name alone with or without a
 .Dq no
 prefix, e.g.
 .Va persist
 or
 .Va nopersist .
 They can also be given the values
 .Dq true
 and
 .Dq false .
 Other parameters may have more than one value, specified as a
 comma-separated list, or with
 .Dq +=
 in the configuration file (see
 .Xr jail.conf 5
 for details).
 List-based parameters may also be specified multiple times on the command
 line, i.e.,
 .Dq name=value1,value2
 and
 .Dq name=value1 name=value2
 are equivalent for such parameters.
 .Pp
 The
 .Nm
 utility recognizes two classes of parameters.
 There are the true jail
 parameters that are passed to the kernel when the jail is created,
 which can be seen with
 .Xr jls 8 ,
 and can (usually) be changed with
 .Dq Nm Fl m .
 Then there are pseudo-parameters that are only used by
 .Nm
 itself.
 .Pp
 Jails have a set of core parameters, and kernel modules can add their own
 jail parameters.
 The current set of available parameters can be retrieved via
 .Dq Nm sysctl Fl d Va security.jail.param .
 Any parameters not set will be given default values, often based on the
 current environment.
 The core parameters are:
 .Bl -tag -width indent
 .It Va jid
 The jail identifier.
 This will be assigned automatically to a new jail (or can be explicitly
 set), and can be used to identify the jail for later modification, or
 for such commands as
 .Xr jls 8
 or
 .Xr jexec 8 .
 .It Va name
 The jail name.
 This is an arbitrary string that identifies a jail (except it may not
 contain a
 .Sq \&. ) .
 Like the
 .Va jid ,
 it can be passed to later
 .Nm
 commands, or to
 .Xr jls 8
 or
 .Xr jexec 8 .
 If no
 .Va name
 is supplied, a default is assumed that is the same as the
 .Va jid .
 The
 .Va name
 parameter is implied by the
 .Xr jail.conf 5
 file format, and need not be explicitly set when using the configuration
 file.
 .It Va path
 The directory which is to be the root of the jail.
 Any commands run inside the jail, either by
 .Nm
 or from
 .Xr jexec 8 ,
 are run from this directory.
 .It Va ip4.addr
 A list of IPv4 addresses assigned to the jail.
 If this is set, the jail is restricted to using only these addresses.
 Any attempts to use other addresses fail, and attempts to use wildcard
 addresses silently use the jailed address instead.
 For IPv4 the first address given will be used as the source address
 when source address selection on unbound sockets cannot find a better
 match.
 It is only possible to start multiple jails with the same IP address
 if none of the jails has more than this single overlapping IP address
 assigned to itself.
 .It Va ip4.saddrsel
 A boolean option to change the formerly mentioned behaviour and disable
 IPv4 source address selection for the jail in favour of the primary
 IPv4 address of the jail.
 Source address selection is enabled by default for all jails and the
 .Va ip4.nosaddrsel
 setting of a parent jail is not inherited for any child jails.
 .It Va ip4
 Control the availability of IPv4 addresses.
 Possible values are
 .Dq inherit
 to allow unrestricted access to all system addresses,
 .Dq new
 to restrict addresses via
 .Va ip4.addr ,
 and
 .Dq disable
 to stop the jail from using IPv4 entirely.
 Setting the
 .Va ip4.addr
 parameter implies a value of
 .Dq new .
 .It Va ip6.addr , Va ip6.saddrsel , Va ip6
 A set of IPv6 options for the jail, the counterparts to
 .Va ip4.addr ,
 .Va ip4.saddrsel
 and
 .Va ip4
 above.
 .It Va vnet
 Create the jail with its own virtual network stack,
 with its own network interfaces, addresses, routing table, etc.
 The kernel must have been compiled with the
 .Sy VIMAGE option
 for this to be available.
 Possible values are
 .Dq inherit
 to use the system network stack, possibly with restricted IP addresses,
 and
 .Dq new
 to create a new network stack.
 .It Va host.hostname
 The hostname of the jail.
 Other similar parameters are
 .Va host.domainname ,
 .Va host.hostuuid
 and
 .Va host.hostid .
 .It Va host
 Set the origin of hostname and related information.
 Possible values are
 .Dq inherit
 to use the system information and
 .Dq new
 for the jail to use the information from the above fields.
 Setting any of the above fields implies a value of
 .Dq new .
 .It Va securelevel
 The value of the jail's
 .Va kern.securelevel
 sysctl.
 A jail never has a lower securelevel than its parent system, but by
 setting this parameter it may have a higher one.
 If the system securelevel is changed, any jail securelevels will be at
 least as secure.
 .It Va devfs_ruleset
 The number of the devfs ruleset that is enforced for mounting devfs in
 this jail.
 A value of zero (default) means no ruleset is enforced.
 Descendant jails inherit the parent jail's devfs ruleset enforcement.
 Mounting devfs inside a jail is possible only if the
 .Va allow.mount
 and
 .Va allow.mount.devfs
 permissions are effective and
 .Va enforce_statfs
 is set to a value lower than 2.
 Devfs rules and rulesets cannot be viewed or modified from inside a jail.
 .Pp
 NOTE: It is important that only appropriate device nodes in devfs be
 exposed to a jail; access to disk devices in the jail may permit processes
 in the jail to bypass the jail sandboxing by modifying files outside of
 the jail.
 See
 .Xr devfs 8
 for information on how to use devfs rules to limit access to entries
 in the per-jail devfs.
 A simple devfs ruleset for jails is available as ruleset #4 in
 .Pa /etc/defaults/devfs.rules .
 .It Va children.max
 The number of child jails allowed to be created by this jail (or by
 other jails under this jail).
 This limit is zero by default, indicating the jail is not allowed to
 create child jails.
 See the
 .Sx "Hierarchical Jails"
 section for more information.
 .It Va children.cur
 The number of descendants of this jail, including its own child jails
 and any jails created under them.
 .It Va enforce_statfs
 This determines what information processes in a jail are able to get
 about mount points.
 It affects the behaviour of the following syscalls:
 .Xr statfs 2 ,
 .Xr fstatfs 2 ,
 .Xr getfsstat 2 ,
 and
 .Xr fhstatfs 2
 (as well as similar compatibility syscalls).
 When set to 0, all mount points are available without any restrictions.
 When set to 1, only mount points below the jail's chroot directory are
 visible.
 In addition to that, the path to the jail's chroot directory is removed
 from the front of their pathnames.
 When set to 2 (default), above syscalls can operate only on a mount-point
 where the jail's chroot directory is located.
 .It Va persist
 Setting this boolean parameter allows a jail to exist without any
 processes.
 Normally, a command is run as part of jail creation, and then the jail
 is destroyed as its last process exits.
 A new jail must have either the
 .Va persist
 parameter or
 .Va exec.start
 or
 .Va command
 pseudo-parameter set.
 .It Va cpuset.id
 The ID of the cpuset associated with this jail (read-only).
 .It Va dying
 This is true if the jail is in the process of shutting down (read-only).
 .It Va parent
 The
 .Va jid
 of the parent of this jail, or zero if this is a top-level jail
 (read-only).
 .It Va osrelease
 The string for the jail's
 .Va kern.osrelease
 sysctl and uname -r.
 .It Va osreldate
 The number for the jail's
 .Va kern.osreldate
 and uname -K.
 .It Va meta , Va env
 An arbitrary string associated with the jail.
 Its maximum buffer size is controlled by the global
 .Va security.jail.meta_maxbufsize
 sysctl, which can only be adjusted by the non-jailed root user.
 While the
 .Va meta
 is hidden from the jail, the
 .Va env
 is readable through the
 .Va security.jail.env
 sysctl.
 .Pp
 Each buffer can be treated as a set of key=value\\n strings.
 In order to add or replace a specific key the
 .Va meta.keyname=value
 or
 .Va env.keyname=value
 parameter notations must be used.
 While
 .Va meta.keyname=
 or
 .Va env.keyname=
 reset the value to an empty string, the
 .Va meta.keyname
 or
 .Va env.keyname
 notations, without the equal sign, remove the given key.
 Respectively, the same
 .Va meta.keyname
 or
 .Va env.keyname
 notations are used to query a specific key while reading jail parameters
 using such commands as
 .Xr jls 8 .
 Multiple keys can be queried or modified with a single command.
 .It Va allow.*
 Some restrictions of the jail environment may be set on a per-jail
 basis.
 With the exception of
 .Va allow.set_hostname
 and
 .Va allow.reserved_ports ,
 these boolean parameters are off by default.
 .Bl -tag -width indent
 .It Va allow.set_hostname
 The jail's hostname may be changed via
 .Xr hostname 1
 or
 .Xr sethostname 3 .
 .It Va allow.sysvipc
 A process within the jail has access to System V IPC primitives.
 This is deprecated in favor of the per-module parameters (see below).
 When this parameter is set, it is equivalent to setting
 .Va sysvmsg ,
 .Va sysvsem ,
 and
 .Va sysvshm
 all to
 .Dq inherit .
 .It Va allow.raw_sockets
 The jail root is allowed to create raw sockets.
 Setting this parameter allows utilities like
 .Xr ping 8
 and
 .Xr traceroute 8
 to operate inside the jail.
 If this is set, the source IP addresses are enforced to comply
 with the IP address bound to the jail, regardless of whether or not
 the
 .Dv IP_HDRINCL
 flag has been set on the socket.
 Since raw sockets can be used to configure and interact with various
 network subsystems, extra caution should be used where privileged access
 to jails is given out to untrusted parties.
 .It Va allow.chflags
 Normally, privileged users inside a jail are treated as unprivileged by
 .Xr chflags 2 .
 When this parameter is set, such users are treated as privileged, and
 may manipulate system file flags subject to the usual constraints on
 .Va kern.securelevel .
 .It Va allow.mount
 privileged users inside the jail will be able to mount and unmount file
 system types marked as jail-friendly.
 The
 .Xr lsvfs 1
 command can be used to find file system types available for mount from
 within a jail.
 This permission is effective only if
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.devfs
 privileged users inside the jail will be able to mount and unmount the
 devfs file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 The devfs ruleset should be restricted from the default by using the
 .Va devfs_ruleset
 option.
 .It Va allow.quotas
 The jail root may administer quotas on the jail's filesystem(s).
 This includes filesystems that the jail may share with other jails or
 with non-jailed parts of the system.
 .It Va allow.read_msgbuf
 Jailed users may read the kernel message buffer.
 If the
 .Va security.bsd.unprivileged_read_msgbuf
 MIB entry is zero, this will be restricted to the root user.
 .It Va allow.socket_af
 Sockets within a jail are normally restricted to IPv4, IPv6, local
 (UNIX), and route.
 This allows access to other protocol stacks that have not had jail
 functionality added to them.
 .It Va allow.mlock
 Locking or unlocking physical pages in memory are normally not available
 within a jail.
 When this parameter is set, users may
 .Xr mlock 2
 or
 .Xr munlock 2
 memory subject to
 .Va security.bsd.unprivileged_mlock
 and resource limits.
 .It Va allow.nfsd
 The
 .Xr mountd 8 ,
 .Xr nfsd 8 ,
 .Xr nfsuserd 8 ,
 .Xr gssd 8
 and
 .Xr rpc.tlsservd 8
 daemons are permitted to run inside a properly configured vnet-enabled jail.
 The jail's root must be a file system mount point and
 .Va enforce_statfs
 must not be set to 0, so that
 .Xr mountd 8
 can export file systems visible within the jail.
 .Va enforce_statfs
 must be set to 1 if file systems mounted under the
 jail's file system need to be exported by
 .Xr mount 8 .
 For exporting only the jail's file system, a setting of 2
 is sufficient.
 If the kernel configuration does not include the
 .Sy NFSD
 option,
 .Pa nfsd.ko
 must be loaded outside of the jails.
 This is normally done by adding
 .Dq nfsd
 to
 .Va kld_list
 in the
 .Xr rc.conf 5
 file outside of the jails.
 Similarily, if the
 .Xr gssd 8
 is to be run in a jail, either the kernel
 .Sy KGSSAPI
 option needs to be specified or
 .Dq kgssapi
 and
 .Dq kgssapi_krb5
 need to be in
 .Va kld_list
 in the
 .Xr rc.conf 5
 file outside of the jails.
 .It Va allow.reserved_ports
 The jail root may bind to ports lower than 1024.
+.It Va allow.unprivileged_parent_tampering
+Unprivileged processes in the jail's parent may tamper with processes of the
+same UID in the jail.
+This includes the ability to signal, debug, and
+.Xr cpuset 1
+processes that belong to the jail.
 .It Va allow.unprivileged_proc_debug
 Unprivileged processes in the jail may use debugging facilities.
 .It Va allow.suser
 The value of the jail's
 .Va security.bsd.suser_enabled
 sysctl.
 The super-user will be disabled automatically if its parent system has it
 disabled.
 The super-user is enabled by default.
 .It Va allow.extattr
 Allow privileged process in the jail to manipulate filesystem extended
 attributes in the system namespace.
 .It Va allow.adjtime
 Allow privileged process in the jail to slowly adjusting global operating system
 time.
 For example through utilities like
 .Xr ntpd 8 .
 .It Va allow.settime
 Allow privileged process in the jail to set global operating system data
 and time.
 For example through utilities like
 .Xr date 1 .
 This permission includes also
 .Va allow.adjtime .
 .It Va allow.routing
 Allow privileged process in the non-VNET jail to modify the system routing
 table.
 .El
 .El
 .Pp
 Kernel modules may add their own parameters, which only exist when the
 module is loaded.
 These are typically headed under a parameter named after the module,
 with values of
 .Dq inherit
 to give the jail full use of the module,
 .Dq new
 to encapsulate the jail in some module-specific way,
 and
 .Dq disable
 to make the module unavailable to the jail.
 There also may be other parameters to define jail behavior within the module.
 Module-specific parameters include:
 .Bl -tag -width indent
 .It Va allow.mount.fdescfs
 privileged users inside the jail will be able to mount and unmount the
 fdescfs file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.fusefs
 privileged users inside the jail will be able to mount and unmount
 fuse-based file systems.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.nullfs
 privileged users inside the jail will be able to mount and unmount the
 nullfs file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.procfs
 privileged users inside the jail will be able to mount and unmount the
 procfs file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.linprocfs
 privileged users inside the jail will be able to mount and unmount the
 linprocfs file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.linsysfs
 privileged users inside the jail will be able to mount and unmount the
 linsysfs file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.tmpfs
 privileged users inside the jail will be able to mount and unmount the
 tmpfs file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.zfs
 privileged users inside the jail will be able to mount and unmount the
 ZFS file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 See
 .Xr zfs-jail 8
 for information on how to configure the ZFS filesystem to operate from
 within a jail.
 .It Va allow.vmm
 The jail may access
 .Xr vmm 4 .
 This flag is only available when the
 .Xr vmm 4
 kernel module is loaded.
 .It Va linux
 Determine how a jail's Linux emulation environment appears.
 A value of
 .Dq inherit
 will keep the same environment, and
 .Dq new
 will give the jail its own environment (still originally inherited when
 the jail is created).
 .It Va linux.osname , linux.osrelease , linux.oss_version
 The Linux OS name, OS release, and OSS version associated with this jail.
 .It Va sysvmsg
 Allow access to SYSV IPC message primitives.
 If set to
 .Dq inherit ,
 all IPC objects on the system are visible to this jail, whether they
 were created by the jail itself, the base system, or other jails.
 If set to
 .Dq new ,
 the jail will have its own key namespace, and can only see the objects
 that it has created;
 the system (or parent jail) has access to the jail's objects, but not to
 its keys.
 If set to
 .Dq disable ,
 the jail cannot perform any sysvmsg-related system calls.
 .It Va sysvsem, sysvshm
 Allow access to SYSV IPC semaphore and shared memory primitives, in the
 same manner as
 .Va sysvmsg .
 .It Va zfs.mount_snapshot
 When set to 1, jailed users may access the contents of ZFS snapshots
 under the filesystem's
 .Pa .zfs
 directory.
 If
 .Va allow.mount.zfs
 is set, the snapshots may also be mounted.
 .El
 .Pp
 There are pseudo-parameters that are not passed to the kernel, but are
 used by
 .Nm
 to set up the jail environment, often by running specified commands
 when jails are created or removed.
 The
 .Va exec.*
 command parameters are
 .Xr sh 1
 command lines that are run in either the system or jail environment.
 They may be given multiple values, which would run the specified
 commands in sequence.
 All commands must succeed (return a zero exit status), or the jail will
 not be created or removed, as appropriate.
 .Pp
 The following variables are added to the environment:
 .Bl -tag -width indent -offset indent
 .It Ev JID
 The
 .Va jid ,
 or jail identifier.
 .It Ev JNAME
 The
 .Va name
 of the jail.
 .It Ev JPATH
 The
 .Va path
 of the jail.
 .El
 .Pp
 The pseudo-parameters are:
 .Bl -tag -width indent
 .It Va exec.prepare
 Command(s) to run in the system environment to prepare a jail for creation.
 These commands are executed before assigning IP addresses and mounting
 filesystems, so they may be used to create a new jail filesystem if it does
 not already exist.
 .It Va exec.prestart
 Command(s) to run in the system environment before a jail is created.
 .It Va exec.created
 Command(s) to run in the system environment right after a jail has been
 created, but before commands (or services) get executed in the jail.
 .It Va exec.start
 Command(s) to run in the jail environment when a jail is created.
 A typical command to run is
 .Dq sh /etc/rc .
 .It Va command
 A synonym for
 .Va exec.start
 for use when specifying a jail directly on the command line.
 Unlike other parameters whose value is a single string,
 .Va command
 uses the remainder of the
 .Nm
 command line as its own arguments.
 .It Va exec.poststart
 Command(s) to run in the system environment after a jail is created,
 and after any
 .Va exec.start
 commands have completed.
 .It Va exec.prestop
 Command(s) to run in the system environment before a jail is removed.
 .It Va exec.stop
 Command(s) to run in the jail environment before a jail is removed,
 and after any
 .Va exec.prestop
 commands have completed.
 A typical command to run is
 .Dq sh /etc/rc.shutdown jail .
 .It Va exec.poststop
 Command(s) to run in the system environment after a jail is removed.
 .It Va exec.release
 Command(s) to run in the system environment after all other actions are done.
 These commands are executed after unmounting filesystems and removing IP
 addresses, so they may be used to remove a jail filesystem if it is no longer
 needed.
 .It Va exec.clean
 Run commands in a clean environment.
 The environment is discarded except for
 .Ev HOME , SHELL , TERM
 and
 .Ev USER .
 .Ev HOME
 and
 .Ev SHELL
 are set to the target login's default values.
 .Ev USER
 is set to the target login.
 .Ev TERM
 is imported from the current environment.
 .Ev PATH
 is set to "/bin:/usr/bin".
 The environment variables from the login class capability database for the
 target login are also set.
 .Ev JID ,
 .Ev JNAME ,
 and
 .Ev JPATH
 are not set.
 If a user is specified (as with
 .Va exec.jail_user ) ,
 commands are run from that (possibly jailed) user's directory.
 .It Va exec.jail_user
 The user to run commands as, when running in the jail environment.
 The default is to run the commands as the current user.
 .It Va exec.system_jail_user
 This boolean option looks for the
 .Va exec.jail_user
 in the system
 .Xr passwd 5
 file, instead of in the jail's file.
 .It Va exec.system_user
 The user to run commands as, when running in the system environment.
 The default is to run the commands as the current user.
 .It Va exec.timeout
 The maximum amount of time to wait for a command to complete, in
 seconds.
 If a command is still running after this timeout has passed,
 the jail will not be created or removed, as appropriate.
 .It Va exec.consolelog
 A file to direct command output (stdout and stderr) to.
 .It Va exec.fib
 The FIB (routing table) to set when running commands inside the jail.
 .It Va stop.timeout
 The maximum amount of time to wait for a jail's processes to exit
 after sending them a
 .Dv SIGTERM
 signal (which happens after the
 .Va exec.stop
 commands have completed).
 After this many seconds have passed, the jail will be removed, which
 will kill any remaining processes.
 If this is set to zero, no
 .Dv SIGTERM
 is sent and the jail is immediately removed.
 The default is 10 seconds.
 .It Va interface
 A network interface to add the jail's IP addresses
 .Va ( ip4.addr
 and
 .Va ip6.addr )
 to.
 An alias for each address will be added to the interface before the
 jail is created, and will be removed from the interface after the
 jail is removed.
 .It Va ip4.addr
 In addition to the IP addresses that are passed to the kernel, an
 interface, netmask and additional parameters (as supported by
 .Xr ifconfig 8 Ns )
 may also be specified, in the form
 .Dq Ar interface Ns | Ns Ar ip-address Ns / Ns Ar netmask param ... .
 If an interface is given before the IP address, an alias for the address
 will be added to that interface, as it is with the
 .Va interface
 parameter.
 If a netmask in either dotted-quad or CIDR form is given
 after an IP address, it will be used when adding the IP alias.
 If additional parameters are specified then they will also be used when
 adding the IP alias.
 .It Va ip6.addr
 In addition to the IP addresses that are passed to the kernel,
 an interface, prefix and additional parameters (as supported by
 .Xr ifconfig 8 Ns )
 may also be specified, in the form
 .Dq Ar interface Ns | Ns Ar ip-address Ns / Ns Ar prefix param ... .
 .It Va vnet.interface
 A list of network interfaces to give to a vnet-enabled jail after is it created.
 The interfaces will automatically be released when the jail is removed.
 .It Va zfs.dataset
 A list of ZFS datasets to be attached to the jail.
 This requires
 .Va allow.mount.zfs
 to be set.
 See
 .Xr zfs-jail 8
 for information on how to configure a ZFS dataset to be operated from
 within a jail.
 .It Va ip_hostname
 Resolve the
 .Va host.hostname
 parameter and add all IP addresses returned by the resolver
 to the list of addresses
 .Po Va ip4.addr
 or
 .Va ip6.addr Pc
 for this jail.
 This may affect default address selection for outgoing IPv4 connections
 from jails.
 The address first returned by the resolver for each address family
 will be used as the primary address.
 .It Va mount
 A filesystem to mount before creating the jail (and to unmount after
 removing it), given as a single
 .Xr fstab 5
 line.
 .It Va mount.fstab
 An
 .Xr fstab 5
 format file containing filesystems to mount before creating a jail.
 .It Va mount.devfs
 Mount a
 .Xr devfs 4
 filesystem on the chrooted
 .Pa /dev
 directory, and apply the ruleset in the
 .Va devfs_ruleset
 parameter (or a default of ruleset 4: devfsrules_jail)
 to restrict the devices visible inside the jail.
 .It Va mount.fdescfs
 Mount a
 .Xr fdescfs 4
 filesystem on the chrooted
 .Pa /dev/fd
 directory.
 .It Va mount.procfs
 Mount a
 .Xr procfs 4
 filesystem on the chrooted
 .Pa /proc
 directory.
 .It Va allow.dying
 This is deprecated and has no effect.
 It used to allow making changes to a
 .Va dying
 jail.
 Now such jails are always replaced when a new jail is created with the same
 .Va jid
 or
 .Va name .
 .It Va depend
 Specify a jail (or jails) that this jail depends on.
 When this jail is to be created, any jail(s) it depends on must already exist.
 If not, they will be created automatically, up to the completion of the last
 .Va exec.poststart
 command, before any action will taken to create this jail.
 When jails are removed the opposite is true:
 this jail will be removed, up to the last
 .Va exec.poststop
 command, before any jail(s) it depends on are stopped.
 .El
 .Sh EXAMPLES
 Jails are typically set up using one of two philosophies: either to
 constrain a specific application (possibly running with privilege), or
 to create a
 .Dq "virtual system image"
 running a variety of daemons and services.
 In both cases, a fairly complete file system install of
 .Fx
 is
 required, so as to provide the necessary command line tools, daemons,
 libraries, application configuration files, etc.
 However, for a virtual server configuration, a fair amount of
 additional work is required so as to replace the
 .Dq boot
 process.
 This manual page documents the configuration steps necessary to support
 either of these steps, although the configuration steps may need to be
 refined based on local requirements.
 .Ss "Setting up a Jail Directory Tree"
 To set up a jail directory tree containing an entire
 .Fx
 distribution, the following
 .Xr sh 1
 command script can be used:
 .Bd -literal -offset indent
 D=/here/is/the/jail
 cd /usr/src
 mkdir -p $D
 make world DESTDIR=$D
 make distribution DESTDIR=$D
 .Ed
 .Pp
 In many cases this example would put far more in the jail than needed.
 In the other extreme case a jail might contain only one file:
 the executable to be run in the jail.
 .Pp
 We recommend experimentation, and caution that it is a lot easier to
 start with a
 .Dq fat
 jail and remove things until it stops working,
 than it is to start with a
 .Dq thin
 jail and add things until it works.
 .Ss "Setting Up a Jail"
 Do what was described in
 .Sx "Setting Up a Jail Directory Tree"
 to build the jail directory tree.
 For the sake of this example, we will
 assume you built it in
 .Pa /data/jail/testjail ,
 for a jail named
 .Dq testjail .
 Substitute below as needed with your
 own directory, IP address, and hostname.
 .Ss "Setting up the Host Environment"
 First, set up the real system's environment to be
 .Dq jail-friendly .
 For consistency, we will refer to the parent box as the
 .Dq "host environment" ,
 and to the jailed virtual machine as the
 .Dq "jail environment" .
 Since jails are implemented using IP aliases, one of the first things to do
 is to disable IP services on the host system that listen on all local
 IP addresses for a service.
 If a network service is present in the host environment that binds all
 available IP addresses rather than specific IP addresses, it may service
 requests sent to jail IP addresses if the jail did not bind the port.
 This means changing
 .Xr inetd 8
 to only listen on the
 appropriate IP address, and so forth.
 Add the following to
 .Pa /etc/rc.conf
 in the host environment:
 .Bd -literal -offset indent
 sendmail_enable="NO"
 inetd_flags="-wW -a 192.0.2.23"
 rpcbind_enable="NO"
 .Ed
 .Pp
 .Li 192.0.2.23
 is the native IP address for the host system, in this example.
 Daemons that run out of
 .Xr inetd 8
 can be easily configured to use only the specified host IP address.
 Other daemons
 will need to be manually configured \(em for some this is possible through
 .Xr rc.conf 5
 flags entries; for others it is necessary to modify per-application
 configuration files, or to recompile the application.
 The following frequently deployed services must have their individual
 configuration files modified to limit the application to listening
 to a specific IP address:
 .Pp
 To configure
 .Xr sshd 8 ,
 it is necessary to modify
 .Pa /etc/ssh/sshd_config .
 .Pp
 To configure
 .Xr sendmail 8 ,
 it is necessary to modify
 .Pa /etc/mail/sendmail.cf .
 .Pp
 In addition, a number of services must be recompiled in order to run
 them in the host environment.
 This includes most applications providing services using
 .Xr rpc 3 ,
 such as
 .Xr rpcbind 8 ,
 .Xr nfsd 8 ,
 and
 .Xr mountd 8 .
 In general, applications for which it is not possible to specify which
 IP address to bind should not be run in the host environment unless they
 should also service requests sent to jail IP addresses.
 Attempting to serve
 NFS from the host environment may also cause confusion, and cannot be
 easily reconfigured to use only specific IPs, as some NFS services are
 hosted directly from the kernel.
 Any third-party network software running
 in the host environment should also be checked and configured so that it
 does not bind all IP addresses, which would result in those services also
 appearing to be offered by the jail environments.
 .Pp
 Once
 these daemons have been disabled or fixed in the host environment, it is
 best to reboot so that all daemons are in a known state, to reduce the
 potential for confusion later (such as finding that when you send mail
 to a jail, and its sendmail is down, the mail is delivered to the host,
 etc.).
 .Ss "Configuring the Jail"
 Start any jail for the first time without configuring the network
 interface so that you can clean it up a little and set up accounts.
 As
 with any machine (virtual or not), you will need to set a root password, time
 zone, etc.
 Some of these steps apply only if you intend to run a full virtual server
 inside the jail; others apply both for constraining a particular application
 or for running a virtual server.
 .Pp
 Start a shell in the jail:
 .Bd -literal -offset indent
 jail -c path=/data/jail/testjail mount.devfs \\
 	host.hostname=testhostname ip4.addr=192.0.2.100 \\
 	command=/bin/sh
 .Ed
 .Pp
 Assuming no errors, you will end up with a shell prompt within the jail.
 You can now run
 .Xr bsdconfig 8
 and do the post-install configuration to set various configuration options,
 or perform these actions manually by editing
 .Pa /etc/rc.conf ,
 etc.
 .Pp
 .Bl -bullet -offset indent -compact
 .It
 Configure
 .Pa /etc/resolv.conf
 so that name resolution within the jail will work correctly.
 .It
 Run
 .Xr newaliases 1
 to quell
 .Xr sendmail 8
 warnings.
 .It
 Set a root password, probably different from the real host system.
 .It
 Set the timezone.
 .It
 Add accounts for users in the jail environment.
 .It
 Install any packages the environment requires.
 .El
 .Pp
 You may also want to perform any package-specific configuration (web servers,
 SSH servers, etc), patch up
 .Pa /etc/syslog.conf
 so it logs as you would like, etc.
 If you are not using a virtual server, you may wish to modify
 .Xr syslogd 8
 in the host environment to listen on the syslog socket in the jail
 environment; in this example, the syslog socket would be stored in
 .Pa /data/jail/testjail/var/run/log .
 .Pp
 Exit from the shell, and the jail will be shut down.
 .Ss "Starting the Jail"
 You are now ready to restart the jail and bring up the environment with
 all of its daemons and other programs.
 Create an entry for the jail in
 .Pa /etc/jail.conf :
 .Bd -literal -offset indent
 testjail {
 	path = /tmp/jail/testjail;
 	mount.devfs;
 	host.hostname = testhostname;
 	ip4.addr = 192.0.2.100;
 	interface = em0;
 	exec.start = "/bin/sh /etc/rc";
 	exec.stop = "/bin/sh /etc/rc.shutdown jail";
 }
 .Ed
 .Pp
 To start a virtual server environment,
 .Pa /etc/rc
 is run to launch various daemons and services, and
 .Pa /etc/rc.shutdown
 is run to shut them down when the jail is removed.
 If you are running a single application in the jail,
 substitute the command used to start the application for
 .Dq /bin/sh /etc/rc ;
 there may be some script available to cleanly shut down the application,
 or it may be sufficient to go without a stop command, and have
 .Nm
 send
 .Dv SIGTERM
 to the application.
 .Pp
 Start the jail by running:
 .Bd -literal -offset indent
 jail -c testjail
 .Ed
 .Pp
 A few warnings may be produced; however, it should all work properly.
 You should be able to see
 .Xr inetd 8 ,
 .Xr syslogd 8 ,
 and other processes running within the jail using
 .Xr ps 1 ,
 with the
 .Ql J
 flag appearing beside jailed processes.
 To see an active list of jails, use
 .Xr jls 8 .
 If
 .Xr sshd 8
 is enabled in the jail environment, you should be able to
 .Xr ssh 1
 to the hostname or IP address of the jailed environment, and log
 in using the accounts you created previously.
 .Pp
 It is possible to have jails started at boot time.
 Please refer to the
 .Dq jail_*
 variables in
 .Xr rc.conf 5
 for more information.
 .Ss "Managing the Jail"
 Normal machine shutdown commands, such as
 .Xr halt 8 ,
 .Xr reboot 8 ,
 and
 .Xr shutdown 8 ,
 cannot be used successfully within the jail.
 To kill all processes from within a jail, you may use one of the
 following commands, depending on what you want to accomplish:
 .Bd -literal -offset indent
 kill -TERM -1
 kill -KILL -1
 .Ed
 .Pp
 This will send the
 .Dv SIGTERM
 or
 .Dv SIGKILL
 signals to all processes in the jail \(em be careful not to run this from
 the host environment!
 Once all of the jail's processes have died, unless the jail was created
 with the
 .Va persist
 parameter, the jail will be removed.
 Depending on
 the intended use of the jail, you may also want to run
 .Pa /etc/rc.shutdown
 from within the jail.
 .Pp
 To shut down the jail from the outside, simply remove it with:
 .Bd -literal -offset indent
 jail -r
 .Ed
 .Pp
 which will run any commands specified by
 .Va exec.stop ,
 and then send
 .Dv SIGTERM
 and eventually
 .Dv SIGKILL
 to any remaining jailed processes.
 .Pp
 The
 .Pa /proc/ Ns Ar pid Ns Pa /status
 file contains, as its last field, the name of the jail in which the
 process runs, or
 .Dq Li -
 to indicate that the process is not running within a jail.
 The
 .Xr ps 1
 command also shows a
 .Ql J
 flag for processes in a jail.
 .Pp
 You can also list/kill processes based on their jail ID.
 To show processes and their jail ID, use the following command:
 .Pp
 .Dl "ps ax -o pid,jid,args"
 .Pp
 To show and then kill processes in jail number 3 use the following commands:
 .Bd -literal -offset indent
 pgrep -lfj 3
 pkill -j 3
 .Ed
 or:
 .Pp
 .Dl "killall -j 3"
 .Ss "Jails and File Systems"
 It is not possible to
 .Xr mount 8
 or
 .Xr umount 8
 any file system inside a jail unless the file system is marked
 jail-friendly, the jail's
 .Va allow.mount
 parameter is set, and the jail's
 .Va enforce_statfs
 parameter is lower than 2.
 .Pp
 Multiple jails sharing the same file system can influence each other.
 For example, a user in one jail can fill the file system,
 leaving no space for processes in the other jail.
 Trying to use
 .Xr quota 1
 to prevent this will not work either, as the file system quotas
 are not aware of jails but only look at the user and group IDs.
 This means the same user ID in two jails share a single file
 system quota.
 One would need to use one file system per jail to make this work.
 .Ss "Sysctl MIB Entries"
 The read-only entry
 .Va security.jail.jailed
 can be used to determine if a process is running inside a jail (value
 is one) or not (value is zero).
 .Pp
 The variable
 .Va security.jail.jail_max_af_ips
 determines how may address per address family a jail may have.
 The default is 255.
 .Pp
 Some MIB variables have per-jail settings.
 Changes to these variables by a jailed process do not affect the host
 environment, only the jail environment.
 These variables are
 .Va kern.securelevel ,
 .Va security.bsd.suser_enabled ,
 .Va kern.hostname ,
 .Va kern.domainname ,
 .Va kern.hostid ,
 and
 .Va kern.hostuuid .
 .Ss "Hierarchical Jails"
 By setting a jail's
 .Va children.max
 parameter, processes within a jail may be able to create jails of their own.
 These child jails are kept in a hierarchy, with jails only able to see and/or
 modify the jails they created (or those jails' children).
 Each jail has a read-only
 .Va parent
 parameter, containing the
 .Va jid
 of the jail that created it; a
 .Va jid
 of 0 indicates the jail is a child of the current jail (or is a top-level
 jail if the current process isn't jailed).
 .Pp
 Jailed processes are not allowed to confer greater permissions than they
 themselves are given, e.g., if a jail is created with
 .Va allow.nomount ,
 it is not able to create a jail with
 .Va allow.mount
 set.
 Similarly, such restrictions as
 .Va ip4.addr
 and
 .Va securelevel
 may not be bypassed in child jails.
 .Pp
 A child jail may in turn create its own child jails if its own
 .Va children.max
 parameter is set (remember it is zero by default).
 These jails are visible to and can be modified by their parent and all
 ancestors.
 .Pp
 Jail names reflect this hierarchy, with a full name being an MIB-type string
 separated by dots.
 For example, if a base system process creates a jail
 .Dq foo ,
 and a process under that jail creates another jail
 .Dq bar ,
 then the second jail will be seen as
 .Dq foo.bar
 in the base system (though it is only seen as
 .Dq bar
 to any processes inside jail
 .Dq foo ) .
 Jids on the other hand exist in a single space, and each jail must have a
 unique jid.
 .Pp
 Like the names, a child jail's
 .Va path
 appears relative to its creator's own
 .Va path .
 This is by virtue of the child jail being created in the chrooted
 environment of the first jail.
 .Sh SEE ALSO
 .Xr date 1 ,
 .Xr killall 1 ,
 .Xr lsvfs 1 ,
 .Xr newaliases 1 ,
 .Xr pgrep 1 ,
 .Xr pkill 1 ,
 .Xr ps 1 ,
 .Xr quota 1 ,
 .Xr adjtime 2 ,
 .Xr clock_settime 2 ,
 .Xr jail_set 2 ,
 .Xr ntp_adjtime 2 ,
 .Xr devfs 4 ,
 .Xr fdescfs 4 ,
 .Xr linprocfs 4 ,
 .Xr linsysfs 4 ,
 .Xr procfs 4 ,
 .Xr vmm 4 ,
 .Xr jail.conf 5 ,
 .Xr rc.conf 5 ,
 .Xr sysctl.conf 5 ,
 .Xr bsdconfig 8 ,
 .Xr chroot 8 ,
 .Xr devfs 8 ,
 .Xr halt 8 ,
 .Xr ifconfig 8 ,
 .Xr inetd 8 ,
 .Xr jexec 8 ,
 .Xr jls 8 ,
 .Xr mount 8 ,
 .Xr mountd 8 ,
 .Xr nfsd 8 ,
 .Xr ntpd 8 ,
 .Xr reboot 8 ,
 .Xr rpcbind 8 ,
 .Xr sendmail 8 ,
 .Xr shutdown 8 ,
 .Xr sysctl 8 ,
 .Xr syslogd 8 ,
 .Xr umount 8 ,
 .Xr zfs-jail 8 ,
 .Xr extattr 9
 .Sh HISTORY
 The
 .Nm
 utility appeared in
 .Fx 4.0 .
 Hierarchical/extensible jails were introduced in
 .Fx 8.0 .
 The configuration file was introduced in
 .Fx 9.1 .
 .Sh AUTHORS
 .An -nosplit
 The jail feature was written by
 .An Poul-Henning Kamp
 for R&D Associates
 who contributed it to
 .Fx .
 .Pp
 .An Robert Watson
 wrote the extended documentation, found a few bugs, added
 a few new features, and cleaned up the userland jail environment.
 .Pp
 .An Bjoern A. Zeeb
 added multi-IP jail support for IPv4 and IPv6 based on a patch
 originally done by
 .An Pawel Jakub Dawidek
 for IPv4.
 .Pp
 .An James Gritton
 added the extensible jail parameters, hierarchical jails,
 and the configuration file.
 .Sh BUGS
 It might be a good idea to add an
 address alias flag such that daemons listening on all IPs
 .Pq Dv INADDR_ANY
 will not bind on that address, which would facilitate building a safe
 host environment such that host daemons do not impose on services offered
 from within jails.
 Currently, the simplest answer is to minimize services
 offered on the host, possibly limiting it to services offered from
 .Xr inetd 8
 which is easily configurable.
 .Sh NOTES
 Great care should be taken when managing directories visible within the jail.
 For example, if a jailed process has its current working directory set to a
 directory that is moved out of the jail's chroot, then the process may gain
 access to the file space outside of the jail.
 It is recommended that directories always be copied, rather than moved, out
 of a jail.
 .Pp
 In addition, there are several ways in which an unprivileged user
 outside the jail can cooperate with a privileged user inside the jail
 and thereby obtain elevated privileges in the host environment.
 Most of these attacks can be mitigated by ensuring that the jail root
 is not accessible to unprivileged users in the host environment.
 Regardless, as a general rule, untrusted users with privileged access
 to a jail should not be given access to the host environment.