diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 65201eb12951..342af50462f2 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -1,4510 +1,4509 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1999 Poul-Henning Kamp.
  * Copyright (c) 2008 Bjoern A. Zeeb.
  * Copyright (c) 2009 James Gritton.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/osd.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/taskqueue.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/racct.h>
 #include <sys/rctl.h>
 #include <sys/refcount.h>
 #include <sys/sx.h>
 #include <sys/sysent.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/uuid.h>
 #include <sys/vnode.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif /* DDB */
 
 #include <security/mac/mac_framework.h>
 
 #define	DEFAULT_HOSTUUID	"00000000-0000-0000-0000-000000000000"
 #define	PRISON0_HOSTUUID_MODULE	"hostuuid"
 
 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
 
 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
 #ifdef INET
 #ifdef INET6
 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
 #else
 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
 #endif
 #else /* !INET */
 #ifdef INET6
 #define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
 #else
 #define	_PR_IP_SADDRSEL	0
 #endif
 #endif
 
 /* prison0 describes what is "real" about the system. */
 struct prison prison0 = {
 	.pr_id		= 0,
 	.pr_name	= "0",
 	.pr_ref		= 1,
 	.pr_uref	= 1,
 	.pr_path	= "/",
 	.pr_securelevel	= -1,
 	.pr_devfs_rsnum = 0,
 	.pr_childmax	= JAIL_MAX,
 	.pr_hostuuid	= DEFAULT_HOSTUUID,
 	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
 #ifdef VIMAGE
 	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
 #else
 	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
 #endif
 	.pr_allow	= PR_ALLOW_ALL_STATIC,
 };
 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
 
 struct bool_flags {
 	const char	*name;
 	const char	*noname;
 	volatile u_int	 flag;
 };
 struct jailsys_flags {
 	const char	*name;
 	unsigned	 disable;
 	unsigned	 new;
 };
 
 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
 struct	sx allprison_lock;
 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
 struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
 LIST_HEAD(, prison_racct) allprison_racct;
 int	lastprid = 0;
 
 static int get_next_prid(struct prison **insprp);
-static int do_jail_attach(struct thread *td, struct prison *pr);
+static int do_jail_attach(struct thread *td, struct prison *pr, int drflags);
 static void prison_complete(void *context, int pending);
 static void prison_deref(struct prison *pr, int flags);
+static int prison_lock_xlock(struct prison *pr, int flags);
+static void prison_free_not_last(struct prison *pr);
 static void prison_set_allow_locked(struct prison *pr, unsigned flag,
     int enable);
 static char *prison_path(struct prison *pr1, struct prison *pr2);
 static void prison_remove_one(struct prison *pr);
 #ifdef RACCT
 static void prison_racct_attach(struct prison *pr);
 static void prison_racct_modify(struct prison *pr);
 static void prison_racct_detach(struct prison *pr);
 #endif
 
 /* Flags for prison_deref */
 #define	PD_DEREF	0x01	/* Decrement pr_ref */
 #define	PD_DEUREF	0x02	/* Decrement pr_uref */
 #define	PD_LOCKED	0x04	/* pr_mtx is held */
 #define	PD_LIST_SLOCKED	0x08	/* allprison_lock is held shared */
 #define	PD_LIST_XLOCKED	0x10	/* allprison_lock is held exclusive */
 
 /*
  * Parameter names corresponding to PR_* flag values.  Size values are for kvm
  * as we cannot figure out the size of a sparse array, or an array without a
  * terminating entry.
  */
 static struct bool_flags pr_flag_bool[] = {
 	{"persist", "nopersist", PR_PERSIST},
 #ifdef INET
 	{"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
 #endif
 #ifdef INET6
 	{"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
 #endif
 };
 const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
 
 static struct jailsys_flags pr_flag_jailsys[] = {
 	{"host", 0, PR_HOST},
 #ifdef VIMAGE
 	{"vnet", 0, PR_VNET},
 #endif
 #ifdef INET
 	{"ip4", PR_IP4_USER, PR_IP4_USER},
 #endif
 #ifdef INET6
 	{"ip6", PR_IP6_USER, PR_IP6_USER},
 #endif
 };
 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
 
 /*
  * Make this array full-size so dynamic parameters can be added.
  * It is protected by prison0.mtx, but lockless reading is allowed
  * with an atomic check of the flag values.
  */
 static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
 	{"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
 	{"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
 	{"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
 	{"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
 	{"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
 	{"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
 	{"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
 	{"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
 	{"allow.reserved_ports", "allow.noreserved_ports",
 	 PR_ALLOW_RESERVED_PORTS},
 	{"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
 	{"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug",
 	 PR_ALLOW_UNPRIV_DEBUG},
 	{"allow.suser", "allow.nosuser", PR_ALLOW_SUSER},
 };
 static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC;
 const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
 
 #define	JAIL_DEFAULT_ALLOW		(PR_ALLOW_SET_HOSTNAME | \
 					 PR_ALLOW_RESERVED_PORTS | \
 					 PR_ALLOW_UNPRIV_DEBUG | \
 					 PR_ALLOW_SUSER)
 #define	JAIL_DEFAULT_ENFORCE_STATFS	2
 #define	JAIL_DEFAULT_DEVFS_RSNUM	0
 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
 #if defined(INET) || defined(INET6)
 static unsigned jail_max_af_ips = 255;
 #endif
 
 /*
  * Initialize the parts of prison0 that can't be static-initialized with
  * constants.  This is called from proc0_init() after creating thread0 cpuset.
  */
 void
 prison0_init(void)
 {
 	uint8_t *file, *data;
 	size_t size;
 
 	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
 	prison0.pr_osreldate = osreldate;
 	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
 
 	/* If we have a preloaded hostuuid, use it. */
 	file = preload_search_by_type(PRISON0_HOSTUUID_MODULE);
 	if (file != NULL) {
 		data = preload_fetch_addr(file);
 		size = preload_fetch_size(file);
 		if (data != NULL) {
 			/*
 			 * The preloaded data may include trailing whitespace, almost
 			 * certainly a newline; skip over any whitespace or
 			 * non-printable characters to be safe.
 			 */
 			while (size > 0 && data[size - 1] <= 0x20) {
 				data[size--] = '\0';
 			}
 			if (validate_uuid(data, size, NULL, 0) == 0) {
 				(void)strlcpy(prison0.pr_hostuuid, data,
 				    size + 1);
 			} else if (bootverbose) {
 				printf("hostuuid: preload data malformed: '%s'",
 				    data);
 			}
 		}
 	}
 	if (bootverbose)
 		printf("hostuuid: using %s\n", prison0.pr_hostuuid);
 }
 
 /*
  * struct jail_args {
  *	struct jail *jail;
  * };
  */
 int
 sys_jail(struct thread *td, struct jail_args *uap)
 {
 	uint32_t version;
 	int error;
 	struct jail j;
 
 	error = copyin(uap->jail, &version, sizeof(uint32_t));
 	if (error)
 		return (error);
 
 	switch (version) {
 	case 0:
 	{
 		struct jail_v0 j0;
 
 		/* FreeBSD single IPv4 jails. */
 		bzero(&j, sizeof(struct jail));
 		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
 		if (error)
 			return (error);
 		j.version = j0.version;
 		j.path = j0.path;
 		j.hostname = j0.hostname;
 		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
 		break;
 	}
 
 	case 1:
 		/*
 		 * Version 1 was used by multi-IPv4 jail implementations
 		 * that never made it into the official kernel.
 		 */
 		return (EINVAL);
 
 	case 2:	/* JAIL_API_VERSION */
 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
 		error = copyin(uap->jail, &j, sizeof(struct jail));
 		if (error)
 			return (error);
 		break;
 
 	default:
 		/* Sci-Fi jails are not supported, sorry. */
 		return (EINVAL);
 	}
 	return (kern_jail(td, &j));
 }
 
 int
 kern_jail(struct thread *td, struct jail *j)
 {
 	struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
 #ifdef INET
 			    + 1
 #endif
 #ifdef INET6
 			    + 1
 #endif
 			    )];
 	struct uio opt;
 	char *u_path, *u_hostname, *u_name;
 	struct bool_flags *bf;
 #ifdef INET
 	uint32_t ip4s;
 	struct in_addr *u_ip4;
 #endif
 #ifdef INET6
 	struct in6_addr *u_ip6;
 #endif
 	size_t tmplen;
 	int error, enforce_statfs;
 
 	bzero(&optiov, sizeof(optiov));
 	opt.uio_iov = optiov;
 	opt.uio_iovcnt = 0;
 	opt.uio_offset = -1;
 	opt.uio_resid = -1;
 	opt.uio_segflg = UIO_SYSSPACE;
 	opt.uio_rw = UIO_READ;
 	opt.uio_td = td;
 
 	/* Set permissions for top-level jails from sysctls. */
 	if (!jailed(td->td_ucred)) {
 		for (bf = pr_flag_allow;
 		     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 			atomic_load_int(&bf->flag) != 0;
 		     bf++) {
 			optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
 			    (jail_default_allow & bf->flag)
 			    ? bf->name : bf->noname);
 			optiov[opt.uio_iovcnt].iov_len =
 			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
 			opt.uio_iovcnt += 2;
 		}
 		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
 		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
 		opt.uio_iovcnt++;
 		enforce_statfs = jail_default_enforce_statfs;
 		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
 		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
 		opt.uio_iovcnt++;
 	}
 
 	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
 #ifdef INET
 	ip4s = (j->version == 0) ? 1 : j->ip4s;
 	if (ip4s > jail_max_af_ips)
 		return (EINVAL);
 	tmplen += ip4s * sizeof(struct in_addr);
 #else
 	if (j->ip4s > 0)
 		return (EINVAL);
 #endif
 #ifdef INET6
 	if (j->ip6s > jail_max_af_ips)
 		return (EINVAL);
 	tmplen += j->ip6s * sizeof(struct in6_addr);
 #else
 	if (j->ip6s > 0)
 		return (EINVAL);
 #endif
 	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
 	u_hostname = u_path + MAXPATHLEN;
 	u_name = u_hostname + MAXHOSTNAMELEN;
 #ifdef INET
 	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
 #endif
 #ifdef INET6
 #ifdef INET
 	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
 #else
 	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
 #endif
 #endif
 	optiov[opt.uio_iovcnt].iov_base = "path";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_path;
 	error = copyinstr(j->path, u_path, MAXPATHLEN,
 	    &optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_hostname;
 	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
 	    &optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 	if (j->jailname != NULL) {
 		optiov[opt.uio_iovcnt].iov_base = "name";
 		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
 		opt.uio_iovcnt++;
 		optiov[opt.uio_iovcnt].iov_base = u_name;
 		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
 		    &optiov[opt.uio_iovcnt].iov_len);
 		if (error) {
 			free(u_path, M_TEMP);
 			return (error);
 		}
 		opt.uio_iovcnt++;
 	}
 #ifdef INET
 	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_ip4;
 	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
 	if (j->version == 0)
 		u_ip4->s_addr = j->ip4s;
 	else {
 		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
 		if (error) {
 			free(u_path, M_TEMP);
 			return (error);
 		}
 	}
 	opt.uio_iovcnt++;
 #endif
 #ifdef INET6
 	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
 	opt.uio_iovcnt++;
 	optiov[opt.uio_iovcnt].iov_base = u_ip6;
 	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
 	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
 	if (error) {
 		free(u_path, M_TEMP);
 		return (error);
 	}
 	opt.uio_iovcnt++;
 #endif
 	KASSERT(opt.uio_iovcnt <= nitems(optiov),
 		("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
 	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
 	free(u_path, M_TEMP);
 	return (error);
 }
 
 /*
  * struct jail_set_args {
  *	struct iovec *iovp;
  *	unsigned int iovcnt;
  *	int flags;
  * };
  */
 int
 sys_jail_set(struct thread *td, struct jail_set_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_set(td, auio, uap->flags);
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 {
 	struct nameidata nd;
 #ifdef INET
 	struct in_addr *ip4;
 #endif
 #ifdef INET6
 	struct in6_addr *ip6;
 #endif
 	struct vfsopt *opt;
 	struct vfsoptlist *opts;
 	struct prison *pr, *deadpr, *inspr, *mypr, *ppr, *tpr;
 	struct vnode *root;
 	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
 	char *g_path, *osrelstr;
 	struct bool_flags *bf;
 	struct jailsys_flags *jsf;
 #if defined(INET) || defined(INET6)
 	struct prison *tppr;
 	void *op;
 #endif
 	unsigned long hid;
 	size_t namelen, onamelen, pnamelen;
 	int born, created, cuflags, descend, drflags, enforce;
 	int error, errmsg_len, errmsg_pos;
 	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
 	int jid, jsys, len, level;
 	int childmax, osreldt, rsnum, slevel;
 #if defined(INET) || defined(INET6)
 	int ii, ij;
 #endif
 #ifdef INET
 	int ip4s, redo_ip4;
 #endif
 #ifdef INET6
 	int ip6s, redo_ip6;
 #endif
 	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
 	uint64_t pr_allow_diff;
 	unsigned tallow;
 	char numbuf[12];
 
 	error = priv_check(td, PRIV_JAIL_SET);
 	if (!error && (flags & JAIL_ATTACH))
 		error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 	mypr = td->td_ucred->cr_prison;
 	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
 		return (EPERM);
 	if (flags & ~JAIL_SET_MASK)
 		return (EINVAL);
 
 	/*
 	 * Check all the parameters before committing to anything.  Not all
 	 * errors can be caught early, but we may as well try.  Also, this
 	 * takes care of some expensive stuff (path lookup) before getting
 	 * the allprison lock.
 	 *
 	 * XXX Jails are not filesystems, and jail parameters are not mount
 	 *     options.  But it makes more sense to re-use the vfsopt code
 	 *     than duplicate it under a different name.
 	 */
 	error = vfs_buildopts(optuio, &opts);
 	if (error)
 		return (error);
 #ifdef INET
 	ip4 = NULL;
 #endif
 #ifdef INET6
 	ip6 = NULL;
 #endif
 	g_path = NULL;
 
 	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
 	if (!cuflags) {
 		error = EINVAL;
 		vfs_opterror(opts, "no valid operation (create or update)");
 		goto done_errmsg;
 	}
 
 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 	if (error == ENOENT)
 		jid = 0;
 	else if (error != 0)
 		goto done_free;
 
 	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
 	if (error == ENOENT)
 		gotslevel = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotslevel = 1;
 
 	error =
 	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
 	if (error == ENOENT)
 		gotchildmax = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotchildmax = 1;
 
 	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
 	if (error == ENOENT)
 		gotenforce = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (enforce < 0 || enforce > 2) {
 		error = EINVAL;
 		goto done_free;
 	} else
 		gotenforce = 1;
 
 	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
 	if (error == ENOENT)
 		gotrsnum = 0;
 	else if (error != 0)
 		goto done_free;
 	else
 		gotrsnum = 1;
 
 	pr_flags = ch_flags = 0;
 	for (bf = pr_flag_bool;
 	     bf < pr_flag_bool + nitems(pr_flag_bool);
 	     bf++) {
 		vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
 		vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
 	}
 	ch_flags |= pr_flags;
 	for (jsf = pr_flag_jailsys;
 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
 	     jsf++) {
 		error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
 		if (error == ENOENT)
 			continue;
 		if (error != 0)
 			goto done_free;
 		switch (jsys) {
 		case JAIL_SYS_DISABLE:
 			if (!jsf->disable) {
 				error = EINVAL;
 				goto done_free;
 			}
 			pr_flags |= jsf->disable;
 			break;
 		case JAIL_SYS_NEW:
 			pr_flags |= jsf->new;
 			break;
 		case JAIL_SYS_INHERIT:
 			break;
 		default:
 			error = EINVAL;
 			goto done_free;
 		}
 		ch_flags |= jsf->new | jsf->disable;
 	}
 	if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
 	    && !(pr_flags & PR_PERSIST)) {
 		error = EINVAL;
 		vfs_opterror(opts, "new jail must persist or attach");
 		goto done_errmsg;
 	}
 #ifdef VIMAGE
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
 		error = EINVAL;
 		vfs_opterror(opts, "vnet cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 #ifdef INET
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
 		error = EINVAL;
 		vfs_opterror(opts, "ip4 cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 #ifdef INET6
 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
 		error = EINVAL;
 		vfs_opterror(opts, "ip6 cannot be changed after creation");
 		goto done_errmsg;
 	}
 #endif
 
 	pr_allow = ch_allow = 0;
 	for (bf = pr_flag_allow;
 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 		atomic_load_int(&bf->flag) != 0;
 	     bf++) {
 		vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
 		vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
 	}
 	ch_allow |= pr_allow;
 
 	error = vfs_getopt(opts, "name", (void **)&name, &len);
 	if (error == ENOENT)
 		name = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (len == 0 || name[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
 	if (error == ENOENT)
 		host = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || host[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
 	if (error == ENOENT)
 		domain = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || domain[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > MAXHOSTNAMELEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
 	if (error == ENOENT)
 		uuid = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 		if (len == 0 || uuid[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len > HOSTUUIDLEN) {
 			error = ENAMETOOLONG;
 			goto done_free;
 		}
 	}
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		uint32_t hid32;
 
 		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
 		hid = hid32;
 	} else
 #endif
 		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
 	if (error == ENOENT)
 		gothid = 0;
 	else if (error != 0)
 		goto done_free;
 	else {
 		gothid = 1;
 		ch_flags |= PR_HOST;
 		pr_flags |= PR_HOST;
 	}
 
 #ifdef INET
 	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
 	if (error == ENOENT)
 		ip4s = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (ip4s & (sizeof(*ip4) - 1)) {
 		error = EINVAL;
 		goto done_free;
 	} else {
 		ch_flags |= PR_IP4_USER;
 		pr_flags |= PR_IP4_USER;
 		if (ip4s > 0) {
 			ip4s /= sizeof(*ip4);
 			if (ip4s > jail_max_af_ips) {
 				error = EINVAL;
 				vfs_opterror(opts, "too many IPv4 addresses");
 				goto done_errmsg;
 			}
 			ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
 			bcopy(op, ip4, ip4s * sizeof(*ip4));
 			/*
 			 * IP addresses are all sorted but ip[0] to preserve
 			 * the primary IP address as given from userland.
 			 * This special IP is used for unbound outgoing
 			 * connections as well for "loopback" traffic in case
 			 * source address selection cannot find any more fitting
 			 * address to connect from.
 			 */
 			if (ip4s > 1)
 				qsort(ip4 + 1, ip4s - 1, sizeof(*ip4),
 				    prison_qcmp_v4);
 			/*
 			 * Check for duplicate addresses and do some simple
 			 * zero and broadcast checks. If users give other bogus
 			 * addresses it is their problem.
 			 *
 			 * We do not have to care about byte order for these
 			 * checks so we will do them in NBO.
 			 */
 			for (ii = 0; ii < ip4s; ii++) {
 				if (ip4[ii].s_addr == INADDR_ANY ||
 				    ip4[ii].s_addr == INADDR_BROADCAST) {
 					error = EINVAL;
 					goto done_free;
 				}
 				if ((ii+1) < ip4s &&
 				    (ip4[0].s_addr == ip4[ii+1].s_addr ||
 				     ip4[ii].s_addr == ip4[ii+1].s_addr)) {
 					error = EINVAL;
 					goto done_free;
 				}
 			}
 		}
 	}
 #endif
 
 #ifdef INET6
 	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
 	if (error == ENOENT)
 		ip6s = 0;
 	else if (error != 0)
 		goto done_free;
 	else if (ip6s & (sizeof(*ip6) - 1)) {
 		error = EINVAL;
 		goto done_free;
 	} else {
 		ch_flags |= PR_IP6_USER;
 		pr_flags |= PR_IP6_USER;
 		if (ip6s > 0) {
 			ip6s /= sizeof(*ip6);
 			if (ip6s > jail_max_af_ips) {
 				error = EINVAL;
 				vfs_opterror(opts, "too many IPv6 addresses");
 				goto done_errmsg;
 			}
 			ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
 			bcopy(op, ip6, ip6s * sizeof(*ip6));
 			if (ip6s > 1)
 				qsort(ip6 + 1, ip6s - 1, sizeof(*ip6),
 				    prison_qcmp_v6);
 			for (ii = 0; ii < ip6s; ii++) {
 				if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
 					error = EINVAL;
 					goto done_free;
 				}
 				if ((ii+1) < ip6s &&
 				    (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
 				     IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
 				{
 					error = EINVAL;
 					goto done_free;
 				}
 			}
 		}
 	}
 #endif
 
 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
 	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
 		error = EINVAL;
 		vfs_opterror(opts,
 		    "vnet jails cannot have IP address restrictions");
 		goto done_errmsg;
 	}
 #endif
 
 	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
 	if (error == ENOENT)
 		osrelstr = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "osrelease cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (len == 0 || osrelstr[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		if (len >= OSRELEASELEN) {
 			error = ENAMETOOLONG;
 			vfs_opterror(opts,
 			    "osrelease string must be 1-%d bytes long",
 			    OSRELEASELEN - 1);
 			goto done_errmsg;
 		}
 	}
 
 	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
 	if (error == ENOENT)
 		osreldt = 0;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "osreldate cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (osreldt == 0) {
 			error = EINVAL;
 			vfs_opterror(opts, "osreldate cannot be 0");
 			goto done_errmsg;
 		}
 	}
 
 	root = NULL;
 	error = vfs_getopt(opts, "path", (void **)&path, &len);
 	if (error == ENOENT)
 		path = NULL;
 	else if (error != 0)
 		goto done_free;
 	else {
 		if (flags & JAIL_UPDATE) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "path cannot be changed after creation");
 			goto done_errmsg;
 		}
 		if (len == 0 || path[len - 1] != '\0') {
 			error = EINVAL;
 			goto done_free;
 		}
 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
 		    path, td);
 		error = namei(&nd);
 		if (error)
 			goto done_free;
 		root = nd.ni_vp;
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 		strlcpy(g_path, path, MAXPATHLEN);
 		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
 		if (error == 0) {
 			path = g_path;
 		} else {
 			/* exit on other errors */
 			goto done_free;
 		}
 		if (root->v_type != VDIR) {
 			error = ENOTDIR;
 			vput(root);
 			goto done_free;
 		}
 		VOP_UNLOCK(root);
 	}
 
 	/*
 	 * Find the specified jail, or at least its parent.
 	 * This abuses the file error codes ENOENT and EEXIST.
 	 */
 	pr = NULL;
 	ppr = mypr;
 	inspr = NULL;
 	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
 		namelc = strrchr(name, '.');
 		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
 		if (*p != '\0')
 			jid = 0;
 	}
 	sx_xlock(&allprison_lock);
 	drflags = PD_LIST_XLOCKED;
 	if (jid != 0) {
 		if (jid < 0) {
 			error = EINVAL;
 			vfs_opterror(opts, "negative jid");
 			goto done_deref;
 		}
 		/*
 		 * See if a requested jid already exists.  Keep track of
 		 * where it can be inserted later.
 		 */
 		TAILQ_FOREACH(inspr, &allprison, pr_list) {
-			if (inspr->pr_id == jid) {
-				mtx_lock(&inspr->pr_mtx);
-				if (prison_isvalid(inspr)) {
-					pr = inspr;
-					drflags |= PD_LOCKED;
-					inspr = NULL;
-				} else
-					mtx_unlock(&inspr->pr_mtx);
-				break;
-			}
+			if (inspr->pr_id < jid)
+				continue;
 			if (inspr->pr_id > jid)
 				break;
+			pr = inspr;
+			mtx_lock(&pr->pr_mtx);
+			drflags |= PD_LOCKED;
+			inspr = NULL;
+			break;
 		}
 		if (pr != NULL) {
 			ppr = pr->pr_parent;
 			/* Create: jid must not exist. */
 			if (cuflags == JAIL_CREATE) {
 				/*
 				 * Even creators that cannot see the jail will
 				 * get EEXIST.
 				 */
 				error = EEXIST;
 				vfs_opterror(opts, "jail %d already exists",
 				    jid);
 				goto done_deref;
 			}
 			if (!prison_ischild(mypr, pr)) {
 				/*
 				 * Updaters get ENOENT if they cannot see the
 				 * jail.  This is true even for CREATE | UPDATE,
 				 * which normally cannot give this error.
 				 */
 				error = ENOENT;
 				vfs_opterror(opts, "jail %d not found", jid);
 				goto done_deref;
-			} else if (!prison_isalive(pr)) {
+			}
+			if (!prison_isalive(pr)) {
 				if (!(flags & JAIL_DYING)) {
 					error = ENOENT;
 					vfs_opterror(opts, "jail %d is dying",
 					    jid);
 					goto done_deref;
-				} else if ((flags & JAIL_ATTACH) ||
+				}
+				if ((flags & JAIL_ATTACH) ||
 				    (pr_flags & PR_PERSIST)) {
 					/*
 					 * A dying jail might be resurrected
 					 * (via attach or persist), but first
 					 * it must determine if another jail
 					 * has claimed its name.  Accomplish
 					 * this by implicitly re-setting the
 					 * name.
 					 */
 					if (name == NULL)
 						name = prison_name(mypr, pr);
 				}
 			}
 		} else {
 			/* Update: jid must exist. */
 			if (cuflags == JAIL_UPDATE) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail %d not found", jid);
 				goto done_deref;
 			}
 		}
 	}
 	/*
 	 * If the caller provided a name, look for a jail by that name.
 	 * This has different semantics for creates and updates keyed by jid
 	 * (where the name must not already exist in a different jail),
 	 * and updates keyed by the name itself (where the name must exist
 	 * because that is the jail being updated).
 	 */
 	namelc = NULL;
 	if (name != NULL) {
 		namelc = strrchr(name, '.');
 		if (namelc == NULL)
 			namelc = name;
 		else {
 			/*
 			 * This is a hierarchical name.  Split it into the
 			 * parent and child names, and make sure the parent
 			 * exists or matches an already found jail.
 			 */
 			if (pr != NULL) {
 				if (strncmp(name, ppr->pr_name, namelc - name)
 				    || ppr->pr_name[namelc - name] != '\0') {
 					error = EINVAL;
 					vfs_opterror(opts,
 					    "cannot change jail's parent");
 					goto done_deref;
 				}
 			} else {
 				*namelc = '\0';
 				ppr = prison_find_name(mypr, name);
 				if (ppr == NULL) {
 					error = ENOENT;
 					vfs_opterror(opts,
 					    "jail \"%s\" not found", name);
 					goto done_deref;
 				}
 				if (!(flags & JAIL_DYING) &&
 				    !prison_isalive(ppr)) {
 					mtx_unlock(&ppr->pr_mtx);
 					error = ENOENT;
 					vfs_opterror(opts,
 					    "jail \"%s\" is dying", name);
 					goto done_deref;
 				}
 				mtx_unlock(&ppr->pr_mtx);
 				*namelc = '.';
 			}
 			namelc++;
 		}
 		if (namelc[0] != '\0') {
 			pnamelen =
 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
- name_again:
 			deadpr = NULL;
 			FOREACH_PRISON_CHILD(ppr, tpr) {
 				if (tpr != pr &&
 				    !strcmp(tpr->pr_name + pnamelen, namelc)) {
-					mtx_lock(&tpr->pr_mtx);
 					if (prison_isalive(tpr)) {
 						if (pr == NULL &&
 						    cuflags != JAIL_CREATE) {
 							/*
 							 * Use this jail
 							 * for updates.
 							 */
 							pr = tpr;
+							mtx_lock(&pr->pr_mtx);
 							drflags |= PD_LOCKED;
 							break;
 						}
 						/*
 						 * Create, or update(jid):
 						 * name must not exist in an
 						 * active sibling jail.
 						 */
 						error = EEXIST;
-						mtx_unlock(&tpr->pr_mtx);
 						vfs_opterror(opts,
 						   "jail \"%s\" already exists",
 						   name);
 						goto done_deref;
 					}
 					if (pr == NULL &&
-					    cuflags != JAIL_CREATE &&
-					    prison_isvalid(tpr))
+					    cuflags != JAIL_CREATE) {
 						deadpr = tpr;
-					mtx_unlock(&tpr->pr_mtx);
+					}
 				}
 			}
 			/* If no active jail is found, use a dying one. */
 			if (deadpr != NULL && pr == NULL) {
 				if (flags & JAIL_DYING) {
-					mtx_lock(&deadpr->pr_mtx);
-					if (!prison_isvalid(deadpr)) {
-						mtx_unlock(&deadpr->pr_mtx);
-						goto name_again;
-					}
 					pr = deadpr;
+					mtx_lock(&pr->pr_mtx);
 					drflags |= PD_LOCKED;
 				} else if (cuflags == JAIL_UPDATE) {
 					error = ENOENT;
 					vfs_opterror(opts,
 					    "jail \"%s\" is dying", name);
 					goto done_deref;
 				}
 			}
 			/* Update: name must exist if no jid. */
 			else if (cuflags == JAIL_UPDATE && pr == NULL) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail \"%s\" not found",
 				    name);
 				goto done_deref;
 			}
 		}
 	}
 	/* Update: must provide a jid or name. */
 	else if (cuflags == JAIL_UPDATE && pr == NULL) {
 		error = ENOENT;
 		vfs_opterror(opts, "update specified no jail");
 		goto done_deref;
 	}
 
 	/* If there's no prison to update, create a new one and link it in. */
 	created = pr == NULL;
 	if (created) {
 		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
 			if (tpr->pr_childcount >= tpr->pr_childmax) {
 				error = EPERM;
 				vfs_opterror(opts, "prison limit exceeded");
 				goto done_deref;
 			}
-		mtx_lock(&ppr->pr_mtx);
-		if (!prison_isvalid(ppr)) {
-			mtx_unlock(&ppr->pr_mtx);
-			error = ENOENT;
-			vfs_opterror(opts, "jail \"%s\" not found",
-			    prison_name(mypr, ppr));
-			goto done_deref;
-		}
 		prison_hold(ppr);
-		if (refcount_acquire(&ppr->pr_uref))
-			mtx_unlock(&ppr->pr_mtx);
-		else {
+		if (!refcount_acquire_if_not_zero(&ppr->pr_uref)) {
 			/* This brings the parent back to life. */
+			mtx_lock(&ppr->pr_mtx);
+			refcount_acquire(&ppr->pr_uref);
 			mtx_unlock(&ppr->pr_mtx);
 			error = osd_jail_call(ppr, PR_METHOD_CREATE, opts);
 			if (error) {
 				pr = ppr;
 				drflags |= PD_DEREF | PD_DEUREF;
 				goto done_deref;
 			}
-                }
+		}
 
 		if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) {
 			error = EAGAIN;
 			vfs_opterror(opts, "no available jail IDs");
 			pr = ppr;
 			drflags |= PD_DEREF | PD_DEUREF;
 			goto done_deref;
 		}
 
 		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
+		refcount_init(&pr->pr_ref, 0);
+		refcount_init(&pr->pr_uref, 0);
 		LIST_INIT(&pr->pr_children);
 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
 
 		pr->pr_id = jid;
 		if (inspr != NULL)
 			TAILQ_INSERT_BEFORE(inspr, pr, pr_list);
 		else
 			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
 
 		pr->pr_parent = ppr;
 		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
 			tpr->pr_childcount++;
 
 		/* Set some default values, and inherit some from the parent. */
 		if (namelc == NULL)
 			namelc = "";
 		if (path == NULL) {
 			path = "/";
 			root = mypr->pr_root;
 			vref(root);
 		}
 		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
 		pr->pr_flags |= PR_HOST;
 #if defined(INET) || defined(INET6)
 #ifdef VIMAGE
 		if (!(pr_flags & PR_VNET))
 #endif
 		{
 #ifdef INET
 			if (!(ch_flags & PR_IP4_USER))
 				pr->pr_flags |= PR_IP4 | PR_IP4_USER;
 			else if (!(pr_flags & PR_IP4_USER)) {
 				pr->pr_flags |= ppr->pr_flags & PR_IP4;
 				if (ppr->pr_ip4 != NULL) {
 					pr->pr_ip4s = ppr->pr_ip4s;
 					pr->pr_ip4 = malloc(pr->pr_ip4s *
 					    sizeof(struct in_addr), M_PRISON,
 					    M_WAITOK);
 					bcopy(ppr->pr_ip4, pr->pr_ip4,
 					    pr->pr_ip4s * sizeof(*pr->pr_ip4));
 				}
 			}
 #endif
 #ifdef INET6
 			if (!(ch_flags & PR_IP6_USER))
 				pr->pr_flags |= PR_IP6 | PR_IP6_USER;
 			else if (!(pr_flags & PR_IP6_USER)) {
 				pr->pr_flags |= ppr->pr_flags & PR_IP6;
 				if (ppr->pr_ip6 != NULL) {
 					pr->pr_ip6s = ppr->pr_ip6s;
 					pr->pr_ip6 = malloc(pr->pr_ip6s *
 					    sizeof(struct in6_addr), M_PRISON,
 					    M_WAITOK);
 					bcopy(ppr->pr_ip6, pr->pr_ip6,
 					    pr->pr_ip6s * sizeof(*pr->pr_ip6));
 				}
 			}
 #endif
 		}
 #endif
 		/* Source address selection is always on by default. */
 		pr->pr_flags |= _PR_IP_SADDRSEL;
 
 		pr->pr_securelevel = ppr->pr_securelevel;
 		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
 		pr->pr_enforce_statfs = jail_default_enforce_statfs;
 		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
 
 		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
 		if (osrelstr == NULL)
 			strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
 			    sizeof(pr->pr_osrelease));
 		else
 			strlcpy(pr->pr_osrelease, osrelstr,
 			    sizeof(pr->pr_osrelease));
 
 #ifdef VIMAGE
 		/* Allocate a new vnet if specified. */
 		pr->pr_vnet = (pr_flags & PR_VNET)
 		    ? vnet_alloc() : ppr->pr_vnet;
 #endif
 		/*
 		 * Allocate a dedicated cpuset for each jail.
 		 * Unlike other initial settings, this may return an erorr.
 		 */
 		error = cpuset_create_root(ppr, &pr->pr_cpuset);
 		if (error)
 			goto done_deref;
 
 		mtx_lock(&pr->pr_mtx);
 		drflags |= PD_LOCKED;
 		/*
 		 * New prisons do not yet have a reference, because we do not
 		 * want others to see the incomplete prison once the
 		 * allprison_lock is downgraded.
 		 */
 	} else {
 		/*
 		 * Grab a reference for existing prisons, to ensure they
 		 * continue to exist for the duration of the call.
 		 */
 		prison_hold(pr);
 		drflags |= PD_DEREF;
 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
 		if ((pr->pr_flags & PR_VNET) &&
 		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "vnet jails cannot have IP address restrictions");
 			goto done_deref;
 		}
 #endif
 #ifdef INET
 		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "ip4 cannot be changed after creation");
 			goto done_deref;
 		}
 #endif
 #ifdef INET6
 		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "ip6 cannot be changed after creation");
 			goto done_deref;
 		}
 #endif
 	}
 
 	/* Do final error checking before setting anything. */
 	if (gotslevel) {
 		if (slevel < ppr->pr_securelevel) {
 			error = EPERM;
 			goto done_deref;
 		}
 	}
 	if (gotchildmax) {
 		if (childmax >= ppr->pr_childmax) {
 			error = EPERM;
 			goto done_deref;
 		}
 	}
 	if (gotenforce) {
 		if (enforce < ppr->pr_enforce_statfs) {
 			error = EPERM;
 			goto done_deref;
 		}
 	}
 	if (gotrsnum) {
 		/*
 		 * devfs_rsnum is a uint16_t
 		 */
 		if (rsnum < 0 || rsnum > 65535) {
 			error = EINVAL;
 			goto done_deref;
 		}
 		/*
 		 * Nested jails always inherit parent's devfs ruleset
 		 */
 		if (jailed(td->td_ucred)) {
 			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
 				error = EPERM;
 				goto done_deref;
 			} else
 				rsnum = ppr->pr_devfs_rsnum;
 		}
 	}
 #ifdef INET
 	if (ip4s > 0) {
 		if (ppr->pr_flags & PR_IP4) {
 			/*
 			 * Make sure the new set of IP addresses is a
 			 * subset of the parent's list.  Don't worry
 			 * about the parent being unlocked, as any
 			 * setting is done with allprison_lock held.
 			 */
 			for (ij = 0; ij < ppr->pr_ip4s; ij++)
 				if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
 					break;
 			if (ij == ppr->pr_ip4s) {
 				error = EPERM;
 				goto done_deref;
 			}
 			if (ip4s > 1) {
 				for (ii = ij = 1; ii < ip4s; ii++) {
 					if (ip4[ii].s_addr ==
 					    ppr->pr_ip4[0].s_addr)
 						continue;
 					for (; ij < ppr->pr_ip4s; ij++)
 						if (ip4[ii].s_addr ==
 						    ppr->pr_ip4[ij].s_addr)
 							break;
 					if (ij == ppr->pr_ip4s)
 						break;
 				}
 				if (ij == ppr->pr_ip4s) {
 					error = EPERM;
 					goto done_deref;
 				}
 			}
 		}
 		/*
 		 * Check for conflicting IP addresses.  We permit them
 		 * if there is no more than one IP on each jail.  If
 		 * there is a duplicate on a jail with more than one
 		 * IP stop checking and return error.
 		 */
 #ifdef VIMAGE
 		for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
 			if (tppr->pr_flags & PR_VNET)
 				break;
 #else
 		tppr = &prison0;
 #endif
 		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
 			if (tpr == pr ||
 #ifdef VIMAGE
 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
 #endif
-			    refcount_load(&tpr->pr_uref) == 0) {
+			    !prison_isalive(tpr)) {
 				descend = 0;
 				continue;
 			}
 			if (!(tpr->pr_flags & PR_IP4_USER))
 				continue;
 			descend = 0;
 			if (tpr->pr_ip4 == NULL ||
 			    (ip4s == 1 && tpr->pr_ip4s == 1))
 				continue;
 			for (ii = 0; ii < ip4s; ii++) {
 				if (prison_check_ip4_locked(tpr, &ip4[ii]) ==
 				    0) {
 					error = EADDRINUSE;
 					vfs_opterror(opts,
 					    "IPv4 addresses clash");
 					goto done_deref;
 				}
 			}
 		}
 	}
 #endif
 #ifdef INET6
 	if (ip6s > 0) {
 		if (ppr->pr_flags & PR_IP6) {
 			/*
 			 * Make sure the new set of IP addresses is a
 			 * subset of the parent's list.
 			 */
 			for (ij = 0; ij < ppr->pr_ip6s; ij++)
 				if (IN6_ARE_ADDR_EQUAL(&ip6[0],
 				    &ppr->pr_ip6[ij]))
 					break;
 			if (ij == ppr->pr_ip6s) {
 				error = EPERM;
 				goto done_deref;
 			}
 			if (ip6s > 1) {
 				for (ii = ij = 1; ii < ip6s; ii++) {
 					if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
 					     &ppr->pr_ip6[0]))
 						continue;
 					for (; ij < ppr->pr_ip6s; ij++)
 						if (IN6_ARE_ADDR_EQUAL(
 						    &ip6[ii], &ppr->pr_ip6[ij]))
 							break;
 					if (ij == ppr->pr_ip6s)
 						break;
 				}
 				if (ij == ppr->pr_ip6s) {
 					error = EPERM;
 					goto done_deref;
 				}
 			}
 		}
 		/* Check for conflicting IP addresses. */
 #ifdef VIMAGE
 		for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
 			if (tppr->pr_flags & PR_VNET)
 				break;
 #else
 		tppr = &prison0;
 #endif
 		FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
 			if (tpr == pr ||
 #ifdef VIMAGE
 			    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
 #endif
-			    refcount_load(&tpr->pr_uref) == 0) {
+			    !prison_isalive(tpr)) {
 				descend = 0;
 				continue;
 			}
 			if (!(tpr->pr_flags & PR_IP6_USER))
 				continue;
 			descend = 0;
 			if (tpr->pr_ip6 == NULL ||
 			    (ip6s == 1 && tpr->pr_ip6s == 1))
 				continue;
 			for (ii = 0; ii < ip6s; ii++) {
 				if (prison_check_ip6_locked(tpr, &ip6[ii]) ==
 				    0) {
 					error = EADDRINUSE;
 					vfs_opterror(opts,
 					    "IPv6 addresses clash");
 					goto done_deref;
 				}
 			}
 		}
 	}
 #endif
 	onamelen = namelen = 0;
 	if (namelc != NULL) {
 		/* Give a default name of the jid.  Also allow the name to be
 		 * explicitly the jid - but not any other number, and only in
 		 * normal form (no leading zero/etc).
 		 */
 		if (namelc[0] == '\0')
 			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
 		else if ((strtoul(namelc, &p, 10) != jid ||
 			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
 			error = EINVAL;
 			vfs_opterror(opts,
 			    "name cannot be numeric (unless it is the jid)");
 			goto done_deref;
 		}
 		/*
 		 * Make sure the name isn't too long for the prison or its
 		 * children.
 		 */
 		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
 		onamelen = strlen(pr->pr_name + pnamelen);
 		namelen = strlen(namelc);
 		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
 			error = ENAMETOOLONG;
 			goto done_deref;
 		}
 		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
 			    sizeof(pr->pr_name)) {
 				error = ENAMETOOLONG;
 				goto done_deref;
 			}
 		}
 	}
 	pr_allow_diff = pr_allow & ~ppr->pr_allow;
 	if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) {
 		error = EPERM;
 		goto done_deref;
 	}
 
 	/*
 	 * Let modules check their parameters.  This requires unlocking and
 	 * then re-locking the prison, but this is still a valid state as long
 	 * as allprison_lock remains xlocked.
 	 */
 	mtx_unlock(&pr->pr_mtx);
 	drflags &= ~PD_LOCKED;
 	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
 	if (error != 0)
 		goto done_deref;
 	mtx_lock(&pr->pr_mtx);
 	drflags |= PD_LOCKED;
 
 	/* At this point, all valid parameters should have been noted. */
 	TAILQ_FOREACH(opt, opts, link) {
 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
 			error = EINVAL;
 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
 			goto done_deref;
 		}
 	}
 
 	/* Set the parameters of the prison. */
 #ifdef INET
 	redo_ip4 = 0;
 	if (pr_flags & PR_IP4_USER) {
 		pr->pr_flags |= PR_IP4;
 		free(pr->pr_ip4, M_PRISON);
 		pr->pr_ip4s = ip4s;
 		pr->pr_ip4 = ip4;
 		ip4 = NULL;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip4(tpr, NULL)) {
 				redo_ip4 = 1;
 				descend = 0;
 			}
 		}
 	}
 #endif
 #ifdef INET6
 	redo_ip6 = 0;
 	if (pr_flags & PR_IP6_USER) {
 		pr->pr_flags |= PR_IP6;
 		free(pr->pr_ip6, M_PRISON);
 		pr->pr_ip6s = ip6s;
 		pr->pr_ip6 = ip6;
 		ip6 = NULL;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip6(tpr, NULL)) {
 				redo_ip6 = 1;
 				descend = 0;
 			}
 		}
 	}
 #endif
 	if (gotslevel) {
 		pr->pr_securelevel = slevel;
 		/* Set all child jails to be at least this level. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			if (tpr->pr_securelevel < slevel)
 				tpr->pr_securelevel = slevel;
 	}
 	if (gotchildmax) {
 		pr->pr_childmax = childmax;
 		/* Set all child jails to under this limit. */
 		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
 			if (tpr->pr_childmax > childmax - level)
 				tpr->pr_childmax = childmax > level
 				    ? childmax - level : 0;
 	}
 	if (gotenforce) {
 		pr->pr_enforce_statfs = enforce;
 		/* Pass this restriction on to the children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			if (tpr->pr_enforce_statfs < enforce)
 				tpr->pr_enforce_statfs = enforce;
 	}
 	if (gotrsnum) {
 		pr->pr_devfs_rsnum = rsnum;
 		/* Pass this restriction on to the children. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 			tpr->pr_devfs_rsnum = rsnum;
 	}
 	if (namelc != NULL) {
 		if (ppr == &prison0)
 			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
 		else
 			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
 			    ppr->pr_name, namelc);
 		/* Change this component of child names. */
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
 			    strlen(tpr->pr_name + onamelen) + 1);
 			bcopy(pr->pr_name, tpr->pr_name, namelen);
 		}
 	}
 	if (path != NULL) {
 		/* Try to keep a real-rooted full pathname. */
 		strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
 		pr->pr_root = root;
 		root = NULL;
 	}
 	if (PR_HOST & ch_flags & ~pr_flags) {
 		if (pr->pr_flags & PR_HOST) {
 			/*
 			 * Copy the parent's host info.  As with pr_ip4 above,
 			 * the lack of a lock on the parent is not a problem;
 			 * it is always set with allprison_lock at least
 			 * shared, and is held exclusively here.
 			 */
 			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
 			    sizeof(pr->pr_hostname));
 			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
 			    sizeof(pr->pr_domainname));
 			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
 			    sizeof(pr->pr_hostuuid));
 			pr->pr_hostid = pr->pr_parent->pr_hostid;
 		}
 	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
 		/* Set this prison, and any descendants without PR_HOST. */
 		if (host != NULL)
 			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
 		if (domain != NULL)
 			strlcpy(pr->pr_domainname, domain, 
 			    sizeof(pr->pr_domainname));
 		if (uuid != NULL)
 			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
 		if (gothid)
 			pr->pr_hostid = hid;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 			if (tpr->pr_flags & PR_HOST)
 				descend = 0;
 			else {
 				if (host != NULL)
 					strlcpy(tpr->pr_hostname,
 					    pr->pr_hostname,
 					    sizeof(tpr->pr_hostname));
 				if (domain != NULL)
 					strlcpy(tpr->pr_domainname, 
 					    pr->pr_domainname,
 					    sizeof(tpr->pr_domainname));
 				if (uuid != NULL)
 					strlcpy(tpr->pr_hostuuid,
 					    pr->pr_hostuuid,
 					    sizeof(tpr->pr_hostuuid));
 				if (gothid)
 					tpr->pr_hostid = hid;
 			}
 		}
 	}
 	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
 	if ((tallow = ch_allow & ~pr_allow))
 		prison_set_allow_locked(pr, tallow, 0);
 	/*
 	 * Persistent prisons get an extra reference, and prisons losing their
 	 * persist flag lose that reference.  Only do this for existing prisons
 	 * for now, so new ones will remain unseen until after the module
 	 * handlers have completed.
 	 */
 	born = !prison_isalive(pr);
 	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
 		if (pr_flags & PR_PERSIST) {
 			prison_hold(pr);
 			refcount_acquire(&pr->pr_uref);
 		} else {
-			refcount_release(&pr->pr_ref);
 			drflags |= PD_DEUREF;
+			prison_free_not_last(pr);
 		}
 	}
 	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
 	pr->pr_flags &= ~PR_REMOVE;
 	mtx_unlock(&pr->pr_mtx);
 	drflags &= ~PD_LOCKED;
 
 #ifdef RACCT
 	if (racct_enable && created)
 		prison_racct_attach(pr);
 #endif
 
 	/* Locks may have prevented a complete restriction of child IP
 	 * addresses.  If so, allocate some more memory and try again.
 	 */
 #ifdef INET
 	while (redo_ip4) {
 		ip4s = pr->pr_ip4s;
 		ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
 		mtx_lock(&pr->pr_mtx);
 		redo_ip4 = 0;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip4(tpr, ip4)) {
 				if (ip4 != NULL)
 					ip4 = NULL;
 				else
 					redo_ip4 = 1;
 			}
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 #endif
 #ifdef INET6
 	while (redo_ip6) {
 		ip6s = pr->pr_ip6s;
 		ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
 		mtx_lock(&pr->pr_mtx);
 		redo_ip6 = 0;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 #ifdef VIMAGE
 			if (tpr->pr_flags & PR_VNET) {
 				descend = 0;
 				continue;
 			}
 #endif
 			if (prison_restrict_ip6(tpr, ip6)) {
 				if (ip6 != NULL)
 					ip6 = NULL;
 				else
 					redo_ip6 = 1;
 			}
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 #endif
 
 	/* Let the modules do their work. */
-	sx_downgrade(&allprison_lock);
-	drflags = (drflags & ~PD_LIST_XLOCKED) | PD_LIST_SLOCKED;
 	if (born) {
 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
 		if (error) {
 			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 			goto done_deref;
 		}
 	}
 	error = osd_jail_call(pr, PR_METHOD_SET, opts);
 	if (error) {
 		if (born)
 			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 		goto done_deref;
 	}
 
 	/* Attach this process to the prison if requested. */
 	if (flags & JAIL_ATTACH) {
-		mtx_lock(&pr->pr_mtx);
-		error = do_jail_attach(td, pr);
-		drflags &= ~PD_LIST_SLOCKED;
+		error = do_jail_attach(td, pr, prison_lock_xlock(pr, drflags));
+		drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED);
 		if (error) {
 			if (created) {
 				/* do_jail_attach has removed the prison. */
 				pr = NULL;
 			}
 			vfs_opterror(opts, "attach failed");
 			goto done_deref;
 		}
 	}
 
 #ifdef RACCT
 	if (racct_enable && !created) {
-		if (drflags & PD_LIST_SLOCKED) {
-			sx_sunlock(&allprison_lock);
-			drflags &= ~PD_LIST_SLOCKED;
+		if (drflags & PD_LIST_XLOCKED) {
+			sx_xunlock(&allprison_lock);
+			drflags &= ~PD_LIST_XLOCKED;
 		}
 		prison_racct_modify(pr);
 	}
 #endif
 
 	td->td_retval[0] = pr->pr_id;
 
 	if (created) {
 		/*
 		 * Add a reference to newly created persistent prisons
 		 * (which was not done earlier so that the prison would
 		 * not be publicly visible).
 		 */
 		if (pr_flags & PR_PERSIST) {
-			mtx_lock(&pr->pr_mtx);
-			drflags |= PD_LOCKED;
+			drflags = prison_lock_xlock(pr, drflags);
 			refcount_acquire(&pr->pr_ref);
 			refcount_acquire(&pr->pr_uref);
 		} else {
 			/* Non-persistent jails need no further changes. */
 			pr = NULL;
 		}
 	}
 
  done_deref:
 	/* Release any temporary prison holds and/or locks. */
 	if (pr != NULL)
 		prison_deref(pr, drflags);
 	else if (drflags & PD_LIST_SLOCKED)
 		sx_sunlock(&allprison_lock);
 	else if (drflags & PD_LIST_XLOCKED)
 		sx_xunlock(&allprison_lock);
 	if (root != NULL)
 		vrele(root);
  done_errmsg:
 	if (error) {
 		/* Write the error message back to userspace. */
 		if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
 		    &errmsg_len) == 0 && errmsg_len > 0) {
 			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
 			if (optuio->uio_segflg == UIO_SYSSPACE)
 				bcopy(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 			else
 				copyout(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 		}
 	}
  done_free:
 #ifdef INET
 	free(ip4, M_PRISON);
 #endif
 #ifdef INET6
 	free(ip6, M_PRISON);
 #endif
 	if (g_path != NULL)
 		free(g_path, M_TEMP);
 	vfs_freeopts(opts);
 	return (error);
 }
 
 /*
  * Find the next available prison ID.  Return the ID on success, or zero
  * on failure.  Also set a pointer to the allprison list entry the prison
  * should be inserted before.
  */
 static int
 get_next_prid(struct prison **insprp)
 {
 	struct prison *inspr;
 	int jid, maxid;
 
 	jid = lastprid % JAIL_MAX + 1;
 	if (TAILQ_EMPTY(&allprison) ||
 	    TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) {
 		/*
 		 * A common case is for all jails to be implicitly numbered,
 		 * which means they'll go on the end of the list, at least
 		 * for the first JAIL_MAX times.
 		 */
 		inspr = NULL;
 	} else {
 		/*
 		 * Take two passes through the allprison list: first starting
 		 * with the proposed jid, then ending with it.
 		 */
 		for (maxid = JAIL_MAX; maxid != 0; ) {
 			TAILQ_FOREACH(inspr, &allprison, pr_list) {
 				if (inspr->pr_id < jid)
 					continue;
-				if (inspr->pr_id > jid ||
-				    refcount_load(&inspr->pr_ref) == 0) {
-					/*
-					 * Found an opening.  This may be a gap
-					 * in the list, or a dead jail with the
-					 * same ID.
-					 */
+				if (inspr->pr_id > jid) {
+					/* Found an opening. */
 					maxid = 0;
 					break;
 				}
 				if (++jid > maxid) {
 					if (lastprid == maxid || lastprid == 0)
 					{
 						/*
 						 * The entire legal range
 						 * has been traversed
 						 */
 						return 0;
 					}
 					/* Try again from the start. */
 					jid = 1;
 					maxid = lastprid;
 					break;
 				}
 			}
 			if (inspr == NULL) {
 				/* Found room at the end of the list. */
 				break;
 			}
 		}
 	}
 	*insprp = inspr;
 	lastprid = jid;
 	return (jid);
 }
 
 /*
  * struct jail_get_args {
  *	struct iovec *iovp;
  *	unsigned int iovcnt;
  *	int flags;
  * };
  */
 int
 sys_jail_get(struct thread *td, struct jail_get_args *uap)
 {
 	struct uio *auio;
 	int error;
 
 	/* Check that we have an even number of iovecs. */
 	if (uap->iovcnt & 1)
 		return (EINVAL);
 
 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 	if (error)
 		return (error);
 	error = kern_jail_get(td, auio, uap->flags);
 	if (error == 0)
 		error = copyout(auio->uio_iov, uap->iovp,
 		    uap->iovcnt * sizeof (struct iovec));
 	free(auio, M_IOV);
 	return (error);
 }
 
 int
 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
 {
 	struct bool_flags *bf;
 	struct jailsys_flags *jsf;
 	struct prison *pr, *mypr;
 	struct vfsopt *opt;
 	struct vfsoptlist *opts;
 	char *errmsg, *name;
 	int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos;
 	unsigned f;
 
 	if (flags & ~JAIL_GET_MASK)
 		return (EINVAL);
 
 	/* Get the parameter list. */
 	error = vfs_buildopts(optuio, &opts);
 	if (error)
 		return (error);
 	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
 	mypr = td->td_ucred->cr_prison;
 	pr = NULL;
 
 	/*
 	 * Find the prison specified by one of: lastjid, jid, name.
 	 */
 	sx_slock(&allprison_lock);
 	drflags = PD_LIST_SLOCKED;
 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
 	if (error == 0) {
 		TAILQ_FOREACH(pr, &allprison, pr_list) {
-			if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
+			if (pr->pr_id > jid &&
+			    ((flags & JAIL_DYING) || prison_isalive(pr)) &&
+			    prison_ischild(mypr, pr)) {
 				mtx_lock(&pr->pr_mtx);
-				if ((flags & JAIL_DYING)
-				    ? prison_isvalid(pr) : prison_isalive(pr))
-					break;
-				mtx_unlock(&pr->pr_mtx);
+				drflags |= PD_LOCKED;
+				goto found_prison;
 			}
 		}
-		if (pr != NULL) {
-			drflags |= PD_LOCKED;
-			goto found_prison;
-		}
 		error = ENOENT;
 		vfs_opterror(opts, "no jail after %d", jid);
 		goto done;
 	} else if (error != ENOENT)
 		goto done;
 
 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 	if (error == 0) {
 		if (jid != 0) {
 			pr = prison_find_child(mypr, jid);
 			if (pr != NULL) {
 				drflags |= PD_LOCKED;
 				if (!(prison_isalive(pr) ||
 				    (flags & JAIL_DYING))) {
 					error = ENOENT;
 					vfs_opterror(opts, "jail %d is dying",
 					    jid);
 					goto done;
 				}
 				goto found_prison;
 			}
 			error = ENOENT;
 			vfs_opterror(opts, "jail %d not found", jid);
 			goto done;
 		}
 	} else if (error != ENOENT)
 		goto done;
 
 	error = vfs_getopt(opts, "name", (void **)&name, &len);
 	if (error == 0) {
 		if (len == 0 || name[len - 1] != '\0') {
 			error = EINVAL;
 			goto done;
 		}
 		pr = prison_find_name(mypr, name);
 		if (pr != NULL) {
 			drflags |= PD_LOCKED;
 			if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
 				error = ENOENT;
 				vfs_opterror(opts, "jail \"%s\" is dying",
 				    name);
 				goto done;
 			}
 			goto found_prison;
 		}
 		error = ENOENT;
 		vfs_opterror(opts, "jail \"%s\" not found", name);
 		goto done;
 	} else if (error != ENOENT)
 		goto done;
 
 	vfs_opterror(opts, "no jail specified");
 	error = ENOENT;
 	goto done;
 
  found_prison:
 	/* Get the parameters of the prison. */
 	prison_hold(pr);
 	drflags |= PD_DEREF;
 	td->td_retval[0] = pr->pr_id;
 	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
 	error = vfs_setopt(opts, "parent", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
 	    sizeof(pr->pr_cpuset->cs_id));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
 	if (error != 0 && error != ENOENT)
 		goto done;
 #ifdef INET
 	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
 	    pr->pr_ip4s * sizeof(*pr->pr_ip4));
 	if (error != 0 && error != ENOENT)
 		goto done;
 #endif
 #ifdef INET6
 	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
 	    pr->pr_ip6s * sizeof(*pr->pr_ip6));
 	if (error != 0 && error != ENOENT)
 		goto done;
 #endif
 	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
 	    sizeof(pr->pr_securelevel));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
 	    sizeof(pr->pr_childcount));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
 	    sizeof(pr->pr_childmax));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
 	if (error != 0 && error != ENOENT)
 		goto done;
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		uint32_t hid32 = pr->pr_hostid;
 
 		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
 	} else
 #endif
 	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
 	    sizeof(pr->pr_hostid));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
 	    sizeof(pr->pr_enforce_statfs));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
 	    sizeof(pr->pr_devfs_rsnum));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	for (bf = pr_flag_bool;
 	     bf < pr_flag_bool + nitems(pr_flag_bool);
 	     bf++) {
 		i = (pr->pr_flags & bf->flag) ? 1 : 0;
 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 		i = !i;
 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 	}
 	for (jsf = pr_flag_jailsys;
 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
 	     jsf++) {
 		f = pr->pr_flags & (jsf->disable | jsf->new);
 		i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
 		    : (f == jsf->new) ? JAIL_SYS_NEW
 		    : JAIL_SYS_INHERIT;
 		error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 	}
 	for (bf = pr_flag_allow;
 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 		atomic_load_int(&bf->flag) != 0;
 	     bf++) {
 		i = (pr->pr_allow & bf->flag) ? 1 : 0;
 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 		i = !i;
 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
 		if (error != 0 && error != ENOENT)
 			goto done;
 	}
 	i = !prison_isalive(pr);
 	error = vfs_setopt(opts, "dying", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	i = !i;
 	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
 	    sizeof(pr->pr_osreldate));
 	if (error != 0 && error != ENOENT)
 		goto done;
 	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
 	if (error != 0 && error != ENOENT)
 		goto done;
 
 	/* Get the module parameters. */
 	mtx_unlock(&pr->pr_mtx);
 	drflags &= ~PD_LOCKED;
 	error = osd_jail_call(pr, PR_METHOD_GET, opts);
 	if (error)
 		goto done;
 	prison_deref(pr, drflags);
 	pr = NULL;
 	drflags = 0;
 
 	/* By now, all parameters should have been noted. */
 	TAILQ_FOREACH(opt, opts, link) {
 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
 			error = EINVAL;
 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
 			goto done;
 		}
 	}
 
 	/* Write the fetched parameters back to userspace. */
 	error = 0;
 	TAILQ_FOREACH(opt, opts, link) {
 		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
 			pos = 2 * opt->pos + 1;
 			optuio->uio_iov[pos].iov_len = opt->len;
 			if (opt->value != NULL) {
 				if (optuio->uio_segflg == UIO_SYSSPACE) {
 					bcopy(opt->value,
 					    optuio->uio_iov[pos].iov_base,
 					    opt->len);
 				} else {
 					error = copyout(opt->value,
 					    optuio->uio_iov[pos].iov_base,
 					    opt->len);
 					if (error)
 						break;
 				}
 			}
 		}
 	}
 
  done:
 	/* Release any temporary prison holds and/or locks. */
 	if (pr != NULL)
 		prison_deref(pr, drflags);
 	else if (drflags & PD_LIST_SLOCKED)
 		sx_sunlock(&allprison_lock);
 	if (error && errmsg_pos >= 0) {
 		/* Write the error message back to userspace. */
 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
 		errmsg_pos = 2 * errmsg_pos + 1;
 		if (errmsg_len > 0) {
 			if (optuio->uio_segflg == UIO_SYSSPACE)
 				bcopy(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 			else
 				copyout(errmsg,
 				    optuio->uio_iov[errmsg_pos].iov_base,
 				    errmsg_len);
 		}
 	}
 	vfs_freeopts(opts);
 	return (error);
 }
 
 /*
  * struct jail_remove_args {
  *	int jid;
  * };
  */
 int
 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
 {
-	struct prison *pr, *cpr, *lpr, *tpr;
+	struct prison *pr, *cpr, *lpr;
 	int descend, error;
 
 	error = priv_check(td, PRIV_JAIL_REMOVE);
 	if (error)
 		return (error);
 
 	sx_xlock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 	if (pr == NULL) {
 		sx_xunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	/* Remove all descendants of this prison, then remove this prison. */
 	prison_hold(pr);
 	if (!LIST_EMPTY(&pr->pr_children)) {
 		mtx_unlock(&pr->pr_mtx);
 		lpr = NULL;
 		FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
-			mtx_lock(&cpr->pr_mtx);
-			if (prison_isvalid(cpr)) {
-				tpr = cpr;
-				prison_hold(cpr);
-			} else {
-				/* Already removed - do not do it again. */
-				tpr = NULL;
-			}
-			mtx_unlock(&cpr->pr_mtx);
+			prison_hold(cpr);
 			if (lpr != NULL) {
 				mtx_lock(&lpr->pr_mtx);
 				prison_remove_one(lpr);
 				sx_xlock(&allprison_lock);
 			}
-			lpr = tpr;
+			lpr = cpr;
 		}
 		if (lpr != NULL) {
 			mtx_lock(&lpr->pr_mtx);
 			prison_remove_one(lpr);
 			sx_xlock(&allprison_lock);
 		}
 		mtx_lock(&pr->pr_mtx);
 	}
 	prison_remove_one(pr);
 	return (0);
 }
 
 static void
 prison_remove_one(struct prison *pr)
 {
 	struct proc *p;
 	int drflags;
 
 	drflags = PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED;
 
 	/*
 	 * Mark the prison as doomed, so it doesn't accidentally come back
 	 * to life.  It may still be explicitly brought back by jail_set(2).
 	 */
 	pr->pr_flags |= PR_REMOVE;
 
 	/* If the prison was persistent, it is not anymore. */
 	if (pr->pr_flags & PR_PERSIST) {
-		refcount_release(&pr->pr_ref);
 		drflags |= PD_DEUREF;
+		prison_free_not_last(pr);
 		pr->pr_flags &= ~PR_PERSIST;
 	}
 
 	/*
 	 * jail_remove added a reference.  If that's the only one, remove
 	 * the prison now.  refcount(9) doesn't guarantee the cache coherence
 	 * of non-zero counters, so force it here.
 	 */
 	KASSERT(refcount_load(&pr->pr_ref) > 0,
 	    ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
 	if (atomic_load_acq_int(&pr->pr_ref) == 1) {
 		prison_deref(pr, drflags);
 		return;
 	}
 
 	mtx_unlock(&pr->pr_mtx);
 	sx_xunlock(&allprison_lock);
 	drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED);
 	/*
 	 * Kill all processes unfortunate enough to be attached to this prison.
 	 */
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state != PRS_NEW && p->p_ucred &&
 		    p->p_ucred->cr_prison == pr)
 			kern_psignal(p, SIGKILL);
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 	/* Remove the temporary reference added by jail_remove. */
 	prison_deref(pr, drflags);
 }
 
 /*
  * struct jail_attach_args {
  *	int jid;
  * };
  */
 int
 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
 {
 	struct prison *pr;
 	int error;
 
 	error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 
-	/*
-	 * Start with exclusive hold on allprison_lock to ensure that a possible
-	 * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove.
-	 * But then immediately downgrade it since we don't need to stop
-	 * readers.
-	 */
-	sx_xlock(&allprison_lock);
-	sx_downgrade(&allprison_lock);
+	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 	if (pr == NULL) {
 		sx_sunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
 	/* Do not allow a process to attach to a prison that is not alive. */
 	if (!prison_isalive(pr)) {
 		mtx_unlock(&pr->pr_mtx);
 		sx_sunlock(&allprison_lock);
 		return (EINVAL);
 	}
 
-	return (do_jail_attach(td, pr));
+	return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
 }
 
 static int
-do_jail_attach(struct thread *td, struct prison *pr)
+do_jail_attach(struct thread *td, struct prison *pr, int drflags)
 {
 	struct proc *p;
 	struct ucred *newcred, *oldcred;
 	int error;
 
+	mtx_assert(&pr->pr_mtx, MA_OWNED);
+	sx_assert(&allprison_lock, SX_LOCKED);
 	/*
 	 * XXX: Note that there is a slight race here if two threads
 	 * in the same privileged process attempt to attach to two
 	 * different jails at the same time.  It is important for
 	 * user processes not to do this, or they might end up with
 	 * a process root from one prison, but attached to the jail
 	 * of another.
 	 */
 	refcount_acquire(&pr->pr_ref);
 	refcount_acquire(&pr->pr_uref);
+	drflags |= PD_DEREF | PD_DEUREF;
 	mtx_unlock(&pr->pr_mtx);
+	drflags &= ~PD_LOCKED;
 
 	/* Let modules do whatever they need to prepare for attaching. */
 	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
 	if (error) {
-		prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
+		prison_deref(pr, drflags);
 		return (error);
 	}
-	sx_sunlock(&allprison_lock);
+	sx_unlock(&allprison_lock);
+	drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED);
 
 	/*
 	 * Reparent the newly attached process to this jail.
 	 */
 	p = td->td_proc;
 	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
 	if (error)
 		goto e_revert_osd;
 
 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
 	if ((error = change_dir(pr->pr_root, td)) != 0)
 		goto e_unlock;
 #ifdef MAC
 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
 		goto e_unlock;
 #endif
 	VOP_UNLOCK(pr->pr_root);
 	if ((error = pwd_chroot_chdir(td, pr->pr_root)))
 		goto e_revert_osd;
 
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = crcopysafe(p, newcred);
 	newcred->cr_prison = pr;
 	proc_set_cred(p, newcred);
 	setsugid(p);
 #ifdef RACCT
 	racct_proc_ucred_changed(p, oldcred, newcred);
 	crhold(newcred);
 #endif
 	PROC_UNLOCK(p);
 #ifdef RCTL
 	rctl_proc_ucred_changed(p, newcred);
 	crfree(newcred);
 #endif
-	prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF);
+	prison_deref(oldcred->cr_prison, drflags);
 	crfree(oldcred);
 
 	/*
 	 * If the prison was killed while changing credentials, die along
 	 * with it.
 	 */
 	if (!prison_isalive(pr)) {
 		PROC_LOCK(p);
 		kern_psignal(p, SIGKILL);
 		PROC_UNLOCK(p);
 	}
 
 	return (0);
 
  e_unlock:
 	VOP_UNLOCK(pr->pr_root);
  e_revert_osd:
 	/* Tell modules this thread is still in its old jail after all. */
 	sx_slock(&allprison_lock);
+	drflags |= PD_LIST_SLOCKED;
 	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
-	prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
+	prison_deref(pr, drflags);
 	return (error);
 }
 
 /*
  * Returns a locked prison instance, or NULL on failure.
  */
 struct prison *
 prison_find(int prid)
 {
 	struct prison *pr;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
-		if (pr->pr_id == prid) {
-			mtx_lock(&pr->pr_mtx);
-			if (prison_isvalid(pr))
-				return (pr);
-			/*
-			 * Any active prison with the same ID would have
-			 * been inserted before a dead one.
-			 */
-			mtx_unlock(&pr->pr_mtx);
-			break;
-		}
+		if (pr->pr_id < prid)
+			continue;
 		if (pr->pr_id > prid)
 			break;
+		KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr));
+		mtx_lock(&pr->pr_mtx);
+		return (pr);
 	}
 	return (NULL);
 }
 
 /*
  * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
  */
 struct prison *
 prison_find_child(struct prison *mypr, int prid)
 {
 	struct prison *pr;
 	int descend;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 		if (pr->pr_id == prid) {
+			KASSERT(prison_isvalid(pr),
+			    ("Found invalid prison %p", pr));
 			mtx_lock(&pr->pr_mtx);
-			if (prison_isvalid(pr))
-				return (pr);
-			mtx_unlock(&pr->pr_mtx);
+			return (pr);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Look for the name relative to mypr.  Returns a locked prison or NULL.
  */
 struct prison *
 prison_find_name(struct prison *mypr, const char *name)
 {
 	struct prison *pr, *deadpr;
 	size_t mylen;
 	int descend;
 
 	sx_assert(&allprison_lock, SX_LOCKED);
 	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
- again:
 	deadpr = NULL;
 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 		if (!strcmp(pr->pr_name + mylen, name)) {
-			mtx_lock(&pr->pr_mtx);
-			if (prison_isalive(pr))
+			KASSERT(prison_isvalid(pr),
+			    ("Found invalid prison %p", pr));
+			if (prison_isalive(pr)) {
+				mtx_lock(&pr->pr_mtx);
 				return (pr);
-			if (prison_isvalid(pr))
-				deadpr = pr;
-			mtx_unlock(&pr->pr_mtx);
+			}
+			deadpr = pr;
 		}
 	}
 	/* There was no valid prison - perhaps there was a dying one. */
-	if (deadpr != NULL) {
+	if (deadpr != NULL)
 		mtx_lock(&deadpr->pr_mtx);
-		if (!prison_isvalid(deadpr)) {
-			mtx_unlock(&deadpr->pr_mtx);
-			goto again;
-		}
-	}
 	return (deadpr);
 }
 
 /*
  * See if a prison has the specific flag set.  The prison should be locked,
  * unless checking for flags that are only set at jail creation (such as
  * PR_IP4 and PR_IP6), or only the single bit is examined, without regard
  * to any other prison data.
  */
 int
 prison_flag(struct ucred *cred, unsigned flag)
 {
 
 	return (cred->cr_prison->pr_flags & flag);
 }
 
 int
 prison_allow(struct ucred *cred, unsigned flag)
 {
 
 	return ((cred->cr_prison->pr_allow & flag) != 0);
 }
 
 /*
  * Hold a prison reference, by incrementing pr_ref.  It is generally
  * an error to hold a prison that does not already have a reference.
  * A prison record will remain valid as long as it has at least one
  * reference, and will not be removed as long as either the prison
  * mutex or the allprison lock is held (allprison_lock may be shared).
  */
 void
 prison_hold_locked(struct prison *pr)
 {
 
 	/* Locking is no longer required. */
 	prison_hold(pr);
 }
 
 void
 prison_hold(struct prison *pr)
 {
 #ifdef INVARIANTS
 	int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref);
 
 	KASSERT(was_valid,
 	    ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
 #else
 	refcount_acquire(&pr->pr_ref);
 #endif
 }
 
 /*
  * Remove a prison reference.  If that was the last reference, the
- * prison will be removed (at a later time).  Return with the prison
- * unlocked.
+ * prison will be removed (at a later time).
  */
 void
 prison_free_locked(struct prison *pr)
 {
-	int lastref;
 
 	mtx_assert(&pr->pr_mtx, MA_OWNED);
+	/*
+	 * Locking is no longer required, but unlock because the caller
+	 * expects it.
+	 */
+	mtx_unlock(&pr->pr_mtx);
+	prison_free(pr);
+}
+
+void
+prison_free(struct prison *pr)
+{
+
 	KASSERT(refcount_load(&pr->pr_ref) > 0,
 	    ("Trying to free dead prison %p (jid=%d).",
 	     pr, pr->pr_id));
-	lastref = refcount_release(&pr->pr_ref);
-	mtx_unlock(&pr->pr_mtx);
-	if (lastref) {
+	if (!refcount_release_if_not_last(&pr->pr_ref)) {
 		/*
-		 * Don't remove the prison itself in this context,
+		 * Don't remove the last reference in this context,
 		 * in case there are locks held.
 		 */
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 	}
 }
 
-void
-prison_free(struct prison *pr)
+static void
+prison_free_not_last(struct prison *pr)
 {
+#ifdef INVARIANTS
+	int lastref;
 
-	/*
-	 * Locking is only required when releasing the last reference.
-	 * This allows assurance that a locked prison will remain valid
-	 * until it is unlocked.
-	 */
 	KASSERT(refcount_load(&pr->pr_ref) > 0,
 	    ("Trying to free dead prison %p (jid=%d).",
 	     pr, pr->pr_id));
-	if (refcount_release_if_not_last(&pr->pr_ref))
-		return;
-	mtx_lock(&pr->pr_mtx);
-	prison_free_locked(pr);
+	lastref = refcount_release(&pr->pr_ref);
+	KASSERT(!lastref,
+	    ("prison_free_not_last freed last ref on prison %p (jid=%d).",
+	     pr, pr->pr_id));
+#else
+	refcount_release(&pr->pr_ref);
+#endif
 }
 
 /*
  * Hold a a prison for user visibility, by incrementing pr_uref.
  * It is generally an error to hold a prison that isn't already
  * user-visible, except through the the jail system calls.  It is also
  * an error to hold an invalid prison.  A prison record will remain
  * alive as long as it has at least one user reference, and will not
- * be set to the dying state was long as the prison mutex is held.
+ * be set to the dying state until the prison mutex and allprison_lock
+ * are both freed.
  */
 void
 prison_proc_hold(struct prison *pr)
 {
 #ifdef INVARIANTS
 	int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref);
 
 	KASSERT(was_alive,
 	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
 #else
 	refcount_acquire(&pr->pr_uref);
 #endif
 }
 
 /*
  * Remove a prison user reference.  If it was the last reference, the
  * prison will be considered "dying", and may be removed once all of
  * its references are dropped.
  */
 void
 prison_proc_free(struct prison *pr)
 {
 
 	/*
 	 * Locking is only required when releasing the last reference.
 	 * This allows assurance that a locked prison will remain alive
 	 * until it is unlocked.
 	 */
 	KASSERT(refcount_load(&pr->pr_uref) > 0,
 	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
 	if (!refcount_release_if_not_last(&pr->pr_uref)) {
 		/*
 		 * Don't remove the last user reference in this context,
 		 * which is expected to be a process that is not only locked,
 		 * but also half dead.  Add a reference so any calls to
 		 * prison_free() won't re-submit the task.
 		 */
-		refcount_acquire(&pr->pr_ref);
+		prison_hold(pr);
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 	}
 }
 
 /*
  * Complete a call to either prison_free or prison_proc_free.
  */
 static void
 prison_complete(void *context, int pending)
 {
 	struct prison *pr = context;
+	int drflags;
 
-	sx_xlock(&allprison_lock);
-	mtx_lock(&pr->pr_mtx);
 	/*
-	 * If this is completing a call to prison_proc_free, there will still
-	 * be a user reference held; clear that as well as the reference that
-	 * was added.  No references are expected if this is completing a call
-	 * to prison_free, but prison_deref is still called for the cleanup.
+	 * This could be called to release the last reference, or the
+	 * last user reference; the existence of a user reference implies
+	 * the latter.  There will always be a reference to remove, as
+	 * prison_proc_free adds one.
 	 */
-	prison_deref(pr, refcount_load(&pr->pr_uref) > 0
-	    ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED
-	    : PD_LOCKED | PD_LIST_XLOCKED);
+	drflags = prison_lock_xlock(pr, PD_DEREF);
+	if (refcount_load(&pr->pr_uref) > 0)
+		drflags |= PD_DEUREF;
+	prison_deref(pr, drflags);
 }
 
 /*
  * Remove a prison reference and/or user reference (usually).
  * This assumes context that allows sleeping (for allprison_lock),
  * with no non-sleeping locks held, except perhaps the prison itself.
  * If there are no more references, release and delist the prison.
  * On completion, the prison lock and the allprison lock are both
  * unlocked.
  */
 static void
 prison_deref(struct prison *pr, int flags)
 {
 	struct prisonlist freeprison;
-	struct prison *rpr, *tpr;
-	int lastref, lasturef;
+	struct prison *rpr, *ppr, *tpr;
 
 	TAILQ_INIT(&freeprison);
-	if (!(flags & PD_LOCKED))
-		mtx_lock(&pr->pr_mtx);
 	/*
 	 * Release this prison as requested, which may cause its parent
 	 * to be released, and then maybe its grandparent, etc.
 	 */
 	for (;;) {
 		if (flags & PD_DEUREF) {
+			/* Drop a user reference. */
 			KASSERT(refcount_load(&pr->pr_uref) > 0,
 			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
 			     pr->pr_id));
-			lasturef = refcount_release(&pr->pr_uref);
-			if (lasturef)
-				refcount_acquire(&pr->pr_ref);
-			KASSERT(refcount_load(&prison0.pr_uref) > 0,
-			    ("prison0 pr_uref=0"));
-		} else
-			lasturef = 0;
+			if (!refcount_release_if_not_last(&pr->pr_uref)) {
+				if (!(flags & PD_DEREF)) {
+					prison_hold(pr);
+					flags |= PD_DEREF;
+				}
+				flags = prison_lock_xlock(pr, flags);
+				if (refcount_release(&pr->pr_uref)) {
+					/*
+					 * When the last user references goes,
+					 * this becomes a dying prison.
+					 */
+					KASSERT(
+					    refcount_load(&prison0.pr_uref) > 0,
+					    ("prison0 pr_uref=0"));
+					mtx_unlock(&pr->pr_mtx);
+					flags &= ~PD_LOCKED;
+					(void)osd_jail_call(pr,
+					    PR_METHOD_REMOVE, NULL);
+				}
+			}
+		}
 		if (flags & PD_DEREF) {
+			/* Drop a reference. */
 			KASSERT(refcount_load(&pr->pr_ref) > 0,
 			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
 			     pr->pr_id));
-			lastref = refcount_release(&pr->pr_ref);
-		}
-		else
-			lastref = refcount_load(&pr->pr_ref) == 0;
-		mtx_unlock(&pr->pr_mtx);
-
-		/*
-		 * Tell the modules if the last user reference was removed
-		 * (even it sticks around in dying state).
-		 */
-		if (lasturef) {
-			if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) {
-				if (atomic_load_acq_int(&pr->pr_ref) > 1) {
-					sx_slock(&allprison_lock);
-					flags |= PD_LIST_SLOCKED;
-				} else {
-					sx_xlock(&allprison_lock);
-					flags |= PD_LIST_XLOCKED;
+			if (!refcount_release_if_not_last(&pr->pr_ref)) {
+				flags = prison_lock_xlock(pr, flags);
+				if (refcount_release(&pr->pr_ref)) {
+					/*
+					 * When the last reference goes,
+					 * unlink the prison and set it aside.
+					 */
+					KASSERT(
+					    refcount_load(&pr->pr_uref) == 0,
+					    ("prison_deref: last ref, "
+					     "but still has %d urefs (jid=%d)",
+					     pr->pr_uref, pr->pr_id));
+					KASSERT(
+					    refcount_load(&prison0.pr_ref) != 0,
+					    ("prison0 pr_ref=0"));
+					TAILQ_REMOVE(&allprison, pr, pr_list);
+					LIST_REMOVE(pr, pr_sibling);
+					TAILQ_INSERT_TAIL(&freeprison, pr,
+					    pr_list);
+					for (ppr = pr->pr_parent;
+					     ppr != NULL;
+					     ppr = ppr->pr_parent)
+						ppr->pr_childcount--;
+					/*
+					 * Removing a prison frees references
+					 * from its parent.
+					 */
+					mtx_unlock(&pr->pr_mtx);
+					flags &= ~PD_LOCKED;
+					pr = pr->pr_parent;
+					flags |= PD_DEREF | PD_DEUREF;
+					continue;
 				}
 			}
-			(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
-			mtx_lock(&pr->pr_mtx);
-			lastref = refcount_release(&pr->pr_ref);
-			mtx_unlock(&pr->pr_mtx);
 		}
-
-		if (!lastref)
-			break;
-
-		if (flags & PD_LIST_SLOCKED) {
-			if (!sx_try_upgrade(&allprison_lock)) {
-				sx_sunlock(&allprison_lock);
-				sx_xlock(&allprison_lock);
-			}
-			flags &= ~PD_LIST_SLOCKED;
-		} else if (!(flags & PD_LIST_XLOCKED))
-			sx_xlock(&allprison_lock);
-		flags |= PD_LIST_XLOCKED;
-
-		TAILQ_REMOVE(&allprison, pr, pr_list);
-		LIST_REMOVE(pr, pr_sibling);
-		TAILQ_INSERT_TAIL(&freeprison, pr, pr_list);
-		for (tpr = pr->pr_parent; tpr != NULL; tpr = tpr->pr_parent)
-			tpr->pr_childcount--;
-
-		/* Removing a prison frees a reference on its parent. */
-		pr = pr->pr_parent;
-		mtx_lock(&pr->pr_mtx);
-		flags |= PD_DEREF | PD_DEUREF;
+		break;
 	}
 
 	/* Release all the prison locks. */
+	if (flags & PD_LOCKED)
+		mtx_unlock(&pr->pr_mtx);
 	if (flags & PD_LIST_SLOCKED)
 		sx_sunlock(&allprison_lock);
 	else if (flags & PD_LIST_XLOCKED)
 		sx_xunlock(&allprison_lock);
 
 	/*
 	 * Finish removing any unreferenced prisons, which couldn't happen
 	 * while allprison_lock was held (to avoid a LOR on vrele).
 	 */
 	TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) {
 #ifdef VIMAGE
 		if (rpr->pr_vnet != rpr->pr_parent->pr_vnet)
 			vnet_destroy(rpr->pr_vnet);
 #endif
 		if (rpr->pr_root != NULL)
 			vrele(rpr->pr_root);
 		mtx_destroy(&rpr->pr_mtx);
 #ifdef INET
 		free(rpr->pr_ip4, M_PRISON);
 #endif
 #ifdef INET6
 		free(rpr->pr_ip6, M_PRISON);
 #endif
 		if (rpr->pr_cpuset != NULL)
 			cpuset_rel(rpr->pr_cpuset);
 		osd_jail_exit(rpr);
 #ifdef RACCT
 		if (racct_enable)
 			prison_racct_detach(rpr);
 #endif
+		TAILQ_REMOVE(&freeprison, rpr, pr_list);
 		free(rpr, M_PRISON);
 	}
 }
 
+/*
+ * Given the current locking state in the flags, make sure allprison_lock
+ * is held exclusive, and the prison is locked.  Return flags indicating
+ * the new state.
+ */
+static int
+prison_lock_xlock(struct prison *pr, int flags)
+{
+
+	if (!(flags & PD_LIST_XLOCKED)) {
+		/*
+		 * Get allprison_lock, which may be an upgrade,
+		 * and may require unlocking the prison.
+		 */
+		if (flags & PD_LOCKED) {
+			mtx_unlock(&pr->pr_mtx);
+			flags &= ~PD_LOCKED;
+		}
+		if (flags & PD_LIST_SLOCKED) {
+			if (!sx_try_upgrade(&allprison_lock)) {
+				sx_sunlock(&allprison_lock);
+				sx_xlock(&allprison_lock);
+			}
+			flags &= ~PD_LIST_SLOCKED;
+		} else
+			sx_xlock(&allprison_lock);
+		flags |= PD_LIST_XLOCKED;
+	}
+	if (!(flags & PD_LOCKED)) {
+		/* Lock the prison mutex. */
+		mtx_lock(&pr->pr_mtx);
+		flags |= PD_LOCKED;
+	}
+	return flags;
+}
+
 /*
  * Set or clear a permission bit in the pr_allow field, passing restrictions
  * (cleared permission) down to child jails.
  */
 void
 prison_set_allow(struct ucred *cred, unsigned flag, int enable)
 {
 	struct prison *pr;
 
 	pr = cred->cr_prison;
 	sx_slock(&allprison_lock);
 	mtx_lock(&pr->pr_mtx);
 	prison_set_allow_locked(pr, flag, enable);
 	mtx_unlock(&pr->pr_mtx);
 	sx_sunlock(&allprison_lock);
 }
 
 static void
 prison_set_allow_locked(struct prison *pr, unsigned flag, int enable)
 {
 	struct prison *cpr;
 	int descend;
 
 	if (enable != 0)
 		pr->pr_allow |= flag;
 	else {
 		pr->pr_allow &= ~flag;
 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
 			cpr->pr_allow &= ~flag;
 	}
 }
 
 /*
  * Check if a jail supports the given address family.
  *
  * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
  * if not.
  */
 int
 prison_check_af(struct ucred *cred, int af)
 {
 	struct prison *pr;
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 
 	pr = cred->cr_prison;
 #ifdef VIMAGE
 	/* Prisons with their own network stack are not limited. */
 	if (prison_owns_vnet(cred))
 		return (0);
 #endif
 
 	error = 0;
 	switch (af)
 	{
 #ifdef INET
 	case AF_INET:
 		if (pr->pr_flags & PR_IP4)
 		{
 			mtx_lock(&pr->pr_mtx);
 			if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
 				error = EAFNOSUPPORT;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (pr->pr_flags & PR_IP6)
 		{
 			mtx_lock(&pr->pr_mtx);
 			if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
 				error = EAFNOSUPPORT;
 			mtx_unlock(&pr->pr_mtx);
 		}
 		break;
 #endif
 	case AF_LOCAL:
 	case AF_ROUTE:
 		break;
 	default:
 		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
 			error = EAFNOSUPPORT;
 	}
 	return (error);
 }
 
 /*
  * Check if given address belongs to the jail referenced by cred (wrapper to
  * prison_check_ip[46]).
  *
  * Returns 0 if jail doesn't restrict the address family or if address belongs
  * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
  * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
  */
 int
 prison_if(struct ucred *cred, const struct sockaddr *sa)
 {
 #ifdef INET
 	const struct sockaddr_in *sai;
 #endif
 #ifdef INET6
 	const struct sockaddr_in6 *sai6;
 #endif
 	int error;
 
 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
 
 #ifdef VIMAGE
 	if (prison_owns_vnet(cred))
 		return (0);
 #endif
 
 	error = 0;
 	switch (sa->sa_family)
 	{
 #ifdef INET
 	case AF_INET:
 		sai = (const struct sockaddr_in *)sa;
 		error = prison_check_ip4(cred, &sai->sin_addr);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sai6 = (const struct sockaddr_in6 *)sa;
 		error = prison_check_ip6(cred, &sai6->sin6_addr);
 		break;
 #endif
 	default:
 		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
 			error = EAFNOSUPPORT;
 	}
 	return (error);
 }
 
 /*
  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
  */
 int
 prison_check(struct ucred *cred1, struct ucred *cred2)
 {
 
 	return ((cred1->cr_prison == cred2->cr_prison ||
 	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
 }
 
 /*
  * Return 1 if p2 is a child of p1, otherwise 0.
  */
 int
 prison_ischild(struct prison *pr1, struct prison *pr2)
 {
 
 	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
 		if (pr1 == pr2)
 			return (1);
 	return (0);
 }
 
 /*
- * Return true if the prison is currently alive.  A prison is alive if it is
- * valid and holds user references, and it isn't being removed.
+ * Return true if the prison is currently alive.  A prison is alive if it
+ * holds user references and it isn't being removed.
  */
 bool
 prison_isalive(struct prison *pr)
 {
 
-	if (__predict_false(refcount_load(&pr->pr_ref) == 0))
-		return (false);
 	if (__predict_false(refcount_load(&pr->pr_uref) == 0))
 		return (false);
 	if (__predict_false(pr->pr_flags & PR_REMOVE))
 		return (false);
 	return (true);
 }
 
 /*
  * Return true if the prison is currently valid.  A prison is valid if it has
  * been fully created, and is not being destroyed.  Note that dying prisons
- * are still considered valid.
+ * are still considered valid.  Invalid prisons won't be found under normal
+ * circumstances, as they're only put in that state by functions that have
+ * an exclusive hold on allprison_lock.
  */
 bool
 prison_isvalid(struct prison *pr)
 {
 
 	if (__predict_false(refcount_load(&pr->pr_ref) == 0))
 		return (false);
 	return (true);
 }
 
 /*
  * Return 1 if the passed credential is in a jail and that jail does not
  * have its own virtual network stack, otherwise 0.
  */
 int
 jailed_without_vnet(struct ucred *cred)
 {
 
 	if (!jailed(cred))
 		return (0);
 #ifdef VIMAGE
 	if (prison_owns_vnet(cred))
 		return (0);
 #endif
 
 	return (1);
 }
 
 /*
  * Return the correct hostname (domainname, et al) for the passed credential.
  */
 void
 getcredhostname(struct ucred *cred, char *buf, size_t size)
 {
 	struct prison *pr;
 
 	/*
 	 * A NULL credential can be used to shortcut to the physical
 	 * system's hostname.
 	 */
 	pr = (cred != NULL) ? cred->cr_prison : &prison0;
 	mtx_lock(&pr->pr_mtx);
 	strlcpy(buf, pr->pr_hostname, size);
 	mtx_unlock(&pr->pr_mtx);
 }
 
 void
 getcreddomainname(struct ucred *cred, char *buf, size_t size)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(buf, cred->cr_prison->pr_domainname, size);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getcredhostid(struct ucred *cred, unsigned long *hostid)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	*hostid = cred->cr_prison->pr_hostid;
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 void
 getjailname(struct ucred *cred, char *name, size_t len)
 {
 
 	mtx_lock(&cred->cr_prison->pr_mtx);
 	strlcpy(name, cred->cr_prison->pr_name, len);
 	mtx_unlock(&cred->cr_prison->pr_mtx);
 }
 
 #ifdef VIMAGE
 /*
  * Determine whether the prison represented by cred owns
  * its vnet rather than having it inherited.
  *
  * Returns 1 in case the prison owns the vnet, 0 otherwise.
  */
 int
 prison_owns_vnet(struct ucred *cred)
 {
 
 	/*
 	 * vnets cannot be added/removed after jail creation,
 	 * so no need to lock here.
 	 */
 	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
 }
 #endif
 
 /*
  * Determine whether the subject represented by cred can "see"
  * status of a mount point.
  * Returns: 0 for permitted, ENOENT otherwise.
  * XXX: This function should be called cr_canseemount() and should be
  *      placed in kern_prot.c.
  */
 int
 prison_canseemount(struct ucred *cred, struct mount *mp)
 {
 	struct prison *pr;
 	struct statfs *sp;
 	size_t len;
 
 	pr = cred->cr_prison;
 	if (pr->pr_enforce_statfs == 0)
 		return (0);
 	if (pr->pr_root->v_mount == mp)
 		return (0);
 	if (pr->pr_enforce_statfs == 2)
 		return (ENOENT);
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 * This is ugly check, but this is the only situation when jail's
 	 * directory ends with '/'.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return (0);
 	len = strlen(pr->pr_path);
 	sp = &mp->mnt_stat;
 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
 		return (ENOENT);
 	/*
 	 * Be sure that we don't have situation where jail's root directory
 	 * is "/some/path" and mount point is "/some/pathpath".
 	 */
 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
 		return (ENOENT);
 	return (0);
 }
 
 void
 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
 {
 	char jpath[MAXPATHLEN];
 	struct prison *pr;
 	size_t len;
 
 	pr = cred->cr_prison;
 	if (pr->pr_enforce_statfs == 0)
 		return;
 	if (prison_canseemount(cred, mp) != 0) {
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		strlcpy(sp->f_mntonname, "[restricted]",
 		    sizeof(sp->f_mntonname));
 		return;
 	}
 	if (pr->pr_root->v_mount == mp) {
 		/*
 		 * Clear current buffer data, so we are sure nothing from
 		 * the valid path left there.
 		 */
 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 		*sp->f_mntonname = '/';
 		return;
 	}
 	/*
 	 * If jail's chroot directory is set to "/" we should be able to see
 	 * all mount-points from inside a jail.
 	 */
 	if (strcmp(pr->pr_path, "/") == 0)
 		return;
 	len = strlen(pr->pr_path);
 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
 	/*
 	 * Clear current buffer data, so we are sure nothing from
 	 * the valid path left there.
 	 */
 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 	if (*jpath == '\0') {
 		/* Should never happen. */
 		*sp->f_mntonname = '/';
 	} else {
 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
 	}
 }
 
 /*
  * Check with permission for a specific privilege is granted within jail.  We
  * have a specific list of accepted privileges; the rest are denied.
  */
 int
 prison_priv_check(struct ucred *cred, int priv)
 {
 	struct prison *pr;
 	int error;
 
 	/*
 	 * Some policies have custom handlers. This routine should not be
 	 * called for them. See priv_check_cred().
 	 */
 	switch (priv) {
 	case PRIV_VFS_LOOKUP:
 	case PRIV_VFS_GENERATION:
 		KASSERT(0, ("prison_priv_check instead of a custom handler "
 		    "called for %d\n", priv));
 	}
 
 	if (!jailed(cred))
 		return (0);
 
 #ifdef VIMAGE
 	/*
 	 * Privileges specific to prisons with a virtual network stack.
 	 * There might be a duplicate entry here in case the privilege
 	 * is only granted conditionally in the legacy jail case.
 	 */
 	switch (priv) {
 #ifdef notyet
 		/*
 		 * NFS-specific privileges.
 		 */
 	case PRIV_NFS_DAEMON:
 	case PRIV_NFS_LOCKD:
 #endif
 		/*
 		 * Network stack privileges.
 		 */
 	case PRIV_NET_BRIDGE:
 	case PRIV_NET_GRE:
 	case PRIV_NET_BPF:
 	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
 	case PRIV_NET_ROUTE:
 	case PRIV_NET_TAP:
 	case PRIV_NET_SETIFMTU:
 	case PRIV_NET_SETIFFLAGS:
 	case PRIV_NET_SETIFCAP:
 	case PRIV_NET_SETIFDESCR:
 	case PRIV_NET_SETIFNAME	:
 	case PRIV_NET_SETIFMETRIC:
 	case PRIV_NET_SETIFPHYS:
 	case PRIV_NET_SETIFMAC:
 	case PRIV_NET_SETLANPCP:
 	case PRIV_NET_ADDMULTI:
 	case PRIV_NET_DELMULTI:
 	case PRIV_NET_HWIOCTL:
 	case PRIV_NET_SETLLADDR:
 	case PRIV_NET_ADDIFGROUP:
 	case PRIV_NET_DELIFGROUP:
 	case PRIV_NET_IFCREATE:
 	case PRIV_NET_IFDESTROY:
 	case PRIV_NET_ADDIFADDR:
 	case PRIV_NET_DELIFADDR:
 	case PRIV_NET_LAGG:
 	case PRIV_NET_GIF:
 	case PRIV_NET_SETIFVNET:
 	case PRIV_NET_SETIFFIB:
 
 		/*
 		 * 802.11-related privileges.
 		 */
 	case PRIV_NET80211_VAP_GETKEY:
 	case PRIV_NET80211_VAP_MANAGE:
 
 #ifdef notyet
 		/*
 		 * ATM privileges.
 		 */
 	case PRIV_NETATM_CFG:
 	case PRIV_NETATM_ADD:
 	case PRIV_NETATM_DEL:
 	case PRIV_NETATM_SET:
 
 		/*
 		 * Bluetooth privileges.
 		 */
 	case PRIV_NETBLUETOOTH_RAW:
 #endif
 
 		/*
 		 * Netgraph and netgraph module privileges.
 		 */
 	case PRIV_NETGRAPH_CONTROL:
 #ifdef notyet
 	case PRIV_NETGRAPH_TTY:
 #endif
 
 		/*
 		 * IPv4 and IPv6 privileges.
 		 */
 	case PRIV_NETINET_IPFW:
 	case PRIV_NETINET_DIVERT:
 	case PRIV_NETINET_PF:
 	case PRIV_NETINET_DUMMYNET:
 	case PRIV_NETINET_CARP:
 	case PRIV_NETINET_MROUTE:
 	case PRIV_NETINET_RAW:
 	case PRIV_NETINET_ADDRCTRL6:
 	case PRIV_NETINET_ND6:
 	case PRIV_NETINET_SCOPE6:
 	case PRIV_NETINET_ALIFETIME6:
 	case PRIV_NETINET_IPSEC:
 	case PRIV_NETINET_BINDANY:
 
 #ifdef notyet
 		/*
 		 * NCP privileges.
 		 */
 	case PRIV_NETNCP:
 
 		/*
 		 * SMB privileges.
 		 */
 	case PRIV_NETSMB:
 #endif
 
 	/*
 	 * No default: or deny here.
 	 * In case of no permit fall through to next switch().
 	 */
 		if (cred->cr_prison->pr_flags & PR_VNET)
 			return (0);
 	}
 #endif /* VIMAGE */
 
 	switch (priv) {
 		/*
 		 * Allow ktrace privileges for root in jail.
 		 */
 	case PRIV_KTRACE:
 
 #if 0
 		/*
 		 * Allow jailed processes to configure audit identity and
 		 * submit audit records (login, etc).  In the future we may
 		 * want to further refine the relationship between audit and
 		 * jail.
 		 */
 	case PRIV_AUDIT_GETAUDIT:
 	case PRIV_AUDIT_SETAUDIT:
 	case PRIV_AUDIT_SUBMIT:
 #endif
 
 		/*
 		 * Allow jailed processes to manipulate process UNIX
 		 * credentials in any way they see fit.
 		 */
 	case PRIV_CRED_SETUID:
 	case PRIV_CRED_SETEUID:
 	case PRIV_CRED_SETGID:
 	case PRIV_CRED_SETEGID:
 	case PRIV_CRED_SETGROUPS:
 	case PRIV_CRED_SETREUID:
 	case PRIV_CRED_SETREGID:
 	case PRIV_CRED_SETRESUID:
 	case PRIV_CRED_SETRESGID:
 
 		/*
 		 * Jail implements visibility constraints already, so allow
 		 * jailed root to override uid/gid-based constraints.
 		 */
 	case PRIV_SEEOTHERGIDS:
 	case PRIV_SEEOTHERUIDS:
 
 		/*
 		 * Jail implements inter-process debugging limits already, so
 		 * allow jailed root various debugging privileges.
 		 */
 	case PRIV_DEBUG_DIFFCRED:
 	case PRIV_DEBUG_SUGID:
 	case PRIV_DEBUG_UNPRIV:
 
 		/*
 		 * Allow jail to set various resource limits and login
 		 * properties, and for now, exceed process resource limits.
 		 */
 	case PRIV_PROC_LIMIT:
 	case PRIV_PROC_SETLOGIN:
 	case PRIV_PROC_SETRLIMIT:
 
 		/*
 		 * System V and POSIX IPC privileges are granted in jail.
 		 */
 	case PRIV_IPC_READ:
 	case PRIV_IPC_WRITE:
 	case PRIV_IPC_ADMIN:
 	case PRIV_IPC_MSGSIZE:
 	case PRIV_MQ_ADMIN:
 
 		/*
 		 * Jail operations within a jail work on child jails.
 		 */
 	case PRIV_JAIL_ATTACH:
 	case PRIV_JAIL_SET:
 	case PRIV_JAIL_REMOVE:
 
 		/*
 		 * Jail implements its own inter-process limits, so allow
 		 * root processes in jail to change scheduling on other
 		 * processes in the same jail.  Likewise for signalling.
 		 */
 	case PRIV_SCHED_DIFFCRED:
 	case PRIV_SCHED_CPUSET:
 	case PRIV_SIGNAL_DIFFCRED:
 	case PRIV_SIGNAL_SUGID:
 
 		/*
 		 * Allow jailed processes to write to sysctls marked as jail
 		 * writable.
 		 */
 	case PRIV_SYSCTL_WRITEJAIL:
 
 		/*
 		 * Allow root in jail to manage a variety of quota
 		 * properties.  These should likely be conditional on a
 		 * configuration option.
 		 */
 	case PRIV_VFS_GETQUOTA:
 	case PRIV_VFS_SETQUOTA:
 
 		/*
 		 * Since Jail relies on chroot() to implement file system
 		 * protections, grant many VFS privileges to root in jail.
 		 * Be careful to exclude mount-related and NFS-related
 		 * privileges.
 		 */
 	case PRIV_VFS_READ:
 	case PRIV_VFS_WRITE:
 	case PRIV_VFS_ADMIN:
 	case PRIV_VFS_EXEC:
 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
 	case PRIV_VFS_CHFLAGS_DEV:
 	case PRIV_VFS_CHOWN:
 	case PRIV_VFS_CHROOT:
 	case PRIV_VFS_RETAINSUGID:
 	case PRIV_VFS_FCHROOT:
 	case PRIV_VFS_LINK:
 	case PRIV_VFS_SETGID:
 	case PRIV_VFS_STAT:
 	case PRIV_VFS_STICKYFILE:
 
 		/*
 		 * As in the non-jail case, non-root users are expected to be
 		 * able to read kernel/phyiscal memory (provided /dev/[k]mem
 		 * exists in the jail and they have permission to access it).
 		 */
 	case PRIV_KMEM_READ:
 		return (0);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * setting system flags.
 		 */
 	case PRIV_VFS_SYSFLAGS:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Depending on the global setting, allow privilege of
 		 * mounting/unmounting file systems.
 		 */
 	case PRIV_VFS_MOUNT:
 	case PRIV_VFS_UNMOUNT:
 	case PRIV_VFS_MOUNT_NONUSER:
 	case PRIV_VFS_MOUNT_OWNER:
 		pr = cred->cr_prison;
 		prison_lock(pr);
 		if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2)
 			error = 0;
 		else
 			error = EPERM;
 		prison_unlock(pr);
 		return (error);
 
 		/*
 		 * Jails should hold no disposition on the PRIV_VFS_READ_DIR
 		 * policy.  priv_check_cred will not specifically allow it, and
 		 * we may want a MAC policy to allow it.
 		 */
 	case PRIV_VFS_READ_DIR:
 		return (0);
 
 		/*
 		 * Conditionnaly allow locking (unlocking) physical pages
 		 * in memory.
 		 */
 	case PRIV_VM_MLOCK:
 	case PRIV_VM_MUNLOCK:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Conditionally allow jailed root to bind reserved ports.
 		 */
 	case PRIV_NETINET_RESERVEDPORT:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Allow jailed root to reuse in-use ports.
 		 */
 	case PRIV_NETINET_REUSEPORT:
 		return (0);
 
 		/*
 		 * Allow jailed root to set certain IPv4/6 (option) headers.
 		 */
 	case PRIV_NETINET_SETHDROPTS:
 		return (0);
 
 		/*
 		 * Conditionally allow creating raw sockets in jail.
 		 */
 	case PRIV_NETINET_RAW:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
 			return (0);
 		else
 			return (EPERM);
 
 		/*
 		 * Since jail implements its own visibility limits on netstat
 		 * sysctls, allow getcred.  This allows identd to work in
 		 * jail.
 		 */
 	case PRIV_NETINET_GETCRED:
 		return (0);
 
 		/*
 		 * Allow jailed root to set loginclass.
 		 */
 	case PRIV_PROC_SETLOGINCLASS:
 		return (0);
 
 		/*
 		 * Do not allow a process inside a jail to read the kernel
 		 * message buffer unless explicitly permitted.
 		 */
 	case PRIV_MSGBUF:
 		if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
 			return (0);
 		return (EPERM);
 
 	default:
 		/*
 		 * In all remaining cases, deny the privilege request.  This
 		 * includes almost all network privileges, many system
 		 * configuration privileges.
 		 */
 		return (EPERM);
 	}
 }
 
 /*
  * Return the part of pr2's name that is relative to pr1, or the whole name
  * if it does not directly follow.
  */
 
 char *
 prison_name(struct prison *pr1, struct prison *pr2)
 {
 	char *name;
 
 	/* Jails see themselves as "0" (if they see themselves at all). */
 	if (pr1 == pr2)
 		return "0";
 	name = pr2->pr_name;
 	if (prison_ischild(pr1, pr2)) {
 		/*
 		 * pr1 isn't locked (and allprison_lock may not be either)
 		 * so its length can't be counted on.  But the number of dots
 		 * can be counted on - and counted.
 		 */
 		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
 			name = strchr(name, '.') + 1;
 	}
 	return (name);
 }
 
 /*
  * Return the part of pr2's path that is relative to pr1, or the whole path
  * if it does not directly follow.
  */
 static char *
 prison_path(struct prison *pr1, struct prison *pr2)
 {
 	char *path1, *path2;
 	int len1;
 
 	path1 = pr1->pr_path;
 	path2 = pr2->pr_path;
 	if (!strcmp(path1, "/"))
 		return (path2);
 	len1 = strlen(path1);
 	if (strncmp(path1, path2, len1))
 		return (path2);
 	if (path2[len1] == '\0')
 		return "/";
 	if (path2[len1] == '/')
 		return (path2 + len1);
 	return (path2);
 }
 
 /*
  * Jail-related sysctls.
  */
 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Jails");
 
 static int
 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
 {
 	struct xprison *xp;
 	struct prison *pr, *cpr;
 #ifdef INET
 	struct in_addr *ip4 = NULL;
 	int ip4s = 0;
 #endif
 #ifdef INET6
 	struct in6_addr *ip6 = NULL;
 	int ip6s = 0;
 #endif
 	int descend, error;
 
 	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
 	pr = req->td->td_ucred->cr_prison;
 	error = 0;
 	sx_slock(&allprison_lock);
 	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
 #if defined(INET) || defined(INET6)
  again:
 #endif
 		mtx_lock(&cpr->pr_mtx);
 #ifdef INET
 		if (cpr->pr_ip4s > 0) {
 			if (ip4s < cpr->pr_ip4s) {
 				ip4s = cpr->pr_ip4s;
 				mtx_unlock(&cpr->pr_mtx);
 				ip4 = realloc(ip4, ip4s *
 				    sizeof(struct in_addr), M_TEMP, M_WAITOK);
 				goto again;
 			}
 			bcopy(cpr->pr_ip4, ip4,
 			    cpr->pr_ip4s * sizeof(struct in_addr));
 		}
 #endif
 #ifdef INET6
 		if (cpr->pr_ip6s > 0) {
 			if (ip6s < cpr->pr_ip6s) {
 				ip6s = cpr->pr_ip6s;
 				mtx_unlock(&cpr->pr_mtx);
 				ip6 = realloc(ip6, ip6s *
 				    sizeof(struct in6_addr), M_TEMP, M_WAITOK);
 				goto again;
 			}
 			bcopy(cpr->pr_ip6, ip6,
 			    cpr->pr_ip6s * sizeof(struct in6_addr));
 		}
 #endif
-		if (!prison_isvalid(cpr)) {
-			mtx_unlock(&cpr->pr_mtx);
-			continue;
-		}
 		bzero(xp, sizeof(*xp));
 		xp->pr_version = XPRISON_VERSION;
 		xp->pr_id = cpr->pr_id;
 		xp->pr_state = prison_isalive(cpr)
 		    ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
 		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
 		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
 		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
 #ifdef INET
 		xp->pr_ip4s = cpr->pr_ip4s;
 #endif
 #ifdef INET6
 		xp->pr_ip6s = cpr->pr_ip6s;
 #endif
 		mtx_unlock(&cpr->pr_mtx);
 		error = SYSCTL_OUT(req, xp, sizeof(*xp));
 		if (error)
 			break;
 #ifdef INET
 		if (xp->pr_ip4s > 0) {
 			error = SYSCTL_OUT(req, ip4,
 			    xp->pr_ip4s * sizeof(struct in_addr));
 			if (error)
 				break;
 		}
 #endif
 #ifdef INET6
 		if (xp->pr_ip6s > 0) {
 			error = SYSCTL_OUT(req, ip6,
 			    xp->pr_ip6s * sizeof(struct in6_addr));
 			if (error)
 				break;
 		}
 #endif
 	}
 	sx_sunlock(&allprison_lock);
 	free(xp, M_TEMP);
 #ifdef INET
 	free(ip4, M_TEMP);
 #endif
 #ifdef INET6
 	free(ip6, M_TEMP);
 #endif
 	return (error);
 }
 
 SYSCTL_OID(_security_jail, OID_AUTO, list,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_list, "S", "List of active jails");
 
 static int
 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
 {
 	int error, injail;
 
 	injail = jailed(req->td->td_ucred);
 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
 
 	return (error);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_jailed, "I", "Process in jail?");
 
 static int
 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
 {
 	int error, havevnet;
 #ifdef VIMAGE
 	struct ucred *cred = req->td->td_ucred;
 
 	havevnet = jailed(cred) && prison_owns_vnet(cred);
 #else
 	havevnet = 0;
 #endif
 	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
 
 	return (error);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     sysctl_jail_vnet, "I", "Jail owns vnet?");
 
 #if defined(INET) || defined(INET6)
 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
     &jail_max_af_ips, 0,
     "Number of IP addresses a jail may have at most per address family (deprecated)");
 #endif
 
 /*
  * Default parameters for jail(2) compatibility.  For historical reasons,
  * the sysctl names have varying similarity to the parameter names.  Prisons
  * just see their own parameters, and can't change them.
  */
 static int
 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
 {
 	int error, i;
 
 	/* Get the current flag value, and convert it to a boolean. */
 	if (req->td->td_ucred->cr_prison == &prison0) {
 		mtx_lock(&prison0.pr_mtx);
 		i = (jail_default_allow & arg2) != 0;
 		mtx_unlock(&prison0.pr_mtx);
 	} else
 		i = prison_allow(req->td->td_ucred, arg2);
 
 	if (arg1 != NULL)
 		i = !i;
 	error = sysctl_handle_int(oidp, &i, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	i = i ? arg2 : 0;
 	if (arg1 != NULL)
 		i ^= arg2;
 	/*
 	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
 	 * for writing.
 	 */
 	mtx_lock(&prison0.pr_mtx);
 	jail_default_allow = (jail_default_allow & ~arg2) | i;
 	mtx_unlock(&prison0.pr_mtx);
 	return (0);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
     "Processes in jail can set their hostnames (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
     "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
     "Processes in jail can use System V IPC primitives (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
     "Prison root can create raw sockets (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
     "Processes in jail can alter system file flags (deprecated)");
 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
     "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
 
 static int
 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 	int level, error;
 
 	pr = req->td->td_ucred->cr_prison;
 	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
 	error = sysctl_handle_int(oidp, &level, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	*(int *)arg1 = level;
 	return (0);
 }
 
 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
     sysctl_jail_default_level, "I",
     "Processes in jail cannot see all mounted file systems (deprecated)");
 
 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
     &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
     sysctl_jail_default_level, "I",
     "Ruleset for the devfs filesystem in jail (deprecated)");
 
 /*
  * Nodes to describe jail parameters.  Maximum length of string parameters
  * is returned in the string itself, and the other parameters exist merely
  * to make themselves and their types known.
  */
 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Jail parameters");
 
 int
 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
 {
 	int i;
 	long l;
 	size_t s;
 	char numbuf[12];
 
 	switch (oidp->oid_kind & CTLTYPE)
 	{
 	case CTLTYPE_LONG:
 	case CTLTYPE_ULONG:
 		l = 0;
 #ifdef SCTL_MASK32
 		if (!(req->flags & SCTL_MASK32))
 #endif
 			return (SYSCTL_OUT(req, &l, sizeof(l)));
 	case CTLTYPE_INT:
 	case CTLTYPE_UINT:
 		i = 0;
 		return (SYSCTL_OUT(req, &i, sizeof(i)));
 	case CTLTYPE_STRING:
 		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
 		return
 		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
 	case CTLTYPE_STRUCT:
 		s = (size_t)arg2;
 		return (SYSCTL_OUT(req, &s, sizeof(s)));
 	}
 	return (0);
 }
 
 /*
  * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
  * jail creation time but cannot be changed in an existing jail.
  */
 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail secure level");
 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
     "Jail value for kern.osreldate and uname -K");
 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
     "Jail value for kern.osrelease and uname -r");
 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Jail cannot see all mounted file systems");
 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Ruleset for in-jail devfs mounts");
 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail persistence");
 #ifdef VIMAGE
 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
     "E,jailsys", "Virtual network stack");
 #endif
 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
     "B", "Jail is in the process of shutting down");
 
 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
     "I", "Current number of child jails");
 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
     "I", "Maximum number of child jails");
 
 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
     "Jail hostname");
 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
     "Jail NIS domainname");
 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
     "Jail host UUID");
 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
     "LU", "Jail host ID");
 
 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
 
 #ifdef INET
 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
     "Jail IPv4 address virtualization");
 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
     "S,in_addr,a", "Jail IPv4 addresses");
 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Do (not) use IPv4 source address selection rather than the "
     "primary jail IPv4 address.");
 #endif
 #ifdef INET6
 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
     "Jail IPv6 address virtualization");
 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
     "S,in6_addr,a", "Jail IPv6 addresses");
 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Do (not) use IPv6 source address selection rather than the "
     "primary jail IPv6 address.");
 #endif
 
 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set hostname");
 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may use SYSV IPC");
 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may create raw sockets");
 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may alter system file flags");
 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may set file quotas");
 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may lock (unlock) physical pages in memory");
 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may bind sockets to reserved ports");
 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may read the kernel message buffer");
 SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Unprivileged processes may use process debugging facilities");
 SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW,
     "B", "Processes in jail with uid 0 have privilege");
 
 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
     "B", "Jail may mount/unmount jail-friendly file systems in general");
 
 /*
  * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>.  Return
  * its associated bit in the pr_allow bitmask, or zero if the parameter was
  * not created.
  */
 unsigned
 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
     const char *descr)
 {
 	struct bool_flags *bf;
 	struct sysctl_oid *parent;
 	char *allow_name, *allow_noname, *allowed;
 #ifndef NO_SYSCTL_DESCR
 	char *descr_deprecated;
 #endif
 	u_int allow_flag;
 
 	if (prefix
 	    ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
 		< 0 ||
 	      asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
 		< 0
 	    : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
 	      asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
 		free(allow_name, M_PRISON);
 		return 0;
 	}
 
 	/*
 	 * See if this parameter has already beed added, i.e. a module was
 	 * previously loaded/unloaded.
 	 */
 	mtx_lock(&prison0.pr_mtx);
 	for (bf = pr_flag_allow;
 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 		atomic_load_int(&bf->flag) != 0;
 	     bf++) {
 		if (strcmp(bf->name, allow_name) == 0) {
 			allow_flag = bf->flag;
 			goto no_add;
 		}
 	}
 
 	/*
 	 * Find a free bit in pr_allow_all, failing if there are none
 	 * (which shouldn't happen as long as we keep track of how many
 	 * potential dynamic flags exist).
 	 */
 	for (allow_flag = 1;; allow_flag <<= 1) {
 		if (allow_flag == 0)
 			goto no_add;
 		if ((pr_allow_all & allow_flag) == 0)
 			break;
 	}
 
 	/* Note the parameter in the next open slot in pr_flag_allow. */
 	for (bf = pr_flag_allow; ; bf++) {
 		if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
 			/* This should never happen, but is not fatal. */
 			allow_flag = 0;
 			goto no_add;
 		}
 		if (atomic_load_int(&bf->flag) == 0)
 			break;
 	}
 	bf->name = allow_name;
 	bf->noname = allow_noname;
 	pr_allow_all |= allow_flag;
 	/*
 	 * prison0 always has permission for the new parameter.
 	 * Other jails must have it granted to them.
 	 */
 	prison0.pr_allow |= allow_flag;
 	/* The flag indicates a valid entry, so make sure it is set last. */
 	atomic_store_rel_int(&bf->flag, allow_flag);
 	mtx_unlock(&prison0.pr_mtx);
 
 	/*
 	 * Create sysctls for the paramter, and the back-compat global
 	 * permission.
 	 */
 	parent = prefix
 	    ? SYSCTL_ADD_NODE(NULL,
 		  SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
 		  OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr)
 	    : &sysctl___security_jail_param_allow;
 	(void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
 	    name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 	    NULL, 0, sysctl_jail_param, "B", descr);
 	if ((prefix
 	     ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
 	     : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
 #ifndef NO_SYSCTL_DESCR
 		(void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
 		    descr);
 #endif
 		(void)SYSCTL_ADD_PROC(NULL,
 		    SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
 		    sysctl_jail_default_allow, "I", descr_deprecated);
 #ifndef NO_SYSCTL_DESCR
 		free(descr_deprecated, M_TEMP);
 #endif
 		free(allowed, M_TEMP);
 	}
 	return allow_flag;
 
  no_add:
 	mtx_unlock(&prison0.pr_mtx);
 	free(allow_name, M_PRISON);
 	free(allow_noname, M_PRISON);
 	return allow_flag;
 }
 
 /*
  * The VFS system will register jail-aware filesystems here.  They each get
  * a parameter allow.mount.xxxfs and a flag to check when a jailed user
  * attempts to mount.
  */
 void
 prison_add_vfs(struct vfsconf *vfsp)
 {
 #ifdef NO_SYSCTL_DESCR
 
 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
 	    NULL, NULL);
 #else
 	char *descr;
 
 	(void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
 	    vfsp->vfc_name);
 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
 	    NULL, descr);
 	free(descr, M_TEMP);
 #endif
 }
 
 #ifdef RACCT
 void
 prison_racct_foreach(void (*callback)(struct racct *racct,
     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
     void *arg2, void *arg3)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_slock(&allprison_lock);
 	if (pre != NULL)
 		(pre)();
 	LIST_FOREACH(prr, &allprison_racct, prr_next)
 		(callback)(prr->prr_racct, arg2, arg3);
 	if (post != NULL)
 		(post)();
 	sx_sunlock(&allprison_lock);
 }
 
 static struct prison_racct *
 prison_racct_find_locked(const char *name)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
 		return (NULL);
 
 	LIST_FOREACH(prr, &allprison_racct, prr_next) {
 		if (strcmp(name, prr->prr_name) != 0)
 			continue;
 
 		/* Found prison_racct with a matching name? */
 		prison_racct_hold(prr);
 		return (prr);
 	}
 
 	/* Add new prison_racct. */
 	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
 	racct_create(&prr->prr_racct);
 
 	strcpy(prr->prr_name, name);
 	refcount_init(&prr->prr_refcount, 1);
 	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
 
 	return (prr);
 }
 
 struct prison_racct *
 prison_racct_find(const char *name)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_xlock(&allprison_lock);
 	prr = prison_racct_find_locked(name);
 	sx_xunlock(&allprison_lock);
 	return (prr);
 }
 
 void
 prison_racct_hold(struct prison_racct *prr)
 {
 
 	ASSERT_RACCT_ENABLED();
 
 	refcount_acquire(&prr->prr_refcount);
 }
 
 static void
 prison_racct_free_locked(struct prison_racct *prr)
 {
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	if (refcount_release(&prr->prr_refcount)) {
 		racct_destroy(&prr->prr_racct);
 		LIST_REMOVE(prr, prr_next);
 		free(prr, M_PRISON_RACCT);
 	}
 }
 
 void
 prison_racct_free(struct prison_racct *prr)
 {
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_UNLOCKED);
 
 	if (refcount_release_if_not_last(&prr->prr_refcount))
 		return;
 
 	sx_xlock(&allprison_lock);
 	prison_racct_free_locked(prr);
 	sx_xunlock(&allprison_lock);
 }
 
 static void
 prison_racct_attach(struct prison *pr)
 {
 	struct prison_racct *prr;
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_XLOCKED);
 
 	prr = prison_racct_find_locked(pr->pr_name);
 	KASSERT(prr != NULL, ("cannot find prison_racct"));
 
 	pr->pr_prison_racct = prr;
 }
 
 /*
  * Handle jail renaming.  From the racct point of view, renaming means
  * moving from one prison_racct to another.
  */
 static void
 prison_racct_modify(struct prison *pr)
 {
 #ifdef RCTL
 	struct proc *p;
 	struct ucred *cred;
 #endif
 	struct prison_racct *oldprr;
 
 	ASSERT_RACCT_ENABLED();
 
 	sx_slock(&allproc_lock);
 	sx_xlock(&allprison_lock);
 
 	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
 		sx_xunlock(&allprison_lock);
 		sx_sunlock(&allproc_lock);
 		return;
 	}
 
 	oldprr = pr->pr_prison_racct;
 	pr->pr_prison_racct = NULL;
 
 	prison_racct_attach(pr);
 
 	/*
 	 * Move resource utilisation records.
 	 */
 	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
 
 #ifdef RCTL
 	/*
 	 * Force rctl to reattach rules to processes.
 	 */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		cred = crhold(p->p_ucred);
 		PROC_UNLOCK(p);
 		rctl_proc_ucred_changed(p, cred);
 		crfree(cred);
 	}
 #endif
 
 	sx_sunlock(&allproc_lock);
 	prison_racct_free_locked(oldprr);
 	sx_xunlock(&allprison_lock);
 }
 
 static void
 prison_racct_detach(struct prison *pr)
 {
 
 	ASSERT_RACCT_ENABLED();
 	sx_assert(&allprison_lock, SA_UNLOCKED);
 
 	if (pr->pr_prison_racct == NULL)
 		return;
 	prison_racct_free(pr->pr_prison_racct);
 	pr->pr_prison_racct = NULL;
 }
 #endif /* RACCT */
 
 #ifdef DDB
 
 static void
 db_show_prison(struct prison *pr)
 {
 	struct bool_flags *bf;
 	struct jailsys_flags *jsf;
 #if defined(INET) || defined(INET6)
 	int ii;
 #endif
 	unsigned f;
 #ifdef INET
 	char ip4buf[INET_ADDRSTRLEN];
 #endif
 #ifdef INET6
 	char ip6buf[INET6_ADDRSTRLEN];
 #endif
 
 	db_printf("prison %p:\n", pr);
 	db_printf(" jid             = %d\n", pr->pr_id);
 	db_printf(" name            = %s\n", pr->pr_name);
 	db_printf(" parent          = %p\n", pr->pr_parent);
 	db_printf(" ref             = %d\n", pr->pr_ref);
 	db_printf(" uref            = %d\n", pr->pr_uref);
 	db_printf(" path            = %s\n", pr->pr_path);
 	db_printf(" cpuset          = %d\n", pr->pr_cpuset
 	    ? pr->pr_cpuset->cs_id : -1);
 #ifdef VIMAGE
 	db_printf(" vnet            = %p\n", pr->pr_vnet);
 #endif
 	db_printf(" root            = %p\n", pr->pr_root);
 	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
 	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
 	db_printf(" children.max    = %d\n", pr->pr_childmax);
 	db_printf(" children.cur    = %d\n", pr->pr_childcount);
 	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
 	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
 	db_printf(" flags           = 0x%x", pr->pr_flags);
 	for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
 		if (pr->pr_flags & bf->flag)
 			db_printf(" %s", bf->name);
 	for (jsf = pr_flag_jailsys;
 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
 	     jsf++) {
 		f = pr->pr_flags & (jsf->disable | jsf->new);
 		db_printf(" %-16s= %s\n", jsf->name,
 		    (f != 0 && f == jsf->disable) ? "disable"
 		    : (f == jsf->new) ? "new"
 		    : "inherit");
 	}
 	db_printf(" allow           = 0x%x", pr->pr_allow);
 	for (bf = pr_flag_allow;
 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
 		atomic_load_int(&bf->flag) != 0;
 	     bf++)
 		if (pr->pr_allow & bf->flag)
 			db_printf(" %s", bf->name);
 	db_printf("\n");
 	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
 	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
 	db_printf(" host.domainname = %s\n", pr->pr_domainname);
 	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
 	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
 #ifdef INET
 	db_printf(" ip4s            = %d\n", pr->pr_ip4s);
 	for (ii = 0; ii < pr->pr_ip4s; ii++)
 		db_printf(" %s %s\n",
 		    ii == 0 ? "ip4.addr        =" : "                 ",
 		    inet_ntoa_r(pr->pr_ip4[ii], ip4buf));
 #endif
 #ifdef INET6
 	db_printf(" ip6s            = %d\n", pr->pr_ip6s);
 	for (ii = 0; ii < pr->pr_ip6s; ii++)
 		db_printf(" %s %s\n",
 		    ii == 0 ? "ip6.addr        =" : "                 ",
 		    ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
 #endif
 }
 
 DB_SHOW_COMMAND(prison, db_show_prison_command)
 {
 	struct prison *pr;
 
 	if (!have_addr) {
 		/*
 		 * Show all prisons in the list, and prison0 which is not
 		 * listed.
 		 */
 		db_show_prison(&prison0);
 		if (!db_pager_quit) {
 			TAILQ_FOREACH(pr, &allprison, pr_list) {
 				db_show_prison(pr);
 				if (db_pager_quit)
 					break;
 			}
 		}
 		return;
 	}
 
 	if (addr == 0)
 		pr = &prison0;
 	else {
 		/* Look for a prison with the ID and with references. */
 		TAILQ_FOREACH(pr, &allprison, pr_list)
 			if (pr->pr_id == addr && pr->pr_ref > 0)
 				break;
 		if (pr == NULL)
 			/* Look again, without requiring a reference. */
 			TAILQ_FOREACH(pr, &allprison, pr_list)
 				if (pr->pr_id == addr)
 					break;
 		if (pr == NULL)
 			/* Assume address points to a valid prison. */
 			pr = (struct prison *)addr;
 	}
 	db_show_prison(pr);
 }
 
 #endif /* DDB */
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
index f048234f3a33..435235f0384d 100644
--- a/sys/kern/sysv_msg.c
+++ b/sys/kern/sysv_msg.c
@@ -1,1920 +1,1920 @@
 /*-
  * Implementation of SVID messages
  *
  * Author:  Daniel Boulet
  *
  * Copyright 1993 Daniel Boulet and RTMX Inc.
  *
  * This system call was implemented by Daniel Boulet under contract from RTMX.
  *
  * Redistribution and use in source forms, with and without modification,
  * are permitted provided that this entire comment appears intact.
  *
  * Redistribution in binary form may occur without any restrictions.
  * Obviously, it would be nice if you gave credit where credit is due
  * but requiring it would be too onerous.
  *
  * This software is provided ``AS IS'' without any warranties of any kind.
  */
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003-2005 McAfee, Inc.
  * Copyright (c) 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/msg.h>
 #include <sys/racct.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/jail.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 FEATURE(sysv_msg, "System V message queues support");
 
 static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues");
 
 static int msginit(void);
 static int msgunload(void);
 static int sysvmsg_modload(struct module *, int, void *);
 static void msq_remove(struct msqid_kernel *);
 static struct prison *msg_find_prison(struct ucred *);
 static int msq_prison_cansee(struct prison *, struct msqid_kernel *);
 static int msg_prison_check(void *, void *);
 static int msg_prison_set(void *, void *);
 static int msg_prison_get(void *, void *);
 static int msg_prison_remove(void *, void *);
 static void msg_prison_cleanup(struct prison *);
 
 #ifdef MSG_DEBUG
 #define DPRINTF(a)	printf a
 #else
 #define DPRINTF(a)	(void)0
 #endif
 
 static void msg_freehdr(struct msg *msghdr);
 
 #ifndef MSGSSZ
 #define MSGSSZ	8		/* Each segment must be 2^N long */
 #endif
 #ifndef MSGSEG
 #define MSGSEG	2048		/* must be less than 32767 */
 #endif
 #define MSGMAX	(MSGSSZ*MSGSEG)
 #ifndef MSGMNB
 #define MSGMNB	2048		/* max # of bytes in a queue */
 #endif
 #ifndef MSGMNI
 #define MSGMNI	40
 #endif
 #ifndef MSGTQL
 #define MSGTQL	40
 #endif
 
 /*
  * Based on the configuration parameters described in an SVR2 (yes, two)
  * config(1m) man page.
  *
  * Each message is broken up and stored in segments that are msgssz bytes
  * long.  For efficiency reasons, this should be a power of two.  Also,
  * it doesn't make sense if it is less than 8 or greater than about 256.
  * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of
  * two between 8 and 1024 inclusive (and panic's if it isn't).
  */
 struct msginfo msginfo = {
                 MSGMAX,         /* max chars in a message */
                 MSGMNI,         /* # of message queue identifiers */
                 MSGMNB,         /* max chars in a queue */
                 MSGTQL,         /* max messages in system */
                 MSGSSZ,         /* size of a message segment */
                 		/* (must be small power of 2 greater than 4) */
                 MSGSEG          /* number of message segments */
 };
 
 /*
  * macros to convert between msqid_ds's and msqid's.
  * (specific to this implementation)
  */
 #define MSQID(ix,ds)	((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000))
 #define MSQID_IX(id)	((id) & 0xffff)
 #define MSQID_SEQ(id)	(((id) >> 16) & 0xffff)
 
 /*
  * The rest of this file is specific to this particular implementation.
  */
 
 struct msgmap {
 	short	next;		/* next segment in buffer */
     				/* -1 -> available */
     				/* 0..(MSGSEG-1) -> index of next segment */
 };
 
 #define MSG_LOCKED	01000	/* Is this msqid_ds locked? */
 
 static int nfree_msgmaps;	/* # of free map entries */
 static short free_msgmaps;	/* head of linked list of free map entries */
 static struct msg *free_msghdrs;/* list of free msg headers */
 static char *msgpool;		/* MSGMAX byte long msg buffer pool */
 static struct msgmap *msgmaps;	/* MSGSEG msgmap structures */
 static struct msg *msghdrs;	/* MSGTQL msg headers */
 static struct msqid_kernel *msqids;	/* MSGMNI msqid_kernel struct's */
 static struct mtx msq_mtx;	/* global mutex for message queues. */
 static unsigned msg_prison_slot;/* prison OSD slot */
 
 static struct syscall_helper_data msg_syscalls[] = {
 	SYSCALL_INIT_HELPER(msgctl),
 	SYSCALL_INIT_HELPER(msgget),
 	SYSCALL_INIT_HELPER(msgsnd),
 	SYSCALL_INIT_HELPER(msgrcv),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL_INIT_HELPER(msgsys),
 	SYSCALL_INIT_HELPER_COMPAT(freebsd7_msgctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static struct syscall_helper_data msg32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_msgctl),
 	SYSCALL32_INIT_HELPER(freebsd32_msgsnd),
 	SYSCALL32_INIT_HELPER(freebsd32_msgrcv),
 	SYSCALL32_INIT_HELPER_COMPAT(msgget),
 	SYSCALL32_INIT_HELPER(freebsd32_msgsys),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_msgctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 msginit()
 {
 	struct prison *pr;
 	void **rsv;
 	int i, error;
 	osd_method_t methods[PR_MAXMETHOD] = {
 	    [PR_METHOD_CHECK] =		msg_prison_check,
 	    [PR_METHOD_SET] =		msg_prison_set,
 	    [PR_METHOD_GET] =		msg_prison_get,
 	    [PR_METHOD_REMOVE] =	msg_prison_remove,
 	};
 
 	msginfo.msgmax = msginfo.msgseg * msginfo.msgssz;
 	msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK);
 	msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK);
 	msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK);
 	msqids = malloc(sizeof(struct msqid_kernel) * msginfo.msgmni, M_MSG,
 	    M_WAITOK | M_ZERO);
 
 	/*
 	 * msginfo.msgssz should be a power of two for efficiency reasons.
 	 * It is also pretty silly if msginfo.msgssz is less than 8
 	 * or greater than about 256 so ...
 	 */
 
 	i = 8;
 	while (i < 1024 && i != msginfo.msgssz)
 		i <<= 1;
     	if (i != msginfo.msgssz) {
 		DPRINTF(("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
 		    msginfo.msgssz));
 		panic("msginfo.msgssz not a small power of 2");
 	}
 
 	if (msginfo.msgseg > 32767) {
 		DPRINTF(("msginfo.msgseg=%d\n", msginfo.msgseg));
 		panic("msginfo.msgseg > 32767");
 	}
 
 	for (i = 0; i < msginfo.msgseg; i++) {
 		if (i > 0)
 			msgmaps[i-1].next = i;
 		msgmaps[i].next = -1;	/* implies entry is available */
 	}
 	free_msgmaps = 0;
 	nfree_msgmaps = msginfo.msgseg;
 
 	for (i = 0; i < msginfo.msgtql; i++) {
 		msghdrs[i].msg_type = 0;
 		if (i > 0)
 			msghdrs[i-1].msg_next = &msghdrs[i];
 		msghdrs[i].msg_next = NULL;
 #ifdef MAC
 		mac_sysvmsg_init(&msghdrs[i]);
 #endif
     	}
 	free_msghdrs = &msghdrs[0];
 
 	for (i = 0; i < msginfo.msgmni; i++) {
 		msqids[i].u.msg_qbytes = 0;	/* implies entry is available */
 		msqids[i].u.msg_perm.seq = 0;	/* reset to a known value */
 		msqids[i].u.msg_perm.mode = 0;
 #ifdef MAC
 		mac_sysvmsq_init(&msqids[i]);
 #endif
 	}
 	mtx_init(&msq_mtx, "msq", NULL, MTX_DEF);
 
 	/* Set current prisons according to their allow.sysvipc. */
 	msg_prison_slot = osd_jail_register(NULL, methods);
 	rsv = osd_reserve(msg_prison_slot);
 	prison_lock(&prison0);
 	(void)osd_jail_set_reserved(&prison0, msg_prison_slot, rsv, &prison0);
 	prison_unlock(&prison0);
 	rsv = NULL;
 	sx_slock(&allprison_lock);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		if (rsv == NULL)
 			rsv = osd_reserve(msg_prison_slot);
 		prison_lock(pr);
-		if (prison_isvalid(pr) && (pr->pr_allow & PR_ALLOW_SYSVIPC)) {
+		if (pr->pr_allow & PR_ALLOW_SYSVIPC) {
 			(void)osd_jail_set_reserved(pr, msg_prison_slot, rsv,
 			    &prison0);
 			rsv = NULL;
 		}
 		prison_unlock(pr);
 	}
 	if (rsv != NULL)
 		osd_free_reserved(rsv);
 	sx_sunlock(&allprison_lock);
 
 	error = syscall_helper_register(msg_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(msg32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 msgunload()
 {
 	struct msqid_kernel *msqkptr;
 	int msqid;
 #ifdef MAC
 	int i;
 #endif
 
 	syscall_helper_unregister(msg_syscalls);
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(msg32_syscalls);
 #endif
 
 	for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 		msqkptr = &msqids[msqid];
 		if (msqkptr->u.msg_qbytes != 0 ||
 		    (msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
 			break;
 	}
 	if (msqid != msginfo.msgmni)
 		return (EBUSY);
 
 	if (msg_prison_slot != 0)
 		osd_jail_deregister(msg_prison_slot);
 #ifdef MAC
 	for (i = 0; i < msginfo.msgtql; i++)
 		mac_sysvmsg_destroy(&msghdrs[i]);
 	for (msqid = 0; msqid < msginfo.msgmni; msqid++)
 		mac_sysvmsq_destroy(&msqids[msqid]);
 #endif
 	free(msgpool, M_MSG);
 	free(msgmaps, M_MSG);
 	free(msghdrs, M_MSG);
 	free(msqids, M_MSG);
 	mtx_destroy(&msq_mtx);
 	return (0);
 }
 
 static int
 sysvmsg_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = msginit();
 		if (error != 0)
 			msgunload();
 		break;
 	case MOD_UNLOAD:
 		error = msgunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvmsg_mod = {
 	"sysvmsg",
 	&sysvmsg_modload,
 	NULL
 };
 
 DECLARE_MODULE(sysvmsg, sysvmsg_mod, SI_SUB_SYSV_MSG, SI_ORDER_FIRST);
 MODULE_VERSION(sysvmsg, 1);
 
 static void
 msg_freehdr(struct msg *msghdr)
 {
 	while (msghdr->msg_ts > 0) {
 		short next;
 		if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
 			panic("msghdr->msg_spot out of range");
 		next = msgmaps[msghdr->msg_spot].next;
 		msgmaps[msghdr->msg_spot].next = free_msgmaps;
 		free_msgmaps = msghdr->msg_spot;
 		nfree_msgmaps++;
 		msghdr->msg_spot = next;
 		if (msghdr->msg_ts >= msginfo.msgssz)
 			msghdr->msg_ts -= msginfo.msgssz;
 		else
 			msghdr->msg_ts = 0;
 	}
 	if (msghdr->msg_spot != -1)
 		panic("msghdr->msg_spot != -1");
 	msghdr->msg_next = free_msghdrs;
 	free_msghdrs = msghdr;
 #ifdef MAC
 	mac_sysvmsg_cleanup(msghdr);
 #endif
 }
 
 static void
 msq_remove(struct msqid_kernel *msqkptr)
 {
 	struct msg *msghdr;
 
 	racct_sub_cred(msqkptr->cred, RACCT_NMSGQ, 1);
 	racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, msqkptr->u.msg_qnum);
 	racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msqkptr->u.msg_cbytes);
 	crfree(msqkptr->cred);
 	msqkptr->cred = NULL;
 
 	/* Free the message headers */
 	msghdr = msqkptr->u.__msg_first;
 	while (msghdr != NULL) {
 		struct msg *msghdr_tmp;
 
 		/* Free the segments of each message */
 		msqkptr->u.msg_cbytes -= msghdr->msg_ts;
 		msqkptr->u.msg_qnum--;
 		msghdr_tmp = msghdr;
 		msghdr = msghdr->msg_next;
 		msg_freehdr(msghdr_tmp);
 	}
 
 	if (msqkptr->u.msg_cbytes != 0)
 		panic("msg_cbytes is screwed up");
 	if (msqkptr->u.msg_qnum != 0)
 		panic("msg_qnum is screwed up");
 
 	msqkptr->u.msg_qbytes = 0;	/* Mark it as free */
 
 #ifdef MAC
 	mac_sysvmsq_cleanup(msqkptr);
 #endif
 
 	wakeup(msqkptr);
 }
 
 static struct prison *
 msg_find_prison(struct ucred *cred)
 {
 	struct prison *pr, *rpr;
 
 	pr = cred->cr_prison;
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, msg_prison_slot);
 	prison_unlock(pr);
 	return rpr;
 }
 
 static int
 msq_prison_cansee(struct prison *rpr, struct msqid_kernel *msqkptr)
 {
 
 	if (msqkptr->cred == NULL ||
 	    !(rpr == msqkptr->cred->cr_prison ||
 	      prison_ischild(rpr, msqkptr->cred->cr_prison)))
 		return (EINVAL);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgctl_args {
 	int	msqid;
 	int	cmd;
 	struct	msqid_ds *buf;
 };
 #endif
 int
 sys_msgctl(struct thread *td, struct msgctl_args *uap)
 {
 	int msqid = uap->msqid;
 	int cmd = uap->cmd;
 	struct msqid_ds msqbuf;
 	int error;
 
 	DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf));
 	if (cmd == IPC_SET &&
 	    (error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0)
 		return (error);
 	error = kern_msgctl(td, msqid, cmd, &msqbuf);
 	if (cmd == IPC_STAT && error == 0)
 		error = copyout(&msqbuf, uap->buf, sizeof(struct msqid_ds));
 	return (error);
 }
 
 int
 kern_msgctl(struct thread *td, int msqid, int cmd, struct msqid_ds *msqbuf)
 {
 	int rval, error, msqix;
 	struct msqid_kernel *msqkptr;
 	struct prison *rpr;
 
 	rpr = msg_find_prison(td->td_ucred);
 	if (rpr == NULL)
 		return (ENOSYS);
 
 	AUDIT_ARG_SVIPC_CMD(cmd);
 	AUDIT_ARG_SVIPC_ID(msqid);
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		return (EINVAL);
 	}
 
 	msqkptr = &msqids[msqix];
 
 	mtx_lock(&msq_mtx);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such msqid\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 
 	error = msq_prison_cansee(rpr, msqkptr);
 	if (error != 0) {
 		DPRINTF(("requester can't see prison\n"));
 		goto done2;
 	}
 
 #ifdef MAC
 	error = mac_sysvmsq_check_msqctl(td->td_ucred, msqkptr, cmd);
 	if (error != 0)
 		goto done2;
 #endif
 
 	error = 0;
 	rval = 0;
 
 	switch (cmd) {
 	case IPC_RMID:
 	{
 #ifdef MAC
 		struct msg *msghdr;
 #endif
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
 			goto done2;
 
 #ifdef MAC
 		/*
 		 * Check that the thread has MAC access permissions to
 		 * individual msghdrs.  Note: We need to do this in a
 		 * separate loop because the actual loop alters the
 		 * msq/msghdr info as it progresses, and there is no going
 		 * back if half the way through we discover that the
 		 * thread cannot free a certain msghdr.  The msq will get
 		 * into an inconsistent state.
 		 */
 		for (msghdr = msqkptr->u.__msg_first; msghdr != NULL;
 		    msghdr = msghdr->msg_next) {
 			error = mac_sysvmsq_check_msgrmid(td->td_ucred, msghdr);
 			if (error != 0)
 				goto done2;
 		}
 #endif
 
 		msq_remove(msqkptr);
 	}
 
 		break;
 
 	case IPC_SET:
 		AUDIT_ARG_SVIPC_PERM(&msqbuf->msg_perm);
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
 			goto done2;
 		if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) {
 			error = priv_check(td, PRIV_IPC_MSGSIZE);
 			if (error)
 				goto done2;
 		}
 		if (msqbuf->msg_qbytes > msginfo.msgmnb) {
 			DPRINTF(("can't increase msg_qbytes beyond %d"
 			    "(truncating)\n", msginfo.msgmnb));
 			msqbuf->msg_qbytes = msginfo.msgmnb;	/* silently restrict qbytes to system limit */
 		}
 		if (msqbuf->msg_qbytes == 0) {
 			DPRINTF(("can't reduce msg_qbytes to 0\n"));
 			error = EINVAL;		/* non-standard errno! */
 			goto done2;
 		}
 		msqkptr->u.msg_perm.uid = msqbuf->msg_perm.uid;	/* change the owner */
 		msqkptr->u.msg_perm.gid = msqbuf->msg_perm.gid;	/* change the owner */
 		msqkptr->u.msg_perm.mode = (msqkptr->u.msg_perm.mode & ~0777) |
 		    (msqbuf->msg_perm.mode & 0777);
 		msqkptr->u.msg_qbytes = msqbuf->msg_qbytes;
 		msqkptr->u.msg_ctime = time_second;
 		break;
 
 	case IPC_STAT:
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
 			DPRINTF(("requester doesn't have read access\n"));
 			goto done2;
 		}
 		*msqbuf = msqkptr->u;
 		if (td->td_ucred->cr_prison != msqkptr->cred->cr_prison)
 			msqbuf->msg_perm.key = IPC_PRIVATE;
 
 		/*
 		 * Try to hide the fact that the structure layout is shared by
 		 * both the kernel and userland.  These pointers are not useful
 		 * to userspace.
 		 */
 		msqbuf->__msg_first = msqbuf->__msg_last = NULL;
 		break;
 
 	default:
 		DPRINTF(("invalid command %d\n", cmd));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgget_args {
 	key_t	key;
 	int	msgflg;
 };
 #endif
 
 int
 sys_msgget(struct thread *td, struct msgget_args *uap)
 {
 	int msqid, error = 0;
 	int key = uap->key;
 	int msgflg = uap->msgflg;
 	struct ucred *cred = td->td_ucred;
 	struct msqid_kernel *msqkptr = NULL;
 
 	DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg));
 
 	if (msg_find_prison(cred) == NULL)
 		return (ENOSYS);
 
 	mtx_lock(&msq_mtx);
 	if (key != IPC_PRIVATE) {
 		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 			msqkptr = &msqids[msqid];
 			if (msqkptr->u.msg_qbytes != 0 &&
 			    msqkptr->cred != NULL &&
 			    msqkptr->cred->cr_prison == cred->cr_prison &&
 			    msqkptr->u.msg_perm.key == key)
 				break;
 		}
 		if (msqid < msginfo.msgmni) {
 			DPRINTF(("found public key\n"));
 			if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
 				DPRINTF(("not exclusive\n"));
 				error = EEXIST;
 				goto done2;
 			}
 			AUDIT_ARG_SVIPC_ID(IXSEQ_TO_IPCID(msqid,
 			    msqkptr->u.msg_perm));
 			if ((error = ipcperm(td, &msqkptr->u.msg_perm,
 			    msgflg & 0700))) {
 				DPRINTF(("requester doesn't have 0%o access\n",
 				    msgflg & 0700));
 				goto done2;
 			}
 #ifdef MAC
 			error = mac_sysvmsq_check_msqget(cred, msqkptr);
 			if (error != 0)
 				goto done2;
 #endif
 			goto found;
 		}
 	}
 
 	DPRINTF(("need to allocate the msqid_ds\n"));
 	if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
 		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 			/*
 			 * Look for an unallocated and unlocked msqid_ds.
 			 * msqid_ds's can be locked by msgsnd or msgrcv while
 			 * they are copying the message in/out.  We can't
 			 * re-use the entry until they release it.
 			 */
 			msqkptr = &msqids[msqid];
 			if (msqkptr->u.msg_qbytes == 0 &&
 			    (msqkptr->u.msg_perm.mode & MSG_LOCKED) == 0)
 				break;
 		}
 		if (msqid == msginfo.msgmni) {
 			DPRINTF(("no more msqid_ds's available\n"));
 			error = ENOSPC;
 			goto done2;
 		}
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(td->td_proc);
 			error = racct_add(td->td_proc, RACCT_NMSGQ, 1);
 			PROC_UNLOCK(td->td_proc);
 			if (error != 0) {
 				error = ENOSPC;
 				goto done2;
 			}
 		}
 #endif
 		DPRINTF(("msqid %d is available\n", msqid));
 		msqkptr->u.msg_perm.key = key;
 		msqkptr->u.msg_perm.cuid = cred->cr_uid;
 		msqkptr->u.msg_perm.uid = cred->cr_uid;
 		msqkptr->u.msg_perm.cgid = cred->cr_gid;
 		msqkptr->u.msg_perm.gid = cred->cr_gid;
 		msqkptr->u.msg_perm.mode = (msgflg & 0777);
 		msqkptr->cred = crhold(cred);
 		/* Make sure that the returned msqid is unique */
 		msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff;
 		msqkptr->u.__msg_first = NULL;
 		msqkptr->u.__msg_last = NULL;
 		msqkptr->u.msg_cbytes = 0;
 		msqkptr->u.msg_qnum = 0;
 		msqkptr->u.msg_qbytes = msginfo.msgmnb;
 		msqkptr->u.msg_lspid = 0;
 		msqkptr->u.msg_lrpid = 0;
 		msqkptr->u.msg_stime = 0;
 		msqkptr->u.msg_rtime = 0;
 		msqkptr->u.msg_ctime = time_second;
 #ifdef MAC
 		mac_sysvmsq_create(cred, msqkptr);
 #endif
 		AUDIT_ARG_SVIPC_PERM(&msqkptr->u.msg_perm);
 	} else {
 		DPRINTF(("didn't find it and wasn't asked to create it\n"));
 		error = ENOENT;
 		goto done2;
 	}
 
 found:
 	/* Construct the unique msqid */
 	td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm);
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgsnd_args {
 	int	msqid;
 	const void	*msgp;	/* XXX msgp is actually mtext. */
 	size_t	msgsz;
 	int	msgflg;
 };
 #endif
 int
 kern_msgsnd(struct thread *td, int msqid, const void *msgp,
     size_t msgsz, int msgflg, long mtype)
 {
 	int msqix, segs_needed, error = 0;
 	struct msqid_kernel *msqkptr;
 	struct msg *msghdr;
 	struct prison *rpr;
 	short next;
 #ifdef RACCT
 	size_t saved_msgsz = 0;
 #endif
 
 	rpr = msg_find_prison(td->td_ucred);
 	if (rpr == NULL)
 		return (ENOSYS);
 
 	mtx_lock(&msq_mtx);
 	AUDIT_ARG_SVIPC_ID(msqid);
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		error = EINVAL;
 		goto done2;
 	}
 
 	msqkptr = &msqids[msqix];
 	AUDIT_ARG_SVIPC_PERM(&msqkptr->u.msg_perm);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such message queue id\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if ((error = msq_prison_cansee(rpr, msqkptr))) {
 		DPRINTF(("requester can't see prison\n"));
 		goto done2;
 	}
 
 	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) {
 		DPRINTF(("requester doesn't have write access\n"));
 		goto done2;
 	}
 
 #ifdef MAC
 	error = mac_sysvmsq_check_msqsnd(td->td_ucred, msqkptr);
 	if (error != 0)
 		goto done2;
 #endif
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(td->td_proc);
 		if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) {
 			PROC_UNLOCK(td->td_proc);
 			error = EAGAIN;
 			goto done2;
 		}
 		saved_msgsz = msgsz;
 		if (racct_add(td->td_proc, RACCT_MSGQSIZE, msgsz)) {
 			racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
 			PROC_UNLOCK(td->td_proc);
 			error = EAGAIN;
 			goto done2;
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	segs_needed = howmany(msgsz, msginfo.msgssz);
 	DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz,
 	    msginfo.msgssz, segs_needed));
 	for (;;) {
 		int need_more_resources = 0;
 
 		/*
 		 * check msgsz
 		 * (inside this loop in case msg_qbytes changes while we sleep)
 		 */
 
 		if (msgsz > msqkptr->u.msg_qbytes) {
 			DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n"));
 			error = EINVAL;
 			goto done3;
 		}
 
 		if (msqkptr->u.msg_perm.mode & MSG_LOCKED) {
 			DPRINTF(("msqid is locked\n"));
 			need_more_resources = 1;
 		}
 		if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) {
 			DPRINTF(("msgsz + msg_cbytes > msg_qbytes\n"));
 			need_more_resources = 1;
 		}
 		if (segs_needed > nfree_msgmaps) {
 			DPRINTF(("segs_needed > nfree_msgmaps\n"));
 			need_more_resources = 1;
 		}
 		if (free_msghdrs == NULL) {
 			DPRINTF(("no more msghdrs\n"));
 			need_more_resources = 1;
 		}
 
 		if (need_more_resources) {
 			int we_own_it;
 
 			if ((msgflg & IPC_NOWAIT) != 0) {
 				DPRINTF(("need more resources but caller "
 				    "doesn't want to wait\n"));
 				error = EAGAIN;
 				goto done3;
 			}
 
 			if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) {
 				DPRINTF(("we don't own the msqid_ds\n"));
 				we_own_it = 0;
 			} else {
 				/* Force later arrivals to wait for our
 				   request */
 				DPRINTF(("we own the msqid_ds\n"));
 				msqkptr->u.msg_perm.mode |= MSG_LOCKED;
 				we_own_it = 1;
 			}
 			DPRINTF(("msgsnd:  goodnight\n"));
 			error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
 			    "msgsnd", hz);
 			DPRINTF(("msgsnd:  good morning, error=%d\n", error));
 			if (we_own_it)
 				msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 			if (error == EWOULDBLOCK) {
 				DPRINTF(("msgsnd:  timed out\n"));
 				continue;
 			}
 			if (error != 0) {
 				DPRINTF(("msgsnd:  interrupted system call\n"));
 				error = EINTR;
 				goto done3;
 			}
 
 			/*
 			 * Make sure that the msq queue still exists
 			 */
 
 			if (msqkptr->u.msg_qbytes == 0) {
 				DPRINTF(("msqid deleted\n"));
 				error = EIDRM;
 				goto done3;
 			}
 
 		} else {
 			DPRINTF(("got all the resources that we need\n"));
 			break;
 		}
 	}
 
 	/*
 	 * We have the resources that we need.
 	 * Make sure!
 	 */
 
 	if (msqkptr->u.msg_perm.mode & MSG_LOCKED)
 		panic("msg_perm.mode & MSG_LOCKED");
 	if (segs_needed > nfree_msgmaps)
 		panic("segs_needed > nfree_msgmaps");
 	if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes)
 		panic("msgsz + msg_cbytes > msg_qbytes");
 	if (free_msghdrs == NULL)
 		panic("no more msghdrs");
 
 	/*
 	 * Re-lock the msqid_ds in case we page-fault when copying in the
 	 * message
 	 */
 
 	if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
 		panic("msqid_ds is already locked");
 	msqkptr->u.msg_perm.mode |= MSG_LOCKED;
 
 	/*
 	 * Allocate a message header
 	 */
 
 	msghdr = free_msghdrs;
 	free_msghdrs = msghdr->msg_next;
 	msghdr->msg_spot = -1;
 	msghdr->msg_ts = msgsz;
 	msghdr->msg_type = mtype;
 #ifdef MAC
 	/*
 	 * XXXMAC: Should the mac_sysvmsq_check_msgmsq check follow here
 	 * immediately?  Or, should it be checked just before the msg is
 	 * enqueued in the msgq (as it is done now)?
 	 */
 	mac_sysvmsg_create(td->td_ucred, msqkptr, msghdr);
 #endif
 
 	/*
 	 * Allocate space for the message
 	 */
 
 	while (segs_needed > 0) {
 		if (nfree_msgmaps <= 0)
 			panic("not enough msgmaps");
 		if (free_msgmaps == -1)
 			panic("nil free_msgmaps");
 		next = free_msgmaps;
 		if (next <= -1)
 			panic("next too low #1");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #1");
 		DPRINTF(("allocating segment %d to message\n", next));
 		free_msgmaps = msgmaps[next].next;
 		nfree_msgmaps--;
 		msgmaps[next].next = msghdr->msg_spot;
 		msghdr->msg_spot = next;
 		segs_needed--;
 	}
 
 	/*
 	 * Validate the message type
 	 */
 
 	if (msghdr->msg_type < 1) {
 		msg_freehdr(msghdr);
 		msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 		wakeup(msqkptr);
 		DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type));
 		error = EINVAL;
 		goto done3;
 	}
 
 	/*
 	 * Copy in the message body
 	 */
 
 	next = msghdr->msg_spot;
 	while (msgsz > 0) {
 		size_t tlen;
 		if (msgsz > msginfo.msgssz)
 			tlen = msginfo.msgssz;
 		else
 			tlen = msgsz;
 		if (next <= -1)
 			panic("next too low #2");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #2");
 		mtx_unlock(&msq_mtx);
 		if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz],
 		    tlen)) != 0) {
 			mtx_lock(&msq_mtx);
 			DPRINTF(("error %d copying in message segment\n",
 			    error));
 			msg_freehdr(msghdr);
 			msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 			wakeup(msqkptr);
 			goto done3;
 		}
 		mtx_lock(&msq_mtx);
 		msgsz -= tlen;
 		msgp = (const char *)msgp + tlen;
 		next = msgmaps[next].next;
 	}
 	if (next != -1)
 		panic("didn't use all the msg segments");
 
 	/*
 	 * We've got the message.  Unlock the msqid_ds.
 	 */
 
 	msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 
 	/*
 	 * Make sure that the msqid_ds is still allocated.
 	 */
 
 	if (msqkptr->u.msg_qbytes == 0) {
 		msg_freehdr(msghdr);
 		wakeup(msqkptr);
 		error = EIDRM;
 		goto done3;
 	}
 
 #ifdef MAC
 	/*
 	 * Note: Since the task/thread allocates the msghdr and usually
 	 * primes it with its own MAC label, for a majority of policies, it
 	 * won't be necessary to check whether the msghdr has access
 	 * permissions to the msgq.  The mac_sysvmsq_check_msqsnd check would
 	 * suffice in that case.  However, this hook may be required where
 	 * individual policies derive a non-identical label for the msghdr
 	 * from the current thread label and may want to check the msghdr
 	 * enqueue permissions, along with read/write permissions to the
 	 * msgq.
 	 */
 	error = mac_sysvmsq_check_msgmsq(td->td_ucred, msghdr, msqkptr);
 	if (error != 0) {
 		msg_freehdr(msghdr);
 		wakeup(msqkptr);
 		goto done3;
 	}
 #endif
 
 	/*
 	 * Put the message into the queue
 	 */
 	if (msqkptr->u.__msg_first == NULL) {
 		msqkptr->u.__msg_first = msghdr;
 		msqkptr->u.__msg_last = msghdr;
 	} else {
 		msqkptr->u.__msg_last->msg_next = msghdr;
 		msqkptr->u.__msg_last = msghdr;
 	}
 	msqkptr->u.__msg_last->msg_next = NULL;
 
 	msqkptr->u.msg_cbytes += msghdr->msg_ts;
 	msqkptr->u.msg_qnum++;
 	msqkptr->u.msg_lspid = td->td_proc->p_pid;
 	msqkptr->u.msg_stime = time_second;
 
 	wakeup(msqkptr);
 	td->td_retval[0] = 0;
 done3:
 #ifdef RACCT
 	if (racct_enable && error != 0) {
 		PROC_LOCK(td->td_proc);
 		racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
 		racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz);
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 int
 sys_msgsnd(struct thread *td, struct msgsnd_args *uap)
 {
 	int error;
 	long mtype;
 
 	DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp,
 	    uap->msgsz, uap->msgflg));
 
 	if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) {
 		DPRINTF(("error %d copying the message type\n", error));
 		return (error);
 	}
 	return (kern_msgsnd(td, uap->msqid,
 	    (const char *)uap->msgp + sizeof(mtype),
 	    uap->msgsz, uap->msgflg, mtype));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgrcv_args {
 	int	msqid;
 	void	*msgp;
 	size_t	msgsz;
 	long	msgtyp;
 	int	msgflg;
 };
 #endif
 /* XXX msgp is actually mtext. */
 int
 kern_msgrcv(struct thread *td, int msqid, void *msgp, size_t msgsz, long msgtyp,
     int msgflg, long *mtype)
 {
 	size_t len;
 	struct msqid_kernel *msqkptr;
 	struct msg *msghdr;
 	struct prison *rpr;
 	int msqix, error = 0;
 	short next;
 
 	rpr = msg_find_prison(td->td_ucred);
 	if (rpr == NULL)
 		return (ENOSYS);
 
 	AUDIT_ARG_SVIPC_ID(msqid);
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		return (EINVAL);
 	}
 
 	msqkptr = &msqids[msqix];
 	mtx_lock(&msq_mtx);
 	AUDIT_ARG_SVIPC_PERM(&msqkptr->u.msg_perm);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such message queue id\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if ((error = msq_prison_cansee(rpr, msqkptr))) {
 		DPRINTF(("requester can't see prison\n"));
 		goto done2;
 	}
 
 	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
 		DPRINTF(("requester doesn't have read access\n"));
 		goto done2;
 	}
 
 #ifdef MAC
 	error = mac_sysvmsq_check_msqrcv(td->td_ucred, msqkptr);
 	if (error != 0)
 		goto done2;
 #endif
 
 	msghdr = NULL;
 	while (msghdr == NULL) {
 		if (msgtyp == 0) {
 			msghdr = msqkptr->u.__msg_first;
 			if (msghdr != NULL) {
 				if (msgsz < msghdr->msg_ts &&
 				    (msgflg & MSG_NOERROR) == 0) {
 					DPRINTF(("first message on the queue "
 					    "is too big (want %zu, got %d)\n",
 					    msgsz, msghdr->msg_ts));
 					error = E2BIG;
 					goto done2;
 				}
 #ifdef MAC
 				error = mac_sysvmsq_check_msgrcv(td->td_ucred,
 				    msghdr);
 				if (error != 0)
 					goto done2;
 #endif
 				if (msqkptr->u.__msg_first ==
 				    msqkptr->u.__msg_last) {
 					msqkptr->u.__msg_first = NULL;
 					msqkptr->u.__msg_last = NULL;
 				} else {
 					msqkptr->u.__msg_first = msghdr->msg_next;
 					if (msqkptr->u.__msg_first == NULL)
 						panic("msg_first/last screwed up #1");
 				}
 			}
 		} else {
 			struct msg *previous;
 			struct msg **prev;
 
 			previous = NULL;
 			prev = &(msqkptr->u.__msg_first);
 			while ((msghdr = *prev) != NULL) {
 				/*
 				 * Is this message's type an exact match or is
 				 * this message's type less than or equal to
 				 * the absolute value of a negative msgtyp?
 				 * Note that the second half of this test can
 				 * NEVER be true if msgtyp is positive since
 				 * msg_type is always positive!
 				 */
 
 				if (msgtyp == msghdr->msg_type ||
 				    msghdr->msg_type <= -msgtyp) {
 					DPRINTF(("found message type %ld, "
 					    "requested %ld\n",
 					    msghdr->msg_type, msgtyp));
 					if (msgsz < msghdr->msg_ts &&
 					    (msgflg & MSG_NOERROR) == 0) {
 						DPRINTF(("requested message "
 						    "on the queue is too big "
 						    "(want %zu, got %hu)\n",
 						    msgsz, msghdr->msg_ts));
 						error = E2BIG;
 						goto done2;
 					}
 #ifdef MAC
 					error = mac_sysvmsq_check_msgrcv(
 					    td->td_ucred, msghdr);
 					if (error != 0)
 						goto done2;
 #endif
 					*prev = msghdr->msg_next;
 					if (msghdr == msqkptr->u.__msg_last) {
 						if (previous == NULL) {
 							if (prev !=
 							    &msqkptr->u.__msg_first)
 								panic("__msg_first/last screwed up #2");
 							msqkptr->u.__msg_first =
 							    NULL;
 							msqkptr->u.__msg_last =
 							    NULL;
 						} else {
 							if (prev ==
 							    &msqkptr->u.__msg_first)
 								panic("__msg_first/last screwed up #3");
 							msqkptr->u.__msg_last =
 							    previous;
 						}
 					}
 					break;
 				}
 				previous = msghdr;
 				prev = &(msghdr->msg_next);
 			}
 		}
 
 		/*
 		 * We've either extracted the msghdr for the appropriate
 		 * message or there isn't one.
 		 * If there is one then bail out of this loop.
 		 */
 
 		if (msghdr != NULL)
 			break;
 
 		/*
 		 * Hmph!  No message found.  Does the user want to wait?
 		 */
 
 		if ((msgflg & IPC_NOWAIT) != 0) {
 			DPRINTF(("no appropriate message found (msgtyp=%ld)\n",
 			    msgtyp));
 			/* The SVID says to return ENOMSG. */
 			error = ENOMSG;
 			goto done2;
 		}
 
 		/*
 		 * Wait for something to happen
 		 */
 
 		DPRINTF(("msgrcv:  goodnight\n"));
 		error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
 		    "msgrcv", 0);
 		DPRINTF(("msgrcv:  good morning (error=%d)\n", error));
 
 		if (error != 0) {
 			DPRINTF(("msgrcv:  interrupted system call\n"));
 			error = EINTR;
 			goto done2;
 		}
 
 		/*
 		 * Make sure that the msq queue still exists
 		 */
 
 		if (msqkptr->u.msg_qbytes == 0 ||
 		    msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 			DPRINTF(("msqid deleted\n"));
 			error = EIDRM;
 			goto done2;
 		}
 	}
 
 	/*
 	 * Return the message to the user.
 	 *
 	 * First, do the bookkeeping (before we risk being interrupted).
 	 */
 
 	msqkptr->u.msg_cbytes -= msghdr->msg_ts;
 	msqkptr->u.msg_qnum--;
 	msqkptr->u.msg_lrpid = td->td_proc->p_pid;
 	msqkptr->u.msg_rtime = time_second;
 
 	racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, 1);
 	racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msghdr->msg_ts);
 
 	/*
 	 * Make msgsz the actual amount that we'll be returning.
 	 * Note that this effectively truncates the message if it is too long
 	 * (since msgsz is never increased).
 	 */
 
 	DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz,
 	    msghdr->msg_ts));
 	if (msgsz > msghdr->msg_ts)
 		msgsz = msghdr->msg_ts;
 	*mtype = msghdr->msg_type;
 
 	/*
 	 * Return the segments to the user
 	 */
 
 	next = msghdr->msg_spot;
 	for (len = 0; len < msgsz; len += msginfo.msgssz) {
 		size_t tlen;
 
 		if (msgsz - len > msginfo.msgssz)
 			tlen = msginfo.msgssz;
 		else
 			tlen = msgsz - len;
 		if (next <= -1)
 			panic("next too low #3");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #3");
 		mtx_unlock(&msq_mtx);
 		error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen);
 		mtx_lock(&msq_mtx);
 		if (error != 0) {
 			DPRINTF(("error (%d) copying out message segment\n",
 			    error));
 			msg_freehdr(msghdr);
 			wakeup(msqkptr);
 			goto done2;
 		}
 		msgp = (char *)msgp + tlen;
 		next = msgmaps[next].next;
 	}
 
 	/*
 	 * Done, return the actual number of bytes copied out.
 	 */
 
 	msg_freehdr(msghdr);
 	wakeup(msqkptr);
 	td->td_retval[0] = msgsz;
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 int
 sys_msgrcv(struct thread *td, struct msgrcv_args *uap)
 {
 	int error;
 	long mtype;
 
 	DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid,
 	    uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg));
 
 	if ((error = kern_msgrcv(td, uap->msqid,
 	    (char *)uap->msgp + sizeof(mtype), uap->msgsz,
 	    uap->msgtyp, uap->msgflg, &mtype)) != 0)
 		return (error);
 	if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0)
 		DPRINTF(("error %d copying the message type\n", error));
 	return (error);
 }
 
 static int
 sysctl_msqids(SYSCTL_HANDLER_ARGS)
 {
 	struct msqid_kernel tmsqk;
 #ifdef COMPAT_FREEBSD32
 	struct msqid_kernel32 tmsqk32;
 #endif
 	struct prison *pr, *rpr;
 	void *outaddr;
 	size_t outsize;
 	int error, i;
 
 	pr = req->td->td_ucred->cr_prison;
 	rpr = msg_find_prison(req->td->td_ucred);
 	error = 0;
 	for (i = 0; i < msginfo.msgmni; i++) {
 		mtx_lock(&msq_mtx);
 		if (msqids[i].u.msg_qbytes == 0 || rpr == NULL ||
 		    msq_prison_cansee(rpr, &msqids[i]) != 0)
 			bzero(&tmsqk, sizeof(tmsqk));
 		else {
 			tmsqk = msqids[i];
 			if (tmsqk.cred->cr_prison != pr)
 				tmsqk.u.msg_perm.key = IPC_PRIVATE;
 		}
 		mtx_unlock(&msq_mtx);
 #ifdef COMPAT_FREEBSD32
 		if (SV_CURPROC_FLAG(SV_ILP32)) {
 			bzero(&tmsqk32, sizeof(tmsqk32));
 			freebsd32_ipcperm_out(&tmsqk.u.msg_perm,
 			    &tmsqk32.u.msg_perm);
 			/* Don't copy u.msg_first or u.msg_last */
 			CP(tmsqk, tmsqk32, u.msg_cbytes);
 			CP(tmsqk, tmsqk32, u.msg_qnum);
 			CP(tmsqk, tmsqk32, u.msg_qbytes);
 			CP(tmsqk, tmsqk32, u.msg_lspid);
 			CP(tmsqk, tmsqk32, u.msg_lrpid);
 			CP(tmsqk, tmsqk32, u.msg_stime);
 			CP(tmsqk, tmsqk32, u.msg_rtime);
 			CP(tmsqk, tmsqk32, u.msg_ctime);
 			/* Don't copy label or cred */
 			outaddr = &tmsqk32;
 			outsize = sizeof(tmsqk32);
 		} else
 #endif
 		{
 			/* Don't leak kernel pointers */
 			tmsqk.u.__msg_first = NULL;
 			tmsqk.u.__msg_last = NULL;
 			tmsqk.label = NULL;
 			tmsqk.cred = NULL;
 			/*
 			 * XXX: some padding also exists, but we take care to
 			 * allocate our pool of msqid_kernel structs with
 			 * zeroed memory so this should be OK.
 			 */
 			outaddr = &tmsqk;
 			outsize = sizeof(tmsqk);
 		}
 		error = SYSCTL_OUT(req, outaddr, outsize);
 		if (error != 0)
 			break;
 	}
 	return (error);
 }
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0,
     "Maximum message size");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0,
     "Number of message queue identifiers");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RDTUN, &msginfo.msgmnb, 0,
     "Maximum number of bytes in a queue");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RDTUN, &msginfo.msgtql, 0,
     "Maximum number of messages in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0,
     "Size of a message segment");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0,
     "Number of message segments");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_msqids, "",
     "Array of struct msqid_kernel for each potential message queue");
 
 static int
 msg_prison_check(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *prpr;
 	struct vfsoptlist *opts = data;
 	int error, jsys;
 
 	/*
 	 * sysvmsg is a jailsys integer.
 	 * It must be "disable" if the parent jail is disabled.
 	 */
 	error = vfs_copyopt(opts, "sysvmsg", &jsys, sizeof(jsys));
 	if (error != ENOENT) {
 		if (error != 0)
 			return (error);
 		switch (jsys) {
 		case JAIL_SYS_DISABLE:
 			break;
 		case JAIL_SYS_NEW:
 		case JAIL_SYS_INHERIT:
 			prison_lock(pr->pr_parent);
 			prpr = osd_jail_get(pr->pr_parent, msg_prison_slot);
 			prison_unlock(pr->pr_parent);
 			if (prpr == NULL)
 				return (EPERM);
 			break;
 		default:
 			return (EINVAL);
 		}
 	}
 
 	return (0);
 }
 
 static int
 msg_prison_set(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *tpr, *orpr, *nrpr, *trpr;
 	struct vfsoptlist *opts = data;
 	void *rsv;
 	int jsys, descend;
 
 	/*
 	 * sysvmsg controls which jail is the root of the associated msgs (this
 	 * jail or same as the parent), or if the feature is available at all.
 	 */
 	if (vfs_copyopt(opts, "sysvmsg", &jsys, sizeof(jsys)) == ENOENT)
 		jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0)
 		    ? JAIL_SYS_INHERIT
 		    : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0)
 		    ? JAIL_SYS_DISABLE
 		    : -1;
 	if (jsys == JAIL_SYS_DISABLE) {
 		prison_lock(pr);
 		orpr = osd_jail_get(pr, msg_prison_slot);
 		if (orpr != NULL)
 			osd_jail_del(pr, msg_prison_slot);
 		prison_unlock(pr);
 		if (orpr != NULL) {
 			if (orpr == pr)
 				msg_prison_cleanup(pr);
 			/* Disable all child jails as well. */
 			FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 				prison_lock(tpr);
 				trpr = osd_jail_get(tpr, msg_prison_slot);
 				if (trpr != NULL) {
 					osd_jail_del(tpr, msg_prison_slot);
 					prison_unlock(tpr);
 					if (trpr == tpr)
 						msg_prison_cleanup(tpr);
 				} else {
 					prison_unlock(tpr);
 					descend = 0;
 				}
 			}
 		}
 	} else if (jsys != -1) {
 		if (jsys == JAIL_SYS_NEW)
 			nrpr = pr;
 		else {
 			prison_lock(pr->pr_parent);
 			nrpr = osd_jail_get(pr->pr_parent, msg_prison_slot);
 			prison_unlock(pr->pr_parent);
 		}
 		rsv = osd_reserve(msg_prison_slot);
 		prison_lock(pr);
 		orpr = osd_jail_get(pr, msg_prison_slot);
 		if (orpr != nrpr)
 			(void)osd_jail_set_reserved(pr, msg_prison_slot, rsv,
 			    nrpr);
 		else
 			osd_free_reserved(rsv);
 		prison_unlock(pr);
 		if (orpr != nrpr) {
 			if (orpr == pr)
 				msg_prison_cleanup(pr);
 			if (orpr != NULL) {
 				/* Change child jails matching the old root, */
 				FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 					prison_lock(tpr);
 					trpr = osd_jail_get(tpr,
 					    msg_prison_slot);
 					if (trpr == orpr) {
 						(void)osd_jail_set(tpr,
 						    msg_prison_slot, nrpr);
 						prison_unlock(tpr);
 						if (trpr == tpr)
 							msg_prison_cleanup(tpr);
 					} else {
 						prison_unlock(tpr);
 						descend = 0;
 					}
 				}
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 msg_prison_get(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *rpr;
 	struct vfsoptlist *opts = data;
 	int error, jsys;
 
 	/* Set sysvmsg based on the jail's root prison. */
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, msg_prison_slot);
 	prison_unlock(pr);
 	jsys = rpr == NULL ? JAIL_SYS_DISABLE
 	    : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
 	error = vfs_setopt(opts, "sysvmsg", &jsys, sizeof(jsys));
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 static int
 msg_prison_remove(void *obj, void *data __unused)
 {
 	struct prison *pr = obj;
 	struct prison *rpr;
 
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, msg_prison_slot);
 	prison_unlock(pr);
 	if (rpr == pr)
 		msg_prison_cleanup(pr);
 	return (0);
 }
 
 static void
 msg_prison_cleanup(struct prison *pr)
 {
 	struct msqid_kernel *msqkptr;
 	int i;
 
 	/* Remove any msqs that belong to this jail. */
 	mtx_lock(&msq_mtx);
 	for (i = 0; i < msginfo.msgmni; i++) {
 		msqkptr = &msqids[i];
 		if (msqkptr->u.msg_qbytes != 0 &&
 		    msqkptr->cred != NULL && msqkptr->cred->cr_prison == pr)
 			msq_remove(msqkptr);
 	}
 	mtx_unlock(&msq_mtx);
 }
 
 SYSCTL_JAIL_PARAM_SYS_NODE(sysvmsg, CTLFLAG_RW, "SYSV message queues");
 
 #ifdef COMPAT_FREEBSD32
 int
 freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap)
 {
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	AUDIT_ARG_SVIPC_WHICH(uap->which);
 	switch (uap->which) {
 	case 0:
 		return (freebsd7_freebsd32_msgctl(td,
 		    (struct freebsd7_freebsd32_msgctl_args *)&uap->a2));
 	case 2:
 		return (freebsd32_msgsnd(td,
 		    (struct freebsd32_msgsnd_args *)&uap->a2));
 	case 3:
 		return (freebsd32_msgrcv(td,
 		    (struct freebsd32_msgrcv_args *)&uap->a2));
 	default:
 		return (sys_msgsys(td, (struct msgsys_args *)uap));
 	}
 #else
 	return (nosys(td, NULL));
 #endif
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 int
 freebsd7_freebsd32_msgctl(struct thread *td,
     struct freebsd7_freebsd32_msgctl_args *uap)
 {
 	struct msqid_ds msqbuf;
 	struct msqid_ds32_old msqbuf32;
 	int error;
 
 	if (uap->cmd == IPC_SET) {
 		error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_old_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
 		PTRIN_CP(msqbuf32, msqbuf, __msg_first);
 		PTRIN_CP(msqbuf32, msqbuf, __msg_last);
 		CP(msqbuf32, msqbuf, msg_cbytes);
 		CP(msqbuf32, msqbuf, msg_qnum);
 		CP(msqbuf32, msqbuf, msg_qbytes);
 		CP(msqbuf32, msqbuf, msg_lspid);
 		CP(msqbuf32, msqbuf, msg_lrpid);
 		CP(msqbuf32, msqbuf, msg_stime);
 		CP(msqbuf32, msqbuf, msg_rtime);
 		CP(msqbuf32, msqbuf, msg_ctime);
 	}
 	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
 	if (error)
 		return (error);
 	if (uap->cmd == IPC_STAT) {
 		bzero(&msqbuf32, sizeof(msqbuf32));
 		freebsd32_ipcperm_old_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
 		PTROUT_CP(msqbuf, msqbuf32, __msg_first);
 		PTROUT_CP(msqbuf, msqbuf32, __msg_last);
 		CP(msqbuf, msqbuf32, msg_cbytes);
 		CP(msqbuf, msqbuf32, msg_qnum);
 		CP(msqbuf, msqbuf32, msg_qbytes);
 		CP(msqbuf, msqbuf32, msg_lspid);
 		CP(msqbuf, msqbuf32, msg_lrpid);
 		CP(msqbuf, msqbuf32, msg_stime);
 		CP(msqbuf, msqbuf32, msg_rtime);
 		CP(msqbuf, msqbuf32, msg_ctime);
 		error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
 	}
 	return (error);
 }
 #endif
 
 int
 freebsd32_msgctl(struct thread *td, struct freebsd32_msgctl_args *uap)
 {
 	struct msqid_ds msqbuf;
 	struct msqid_ds32 msqbuf32;
 	int error;
 
 	if (uap->cmd == IPC_SET) {
 		error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
 		PTRIN_CP(msqbuf32, msqbuf, __msg_first);
 		PTRIN_CP(msqbuf32, msqbuf, __msg_last);
 		CP(msqbuf32, msqbuf, msg_cbytes);
 		CP(msqbuf32, msqbuf, msg_qnum);
 		CP(msqbuf32, msqbuf, msg_qbytes);
 		CP(msqbuf32, msqbuf, msg_lspid);
 		CP(msqbuf32, msqbuf, msg_lrpid);
 		CP(msqbuf32, msqbuf, msg_stime);
 		CP(msqbuf32, msqbuf, msg_rtime);
 		CP(msqbuf32, msqbuf, msg_ctime);
 	}
 	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
 	if (error)
 		return (error);
 	if (uap->cmd == IPC_STAT) {
 		freebsd32_ipcperm_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
 		PTROUT_CP(msqbuf, msqbuf32, __msg_first);
 		PTROUT_CP(msqbuf, msqbuf32, __msg_last);
 		CP(msqbuf, msqbuf32, msg_cbytes);
 		CP(msqbuf, msqbuf32, msg_qnum);
 		CP(msqbuf, msqbuf32, msg_qbytes);
 		CP(msqbuf, msqbuf32, msg_lspid);
 		CP(msqbuf, msqbuf32, msg_lrpid);
 		CP(msqbuf, msqbuf32, msg_stime);
 		CP(msqbuf, msqbuf32, msg_rtime);
 		CP(msqbuf, msqbuf32, msg_ctime);
 		error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
 	}
 	return (error);
 }
 
 int
 freebsd32_msgsnd(struct thread *td, struct freebsd32_msgsnd_args *uap)
 {
 	const void *msgp;
 	long mtype;
 	int32_t mtype32;
 	int error;
 
 	msgp = PTRIN(uap->msgp);
 	if ((error = copyin(msgp, &mtype32, sizeof(mtype32))) != 0)
 		return (error);
 	mtype = mtype32;
 	return (kern_msgsnd(td, uap->msqid,
 	    (const char *)msgp + sizeof(mtype32),
 	    uap->msgsz, uap->msgflg, mtype));
 }
 
 int
 freebsd32_msgrcv(struct thread *td, struct freebsd32_msgrcv_args *uap)
 {
 	void *msgp;
 	long mtype;
 	int32_t mtype32;
 	int error;
 
 	msgp = PTRIN(uap->msgp);
 	if ((error = kern_msgrcv(td, uap->msqid,
 	    (char *)msgp + sizeof(mtype32), uap->msgsz,
 	    uap->msgtyp, uap->msgflg, &mtype)) != 0)
 		return (error);
 	mtype32 = (int32_t)mtype;
 	return (copyout(&mtype32, msgp, sizeof(mtype32)));
 }
 #endif
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *msgcalls[] = {
 	(sy_call_t *)freebsd7_msgctl, (sy_call_t *)sys_msgget,
 	(sy_call_t *)sys_msgsnd, (sy_call_t *)sys_msgrcv
 };
 
 /*
  * Entry point for all MSG calls.
  *
  * XXX actually varargs.
  * struct msgsys_args {
  *		int	which;
  *		int	a2;
  *		int	a3;
  *		int	a4;
  *		int	a5;
  *		int	a6;
  *	} *uap;
  */
 int
 sys_msgsys(struct thread *td, struct msgsys_args *uap)
 {
 	int error;
 
 	AUDIT_ARG_SVIPC_WHICH(uap->which);
 	if (uap->which < 0 || uap->which >= nitems(msgcalls))
 		return (EINVAL);
 	error = (*msgcalls[uap->which])(td, &uap->a2);
 	return (error);
 }
 
 #ifndef CP
 #define CP(src, dst, fld)	do { (dst).fld = (src).fld; } while (0)
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd7_msgctl_args {
 	int	msqid;
 	int	cmd;
 	struct	msqid_ds_old *buf;
 };
 #endif
 int
 freebsd7_msgctl(struct thread *td, struct freebsd7_msgctl_args *uap)
 {
 	struct msqid_ds_old msqold;
 	struct msqid_ds msqbuf;
 	int error;
 
 	DPRINTF(("call to freebsd7_msgctl(%d, %d, %p)\n", uap->msqid, uap->cmd,
 	    uap->buf));
 	if (uap->cmd == IPC_SET) {
 		error = copyin(uap->buf, &msqold, sizeof(msqold));
 		if (error)
 			return (error);
 		ipcperm_old2new(&msqold.msg_perm, &msqbuf.msg_perm);
 		CP(msqold, msqbuf, __msg_first);
 		CP(msqold, msqbuf, __msg_last);
 		CP(msqold, msqbuf, msg_cbytes);
 		CP(msqold, msqbuf, msg_qnum);
 		CP(msqold, msqbuf, msg_qbytes);
 		CP(msqold, msqbuf, msg_lspid);
 		CP(msqold, msqbuf, msg_lrpid);
 		CP(msqold, msqbuf, msg_stime);
 		CP(msqold, msqbuf, msg_rtime);
 		CP(msqold, msqbuf, msg_ctime);
 	}
 	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
 	if (error)
 		return (error);
 	if (uap->cmd == IPC_STAT) {
 		bzero(&msqold, sizeof(msqold));
 		ipcperm_new2old(&msqbuf.msg_perm, &msqold.msg_perm);
 		CP(msqbuf, msqold, __msg_first);
 		CP(msqbuf, msqold, __msg_last);
 		CP(msqbuf, msqold, msg_cbytes);
 		CP(msqbuf, msqold, msg_qnum);
 		CP(msqbuf, msqold, msg_qbytes);
 		CP(msqbuf, msqold, msg_lspid);
 		CP(msqbuf, msqold, msg_lrpid);
 		CP(msqbuf, msqold, msg_stime);
 		CP(msqbuf, msqold, msg_rtime);
 		CP(msqbuf, msqold, msg_ctime);
 		error = copyout(&msqold, uap->buf, sizeof(struct msqid_ds_old));
 	}
 	return (error);
 }
 
 #undef CP
 
 #endif	/* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
 	   COMPAT_FREEBSD7 */
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
index deee60d87a5a..dd8925246d1e 100644
--- a/sys/kern/sysv_sem.c
+++ b/sys/kern/sysv_sem.c
@@ -1,2010 +1,2010 @@
 /*-
  * Implementation of SVID semaphores
  *
  * Author:  Daniel Boulet
  *
  * This software is provided ``AS IS'' without any warranties of any kind.
  */
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003-2005 McAfee, Inc.
  * Copyright (c) 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/abi_compat.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/racct.h>
 #include <sys/sem.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/jail.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 FEATURE(sysv_sem, "System V semaphores support");
 
 static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores");
 
 #ifdef SEM_DEBUG
 #define DPRINTF(a)	printf a
 #else
 #define DPRINTF(a)
 #endif
 
 static int seminit(void);
 static int sysvsem_modload(struct module *, int, void *);
 static int semunload(void);
 static void semexit_myhook(void *arg, struct proc *p);
 static int sysctl_sema(SYSCTL_HANDLER_ARGS);
 static int semvalid(int semid, struct prison *rpr,
     struct semid_kernel *semakptr);
 static void sem_remove(int semidx, struct ucred *cred);
 static struct prison *sem_find_prison(struct ucred *);
 static int sem_prison_cansee(struct prison *, struct semid_kernel *);
 static int sem_prison_check(void *, void *);
 static int sem_prison_set(void *, void *);
 static int sem_prison_get(void *, void *);
 static int sem_prison_remove(void *, void *);
 static void sem_prison_cleanup(struct prison *);
 
 #ifndef _SYS_SYSPROTO_H_
 struct __semctl_args;
 int __semctl(struct thread *td, struct __semctl_args *uap);
 struct semget_args;
 int semget(struct thread *td, struct semget_args *uap);
 struct semop_args;
 int semop(struct thread *td, struct semop_args *uap);
 #endif
 
 static struct sem_undo *semu_alloc(struct thread *td);
 static int semundo_adjust(struct thread *td, struct sem_undo **supptr,
     int semid, int semseq, int semnum, int adjval);
 static void semundo_clear(int semid, int semnum);
 
 static struct mtx	sem_mtx;	/* semaphore global lock */
 static struct mtx sem_undo_mtx;
 static int	semtot = 0;
 static struct semid_kernel *sema;	/* semaphore id pool */
 static struct mtx *sema_mtx;	/* semaphore id pool mutexes*/
 static struct sem *sem;		/* semaphore pool */
 LIST_HEAD(, sem_undo) semu_list;	/* list of active undo structures */
 LIST_HEAD(, sem_undo) semu_free_list;	/* list of free undo structures */
 static int	*semu;		/* undo structure pool */
 static eventhandler_tag semexit_tag;
 static unsigned sem_prison_slot;	/* prison OSD slot */
 
 #define SEMUNDO_MTX		sem_undo_mtx
 #define SEMUNDO_LOCK()		mtx_lock(&SEMUNDO_MTX);
 #define SEMUNDO_UNLOCK()	mtx_unlock(&SEMUNDO_MTX);
 #define SEMUNDO_LOCKASSERT(how)	mtx_assert(&SEMUNDO_MTX, (how));
 
 struct sem {
 	u_short	semval;		/* semaphore value */
 	pid_t	sempid;		/* pid of last operation */
 	u_short	semncnt;	/* # awaiting semval > cval */
 	u_short	semzcnt;	/* # awaiting semval = 0 */
 };
 
 /*
  * Undo structure (one per process)
  */
 struct sem_undo {
 	LIST_ENTRY(sem_undo) un_next;	/* ptr to next active undo structure */
 	struct	proc *un_proc;		/* owner of this structure */
 	short	un_cnt;			/* # of active entries */
 	struct undo {
 		short	un_adjval;	/* adjust on exit values */
 		short	un_num;		/* semaphore # */
 		int	un_id;		/* semid */
 		unsigned short un_seq;
 	} un_ent[1];			/* undo entries */
 };
 
 /*
  * Configuration parameters
  */
 #ifndef SEMMNI
 #define SEMMNI	50		/* # of semaphore identifiers */
 #endif
 #ifndef SEMMNS
 #define SEMMNS	340		/* # of semaphores in system */
 #endif
 #ifndef SEMUME
 #define SEMUME	50		/* max # of undo entries per process */
 #endif
 #ifndef SEMMNU
 #define SEMMNU	150		/* # of undo structures in system */
 #endif
 
 /* shouldn't need tuning */
 #ifndef SEMMSL
 #define SEMMSL	SEMMNS		/* max # of semaphores per id */
 #endif
 #ifndef SEMOPM
 #define SEMOPM	100		/* max # of operations per semop call */
 #endif
 
 #define SEMVMX	32767		/* semaphore maximum value */
 #define SEMAEM	16384		/* adjust on exit max value */
 
 /*
  * Due to the way semaphore memory is allocated, we have to ensure that
  * SEMUSZ is properly aligned.
  */
 
 #define	SEM_ALIGN(bytes) roundup2(bytes, sizeof(long))
 
 /* actual size of an undo structure */
 #define SEMUSZ(x)	SEM_ALIGN(offsetof(struct sem_undo, un_ent[(x)]))
 
 /*
  * Macro to find a particular sem_undo vector
  */
 #define SEMU(ix) \
 	((struct sem_undo *)(((intptr_t)semu) + (ix) * seminfo.semusz))
 
 /*
  * semaphore info struct
  */
 struct seminfo seminfo = {
 	.semmni =	SEMMNI,	/* # of semaphore identifiers */
 	.semmns =	SEMMNS,	/* # of semaphores in system */
 	.semmnu =	SEMMNU,	/* # of undo structures in system */
 	.semmsl =	SEMMSL,	/* max # of semaphores per id */
 	.semopm =	SEMOPM,	/* max # of operations per semop call */
 	.semume =	SEMUME,	/* max # of undo entries per process */
 	.semusz =	SEMUSZ(SEMUME),	/* size in bytes of undo structure */
 	.semvmx =	SEMVMX,	/* semaphore maximum value */
 	.semaem =	SEMAEM,	/* adjust on exit max value */
 };
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0,
     "Number of semaphore identifiers");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0,
     "Maximum number of semaphores in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RDTUN, &seminfo.semmnu, 0,
     "Maximum number of undo structures in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RWTUN, &seminfo.semmsl, 0,
     "Max semaphores per id");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RDTUN, &seminfo.semopm, 0,
     "Max operations per semop call");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RDTUN, &seminfo.semume, 0,
     "Max undo entries per process");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RD, &seminfo.semusz, 0,
     "Size in bytes of undo structure");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RWTUN, &seminfo.semvmx, 0,
     "Semaphore maximum value");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RWTUN, &seminfo.semaem, 0,
     "Adjust on exit max value");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, sema,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_sema, "",
     "Array of struct semid_kernel for each potential semaphore");
 
 static struct syscall_helper_data sem_syscalls[] = {
 	SYSCALL_INIT_HELPER(__semctl),
 	SYSCALL_INIT_HELPER(semget),
 	SYSCALL_INIT_HELPER(semop),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL_INIT_HELPER(semsys),
 	SYSCALL_INIT_HELPER_COMPAT(freebsd7___semctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static struct syscall_helper_data sem32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_semctl),
 	SYSCALL32_INIT_HELPER_COMPAT(semget),
 	SYSCALL32_INIT_HELPER_COMPAT(semop),
 	SYSCALL32_INIT_HELPER(freebsd32_semsys),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_semctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 seminit(void)
 {
 	struct prison *pr;
 	void **rsv;
 	int i, error;
 	osd_method_t methods[PR_MAXMETHOD] = {
 	    [PR_METHOD_CHECK] =		sem_prison_check,
 	    [PR_METHOD_SET] =		sem_prison_set,
 	    [PR_METHOD_GET] =		sem_prison_get,
 	    [PR_METHOD_REMOVE] =	sem_prison_remove,
 	};
 
 	sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK);
 	sema = malloc(sizeof(struct semid_kernel) * seminfo.semmni, M_SEM,
 	    M_WAITOK | M_ZERO);
 	sema_mtx = malloc(sizeof(struct mtx) * seminfo.semmni, M_SEM,
 	    M_WAITOK | M_ZERO);
 	seminfo.semusz = SEMUSZ(seminfo.semume);
 	semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK);
 
 	for (i = 0; i < seminfo.semmni; i++) {
 		sema[i].u.__sem_base = 0;
 		sema[i].u.sem_perm.mode = 0;
 		sema[i].u.sem_perm.seq = 0;
 #ifdef MAC
 		mac_sysvsem_init(&sema[i]);
 #endif
 	}
 	for (i = 0; i < seminfo.semmni; i++)
 		mtx_init(&sema_mtx[i], "semid", NULL, MTX_DEF);
 	LIST_INIT(&semu_free_list);
 	for (i = 0; i < seminfo.semmnu; i++) {
 		struct sem_undo *suptr = SEMU(i);
 		suptr->un_proc = NULL;
 		LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
 	}
 	LIST_INIT(&semu_list);
 	mtx_init(&sem_mtx, "sem", NULL, MTX_DEF);
 	mtx_init(&sem_undo_mtx, "semu", NULL, MTX_DEF);
 	semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL,
 	    EVENTHANDLER_PRI_ANY);
 
 	/* Set current prisons according to their allow.sysvipc. */
 	sem_prison_slot = osd_jail_register(NULL, methods);
 	rsv = osd_reserve(sem_prison_slot);
 	prison_lock(&prison0);
 	(void)osd_jail_set_reserved(&prison0, sem_prison_slot, rsv, &prison0);
 	prison_unlock(&prison0);
 	rsv = NULL;
 	sx_slock(&allprison_lock);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		if (rsv == NULL)
 			rsv = osd_reserve(sem_prison_slot);
 		prison_lock(pr);
-		if (prison_isvalid(pr) && (pr->pr_allow & PR_ALLOW_SYSVIPC)) {
+		if (pr->pr_allow & PR_ALLOW_SYSVIPC) {
 			(void)osd_jail_set_reserved(pr, sem_prison_slot, rsv,
 			    &prison0);
 			rsv = NULL;
 		}
 		prison_unlock(pr);
 	}
 	if (rsv != NULL)
 		osd_free_reserved(rsv);
 	sx_sunlock(&allprison_lock);
 
 	error = syscall_helper_register(sem_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(sem32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 semunload(void)
 {
 	int i;
 
 	/* XXXKIB */
 	if (semtot != 0)
 		return (EBUSY);
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(sem32_syscalls);
 #endif
 	syscall_helper_unregister(sem_syscalls);
 	EVENTHANDLER_DEREGISTER(process_exit, semexit_tag);
 	if (sem_prison_slot != 0)
 		osd_jail_deregister(sem_prison_slot);
 #ifdef MAC
 	for (i = 0; i < seminfo.semmni; i++)
 		mac_sysvsem_destroy(&sema[i]);
 #endif
 	free(sem, M_SEM);
 	free(sema, M_SEM);
 	free(semu, M_SEM);
 	for (i = 0; i < seminfo.semmni; i++)
 		mtx_destroy(&sema_mtx[i]);
 	free(sema_mtx, M_SEM);
 	mtx_destroy(&sem_mtx);
 	mtx_destroy(&sem_undo_mtx);
 	return (0);
 }
 
 static int
 sysvsem_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = seminit();
 		break;
 	case MOD_UNLOAD:
 		error = semunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvsem_mod = {
 	"sysvsem",
 	&sysvsem_modload,
 	NULL
 };
 
 DECLARE_MODULE(sysvsem, sysvsem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
 MODULE_VERSION(sysvsem, 1);
 
 /*
  * Allocate a new sem_undo structure for a process
  * (returns ptr to structure or NULL if no more room)
  */
 
 static struct sem_undo *
 semu_alloc(struct thread *td)
 {
 	struct sem_undo *suptr;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	if ((suptr = LIST_FIRST(&semu_free_list)) == NULL)
 		return (NULL);
 	LIST_REMOVE(suptr, un_next);
 	LIST_INSERT_HEAD(&semu_list, suptr, un_next);
 	suptr->un_cnt = 0;
 	suptr->un_proc = td->td_proc;
 	return (suptr);
 }
 
 static int
 semu_try_free(struct sem_undo *suptr)
 {
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 
 	if (suptr->un_cnt != 0)
 		return (0);
 	LIST_REMOVE(suptr, un_next);
 	LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
 	return (1);
 }
 
 /*
  * Adjust a particular entry for a particular proc
  */
 
 static int
 semundo_adjust(struct thread *td, struct sem_undo **supptr, int semid,
     int semseq, int semnum, int adjval)
 {
 	struct proc *p = td->td_proc;
 	struct sem_undo *suptr;
 	struct undo *sunptr;
 	int i;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	/* Look for and remember the sem_undo if the caller doesn't provide
 	   it */
 
 	suptr = *supptr;
 	if (suptr == NULL) {
 		LIST_FOREACH(suptr, &semu_list, un_next) {
 			if (suptr->un_proc == p) {
 				*supptr = suptr;
 				break;
 			}
 		}
 		if (suptr == NULL) {
 			if (adjval == 0)
 				return(0);
 			suptr = semu_alloc(td);
 			if (suptr == NULL)
 				return (ENOSPC);
 			*supptr = suptr;
 		}
 	}
 
 	/*
 	 * Look for the requested entry and adjust it (delete if adjval becomes
 	 * 0).
 	 */
 	sunptr = &suptr->un_ent[0];
 	for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
 		if (sunptr->un_id != semid || sunptr->un_num != semnum)
 			continue;
 		if (adjval != 0) {
 			adjval += sunptr->un_adjval;
 			if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
 				return (ERANGE);
 		}
 		sunptr->un_adjval = adjval;
 		if (sunptr->un_adjval == 0) {
 			suptr->un_cnt--;
 			if (i < suptr->un_cnt)
 				suptr->un_ent[i] =
 				    suptr->un_ent[suptr->un_cnt];
 			if (suptr->un_cnt == 0)
 				semu_try_free(suptr);
 		}
 		return (0);
 	}
 
 	/* Didn't find the right entry - create it */
 	if (adjval == 0)
 		return (0);
 	if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
 		return (ERANGE);
 	if (suptr->un_cnt != seminfo.semume) {
 		sunptr = &suptr->un_ent[suptr->un_cnt];
 		suptr->un_cnt++;
 		sunptr->un_adjval = adjval;
 		sunptr->un_id = semid;
 		sunptr->un_num = semnum;
 		sunptr->un_seq = semseq;
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 static void
 semundo_clear(int semid, int semnum)
 {
 	struct sem_undo *suptr, *suptr1;
 	struct undo *sunptr;
 	int i;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	LIST_FOREACH_SAFE(suptr, &semu_list, un_next, suptr1) {
 		sunptr = &suptr->un_ent[0];
 		for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
 			if (sunptr->un_id != semid)
 				continue;
 			if (semnum == -1 || sunptr->un_num == semnum) {
 				suptr->un_cnt--;
 				if (i < suptr->un_cnt) {
 					suptr->un_ent[i] =
 					    suptr->un_ent[suptr->un_cnt];
 					continue;
 				}
 				semu_try_free(suptr);
 			}
 			if (semnum != -1)
 				break;
 		}
 	}
 }
 
 static int
 semvalid(int semid, struct prison *rpr, struct semid_kernel *semakptr)
 {
 
 	return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 	    semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ||
 	    sem_prison_cansee(rpr, semakptr) ? EINVAL : 0);
 }
 
 static void
 sem_remove(int semidx, struct ucred *cred)
 {
 	struct semid_kernel *semakptr;
 	int i;
 
 	KASSERT(semidx >= 0 && semidx < seminfo.semmni,
 	    ("semidx out of bounds"));
 	mtx_assert(&sem_mtx, MA_OWNED);
 	semakptr = &sema[semidx];
 	KASSERT(semakptr->u.__sem_base - sem + semakptr->u.sem_nsems <= semtot,
 	    ("sem_remove: sema %d corrupted sem pointer %p %p %d %d",
 	    semidx, semakptr->u.__sem_base, sem, semakptr->u.sem_nsems,
 	    semtot));
 
 	semakptr->u.sem_perm.cuid = cred ? cred->cr_uid : 0;
 	semakptr->u.sem_perm.uid = cred ? cred->cr_uid : 0;
 	semakptr->u.sem_perm.mode = 0;
 	racct_sub_cred(semakptr->cred, RACCT_NSEM, semakptr->u.sem_nsems);
 	crfree(semakptr->cred);
 	semakptr->cred = NULL;
 	SEMUNDO_LOCK();
 	semundo_clear(semidx, -1);
 	SEMUNDO_UNLOCK();
 #ifdef MAC
 	mac_sysvsem_cleanup(semakptr);
 #endif
 	wakeup(semakptr);
 	for (i = 0; i < seminfo.semmni; i++) {
 		if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
 		    sema[i].u.__sem_base > semakptr->u.__sem_base)
 			mtx_lock_flags(&sema_mtx[i], LOP_DUPOK);
 	}
 	for (i = semakptr->u.__sem_base - sem + semakptr->u.sem_nsems;
 	    i < semtot; i++)
 		sem[i - semakptr->u.sem_nsems] = sem[i];
 	for (i = 0; i < seminfo.semmni; i++) {
 		if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
 		    sema[i].u.__sem_base > semakptr->u.__sem_base) {
 			sema[i].u.__sem_base -= semakptr->u.sem_nsems;
 			mtx_unlock(&sema_mtx[i]);
 		}
 	}
 	semtot -= semakptr->u.sem_nsems;
 }
 
 static struct prison *
 sem_find_prison(struct ucred *cred)
 {
 	struct prison *pr, *rpr;
 
 	pr = cred->cr_prison;
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, sem_prison_slot);
 	prison_unlock(pr);
 	return rpr;
 }
 
 static int
 sem_prison_cansee(struct prison *rpr, struct semid_kernel *semakptr)
 {
 
 	if (semakptr->cred == NULL ||
 	    !(rpr == semakptr->cred->cr_prison ||
 	      prison_ischild(rpr, semakptr->cred->cr_prison)))
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Note that the user-mode half of this passes a union, not a pointer.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct __semctl_args {
 	int	semid;
 	int	semnum;
 	int	cmd;
 	union	semun *arg;
 };
 #endif
 int
 sys___semctl(struct thread *td, struct __semctl_args *uap)
 {
 	struct semid_ds dsbuf;
 	union semun arg, semun;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(arg.buf, &dsbuf, sizeof(dsbuf));
 		if (error)
 			return (error);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = arg.array;
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;		
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		error = copyout(&dsbuf, arg.buf, sizeof(dsbuf));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 
 int
 kern_semctl(struct thread *td, int semid, int semnum, int cmd,
     union semun *arg, register_t *rval)
 {
 	u_short *array;
 	struct ucred *cred = td->td_ucred;
 	int i, error;
 	struct prison *rpr;
 	struct semid_ds *sbuf;
 	struct semid_kernel *semakptr;
 	struct mtx *sema_mtxp;
 	u_short usval, count;
 	int semidx;
 
 	DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n",
 	    semid, semnum, cmd, arg));
 
 	AUDIT_ARG_SVIPC_CMD(cmd);
 	AUDIT_ARG_SVIPC_ID(semid);
 
 	rpr = sem_find_prison(td->td_ucred);
 	if (sem == NULL)
 		return (ENOSYS);
 
 	array = NULL;
 
 	switch(cmd) {
 	case SEM_STAT:
 		/*
 		 * For this command we assume semid is an array index
 		 * rather than an IPC id.
 		 */
 		if (semid < 0 || semid >= seminfo.semmni)
 			return (EINVAL);
 		semakptr = &sema[semid];
 		sema_mtxp = &sema_mtx[semid];
 		mtx_lock(sema_mtxp);
 		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
 			error = EINVAL;
 			goto done2;
 		}
 		if ((error = sem_prison_cansee(rpr, semakptr)))
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 #ifdef MAC
 		error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
 		if (error != 0)
 			goto done2;
 #endif
 		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
 		if (cred->cr_prison != semakptr->cred->cr_prison)
 			arg->buf->sem_perm.key = IPC_PRIVATE;
 		*rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm);
 		mtx_unlock(sema_mtxp);
 		return (0);
 	}
 
 	semidx = IPCID_TO_IX(semid);
 	if (semidx < 0 || semidx >= seminfo.semmni)
 		return (EINVAL);
 
 	semakptr = &sema[semidx];
 	sema_mtxp = &sema_mtx[semidx];
 	if (cmd == IPC_RMID)
 		mtx_lock(&sem_mtx);
 	mtx_lock(sema_mtxp);
 
 #ifdef MAC
 	error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
 	if (error != 0)
 		goto done2;
 #endif
 
 	error = 0;
 	*rval = 0;
 
 	switch (cmd) {
 	case IPC_RMID:
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
 			goto done2;
 		sem_remove(semidx, cred);
 		break;
 
 	case IPC_SET:
 		AUDIT_ARG_SVIPC_PERM(&arg->buf->sem_perm);
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
 			goto done2;
 		sbuf = arg->buf;
 		semakptr->u.sem_perm.uid = sbuf->sem_perm.uid;
 		semakptr->u.sem_perm.gid = sbuf->sem_perm.gid;
 		semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode &
 		    ~0777) | (sbuf->sem_perm.mode & 0777);
 		semakptr->u.sem_ctime = time_second;
 		break;
 
 	case IPC_STAT:
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
 		if (cred->cr_prison != semakptr->cred->cr_prison)
 			arg->buf->sem_perm.key = IPC_PRIVATE;
 
 		/*
 		 * Try to hide the fact that the structure layout is shared by
 		 * both the kernel and userland.  This pointer is not useful to
 		 * userspace.
 		 */
 		arg->buf->__sem_base = NULL;
 		break;
 
 	case GETNCNT:
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.__sem_base[semnum].semncnt;
 		break;
 
 	case GETPID:
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.__sem_base[semnum].sempid;
 		break;
 
 	case GETVAL:
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.__sem_base[semnum].semval;
 		break;
 
 	case GETALL:
 		/*
 		 * Unfortunately, callers of this function don't know
 		 * in advance how many semaphores are in this set.
 		 * While we could just allocate the maximum size array
 		 * and pass the actual size back to the caller, that
 		 * won't work for SETALL since we can't copyin() more
 		 * data than the user specified as we may return a
 		 * spurious EFAULT.
 		 * 
 		 * Note that the number of semaphores in a set is
 		 * fixed for the life of that set.  The only way that
 		 * the 'count' could change while are blocked in
 		 * malloc() is if this semaphore set were destroyed
 		 * and a new one created with the same index.
 		 * However, semvalid() will catch that due to the
 		 * sequence number unless exactly 0x8000 (or a
 		 * multiple thereof) semaphore sets for the same index
 		 * are created and destroyed while we are in malloc!
 		 *
 		 */
 		count = semakptr->u.sem_nsems;
 		mtx_unlock(sema_mtxp);		    
 		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
 		mtx_lock(sema_mtxp);
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		for (i = 0; i < semakptr->u.sem_nsems; i++)
 			array[i] = semakptr->u.__sem_base[i].semval;
 		mtx_unlock(sema_mtxp);
 		error = copyout(array, arg->array, count * sizeof(*array));
 		mtx_lock(sema_mtxp);
 		break;
 
 	case GETZCNT:
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.__sem_base[semnum].semzcnt;
 		break;
 
 	case SETVAL:
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		if (arg->val < 0 || arg->val > seminfo.semvmx) {
 			error = ERANGE;
 			goto done2;
 		}
 		semakptr->u.__sem_base[semnum].semval = arg->val;
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, semnum);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	case SETALL:
 		/*
 		 * See comment on GETALL for why 'count' shouldn't change
 		 * and why we require a userland buffer.
 		 */
 		count = semakptr->u.sem_nsems;
 		mtx_unlock(sema_mtxp);		    
 		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
 		error = copyin(arg->array, array, count * sizeof(*array));
 		mtx_lock(sema_mtxp);
 		if (error)
 			break;
 		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
 			goto done2;
 		for (i = 0; i < semakptr->u.sem_nsems; i++) {
 			usval = array[i];
 			if (usval > seminfo.semvmx) {
 				error = ERANGE;
 				break;
 			}
 			semakptr->u.__sem_base[i].semval = usval;
 		}
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, -1);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 done2:
 	mtx_unlock(sema_mtxp);
 	if (cmd == IPC_RMID)
 		mtx_unlock(&sem_mtx);
 	if (array != NULL)
 		free(array, M_TEMP);
 	return(error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct semget_args {
 	key_t	key;
 	int	nsems;
 	int	semflg;
 };
 #endif
 int
 sys_semget(struct thread *td, struct semget_args *uap)
 {
 	int semid, error = 0;
 	int key = uap->key;
 	int nsems = uap->nsems;
 	int semflg = uap->semflg;
 	struct ucred *cred = td->td_ucred;
 
 	DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg));
 
 	AUDIT_ARG_VALUE(semflg);
 
 	if (sem_find_prison(cred) == NULL)
 		return (ENOSYS);
 
 	mtx_lock(&sem_mtx);
 	if (key != IPC_PRIVATE) {
 		for (semid = 0; semid < seminfo.semmni; semid++) {
 			if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) &&
 			    sema[semid].cred != NULL &&
 			    sema[semid].cred->cr_prison == cred->cr_prison &&
 			    sema[semid].u.sem_perm.key == key)
 				break;
 		}
 		if (semid < seminfo.semmni) {
 			AUDIT_ARG_SVIPC_ID(semid);
 			DPRINTF(("found public key\n"));
 			if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
 				DPRINTF(("not exclusive\n"));
 				error = EEXIST;
 				goto done2;
 			}
 			if ((error = ipcperm(td, &sema[semid].u.sem_perm,
 			    semflg & 0700))) {
 				goto done2;
 			}
 			if (nsems > 0 && sema[semid].u.sem_nsems < nsems) {
 				DPRINTF(("too small\n"));
 				error = EINVAL;
 				goto done2;
 			}
 #ifdef MAC
 			error = mac_sysvsem_check_semget(cred, &sema[semid]);
 			if (error != 0)
 				goto done2;
 #endif
 			goto found;
 		}
 	}
 
 	DPRINTF(("need to allocate the semid_kernel\n"));
 	if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
 		if (nsems <= 0 || nsems > seminfo.semmsl) {
 			DPRINTF(("nsems out of range (0<%d<=%d)\n", nsems,
 			    seminfo.semmsl));
 			error = EINVAL;
 			goto done2;
 		}
 		if (nsems > seminfo.semmns - semtot) {
 			DPRINTF((
 			    "not enough semaphores left (need %d, got %d)\n",
 			    nsems, seminfo.semmns - semtot));
 			error = ENOSPC;
 			goto done2;
 		}
 		for (semid = 0; semid < seminfo.semmni; semid++) {
 			if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0)
 				break;
 		}
 		if (semid == seminfo.semmni) {
 			DPRINTF(("no more semid_kernel's available\n"));
 			error = ENOSPC;
 			goto done2;
 		}
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(td->td_proc);
 			error = racct_add(td->td_proc, RACCT_NSEM, nsems);
 			PROC_UNLOCK(td->td_proc);
 			if (error != 0) {
 				error = ENOSPC;
 				goto done2;
 			}
 		}
 #endif
 		DPRINTF(("semid %d is available\n", semid));
 		mtx_lock(&sema_mtx[semid]);
 		KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0,
 		    ("Lost semaphore %d", semid));
 		sema[semid].u.sem_perm.key = key;
 		sema[semid].u.sem_perm.cuid = cred->cr_uid;
 		sema[semid].u.sem_perm.uid = cred->cr_uid;
 		sema[semid].u.sem_perm.cgid = cred->cr_gid;
 		sema[semid].u.sem_perm.gid = cred->cr_gid;
 		sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
 		sema[semid].cred = crhold(cred);
 		sema[semid].u.sem_perm.seq =
 		    (sema[semid].u.sem_perm.seq + 1) & 0x7fff;
 		sema[semid].u.sem_nsems = nsems;
 		sema[semid].u.sem_otime = 0;
 		sema[semid].u.sem_ctime = time_second;
 		sema[semid].u.__sem_base = &sem[semtot];
 		semtot += nsems;
 		bzero(sema[semid].u.__sem_base,
 		    sizeof(sema[semid].u.__sem_base[0])*nsems);
 #ifdef MAC
 		mac_sysvsem_create(cred, &sema[semid]);
 #endif
 		mtx_unlock(&sema_mtx[semid]);
 		DPRINTF(("sembase = %p, next = %p\n",
 		    sema[semid].u.__sem_base, &sem[semtot]));
 	} else {
 		DPRINTF(("didn't find it and wasn't asked to create it\n"));
 		error = ENOENT;
 		goto done2;
 	}
 
 found:
 	td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].u.sem_perm);
 done2:
 	mtx_unlock(&sem_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct semop_args {
 	int	semid;
 	struct	sembuf *sops;
 	size_t	nsops;
 };
 #endif
 int
 sys_semop(struct thread *td, struct semop_args *uap)
 {
 #define SMALL_SOPS	8
 	struct sembuf small_sops[SMALL_SOPS];
 	int semid = uap->semid;
 	size_t nsops = uap->nsops;
 	struct prison *rpr;
 	struct sembuf *sops;
 	struct semid_kernel *semakptr;
 	struct sembuf *sopptr = NULL;
 	struct sem *semptr = NULL;
 	struct sem_undo *suptr;
 	struct mtx *sema_mtxp;
 	size_t i, j, k;
 	int error;
 	int do_wakeup, do_undos;
 	unsigned short seq;
 
 #ifdef SEM_DEBUG
 	sops = NULL;
 #endif
 	DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops));
 
 	AUDIT_ARG_SVIPC_ID(semid);
 
 	rpr = sem_find_prison(td->td_ucred);
 	if (sem == NULL)
 		return (ENOSYS);
 
 	semid = IPCID_TO_IX(semid);	/* Convert back to zero origin */
 
 	if (semid < 0 || semid >= seminfo.semmni)
 		return (EINVAL);
 
 	/* Allocate memory for sem_ops */
 	if (nsops <= SMALL_SOPS)
 		sops = small_sops;
 	else if (nsops > seminfo.semopm) {
 		DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm,
 		    nsops));
 		return (E2BIG);
 	} else {
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(td->td_proc);
 			if (nsops >
 			    racct_get_available(td->td_proc, RACCT_NSEMOP)) {
 				PROC_UNLOCK(td->td_proc);
 				return (E2BIG);
 			}
 			PROC_UNLOCK(td->td_proc);
 		}
 #endif
 
 		sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK);
 	}
 	if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) {
 		DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error,
 		    uap->sops, sops, nsops * sizeof(sops[0])));
 		if (sops != small_sops)
 			free(sops, M_TEMP);
 		return (error);
 	}
 
 	semakptr = &sema[semid];
 	sema_mtxp = &sema_mtx[semid];
 	mtx_lock(sema_mtxp);
 	if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
 		error = EINVAL;
 		goto done2;
 	}
 	seq = semakptr->u.sem_perm.seq;
 	if (seq != IPCID_TO_SEQ(uap->semid)) {
 		error = EINVAL;
 		goto done2;
 	}
 	if ((error = sem_prison_cansee(rpr, semakptr)) != 0)
 		goto done2;
 	/*
 	 * Initial pass through sops to see what permissions are needed.
 	 * Also perform any checks that don't need repeating on each
 	 * attempt to satisfy the request vector.
 	 */
 	j = 0;		/* permission needed */
 	do_undos = 0;
 	for (i = 0; i < nsops; i++) {
 		sopptr = &sops[i];
 		if (sopptr->sem_num >= semakptr->u.sem_nsems) {
 			error = EFBIG;
 			goto done2;
 		}
 		if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0)
 			do_undos = 1;
 		j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A;
 	}
 
 	if ((error = ipcperm(td, &semakptr->u.sem_perm, j))) {
 		DPRINTF(("error = %d from ipaccess\n", error));
 		goto done2;
 	}
 #ifdef MAC
 	error = mac_sysvsem_check_semop(td->td_ucred, semakptr, j);
 	if (error != 0)
 		goto done2;
 #endif
 
 	/*
 	 * Loop trying to satisfy the vector of requests.
 	 * If we reach a point where we must wait, any requests already
 	 * performed are rolled back and we go to sleep until some other
 	 * process wakes us up.  At this point, we start all over again.
 	 *
 	 * This ensures that from the perspective of other tasks, a set
 	 * of requests is atomic (never partially satisfied).
 	 */
 	for (;;) {
 		do_wakeup = 0;
 		error = 0;	/* error return if necessary */
 
 		for (i = 0; i < nsops; i++) {
 			sopptr = &sops[i];
 			semptr = &semakptr->u.__sem_base[sopptr->sem_num];
 
 			DPRINTF((
 			    "semop:  semakptr=%p, __sem_base=%p, "
 			    "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n",
 			    semakptr, semakptr->u.__sem_base, semptr,
 			    sopptr->sem_num, semptr->semval, sopptr->sem_op,
 			    (sopptr->sem_flg & IPC_NOWAIT) ?
 			    "nowait" : "wait"));
 
 			if (sopptr->sem_op < 0) {
 				if (semptr->semval + sopptr->sem_op < 0) {
 					DPRINTF(("semop:  can't do it now\n"));
 					break;
 				} else {
 					semptr->semval += sopptr->sem_op;
 					if (semptr->semval == 0 &&
 					    semptr->semzcnt > 0)
 						do_wakeup = 1;
 				}
 			} else if (sopptr->sem_op == 0) {
 				if (semptr->semval != 0) {
 					DPRINTF(("semop:  not zero now\n"));
 					break;
 				}
 			} else if (semptr->semval + sopptr->sem_op >
 			    seminfo.semvmx) {
 				error = ERANGE;
 				break;
 			} else {
 				if (semptr->semncnt > 0)
 					do_wakeup = 1;
 				semptr->semval += sopptr->sem_op;
 			}
 		}
 
 		/*
 		 * Did we get through the entire vector?
 		 */
 		if (i >= nsops)
 			goto done;
 
 		/*
 		 * No ... rollback anything that we've already done
 		 */
 		DPRINTF(("semop:  rollback 0 through %d\n", i-1));
 		for (j = 0; j < i; j++)
 			semakptr->u.__sem_base[sops[j].sem_num].semval -=
 			    sops[j].sem_op;
 
 		/* If we detected an error, return it */
 		if (error != 0)
 			goto done2;
 
 		/*
 		 * If the request that we couldn't satisfy has the
 		 * NOWAIT flag set then return with EAGAIN.
 		 */
 		if (sopptr->sem_flg & IPC_NOWAIT) {
 			error = EAGAIN;
 			goto done2;
 		}
 
 		if (sopptr->sem_op == 0)
 			semptr->semzcnt++;
 		else
 			semptr->semncnt++;
 
 		DPRINTF(("semop:  good night!\n"));
 		error = msleep(semakptr, sema_mtxp, (PZERO - 4) | PCATCH,
 		    "semwait", 0);
 		DPRINTF(("semop:  good morning (error=%d)!\n", error));
 		/* return code is checked below, after sem[nz]cnt-- */
 
 		/*
 		 * Make sure that the semaphore still exists
 		 */
 		seq = semakptr->u.sem_perm.seq;
 		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 		    seq != IPCID_TO_SEQ(uap->semid)) {
 			error = EIDRM;
 			goto done2;
 		}
 
 		/*
 		 * Renew the semaphore's pointer after wakeup since
 		 * during msleep __sem_base may have been modified and semptr
 		 * is not valid any more
 		 */
 		semptr = &semakptr->u.__sem_base[sopptr->sem_num];
 
 		/*
 		 * The semaphore is still alive.  Readjust the count of
 		 * waiting processes.
 		 */
 		if (sopptr->sem_op == 0)
 			semptr->semzcnt--;
 		else
 			semptr->semncnt--;
 
 		/*
 		 * Is it really morning, or was our sleep interrupted?
 		 * (Delayed check of msleep() return code because we
 		 * need to decrement sem[nz]cnt either way.)
 		 */
 		if (error != 0) {
 			error = EINTR;
 			goto done2;
 		}
 		DPRINTF(("semop:  good morning!\n"));
 	}
 
 done:
 	/*
 	 * Process any SEM_UNDO requests.
 	 */
 	if (do_undos) {
 		SEMUNDO_LOCK();
 		suptr = NULL;
 		for (i = 0; i < nsops; i++) {
 			/*
 			 * We only need to deal with SEM_UNDO's for non-zero
 			 * op's.
 			 */
 			int adjval;
 
 			if ((sops[i].sem_flg & SEM_UNDO) == 0)
 				continue;
 			adjval = sops[i].sem_op;
 			if (adjval == 0)
 				continue;
 			error = semundo_adjust(td, &suptr, semid, seq,
 			    sops[i].sem_num, -adjval);
 			if (error == 0)
 				continue;
 
 			/*
 			 * Oh-Oh!  We ran out of either sem_undo's or undo's.
 			 * Rollback the adjustments to this point and then
 			 * rollback the semaphore ups and down so we can return
 			 * with an error with all structures restored.  We
 			 * rollback the undo's in the exact reverse order that
 			 * we applied them.  This guarantees that we won't run
 			 * out of space as we roll things back out.
 			 */
 			for (j = 0; j < i; j++) {
 				k = i - j - 1;
 				if ((sops[k].sem_flg & SEM_UNDO) == 0)
 					continue;
 				adjval = sops[k].sem_op;
 				if (adjval == 0)
 					continue;
 				if (semundo_adjust(td, &suptr, semid, seq,
 				    sops[k].sem_num, adjval) != 0)
 					panic("semop - can't undo undos");
 			}
 
 			for (j = 0; j < nsops; j++)
 				semakptr->u.__sem_base[sops[j].sem_num].semval -=
 				    sops[j].sem_op;
 
 			DPRINTF(("error = %d from semundo_adjust\n", error));
 			SEMUNDO_UNLOCK();
 			goto done2;
 		} /* loop through the sops */
 		SEMUNDO_UNLOCK();
 	} /* if (do_undos) */
 
 	/* We're definitely done - set the sempid's and time */
 	for (i = 0; i < nsops; i++) {
 		sopptr = &sops[i];
 		semptr = &semakptr->u.__sem_base[sopptr->sem_num];
 		semptr->sempid = td->td_proc->p_pid;
 	}
 	semakptr->u.sem_otime = time_second;
 
 	/*
 	 * Do a wakeup if any semaphore was up'd whilst something was
 	 * sleeping on it.
 	 */
 	if (do_wakeup) {
 		DPRINTF(("semop:  doing wakeup\n"));
 		wakeup(semakptr);
 		DPRINTF(("semop:  back from wakeup\n"));
 	}
 	DPRINTF(("semop:  done\n"));
 	td->td_retval[0] = 0;
 done2:
 	mtx_unlock(sema_mtxp);
 	if (sops != small_sops)
 		free(sops, M_TEMP);
 	return (error);
 }
 
 /*
  * Go through the undo structures for this process and apply the adjustments to
  * semaphores.
  */
 static void
 semexit_myhook(void *arg, struct proc *p)
 {
 	struct sem_undo *suptr;
 	struct semid_kernel *semakptr;
 	struct mtx *sema_mtxp;
 	int semid, semnum, adjval, ix;
 	unsigned short seq;
 
 	/*
 	 * Go through the chain of undo vectors looking for one
 	 * associated with this process.
 	 */
 	if (LIST_EMPTY(&semu_list))
 		return;
 	SEMUNDO_LOCK();
 	LIST_FOREACH(suptr, &semu_list, un_next) {
 		if (suptr->un_proc == p)
 			break;
 	}
 	if (suptr == NULL) {
 		SEMUNDO_UNLOCK();
 		return;
 	}
 	LIST_REMOVE(suptr, un_next);
 
 	DPRINTF(("proc @%p has undo structure with %d entries\n", p,
 	    suptr->un_cnt));
 
 	/*
 	 * If there are any active undo elements then process them.
 	 */
 	if (suptr->un_cnt > 0) {
 		SEMUNDO_UNLOCK();
 		for (ix = 0; ix < suptr->un_cnt; ix++) {
 			semid = suptr->un_ent[ix].un_id;
 			semnum = suptr->un_ent[ix].un_num;
 			adjval = suptr->un_ent[ix].un_adjval;
 			seq = suptr->un_ent[ix].un_seq;
 			semakptr = &sema[semid];
 			sema_mtxp = &sema_mtx[semid];
 
 			mtx_lock(sema_mtxp);
 			if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 			    (semakptr->u.sem_perm.seq != seq)) {
 				mtx_unlock(sema_mtxp);
 				continue;
 			}
 			if (semnum >= semakptr->u.sem_nsems)
 				panic("semexit - semnum out of range");
 
 			DPRINTF((
 			    "semexit:  %p id=%d num=%d(adj=%d) ; sem=%d\n",
 			    suptr->un_proc, suptr->un_ent[ix].un_id,
 			    suptr->un_ent[ix].un_num,
 			    suptr->un_ent[ix].un_adjval,
 			    semakptr->u.__sem_base[semnum].semval));
 
 			if (adjval < 0 && semakptr->u.__sem_base[semnum].semval <
 			    -adjval)
 				semakptr->u.__sem_base[semnum].semval = 0;
 			else
 				semakptr->u.__sem_base[semnum].semval += adjval;
 
 			wakeup(semakptr);
 			DPRINTF(("semexit:  back from wakeup\n"));
 			mtx_unlock(sema_mtxp);
 		}
 		SEMUNDO_LOCK();
 	}
 
 	/*
 	 * Deallocate the undo vector.
 	 */
 	DPRINTF(("removing vector\n"));
 	suptr->un_proc = NULL;
 	suptr->un_cnt = 0;
 	LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
 	SEMUNDO_UNLOCK();
 }
 
 static int
 sysctl_sema(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr, *rpr;
 	struct semid_kernel tsemak;
 #ifdef COMPAT_FREEBSD32
 	struct semid_kernel32 tsemak32;
 #endif
 	void *outaddr;
 	size_t outsize;
 	int error, i;
 
 	pr = req->td->td_ucred->cr_prison;
 	rpr = sem_find_prison(req->td->td_ucred);
 	error = 0;
 	for (i = 0; i < seminfo.semmni; i++) {
 		mtx_lock(&sema_mtx[i]);
 		if ((sema[i].u.sem_perm.mode & SEM_ALLOC) == 0 ||
 		    rpr == NULL || sem_prison_cansee(rpr, &sema[i]) != 0)
 			bzero(&tsemak, sizeof(tsemak));
 		else {
 			tsemak = sema[i];
 			if (tsemak.cred->cr_prison != pr)
 				tsemak.u.sem_perm.key = IPC_PRIVATE;
 		}
 		mtx_unlock(&sema_mtx[i]);
 #ifdef COMPAT_FREEBSD32
 		if (SV_CURPROC_FLAG(SV_ILP32)) {
 			bzero(&tsemak32, sizeof(tsemak32));
 			freebsd32_ipcperm_out(&tsemak.u.sem_perm,
 			    &tsemak32.u.sem_perm);
 			/* Don't copy u.__sem_base */
 			CP(tsemak, tsemak32, u.sem_nsems);
 			CP(tsemak, tsemak32, u.sem_otime);
 			CP(tsemak, tsemak32, u.sem_ctime);
 			/* Don't copy label or cred */
 			outaddr = &tsemak32;
 			outsize = sizeof(tsemak32);
 		} else
 #endif
 		{
 			tsemak.u.__sem_base = NULL;
 			tsemak.label = NULL;
 			tsemak.cred = NULL;
 			outaddr = &tsemak;
 			outsize = sizeof(tsemak);
 		}
 		error = SYSCTL_OUT(req, outaddr, outsize);
 		if (error != 0)
 			break;
 	}
 	return (error);
 }
 
 static int
 sem_prison_check(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *prpr;
 	struct vfsoptlist *opts = data;
 	int error, jsys;
 
 	/*
 	 * sysvsem is a jailsys integer.
 	 * It must be "disable" if the parent jail is disabled.
 	 */
 	error = vfs_copyopt(opts, "sysvsem", &jsys, sizeof(jsys));
 	if (error != ENOENT) {
 		if (error != 0)
 			return (error);
 		switch (jsys) {
 		case JAIL_SYS_DISABLE:
 			break;
 		case JAIL_SYS_NEW:
 		case JAIL_SYS_INHERIT:
 			prison_lock(pr->pr_parent);
 			prpr = osd_jail_get(pr->pr_parent, sem_prison_slot);
 			prison_unlock(pr->pr_parent);
 			if (prpr == NULL)
 				return (EPERM);
 			break;
 		default:
 			return (EINVAL);
 		}
 	}
 
 	return (0);
 }
 
 static int
 sem_prison_set(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *tpr, *orpr, *nrpr, *trpr;
 	struct vfsoptlist *opts = data;
 	void *rsv;
 	int jsys, descend;
 
 	/*
 	 * sysvsem controls which jail is the root of the associated sems (this
 	 * jail or same as the parent), or if the feature is available at all.
 	 */
 	if (vfs_copyopt(opts, "sysvsem", &jsys, sizeof(jsys)) == ENOENT)
 		jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0)
 		    ? JAIL_SYS_INHERIT
 		    : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0)
 		    ? JAIL_SYS_DISABLE
 		    : -1;
 	if (jsys == JAIL_SYS_DISABLE) {
 		prison_lock(pr);
 		orpr = osd_jail_get(pr, sem_prison_slot);
 		if (orpr != NULL)
 			osd_jail_del(pr, sem_prison_slot);
 		prison_unlock(pr);
 		if (orpr != NULL) {
 			if (orpr == pr)
 				sem_prison_cleanup(pr);
 			/* Disable all child jails as well. */
 			FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 				prison_lock(tpr);
 				trpr = osd_jail_get(tpr, sem_prison_slot);
 				if (trpr != NULL) {
 					osd_jail_del(tpr, sem_prison_slot);
 					prison_unlock(tpr);
 					if (trpr == tpr)
 						sem_prison_cleanup(tpr);
 				} else {
 					prison_unlock(tpr);
 					descend = 0;
 				}
 			}
 		}
 	} else if (jsys != -1) {
 		if (jsys == JAIL_SYS_NEW)
 			nrpr = pr;
 		else {
 			prison_lock(pr->pr_parent);
 			nrpr = osd_jail_get(pr->pr_parent, sem_prison_slot);
 			prison_unlock(pr->pr_parent);
 		}
 		rsv = osd_reserve(sem_prison_slot);
 		prison_lock(pr);
 		orpr = osd_jail_get(pr, sem_prison_slot);
 		if (orpr != nrpr)
 			(void)osd_jail_set_reserved(pr, sem_prison_slot, rsv,
 			    nrpr);
 		else
 			osd_free_reserved(rsv);
 		prison_unlock(pr);
 		if (orpr != nrpr) {
 			if (orpr == pr)
 				sem_prison_cleanup(pr);
 			if (orpr != NULL) {
 				/* Change child jails matching the old root, */
 				FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 					prison_lock(tpr);
 					trpr = osd_jail_get(tpr,
 					    sem_prison_slot);
 					if (trpr == orpr) {
 						(void)osd_jail_set(tpr,
 						    sem_prison_slot, nrpr);
 						prison_unlock(tpr);
 						if (trpr == tpr)
 							sem_prison_cleanup(tpr);
 					} else {
 						prison_unlock(tpr);
 						descend = 0;
 					}
 				}
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 sem_prison_get(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *rpr;
 	struct vfsoptlist *opts = data;
 	int error, jsys;
 
 	/* Set sysvsem based on the jail's root prison. */
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, sem_prison_slot);
 	prison_unlock(pr);
 	jsys = rpr == NULL ? JAIL_SYS_DISABLE
 	    : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
 	error = vfs_setopt(opts, "sysvsem", &jsys, sizeof(jsys));
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 static int
 sem_prison_remove(void *obj, void *data __unused)
 {
 	struct prison *pr = obj;
 	struct prison *rpr;
 
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, sem_prison_slot);
 	prison_unlock(pr);
 	if (rpr == pr)
 		sem_prison_cleanup(pr);
 	return (0);
 }
 
 static void
 sem_prison_cleanup(struct prison *pr)
 {
 	int i;
 
 	/* Remove any sems that belong to this jail. */
 	mtx_lock(&sem_mtx);
 	for (i = 0; i < seminfo.semmni; i++) {
 		if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
 		    sema[i].cred != NULL && sema[i].cred->cr_prison == pr) {
 			mtx_lock(&sema_mtx[i]);
 			sem_remove(i, NULL);
 			mtx_unlock(&sema_mtx[i]);
 		}
 	}
 	mtx_unlock(&sem_mtx);
 }
 
 SYSCTL_JAIL_PARAM_SYS_NODE(sysvsem, CTLFLAG_RW, "SYSV semaphores");
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *semcalls[] = {
 	(sy_call_t *)freebsd7___semctl, (sy_call_t *)sys_semget,
 	(sy_call_t *)sys_semop
 };
 
 /*
  * Entry point for all SEM calls.
  */
 int
 sys_semsys(td, uap)
 	struct thread *td;
 	/* XXX actually varargs. */
 	struct semsys_args /* {
 		int	which;
 		int	a2;
 		int	a3;
 		int	a4;
 		int	a5;
 	} */ *uap;
 {
 	int error;
 
 	AUDIT_ARG_SVIPC_WHICH(uap->which);
 	if (uap->which < 0 || uap->which >= nitems(semcalls))
 		return (EINVAL);
 	error = (*semcalls[uap->which])(td, &uap->a2);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd7___semctl_args {
 	int	semid;
 	int	semnum;
 	int	cmd;
 	union	semun_old *arg;
 };
 #endif
 int
 freebsd7___semctl(struct thread *td, struct freebsd7___semctl_args *uap)
 {
 	struct semid_ds_old dsold;
 	struct semid_ds dsbuf;
 	union semun_old arg;
 	union semun semun;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(arg.buf, &dsold, sizeof(dsold));
 		if (error)
 			return (error);
 		ipcperm_old2new(&dsold.sem_perm, &dsbuf.sem_perm);
 		CP(dsold, dsbuf, __sem_base);
 		CP(dsold, dsbuf, sem_nsems);
 		CP(dsold, dsbuf, sem_otime);
 		CP(dsold, dsbuf, sem_ctime);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = arg.array;
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;		
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		bzero(&dsold, sizeof(dsold));
 		ipcperm_new2old(&dsbuf.sem_perm, &dsold.sem_perm);
 		CP(dsbuf, dsold, __sem_base);
 		CP(dsbuf, dsold, sem_nsems);
 		CP(dsbuf, dsold, sem_otime);
 		CP(dsbuf, dsold, sem_ctime);
 		error = copyout(&dsold, arg.buf, sizeof(dsold));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 
 #endif /* COMPAT_FREEBSD{4,5,6,7} */
 
 #ifdef COMPAT_FREEBSD32
 
 int
 freebsd32_semsys(struct thread *td, struct freebsd32_semsys_args *uap)
 {
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	AUDIT_ARG_SVIPC_WHICH(uap->which);
 	switch (uap->which) {
 	case 0:
 		return (freebsd7_freebsd32_semctl(td,
 		    (struct freebsd7_freebsd32_semctl_args *)&uap->a2));
 	default:
 		return (sys_semsys(td, (struct semsys_args *)uap));
 	}
 #else
 	return (nosys(td, NULL));
 #endif
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 int
 freebsd7_freebsd32_semctl(struct thread *td,
     struct freebsd7_freebsd32_semctl_args *uap)
 {
 	struct semid_ds32_old dsbuf32;
 	struct semid_ds dsbuf;
 	union semun semun;
 	union semun32 arg;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);		
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_old_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
 		PTRIN_CP(dsbuf32, dsbuf, __sem_base);
 		CP(dsbuf32, dsbuf, sem_nsems);
 		CP(dsbuf32, dsbuf, sem_otime);
 		CP(dsbuf32, dsbuf, sem_ctime);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = PTRIN(arg.array);
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		bzero(&dsbuf32, sizeof(dsbuf32));
 		freebsd32_ipcperm_old_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
 		PTROUT_CP(dsbuf, dsbuf32, __sem_base);
 		CP(dsbuf, dsbuf32, sem_nsems);
 		CP(dsbuf, dsbuf32, sem_otime);
 		CP(dsbuf, dsbuf32, sem_ctime);
 		error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 #endif
 
 int
 freebsd32_semctl(struct thread *td, struct freebsd32_semctl_args *uap)
 {
 	struct semid_ds32 dsbuf32;
 	struct semid_ds dsbuf;
 	union semun semun;
 	union semun32 arg;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);		
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
 		PTRIN_CP(dsbuf32, dsbuf, __sem_base);
 		CP(dsbuf32, dsbuf, sem_nsems);
 		CP(dsbuf32, dsbuf, sem_otime);
 		CP(dsbuf32, dsbuf, sem_ctime);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = PTRIN(arg.array);
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;		
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		bzero(&dsbuf32, sizeof(dsbuf32));
 		freebsd32_ipcperm_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
 		PTROUT_CP(dsbuf, dsbuf32, __sem_base);
 		CP(dsbuf, dsbuf32, sem_nsems);
 		CP(dsbuf, dsbuf32, sem_otime);
 		CP(dsbuf, dsbuf32, sem_ctime);
 		error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 
 #endif /* COMPAT_FREEBSD32 */
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
index ad5f0030b965..2e7ae927dcc3 100644
--- a/sys/kern/sysv_shm.c
+++ b/sys/kern/sysv_shm.c
@@ -1,1703 +1,1703 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause AND BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1994 Adam Glass and Charles Hannum.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Adam Glass and Charles
  *	Hannum.
  * 4. The names of the authors may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $NetBSD: sysv_shm.c,v 1.39 1997/10/07 10:02:03 drochner Exp $
  */
 /*-
  * Copyright (c) 2003-2005 McAfee, Inc.
  * Copyright (c) 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/abi_compat.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/shm.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 FEATURE(sysv_shm, "System V shared memory segments support");
 
 static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
 
 #define	SHMSEG_FREE     	0x0200
 #define	SHMSEG_REMOVED  	0x0400
 #define	SHMSEG_ALLOCATED	0x0800
 
 static int shm_last_free, shm_nused, shmalloced;
 vm_size_t shm_committed;
 static struct shmid_kernel *shmsegs;
 static unsigned shm_prison_slot;
 
 struct shmmap_state {
 	vm_offset_t va;
 	int shmid;
 };
 
 static void shm_deallocate_segment(struct shmid_kernel *);
 static int shm_find_segment_by_key(struct prison *, key_t);
 static struct shmid_kernel *shm_find_segment(struct prison *, int, bool);
 static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *);
 static int shmget_allocate_segment(struct thread *td, key_t key, size_t size,
     int mode);
 static int shmget_existing(struct thread *td, size_t size, int shmflg,
     int mode, int segnum);
 static void shmrealloc(void);
 static int shminit(void);
 static int sysvshm_modload(struct module *, int, void *);
 static int shmunload(void);
 #ifndef SYSVSHM
 static void shmexit_myhook(struct vmspace *vm);
 static void shmfork_myhook(struct proc *p1, struct proc *p2);
 #endif
 static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS);
 static void shm_remove(struct shmid_kernel *, int);
 static struct prison *shm_find_prison(struct ucred *);
 static int shm_prison_cansee(struct prison *, struct shmid_kernel *);
 static int shm_prison_check(void *, void *);
 static int shm_prison_set(void *, void *);
 static int shm_prison_get(void *, void *);
 static int shm_prison_remove(void *, void *);
 static void shm_prison_cleanup(struct prison *);
 
 /*
  * Tuneable values.
  */
 #ifndef SHMMAXPGS
 #define	SHMMAXPGS	131072	/* Note: sysv shared memory is swap backed. */
 #endif
 #ifndef SHMMAX
 #define	SHMMAX	(SHMMAXPGS*PAGE_SIZE)
 #endif
 #ifndef SHMMIN
 #define	SHMMIN	1
 #endif
 #ifndef SHMMNI
 #define	SHMMNI	192
 #endif
 #ifndef SHMSEG
 #define	SHMSEG	128
 #endif
 #ifndef SHMALL
 #define	SHMALL	(SHMMAXPGS)
 #endif
 
 struct	shminfo shminfo = {
 	.shmmax = SHMMAX,
 	.shmmin = SHMMIN,
 	.shmmni = SHMMNI,
 	.shmseg = SHMSEG,
 	.shmall = SHMALL
 };
 
 static int shm_use_phys;
 static int shm_allow_removed = 1;
 
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RWTUN, &shminfo.shmmax, 0,
     "Maximum shared memory segment size");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RWTUN, &shminfo.shmmin, 0,
     "Minimum shared memory segment size");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
     "Number of shared memory identifiers");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
     "Number of segments per process");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RWTUN, &shminfo.shmall, 0,
     "Maximum number of pages available for shared memory");
 SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RWTUN,
     &shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core");
 SYSCTL_INT(_kern_ipc, OID_AUTO, shm_allow_removed, CTLFLAG_RWTUN,
     &shm_allow_removed, 0,
     "Enable/Disable attachment to attached segments marked for removal");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_shmsegs, "",
     "Array of struct shmid_kernel for each potential shared memory segment");
 
 static struct sx sysvshmsx;
 #define	SYSVSHM_LOCK()		sx_xlock(&sysvshmsx)
 #define	SYSVSHM_UNLOCK()	sx_xunlock(&sysvshmsx)
 #define	SYSVSHM_ASSERT_LOCKED()	sx_assert(&sysvshmsx, SA_XLOCKED)
 
 static int
 shm_find_segment_by_key(struct prison *pr, key_t key)
 {
 	int i;
 
 	for (i = 0; i < shmalloced; i++)
 		if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) &&
 		    shmsegs[i].cred != NULL &&
 		    shmsegs[i].cred->cr_prison == pr &&
 		    shmsegs[i].u.shm_perm.key == key)
 			return (i);
 	return (-1);
 }
 
 /*
  * Finds segment either by shmid if is_shmid is true, or by segnum if
  * is_shmid is false.
  */
 static struct shmid_kernel *
 shm_find_segment(struct prison *rpr, int arg, bool is_shmid)
 {
 	struct shmid_kernel *shmseg;
 	int segnum;
 
 	segnum = is_shmid ? IPCID_TO_IX(arg) : arg;
 	if (segnum < 0 || segnum >= shmalloced)
 		return (NULL);
 	shmseg = &shmsegs[segnum];
 	if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
 	    (!shm_allow_removed &&
 	    (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) ||
 	    (is_shmid && shmseg->u.shm_perm.seq != IPCID_TO_SEQ(arg)) ||
 	    shm_prison_cansee(rpr, shmseg) != 0)
 		return (NULL);
 	return (shmseg);
 }
 
 static void
 shm_deallocate_segment(struct shmid_kernel *shmseg)
 {
 	vm_size_t size;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	vm_object_deallocate(shmseg->object);
 	shmseg->object = NULL;
 	size = round_page(shmseg->u.shm_segsz);
 	shm_committed -= btoc(size);
 	shm_nused--;
 	shmseg->u.shm_perm.mode = SHMSEG_FREE;
 #ifdef MAC
 	mac_sysvshm_cleanup(shmseg);
 #endif
 	racct_sub_cred(shmseg->cred, RACCT_NSHM, 1);
 	racct_sub_cred(shmseg->cred, RACCT_SHMSIZE, size);
 	crfree(shmseg->cred);
 	shmseg->cred = NULL;
 }
 
 static int
 shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s)
 {
 	struct shmid_kernel *shmseg;
 	int segnum, result;
 	vm_size_t size;
 
 	SYSVSHM_ASSERT_LOCKED();
 	segnum = IPCID_TO_IX(shmmap_s->shmid);
 	KASSERT(segnum >= 0 && segnum < shmalloced,
 	    ("segnum %d shmalloced %d", segnum, shmalloced));
 
 	shmseg = &shmsegs[segnum];
 	size = round_page(shmseg->u.shm_segsz);
 	result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size);
 	if (result != KERN_SUCCESS)
 		return (EINVAL);
 	shmmap_s->shmid = -1;
 	shmseg->u.shm_dtime = time_second;
 	if (--shmseg->u.shm_nattch == 0 &&
 	    (shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) {
 		shm_deallocate_segment(shmseg);
 		shm_last_free = segnum;
 	}
 	return (0);
 }
 
 static void
 shm_remove(struct shmid_kernel *shmseg, int segnum)
 {
 
 	shmseg->u.shm_perm.key = IPC_PRIVATE;
 	shmseg->u.shm_perm.mode |= SHMSEG_REMOVED;
 	if (shmseg->u.shm_nattch == 0) {
 		shm_deallocate_segment(shmseg);
 		shm_last_free = segnum;
 	}
 }
 
 static struct prison *
 shm_find_prison(struct ucred *cred)
 {
 	struct prison *pr, *rpr;
 
 	pr = cred->cr_prison;
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, shm_prison_slot);
 	prison_unlock(pr);
 	return rpr;
 }
 
 static int
 shm_prison_cansee(struct prison *rpr, struct shmid_kernel *shmseg)
 {
 
 	if (shmseg->cred == NULL ||
 	    !(rpr == shmseg->cred->cr_prison ||
 	      prison_ischild(rpr, shmseg->cred->cr_prison)))
 		return (EINVAL);
 	return (0);
 }
 
 static int
 kern_shmdt_locked(struct thread *td, const void *shmaddr)
 {
 	struct proc *p = td->td_proc;
 	struct shmmap_state *shmmap_s;
 #ifdef MAC
 	int error;
 #endif
 	int i;
 
 	SYSVSHM_ASSERT_LOCKED();
 	if (shm_find_prison(td->td_ucred) == NULL)
 		return (ENOSYS);
 	shmmap_s = p->p_vmspace->vm_shm;
  	if (shmmap_s == NULL)
 		return (EINVAL);
 	AUDIT_ARG_SVIPC_ID(shmmap_s->shmid);
 	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
 		if (shmmap_s->shmid != -1 &&
 		    shmmap_s->va == (vm_offset_t)shmaddr) {
 			break;
 		}
 	}
 	if (i == shminfo.shmseg)
 		return (EINVAL);
 #ifdef MAC
 	error = mac_sysvshm_check_shmdt(td->td_ucred,
 	    &shmsegs[IPCID_TO_IX(shmmap_s->shmid)]);
 	if (error != 0)
 		return (error);
 #endif
 	return (shm_delete_mapping(p->p_vmspace, shmmap_s));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmdt_args {
 	const void *shmaddr;
 };
 #endif
 int
 sys_shmdt(struct thread *td, struct shmdt_args *uap)
 {
 	int error;
 
 	SYSVSHM_LOCK();
 	error = kern_shmdt_locked(td, uap->shmaddr);
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 static int
 kern_shmat_locked(struct thread *td, int shmid, const void *shmaddr,
     int shmflg)
 {
 	struct prison *rpr;
 	struct proc *p = td->td_proc;
 	struct shmid_kernel *shmseg;
 	struct shmmap_state *shmmap_s;
 	vm_offset_t attach_va;
 	vm_prot_t prot;
 	vm_size_t size;
 	int cow, error, find_space, i, rv;
 
 	AUDIT_ARG_SVIPC_ID(shmid);
 	AUDIT_ARG_VALUE(shmflg);
 
 	SYSVSHM_ASSERT_LOCKED();
 	rpr = shm_find_prison(td->td_ucred);
 	if (rpr == NULL)
 		return (ENOSYS);
 	shmmap_s = p->p_vmspace->vm_shm;
 	if (shmmap_s == NULL) {
 		shmmap_s = malloc(shminfo.shmseg * sizeof(struct shmmap_state),
 		    M_SHM, M_WAITOK);
 		for (i = 0; i < shminfo.shmseg; i++)
 			shmmap_s[i].shmid = -1;
 		KASSERT(p->p_vmspace->vm_shm == NULL, ("raced"));
 		p->p_vmspace->vm_shm = shmmap_s;
 	}
 	shmseg = shm_find_segment(rpr, shmid, true);
 	if (shmseg == NULL)
 		return (EINVAL);
 	error = ipcperm(td, &shmseg->u.shm_perm,
 	    (shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
 	if (error != 0)
 		return (error);
 #ifdef MAC
 	error = mac_sysvshm_check_shmat(td->td_ucred, shmseg, shmflg);
 	if (error != 0)
 		return (error);
 #endif
 	for (i = 0; i < shminfo.shmseg; i++) {
 		if (shmmap_s->shmid == -1)
 			break;
 		shmmap_s++;
 	}
 	if (i >= shminfo.shmseg)
 		return (EMFILE);
 	size = round_page(shmseg->u.shm_segsz);
 	prot = VM_PROT_READ;
 	cow = MAP_INHERIT_SHARE | MAP_PREFAULT_PARTIAL;
 	if ((shmflg & SHM_RDONLY) == 0)
 		prot |= VM_PROT_WRITE;
 	if (shmaddr != NULL) {
 		if ((shmflg & SHM_RND) != 0)
 			attach_va = rounddown2((vm_offset_t)shmaddr, SHMLBA);
 		else if (((vm_offset_t)shmaddr & (SHMLBA-1)) == 0)
 			attach_va = (vm_offset_t)shmaddr;
 		else
 			return (EINVAL);
 		if ((shmflg & SHM_REMAP) != 0)
 			cow |= MAP_REMAP;
 		find_space = VMFS_NO_SPACE;
 	} else {
 		/*
 		 * This is just a hint to vm_map_find() about where to
 		 * put it.
 		 */
 		attach_va = round_page((vm_offset_t)p->p_vmspace->vm_daddr +
 		    lim_max(td, RLIMIT_DATA));
 		find_space = VMFS_OPTIMAL_SPACE;
 	}
 
 	vm_object_reference(shmseg->object);
 	rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->object, 0, &attach_va,
 	    size, 0, find_space, prot, prot, cow);
 	if (rv != KERN_SUCCESS) {
 		vm_object_deallocate(shmseg->object);
 		return (ENOMEM);
 	}
 
 	shmmap_s->va = attach_va;
 	shmmap_s->shmid = shmid;
 	shmseg->u.shm_lpid = p->p_pid;
 	shmseg->u.shm_atime = time_second;
 	shmseg->u.shm_nattch++;
 	td->td_retval[0] = attach_va;
 	return (error);
 }
 
 int
 kern_shmat(struct thread *td, int shmid, const void *shmaddr, int shmflg)
 {
 	int error;
 
 	SYSVSHM_LOCK();
 	error = kern_shmat_locked(td, shmid, shmaddr, shmflg);
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmat_args {
 	int shmid;
 	const void *shmaddr;
 	int shmflg;
 };
 #endif
 int
 sys_shmat(struct thread *td, struct shmat_args *uap)
 {
 
 	return (kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg));
 }
 
 static int
 kern_shmctl_locked(struct thread *td, int shmid, int cmd, void *buf,
     size_t *bufsz)
 {
 	struct prison *rpr;
 	struct shmid_kernel *shmseg;
 	struct shmid_ds *shmidp;
 	struct shm_info shm_info;
 	int error;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	rpr = shm_find_prison(td->td_ucred);
 	if (rpr == NULL)
 		return (ENOSYS);
 
 	AUDIT_ARG_SVIPC_ID(shmid);
 	AUDIT_ARG_SVIPC_CMD(cmd);
 
 	switch (cmd) {
 	/*
 	 * It is possible that kern_shmctl is being called from the Linux ABI
 	 * layer, in which case, we will need to implement IPC_INFO.  It should
 	 * be noted that other shmctl calls will be funneled through here for
 	 * Linix binaries as well.
 	 *
 	 * NB: The Linux ABI layer will convert this data to structure(s) more
 	 * consistent with the Linux ABI.
 	 */
 	case IPC_INFO:
 		memcpy(buf, &shminfo, sizeof(shminfo));
 		if (bufsz)
 			*bufsz = sizeof(shminfo);
 		td->td_retval[0] = shmalloced;
 		return (0);
 	case SHM_INFO: {
 		shm_info.used_ids = shm_nused;
 		shm_info.shm_rss = 0;	/*XXX where to get from ? */
 		shm_info.shm_tot = 0;	/*XXX where to get from ? */
 		shm_info.shm_swp = 0;	/*XXX where to get from ? */
 		shm_info.swap_attempts = 0;	/*XXX where to get from ? */
 		shm_info.swap_successes = 0;	/*XXX where to get from ? */
 		memcpy(buf, &shm_info, sizeof(shm_info));
 		if (bufsz != NULL)
 			*bufsz = sizeof(shm_info);
 		td->td_retval[0] = shmalloced;
 		return (0);
 	}
 	}
 	shmseg = shm_find_segment(rpr, shmid, cmd != SHM_STAT);
 	if (shmseg == NULL)
 		return (EINVAL);
 #ifdef MAC
 	error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, cmd);
 	if (error != 0)
 		return (error);
 #endif
 	switch (cmd) {
 	case SHM_STAT:
 	case IPC_STAT:
 		shmidp = (struct shmid_ds *)buf;
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
 		if (error != 0)
 			return (error);
 		memcpy(shmidp, &shmseg->u, sizeof(struct shmid_ds));
 		if (td->td_ucred->cr_prison != shmseg->cred->cr_prison)
 			shmidp->shm_perm.key = IPC_PRIVATE;
 		if (bufsz != NULL)
 			*bufsz = sizeof(struct shmid_ds);
 		if (cmd == SHM_STAT) {
 			td->td_retval[0] = IXSEQ_TO_IPCID(shmid,
 			    shmseg->u.shm_perm);
 		}
 		break;
 	case IPC_SET:
 		shmidp = (struct shmid_ds *)buf;
 		AUDIT_ARG_SVIPC_PERM(&shmidp->shm_perm);
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
 		if (error != 0)
 			return (error);
 		shmseg->u.shm_perm.uid = shmidp->shm_perm.uid;
 		shmseg->u.shm_perm.gid = shmidp->shm_perm.gid;
 		shmseg->u.shm_perm.mode =
 		    (shmseg->u.shm_perm.mode & ~ACCESSPERMS) |
 		    (shmidp->shm_perm.mode & ACCESSPERMS);
 		shmseg->u.shm_ctime = time_second;
 		break;
 	case IPC_RMID:
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
 		if (error != 0)
 			return (error);
 		shm_remove(shmseg, IPCID_TO_IX(shmid));
 		break;
 #if 0
 	case SHM_LOCK:
 	case SHM_UNLOCK:
 #endif
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 int
 kern_shmctl(struct thread *td, int shmid, int cmd, void *buf, size_t *bufsz)
 {
 	int error;
 
 	SYSVSHM_LOCK();
 	error = kern_shmctl_locked(td, shmid, cmd, buf, bufsz);
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmctl_args {
 	int shmid;
 	int cmd;
 	struct shmid_ds *buf;
 };
 #endif
 int
 sys_shmctl(struct thread *td, struct shmctl_args *uap)
 {
 	int error;
 	struct shmid_ds buf;
 	size_t bufsz;
 
 	/*
 	 * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
 	 * Linux binaries.  If we see the call come through the FreeBSD ABI,
 	 * return an error back to the user since we do not to support this.
 	 */
 	if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
 	    uap->cmd == SHM_STAT)
 		return (EINVAL);
 
 	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &buf, sizeof(struct shmid_ds))))
 			goto done;
 	}
 
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
 	if (error)
 		goto done;
 
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_STAT:
 		error = copyout(&buf, uap->buf, bufsz);
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 
 static int
 shmget_existing(struct thread *td, size_t size, int shmflg, int mode,
     int segnum)
 {
 	struct shmid_kernel *shmseg;
 #ifdef MAC
 	int error;
 #endif
 
 	SYSVSHM_ASSERT_LOCKED();
 	KASSERT(segnum >= 0 && segnum < shmalloced,
 	    ("segnum %d shmalloced %d", segnum, shmalloced));
 	shmseg = &shmsegs[segnum];
 	if ((shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
 		return (EEXIST);
 #ifdef MAC
 	error = mac_sysvshm_check_shmget(td->td_ucred, shmseg, shmflg);
 	if (error != 0)
 		return (error);
 #endif
 	if (size != 0 && size > shmseg->u.shm_segsz)
 		return (EINVAL);
 	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
 	return (0);
 }
 
 static int
 shmget_allocate_segment(struct thread *td, key_t key, size_t size, int mode)
 {
 	struct ucred *cred = td->td_ucred;
 	struct shmid_kernel *shmseg;
 	vm_object_t shm_object;
 	int i, segnum;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	if (size < shminfo.shmmin || size > shminfo.shmmax)
 		return (EINVAL);
 	if (shm_nused >= shminfo.shmmni) /* Any shmids left? */
 		return (ENOSPC);
 	size = round_page(size);
 	if (shm_committed + btoc(size) > shminfo.shmall)
 		return (ENOMEM);
 	if (shm_last_free < 0) {
 		shmrealloc();	/* Maybe expand the shmsegs[] array. */
 		for (i = 0; i < shmalloced; i++)
 			if (shmsegs[i].u.shm_perm.mode & SHMSEG_FREE)
 				break;
 		if (i == shmalloced)
 			return (ENOSPC);
 		segnum = i;
 	} else  {
 		segnum = shm_last_free;
 		shm_last_free = -1;
 	}
 	KASSERT(segnum >= 0 && segnum < shmalloced,
 	    ("segnum %d shmalloced %d", segnum, shmalloced));
 	shmseg = &shmsegs[segnum];
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(td->td_proc);
 		if (racct_add(td->td_proc, RACCT_NSHM, 1)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOSPC);
 		}
 		if (racct_add(td->td_proc, RACCT_SHMSIZE, size)) {
 			racct_sub(td->td_proc, RACCT_NSHM, 1);
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	/*
 	 * We make sure that we have allocated a pager before we need
 	 * to.
 	 */
 	shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP,
 	    0, size, VM_PROT_DEFAULT, 0, cred);
 	if (shm_object == NULL) {
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(td->td_proc);
 			racct_sub(td->td_proc, RACCT_NSHM, 1);
 			racct_sub(td->td_proc, RACCT_SHMSIZE, size);
 			PROC_UNLOCK(td->td_proc);
 		}
 #endif
 		return (ENOMEM);
 	}
 
 	shmseg->object = shm_object;
 	shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid;
 	shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid;
 	shmseg->u.shm_perm.mode = (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
 	shmseg->u.shm_perm.key = key;
 	shmseg->u.shm_perm.seq = (shmseg->u.shm_perm.seq + 1) & 0x7fff;
 	shmseg->cred = crhold(cred);
 	shmseg->u.shm_segsz = size;
 	shmseg->u.shm_cpid = td->td_proc->p_pid;
 	shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0;
 	shmseg->u.shm_atime = shmseg->u.shm_dtime = 0;
 #ifdef MAC
 	mac_sysvshm_create(cred, shmseg);
 #endif
 	shmseg->u.shm_ctime = time_second;
 	shm_committed += btoc(size);
 	shm_nused++;
 	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
 
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmget_args {
 	key_t key;
 	size_t size;
 	int shmflg;
 };
 #endif
 int
 sys_shmget(struct thread *td, struct shmget_args *uap)
 {
 	int segnum, mode;
 	int error;
 
 	if (shm_find_prison(td->td_ucred) == NULL)
 		return (ENOSYS);
 	mode = uap->shmflg & ACCESSPERMS;
 	SYSVSHM_LOCK();
 	if (uap->key == IPC_PRIVATE) {
 		error = shmget_allocate_segment(td, uap->key, uap->size, mode);
 	} else {
 		segnum = shm_find_segment_by_key(td->td_ucred->cr_prison,
 		    uap->key);
 		if (segnum >= 0)
 			error = shmget_existing(td, uap->size, uap->shmflg,
 			    mode, segnum);
 		else if ((uap->shmflg & IPC_CREAT) == 0)
 			error = ENOENT;
 		else
 			error = shmget_allocate_segment(td, uap->key,
 			    uap->size, mode);
 	}
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 #ifdef SYSVSHM
 void
 shmfork(struct proc *p1, struct proc *p2)
 #else
 static void
 shmfork_myhook(struct proc *p1, struct proc *p2)
 #endif
 {
 	struct shmmap_state *shmmap_s;
 	size_t size;
 	int i;
 
 	SYSVSHM_LOCK();
 	size = shminfo.shmseg * sizeof(struct shmmap_state);
 	shmmap_s = malloc(size, M_SHM, M_WAITOK);
 	bcopy(p1->p_vmspace->vm_shm, shmmap_s, size);
 	p2->p_vmspace->vm_shm = shmmap_s;
 	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
 		if (shmmap_s->shmid != -1) {
 			KASSERT(IPCID_TO_IX(shmmap_s->shmid) >= 0 &&
 			    IPCID_TO_IX(shmmap_s->shmid) < shmalloced,
 			    ("segnum %d shmalloced %d",
 			    IPCID_TO_IX(shmmap_s->shmid), shmalloced));
 			shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++;
 		}
 	}
 	SYSVSHM_UNLOCK();
 }
 
 #ifdef SYSVSHM
 void
 shmexit(struct vmspace *vm)
 #else
 static void
 shmexit_myhook(struct vmspace *vm)
 #endif
 {
 	struct shmmap_state *base, *shm;
 	int i;
 
 	base = vm->vm_shm;
 	if (base != NULL) {
 		vm->vm_shm = NULL;
 		SYSVSHM_LOCK();
 		for (i = 0, shm = base; i < shminfo.shmseg; i++, shm++) {
 			if (shm->shmid != -1)
 				shm_delete_mapping(vm, shm);
 		}
 		SYSVSHM_UNLOCK();
 		free(base, M_SHM);
 	}
 }
 
 static void
 shmrealloc(void)
 {
 	struct shmid_kernel *newsegs;
 	int i;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	if (shmalloced >= shminfo.shmmni)
 		return;
 
 	newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < shmalloced; i++)
 		bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0]));
 	for (; i < shminfo.shmmni; i++) {
 		newsegs[i].u.shm_perm.mode = SHMSEG_FREE;
 		newsegs[i].u.shm_perm.seq = 0;
 #ifdef MAC
 		mac_sysvshm_init(&newsegs[i]);
 #endif
 	}
 	free(shmsegs, M_SHM);
 	shmsegs = newsegs;
 	shmalloced = shminfo.shmmni;
 }
 
 static struct syscall_helper_data shm_syscalls[] = {
 	SYSCALL_INIT_HELPER(shmat),
 	SYSCALL_INIT_HELPER(shmctl),
 	SYSCALL_INIT_HELPER(shmdt),
 	SYSCALL_INIT_HELPER(shmget),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL_INIT_HELPER_COMPAT(freebsd7_shmctl),
 #endif
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 	SYSCALL_INIT_HELPER(shmsys),
 #endif
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static struct syscall_helper_data shm32_syscalls[] = {
 	SYSCALL32_INIT_HELPER_COMPAT(shmat),
 	SYSCALL32_INIT_HELPER_COMPAT(shmdt),
 	SYSCALL32_INIT_HELPER_COMPAT(shmget),
 	SYSCALL32_INIT_HELPER(freebsd32_shmsys),
 	SYSCALL32_INIT_HELPER(freebsd32_shmctl),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_shmctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 shminit(void)
 {
 	struct prison *pr;
 	void **rsv;
 	int i, error;
 	osd_method_t methods[PR_MAXMETHOD] = {
 	    [PR_METHOD_CHECK] =		shm_prison_check,
 	    [PR_METHOD_SET] =		shm_prison_set,
 	    [PR_METHOD_GET] =		shm_prison_get,
 	    [PR_METHOD_REMOVE] =	shm_prison_remove,
 	};
 
 #ifndef BURN_BRIDGES
 	if (TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall) != 0)
 		printf("kern.ipc.shmmaxpgs is now called kern.ipc.shmall!\n");
 #endif
 	if (shminfo.shmmax == SHMMAX) {
 		/* Initialize shmmax dealing with possible overflow. */
 		for (i = PAGE_SIZE; i != 0; i--) {
 			shminfo.shmmax = shminfo.shmall * i;
 			if ((shminfo.shmmax / shminfo.shmall) == (u_long)i)
 				break;
 		}
 	}
 	shmalloced = shminfo.shmmni;
 	shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM,
 	    M_WAITOK|M_ZERO);
 	for (i = 0; i < shmalloced; i++) {
 		shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
 		shmsegs[i].u.shm_perm.seq = 0;
 #ifdef MAC
 		mac_sysvshm_init(&shmsegs[i]);
 #endif
 	}
 	shm_last_free = 0;
 	shm_nused = 0;
 	shm_committed = 0;
 	sx_init(&sysvshmsx, "sysvshmsx");
 #ifndef SYSVSHM
 	shmexit_hook = &shmexit_myhook;
 	shmfork_hook = &shmfork_myhook;
 #endif
 
 	/* Set current prisons according to their allow.sysvipc. */
 	shm_prison_slot = osd_jail_register(NULL, methods);
 	rsv = osd_reserve(shm_prison_slot);
 	prison_lock(&prison0);
 	(void)osd_jail_set_reserved(&prison0, shm_prison_slot, rsv, &prison0);
 	prison_unlock(&prison0);
 	rsv = NULL;
 	sx_slock(&allprison_lock);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		if (rsv == NULL)
 			rsv = osd_reserve(shm_prison_slot);
 		prison_lock(pr);
-		if (prison_isvalid(pr) && (pr->pr_allow & PR_ALLOW_SYSVIPC)) {
+		if (pr->pr_allow & PR_ALLOW_SYSVIPC) {
 			(void)osd_jail_set_reserved(pr, shm_prison_slot, rsv,
 			    &prison0);
 			rsv = NULL;
 		}
 		prison_unlock(pr);
 	}
 	if (rsv != NULL)
 		osd_free_reserved(rsv);
 	sx_sunlock(&allprison_lock);
 
 	error = syscall_helper_register(shm_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(shm32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 shmunload(void)
 {
 	int i;
 
 	if (shm_nused > 0)
 		return (EBUSY);
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(shm32_syscalls);
 #endif
 	syscall_helper_unregister(shm_syscalls);
 	if (shm_prison_slot != 0)
 		osd_jail_deregister(shm_prison_slot);
 
 	for (i = 0; i < shmalloced; i++) {
 #ifdef MAC
 		mac_sysvshm_destroy(&shmsegs[i]);
 #endif
 		/*
 		 * Objects might be still mapped into the processes
 		 * address spaces.  Actual free would happen on the
 		 * last mapping destruction.
 		 */
 		if (shmsegs[i].u.shm_perm.mode != SHMSEG_FREE)
 			vm_object_deallocate(shmsegs[i].object);
 	}
 	free(shmsegs, M_SHM);
 #ifndef SYSVSHM
 	shmexit_hook = NULL;
 	shmfork_hook = NULL;
 #endif
 	sx_destroy(&sysvshmsx);
 	return (0);
 }
 
 static int
 sysctl_shmsegs(SYSCTL_HANDLER_ARGS)
 {
 	struct shmid_kernel tshmseg;
 #ifdef COMPAT_FREEBSD32
 	struct shmid_kernel32 tshmseg32;
 #endif
 	struct prison *pr, *rpr;
 	void *outaddr;
 	size_t outsize;
 	int error, i;
 
 	SYSVSHM_LOCK();
 	pr = req->td->td_ucred->cr_prison;
 	rpr = shm_find_prison(req->td->td_ucred);
 	error = 0;
 	for (i = 0; i < shmalloced; i++) {
 		if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
 		    rpr == NULL || shm_prison_cansee(rpr, &shmsegs[i]) != 0) {
 			bzero(&tshmseg, sizeof(tshmseg));
 			tshmseg.u.shm_perm.mode = SHMSEG_FREE;
 		} else {
 			tshmseg = shmsegs[i];
 			if (tshmseg.cred->cr_prison != pr)
 				tshmseg.u.shm_perm.key = IPC_PRIVATE;
 		}
 #ifdef COMPAT_FREEBSD32
 		if (SV_CURPROC_FLAG(SV_ILP32)) {
 			bzero(&tshmseg32, sizeof(tshmseg32));
 			freebsd32_ipcperm_out(&tshmseg.u.shm_perm,
 			    &tshmseg32.u.shm_perm);
 			CP(tshmseg, tshmseg32, u.shm_segsz);
 			CP(tshmseg, tshmseg32, u.shm_lpid);
 			CP(tshmseg, tshmseg32, u.shm_cpid);
 			CP(tshmseg, tshmseg32, u.shm_nattch);
 			CP(tshmseg, tshmseg32, u.shm_atime);
 			CP(tshmseg, tshmseg32, u.shm_dtime);
 			CP(tshmseg, tshmseg32, u.shm_ctime);
 			/* Don't copy object, label, or cred */
 			outaddr = &tshmseg32;
 			outsize = sizeof(tshmseg32);
 		} else
 #endif
 		{
 			tshmseg.object = NULL;
 			tshmseg.label = NULL;
 			tshmseg.cred = NULL;
 			outaddr = &tshmseg;
 			outsize = sizeof(tshmseg);
 		}
 		error = SYSCTL_OUT(req, outaddr, outsize);
 		if (error != 0)
 			break;
 	}
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 static int
 shm_prison_check(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *prpr;
 	struct vfsoptlist *opts = data;
 	int error, jsys;
 
 	/*
 	 * sysvshm is a jailsys integer.
 	 * It must be "disable" if the parent jail is disabled.
 	 */
 	error = vfs_copyopt(opts, "sysvshm", &jsys, sizeof(jsys));
 	if (error != ENOENT) {
 		if (error != 0)
 			return (error);
 		switch (jsys) {
 		case JAIL_SYS_DISABLE:
 			break;
 		case JAIL_SYS_NEW:
 		case JAIL_SYS_INHERIT:
 			prison_lock(pr->pr_parent);
 			prpr = osd_jail_get(pr->pr_parent, shm_prison_slot);
 			prison_unlock(pr->pr_parent);
 			if (prpr == NULL)
 				return (EPERM);
 			break;
 		default:
 			return (EINVAL);
 		}
 	}
 
 	return (0);
 }
 
 static int
 shm_prison_set(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *tpr, *orpr, *nrpr, *trpr;
 	struct vfsoptlist *opts = data;
 	void *rsv;
 	int jsys, descend;
 
 	/*
 	 * sysvshm controls which jail is the root of the associated segments
 	 * (this jail or same as the parent), or if the feature is available
 	 * at all.
 	 */
 	if (vfs_copyopt(opts, "sysvshm", &jsys, sizeof(jsys)) == ENOENT)
 		jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0)
 		    ? JAIL_SYS_INHERIT
 		    : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0)
 		    ? JAIL_SYS_DISABLE
 		    : -1;
 	if (jsys == JAIL_SYS_DISABLE) {
 		prison_lock(pr);
 		orpr = osd_jail_get(pr, shm_prison_slot);
 		if (orpr != NULL)
 			osd_jail_del(pr, shm_prison_slot);
 		prison_unlock(pr);
 		if (orpr != NULL) {
 			if (orpr == pr)
 				shm_prison_cleanup(pr);
 			/* Disable all child jails as well. */
 			FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 				prison_lock(tpr);
 				trpr = osd_jail_get(tpr, shm_prison_slot);
 				if (trpr != NULL) {
 					osd_jail_del(tpr, shm_prison_slot);
 					prison_unlock(tpr);
 					if (trpr == tpr)
 						shm_prison_cleanup(tpr);
 				} else {
 					prison_unlock(tpr);
 					descend = 0;
 				}
 			}
 		}
 	} else if (jsys != -1) {
 		if (jsys == JAIL_SYS_NEW)
 			nrpr = pr;
 		else {
 			prison_lock(pr->pr_parent);
 			nrpr = osd_jail_get(pr->pr_parent, shm_prison_slot);
 			prison_unlock(pr->pr_parent);
 		}
 		rsv = osd_reserve(shm_prison_slot);
 		prison_lock(pr);
 		orpr = osd_jail_get(pr, shm_prison_slot);
 		if (orpr != nrpr)
 			(void)osd_jail_set_reserved(pr, shm_prison_slot, rsv,
 			    nrpr);
 		else
 			osd_free_reserved(rsv);
 		prison_unlock(pr);
 		if (orpr != nrpr) {
 			if (orpr == pr)
 				shm_prison_cleanup(pr);
 			if (orpr != NULL) {
 				/* Change child jails matching the old root, */
 				FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 					prison_lock(tpr);
 					trpr = osd_jail_get(tpr,
 					    shm_prison_slot);
 					if (trpr == orpr) {
 						(void)osd_jail_set(tpr,
 						    shm_prison_slot, nrpr);
 						prison_unlock(tpr);
 						if (trpr == tpr)
 							shm_prison_cleanup(tpr);
 					} else {
 						prison_unlock(tpr);
 						descend = 0;
 					}
 				}
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 shm_prison_get(void *obj, void *data)
 {
 	struct prison *pr = obj;
 	struct prison *rpr;
 	struct vfsoptlist *opts = data;
 	int error, jsys;
 
 	/* Set sysvshm based on the jail's root prison. */
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, shm_prison_slot);
 	prison_unlock(pr);
 	jsys = rpr == NULL ? JAIL_SYS_DISABLE
 	    : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
 	error = vfs_setopt(opts, "sysvshm", &jsys, sizeof(jsys));
 	if (error == ENOENT)
 		error = 0;
 	return (error);
 }
 
 static int
 shm_prison_remove(void *obj, void *data __unused)
 {
 	struct prison *pr = obj;
 	struct prison *rpr;
 
 	SYSVSHM_LOCK();
 	prison_lock(pr);
 	rpr = osd_jail_get(pr, shm_prison_slot);
 	prison_unlock(pr);
 	if (rpr == pr)
 		shm_prison_cleanup(pr);
 	SYSVSHM_UNLOCK();
 	return (0);
 }
 
 static void
 shm_prison_cleanup(struct prison *pr)
 {
 	struct shmid_kernel *shmseg;
 	int i;
 
 	/* Remove any segments that belong to this jail. */
 	for (i = 0; i < shmalloced; i++) {
 		shmseg = &shmsegs[i];
 		if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) &&
 		    shmseg->cred != NULL && shmseg->cred->cr_prison == pr) {
 			shm_remove(shmseg, i);
 		}
 	}
 }
 
 SYSCTL_JAIL_PARAM_SYS_NODE(sysvshm, CTLFLAG_RW, "SYSV shared memory");
 
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 struct oshmid_ds {
 	struct	ipc_perm_old shm_perm;	/* operation perms */
 	int	shm_segsz;		/* size of segment (bytes) */
 	u_short	shm_cpid;		/* pid, creator */
 	u_short	shm_lpid;		/* pid, last operation */
 	short	shm_nattch;		/* no. of current attaches */
 	time_t	shm_atime;		/* last attach time */
 	time_t	shm_dtime;		/* last detach time */
 	time_t	shm_ctime;		/* last change time */
 	void	*shm_handle;		/* internal handle for shm segment */
 };
 
 struct oshmctl_args {
 	int shmid;
 	int cmd;
 	struct oshmid_ds *ubuf;
 };
 
 static int
 oshmctl(struct thread *td, struct oshmctl_args *uap)
 {
 #ifdef COMPAT_43
 	int error = 0;
 	struct prison *rpr;
 	struct shmid_kernel *shmseg;
 	struct oshmid_ds outbuf;
 
 	rpr = shm_find_prison(td->td_ucred);
 	if (rpr == NULL)
 		return (ENOSYS);
 	if (uap->cmd != IPC_STAT) {
 		return (freebsd7_shmctl(td,
 		    (struct freebsd7_shmctl_args *)uap));
 	}
 	SYSVSHM_LOCK();
 	shmseg = shm_find_segment(rpr, uap->shmid, true);
 	if (shmseg == NULL) {
 		SYSVSHM_UNLOCK();
 		return (EINVAL);
 	}
 	error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
 	if (error != 0) {
 		SYSVSHM_UNLOCK();
 		return (error);
 	}
 #ifdef MAC
 	error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, uap->cmd);
 	if (error != 0) {
 		SYSVSHM_UNLOCK();
 		return (error);
 	}
 #endif
 	ipcperm_new2old(&shmseg->u.shm_perm, &outbuf.shm_perm);
 	outbuf.shm_segsz = shmseg->u.shm_segsz;
 	outbuf.shm_cpid = shmseg->u.shm_cpid;
 	outbuf.shm_lpid = shmseg->u.shm_lpid;
 	outbuf.shm_nattch = shmseg->u.shm_nattch;
 	outbuf.shm_atime = shmseg->u.shm_atime;
 	outbuf.shm_dtime = shmseg->u.shm_dtime;
 	outbuf.shm_ctime = shmseg->u.shm_ctime;
 	outbuf.shm_handle = shmseg->object;
 	SYSVSHM_UNLOCK();
 	return (copyout(&outbuf, uap->ubuf, sizeof(outbuf)));
 #else
 	return (EINVAL);
 #endif
 }
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *shmcalls[] = {
 	(sy_call_t *)sys_shmat, (sy_call_t *)oshmctl,
 	(sy_call_t *)sys_shmdt, (sy_call_t *)sys_shmget,
 	(sy_call_t *)freebsd7_shmctl
 };
 
 #ifndef _SYS_SYSPROTO_H_
 /* XXX actually varargs. */
 struct shmsys_args {
 	int	which;
 	int	a2;
 	int	a3;
 	int	a4;
 };
 #endif
 int
 sys_shmsys(struct thread *td, struct shmsys_args *uap)
 {
 
 	AUDIT_ARG_SVIPC_WHICH(uap->which);
 	if (uap->which < 0 || uap->which >= nitems(shmcalls))
 		return (EINVAL);
 	return ((*shmcalls[uap->which])(td, &uap->a2));
 }
 
 #endif	/* i386 && (COMPAT_FREEBSD4 || COMPAT_43) */
 
 #ifdef COMPAT_FREEBSD32
 
 int
 freebsd32_shmsys(struct thread *td, struct freebsd32_shmsys_args *uap)
 {
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	AUDIT_ARG_SVIPC_WHICH(uap->which);
 	switch (uap->which) {
 	case 0:	{	/* shmat */
 		struct shmat_args ap;
 
 		ap.shmid = uap->a2;
 		ap.shmaddr = PTRIN(uap->a3);
 		ap.shmflg = uap->a4;
 		return (sysent[SYS_shmat].sy_call(td, &ap));
 	}
 	case 2: {	/* shmdt */
 		struct shmdt_args ap;
 
 		ap.shmaddr = PTRIN(uap->a2);
 		return (sysent[SYS_shmdt].sy_call(td, &ap));
 	}
 	case 3: {	/* shmget */
 		struct shmget_args ap;
 
 		ap.key = uap->a2;
 		ap.size = uap->a3;
 		ap.shmflg = uap->a4;
 		return (sysent[SYS_shmget].sy_call(td, &ap));
 	}
 	case 4: {	/* shmctl */
 		struct freebsd7_freebsd32_shmctl_args ap;
 
 		ap.shmid = uap->a2;
 		ap.cmd = uap->a3;
 		ap.buf = PTRIN(uap->a4);
 		return (freebsd7_freebsd32_shmctl(td, &ap));
 	}
 	case 1:		/* oshmctl */
 	default:
 		return (EINVAL);
 	}
 #else
 	return (nosys(td, NULL));
 #endif
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 int
 freebsd7_freebsd32_shmctl(struct thread *td,
     struct freebsd7_freebsd32_shmctl_args *uap)
 {
 	int error;
 	union {
 		struct shmid_ds shmid_ds;
 		struct shm_info shm_info;
 		struct shminfo shminfo;
 	} u;
 	union {
 		struct shmid_ds32_old shmid_ds32;
 		struct shm_info32 shm_info32;
 		struct shminfo32 shminfo32;
 	} u32;
 	size_t sz;
 
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &u32.shmid_ds32,
 		    sizeof(u32.shmid_ds32))))
 			goto done;
 		freebsd32_ipcperm_old_in(&u32.shmid_ds32.shm_perm,
 		    &u.shmid_ds.shm_perm);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
 	}
 
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
 	if (error)
 		goto done;
 
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_INFO:
 		CP(u.shminfo, u32.shminfo32, shmmax);
 		CP(u.shminfo, u32.shminfo32, shmmin);
 		CP(u.shminfo, u32.shminfo32, shmmni);
 		CP(u.shminfo, u32.shminfo32, shmseg);
 		CP(u.shminfo, u32.shminfo32, shmall);
 		error = copyout(&u32.shminfo32, uap->buf,
 		    sizeof(u32.shminfo32));
 		break;
 	case SHM_INFO:
 		CP(u.shm_info, u32.shm_info32, used_ids);
 		CP(u.shm_info, u32.shm_info32, shm_rss);
 		CP(u.shm_info, u32.shm_info32, shm_tot);
 		CP(u.shm_info, u32.shm_info32, shm_swp);
 		CP(u.shm_info, u32.shm_info32, swap_attempts);
 		CP(u.shm_info, u32.shm_info32, swap_successes);
 		error = copyout(&u32.shm_info32, uap->buf,
 		    sizeof(u32.shm_info32));
 		break;
 	case SHM_STAT:
 	case IPC_STAT:
 		memset(&u32.shmid_ds32, 0, sizeof(u32.shmid_ds32));
 		freebsd32_ipcperm_old_out(&u.shmid_ds.shm_perm,
 		    &u32.shmid_ds32.shm_perm);
 		if (u.shmid_ds.shm_segsz > INT32_MAX)
 			u32.shmid_ds32.shm_segsz = INT32_MAX;
 		else
 			CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
 		u32.shmid_ds32.shm_internal = 0;
 		error = copyout(&u32.shmid_ds32, uap->buf,
 		    sizeof(u32.shmid_ds32));
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 #endif
 
 int
 freebsd32_shmctl(struct thread *td, struct freebsd32_shmctl_args *uap)
 {
 	int error;
 	union {
 		struct shmid_ds shmid_ds;
 		struct shm_info shm_info;
 		struct shminfo shminfo;
 	} u;
 	union {
 		struct shmid_ds32 shmid_ds32;
 		struct shm_info32 shm_info32;
 		struct shminfo32 shminfo32;
 	} u32;
 	size_t sz;
 
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &u32.shmid_ds32,
 		    sizeof(u32.shmid_ds32))))
 			goto done;
 		freebsd32_ipcperm_in(&u32.shmid_ds32.shm_perm,
 		    &u.shmid_ds.shm_perm);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
 	}
 
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
 	if (error)
 		goto done;
 
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_INFO:
 		CP(u.shminfo, u32.shminfo32, shmmax);
 		CP(u.shminfo, u32.shminfo32, shmmin);
 		CP(u.shminfo, u32.shminfo32, shmmni);
 		CP(u.shminfo, u32.shminfo32, shmseg);
 		CP(u.shminfo, u32.shminfo32, shmall);
 		error = copyout(&u32.shminfo32, uap->buf,
 		    sizeof(u32.shminfo32));
 		break;
 	case SHM_INFO:
 		CP(u.shm_info, u32.shm_info32, used_ids);
 		CP(u.shm_info, u32.shm_info32, shm_rss);
 		CP(u.shm_info, u32.shm_info32, shm_tot);
 		CP(u.shm_info, u32.shm_info32, shm_swp);
 		CP(u.shm_info, u32.shm_info32, swap_attempts);
 		CP(u.shm_info, u32.shm_info32, swap_successes);
 		error = copyout(&u32.shm_info32, uap->buf,
 		    sizeof(u32.shm_info32));
 		break;
 	case SHM_STAT:
 	case IPC_STAT:
 		freebsd32_ipcperm_out(&u.shmid_ds.shm_perm,
 		    &u32.shmid_ds32.shm_perm);
 		if (u.shmid_ds.shm_segsz > INT32_MAX)
 			u32.shmid_ds32.shm_segsz = INT32_MAX;
 		else
 			CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
 		error = copyout(&u32.shmid_ds32, uap->buf,
 		    sizeof(u32.shmid_ds32));
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 #endif
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd7_shmctl_args {
 	int shmid;
 	int cmd;
 	struct shmid_ds_old *buf;
 };
 #endif
 int
 freebsd7_shmctl(struct thread *td, struct freebsd7_shmctl_args *uap)
 {
 	int error;
 	struct shmid_ds_old old;
 	struct shmid_ds buf;
 	size_t bufsz;
 
 	/*
 	 * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
 	 * Linux binaries.  If we see the call come through the FreeBSD ABI,
 	 * return an error back to the user since we do not to support this.
 	 */
 	if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
 	    uap->cmd == SHM_STAT)
 		return (EINVAL);
 
 	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &old, sizeof(old))))
 			goto done;
 		ipcperm_old2new(&old.shm_perm, &buf.shm_perm);
 		CP(old, buf, shm_segsz);
 		CP(old, buf, shm_lpid);
 		CP(old, buf, shm_cpid);
 		CP(old, buf, shm_nattch);
 		CP(old, buf, shm_atime);
 		CP(old, buf, shm_dtime);
 		CP(old, buf, shm_ctime);
 	}
 
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
 	if (error)
 		goto done;
 
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_STAT:
 		memset(&old, 0, sizeof(old));
 		ipcperm_new2old(&buf.shm_perm, &old.shm_perm);
 		if (buf.shm_segsz > INT_MAX)
 			old.shm_segsz = INT_MAX;
 		else
 			CP(buf, old, shm_segsz);
 		CP(buf, old, shm_lpid);
 		CP(buf, old, shm_cpid);
 		if (buf.shm_nattch > SHRT_MAX)
 			old.shm_nattch = SHRT_MAX;
 		else
 			CP(buf, old, shm_nattch);
 		CP(buf, old, shm_atime);
 		CP(buf, old, shm_dtime);
 		CP(buf, old, shm_ctime);
 		old.shm_internal = NULL;
 		error = copyout(&old, uap->buf, sizeof(old));
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 
 #endif	/* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
 	   COMPAT_FREEBSD7 */
 
 static int
 sysvshm_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = shminit();
 		if (error != 0)
 			shmunload();
 		break;
 	case MOD_UNLOAD:
 		error = shmunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvshm_mod = {
 	"sysvshm",
 	&sysvshm_modload,
 	NULL
 };
 
 DECLARE_MODULE(sysvshm, sysvshm_mod, SI_SUB_SYSV_SHM, SI_ORDER_FIRST);
 MODULE_VERSION(sysvshm, 1);
diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c
index dc94ce213d08..5c1775a261fc 100644
--- a/sys/kern/uipc_mqueue.c
+++ b/sys/kern/uipc_mqueue.c
@@ -1,2948 +1,2945 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * Copyright (c) 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 /*
  * POSIX message queue implementation.
  *
  * 1) A mqueue filesystem can be mounted, each message queue appears
  *    in mounted directory, user can change queue's permission and
  *    ownership, or remove a queue. Manually creating a file in the
  *    directory causes a message queue to be created in the kernel with
  *    default message queue attributes applied and same name used, this
  *    method is not advocated since mq_open syscall allows user to specify
  *    different attributes. Also the file system can be mounted multiple
  *    times at different mount points but shows same contents.
  *
  * 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
  *    but directly operate on internal data structure, this allows user to
  *    use the IPC facility without having to mount mqueue file system.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/buf.h>
 #include <sys/capsicum.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mqueue.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/posix4.h>
 #include <sys/poll.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sysproto.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #include <machine/atomic.h>
 
 #include <security/audit/audit.h>
 
 FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support");
 
 /*
  * Limits and constants
  */
 #define	MQFS_NAMELEN		NAME_MAX
 #define MQFS_DELEN		(8 + MQFS_NAMELEN)
 
 /* node types */
 typedef enum {
 	mqfstype_none = 0,
 	mqfstype_root,
 	mqfstype_dir,
 	mqfstype_this,
 	mqfstype_parent,
 	mqfstype_file,
 	mqfstype_symlink,
 } mqfs_type_t;
 
 struct mqfs_node;
 
 /*
  * mqfs_info: describes a mqfs instance
  */
 struct mqfs_info {
 	struct sx		mi_lock;
 	struct mqfs_node	*mi_root;
 	struct unrhdr		*mi_unrhdr;
 };
 
 struct mqfs_vdata {
 	LIST_ENTRY(mqfs_vdata)	mv_link;
 	struct mqfs_node	*mv_node;
 	struct vnode		*mv_vnode;
 	struct task		mv_task;
 };
 
 /*
  * mqfs_node: describes a node (file or directory) within a mqfs
  */
 struct mqfs_node {
 	char			mn_name[MQFS_NAMELEN+1];
 	struct mqfs_info	*mn_info;
 	struct mqfs_node	*mn_parent;
 	LIST_HEAD(,mqfs_node)	mn_children;
 	LIST_ENTRY(mqfs_node)	mn_sibling;
 	LIST_HEAD(,mqfs_vdata)	mn_vnodes;
 	const void		*mn_pr_root;
 	int			mn_refcount;
 	mqfs_type_t		mn_type;
 	int			mn_deleted;
 	uint32_t		mn_fileno;
 	void			*mn_data;
 	struct timespec		mn_birth;
 	struct timespec		mn_ctime;
 	struct timespec		mn_atime;
 	struct timespec		mn_mtime;
 	uid_t			mn_uid;
 	gid_t			mn_gid;
 	int			mn_mode;
 };
 
 #define	VTON(vp)	(((struct mqfs_vdata *)((vp)->v_data))->mv_node)
 #define VTOMQ(vp) 	((struct mqueue *)(VTON(vp)->mn_data))
 #define	VFSTOMQFS(m)	((struct mqfs_info *)((m)->mnt_data))
 #define	FPTOMQ(fp)	((struct mqueue *)(((struct mqfs_node *) \
 				(fp)->f_data)->mn_data))
 
 TAILQ_HEAD(msgq, mqueue_msg);
 
 struct mqueue;
 
 struct mqueue_notifier {
 	LIST_ENTRY(mqueue_notifier)	nt_link;
 	struct sigevent			nt_sigev;
 	ksiginfo_t			nt_ksi;
 	struct proc			*nt_proc;
 };
 
 struct mqueue {
 	struct mtx	mq_mutex;
 	int		mq_flags;
 	long		mq_maxmsg;
 	long		mq_msgsize;
 	long		mq_curmsgs;
 	long		mq_totalbytes;
 	struct msgq	mq_msgq;
 	int		mq_receivers;
 	int		mq_senders;
 	struct selinfo	mq_rsel;
 	struct selinfo	mq_wsel;
 	struct mqueue_notifier	*mq_notifier;
 };
 
 #define	MQ_RSEL		0x01
 #define	MQ_WSEL		0x02
 
 struct mqueue_msg {
 	TAILQ_ENTRY(mqueue_msg)	msg_link;
 	unsigned int	msg_prio;
 	unsigned int	msg_size;
 	/* following real data... */
 };
 
 static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 	"POSIX real time message queue");
 
 static int	default_maxmsg  = 10;
 static int	default_msgsize = 1024;
 
 static int	maxmsg = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
     &maxmsg, 0, "Default maximum messages in queue");
 static int	maxmsgsize = 16384;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
     &maxmsgsize, 0, "Default maximum message size");
 static int	maxmq = 100;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
     &maxmq, 0, "maximum message queues");
 static int	curmq = 0;
 SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
     &curmq, 0, "current message queue number");
 static int	unloadable = 0;
 static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
 
 static eventhandler_tag exit_tag;
 
 /* Only one instance per-system */
 static struct mqfs_info		mqfs_data;
 static uma_zone_t		mqnode_zone;
 static uma_zone_t		mqueue_zone;
 static uma_zone_t		mvdata_zone;
 static uma_zone_t		mqnoti_zone;
 static struct vop_vector	mqfs_vnodeops;
 static struct fileops		mqueueops;
 static unsigned			mqfs_osd_jail_slot;
 
 /*
  * Directory structure construction and manipulation
  */
 #ifdef notyet
 static struct mqfs_node	*mqfs_create_dir(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static struct mqfs_node	*mqfs_create_link(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 #endif
 
 static struct mqfs_node	*mqfs_create_file(struct mqfs_node *parent,
 	const char *name, int namelen, struct ucred *cred, int mode);
 static int	mqfs_destroy(struct mqfs_node *mn);
 static void	mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
 static void	mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
 static int	mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
 static int	mqfs_prison_remove(void *obj, void *data);
 
 /*
  * Message queue construction and maniplation
  */
 static struct mqueue	*mqueue_alloc(const struct mq_attr *attr);
 static void	mqueue_free(struct mqueue *mq);
 static int	mqueue_send(struct mqueue *mq, const char *msg_ptr,
 			size_t msg_len, unsigned msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	mqueue_receive(struct mqueue *mq, char *msg_ptr,
 			size_t msg_len, unsigned *msg_prio, int waitok,
 			const struct timespec *abs_timeout);
 static int	_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
 			int timo);
 static int	_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
 			int timo);
 static void	mqueue_send_notification(struct mqueue *mq);
 static void	mqueue_fdclose(struct thread *td, int fd, struct file *fp);
 static void	mq_proc_exit(void *arg, struct proc *p);
 
 /*
  * kqueue filters
  */
 static void	filt_mqdetach(struct knote *kn);
 static int	filt_mqread(struct knote *kn, long hint);
 static int	filt_mqwrite(struct knote *kn, long hint);
 
 struct filterops mq_rfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqread,
 };
 struct filterops mq_wfiltops = {
 	.f_isfd = 1,
 	.f_detach = filt_mqdetach,
 	.f_event = filt_mqwrite,
 };
 
 /*
  * Initialize fileno bitmap
  */
 static void
 mqfs_fileno_init(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = new_unrhdr(1, INT_MAX, NULL);
 	mi->mi_unrhdr = up;
 }
 
 /*
  * Tear down fileno bitmap
  */
 static void
 mqfs_fileno_uninit(struct mqfs_info *mi)
 {
 	struct unrhdr *up;
 
 	up = mi->mi_unrhdr;
 	mi->mi_unrhdr = NULL;
 	delete_unrhdr(up);
 }
 
 /*
  * Allocate a file number
  */
 static void
 mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	/* make sure our parent has a file number */
 	if (mn->mn_parent && !mn->mn_parent->mn_fileno)
 		mqfs_fileno_alloc(mi, mn->mn_parent);
 
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
 		break;
 	case mqfstype_this:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_this node has no parent"));
 		mn->mn_fileno = mn->mn_parent->mn_fileno;
 		break;
 	case mqfstype_parent:
 		KASSERT(mn->mn_parent != NULL,
 		    ("mqfstype_parent node has no parent"));
 		if (mn->mn_parent == mi->mi_root) {
 			mn->mn_fileno = mn->mn_parent->mn_fileno;
 			break;
 		}
 		KASSERT(mn->mn_parent->mn_parent != NULL,
 		    ("mqfstype_parent node has no grandparent"));
 		mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_alloc() called for unknown type node: %d",
 			mn->mn_type));
 		break;
 	}
 }
 
 /*
  * Release a file number
  */
 static void
 mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
 {
 	switch (mn->mn_type) {
 	case mqfstype_root:
 	case mqfstype_dir:
 	case mqfstype_file:
 	case mqfstype_symlink:
 		free_unr(mi->mi_unrhdr, mn->mn_fileno);
 		break;
 	case mqfstype_this:
 	case mqfstype_parent:
 		/* ignore these, as they don't "own" their file number */
 		break;
 	default:
 		KASSERT(0,
 		    ("mqfs_fileno_free() called for unknown type node: %d", 
 			mn->mn_type));
 		break;
 	}
 }
 
 static __inline struct mqfs_node *
 mqnode_alloc(void)
 {
 	return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO);
 }
 
 static __inline void
 mqnode_free(struct mqfs_node *node)
 {
 	uma_zfree(mqnode_zone, node);
 }
 
 static __inline void
 mqnode_addref(struct mqfs_node *node)
 {
 	atomic_add_int(&node->mn_refcount, 1);
 }
 
 static __inline void
 mqnode_release(struct mqfs_node *node)
 {
 	struct mqfs_info *mqfs;
 	int old, exp;
 
 	mqfs = node->mn_info;
 	old = atomic_fetchadd_int(&node->mn_refcount, -1);
 	if (node->mn_type == mqfstype_dir ||
 	    node->mn_type == mqfstype_root)
 		exp = 3; /* include . and .. */
 	else
 		exp = 1;
 	if (old == exp) {
 		int locked = sx_xlocked(&mqfs->mi_lock);
 		if (!locked)
 			sx_xlock(&mqfs->mi_lock);
 		mqfs_destroy(node);
 		if (!locked)
 			sx_xunlock(&mqfs->mi_lock);
 	}
 }
 
 /*
  * Add a node to a directory
  */
 static int
 mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
 {
 	KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
 	KASSERT(parent->mn_info != NULL,
 	    ("%s(): parent has no mn_info", __func__));
 	KASSERT(parent->mn_type == mqfstype_dir ||
 	    parent->mn_type == mqfstype_root,
 	    ("%s(): parent is not a directory", __func__));
 
 	node->mn_info = parent->mn_info;
 	node->mn_parent = parent;
 	LIST_INIT(&node->mn_children);
 	LIST_INIT(&node->mn_vnodes);
 	LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
 	mqnode_addref(parent);
 	return (0);
 }
 
 static struct mqfs_node *
 mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
 	int nodetype)
 {
 	struct mqfs_node *node;
 
 	node = mqnode_alloc();
 	strncpy(node->mn_name, name, namelen);
 	node->mn_pr_root = cred->cr_prison->pr_root;
 	node->mn_type = nodetype;
 	node->mn_refcount = 1;
 	vfs_timestamp(&node->mn_birth);
 	node->mn_ctime = node->mn_atime = node->mn_mtime
 		= node->mn_birth;
 	node->mn_uid = cred->cr_uid;
 	node->mn_gid = cred->cr_gid;
 	node->mn_mode = mode;
 	return (node);
 }
 
 /*
  * Create a file
  */
 static struct mqfs_node *
 mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Add . and .. to a directory
  */
 static int
 mqfs_fixup_dir(struct mqfs_node *parent)
 {
 	struct mqfs_node *dir;
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = '.';
 	dir->mn_type = mqfstype_this;
 	dir->mn_refcount = 1;
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	dir = mqnode_alloc();
 	dir->mn_name[0] = dir->mn_name[1] = '.';
 	dir->mn_type = mqfstype_parent;
 	dir->mn_refcount = 1;
 
 	if (mqfs_add_node(parent, dir) != 0) {
 		mqnode_free(dir);
 		return (-1);
 	}
 
 	return (0);
 }
 
 #ifdef notyet
 
 /*
  * Create a directory
  */
 static struct mqfs_node *
 mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 
 	if (mqfs_fixup_dir(node) != 0) {
 		mqfs_destroy(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 /*
  * Create a symlink
  */
 static struct mqfs_node *
 mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
 	struct ucred *cred, int mode)
 {
 	struct mqfs_node *node;
 
 	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
 	if (mqfs_add_node(parent, node) != 0) {
 		mqnode_free(node);
 		return (NULL);
 	}
 	return (node);
 }
 
 #endif
 
 /*
  * Destroy a node or a tree of nodes
  */
 static int
 mqfs_destroy(struct mqfs_node *node)
 {
 	struct mqfs_node *parent;
 
 	KASSERT(node != NULL,
 	    ("%s(): node is NULL", __func__));
 	KASSERT(node->mn_info != NULL,
 	    ("%s(): node has no mn_info", __func__));
 
 	/* destroy children */
 	if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
 		while (! LIST_EMPTY(&node->mn_children))
 			mqfs_destroy(LIST_FIRST(&node->mn_children));
 
 	/* unlink from parent */
 	if ((parent = node->mn_parent) != NULL) {
 		KASSERT(parent->mn_info == node->mn_info,
 		    ("%s(): parent has different mn_info", __func__));
 		LIST_REMOVE(node, mn_sibling);
 	}
 
 	if (node->mn_fileno != 0)
 		mqfs_fileno_free(node->mn_info, node);
 	if (node->mn_data != NULL)
 		mqueue_free(node->mn_data);
 	mqnode_free(node);
 	return (0);
 }
 
 /*
  * Mount a mqfs instance
  */
 static int
 mqfs_mount(struct mount *mp)
 {
 	struct statfs *sbp;
 
 	if (mp->mnt_flag & MNT_UPDATE)
 		return (EOPNOTSUPP);
 
 	mp->mnt_data = &mqfs_data;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	vfs_getnewfsid(mp);
 
 	sbp = &mp->mnt_stat;
 	vfs_mountedfrom(mp, "mqueue");
 	sbp->f_bsize = PAGE_SIZE;
 	sbp->f_iosize = PAGE_SIZE;
 	sbp->f_blocks = 1;
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = 1;
 	sbp->f_ffree = 0;
 	return (0);
 }
 
 /*
  * Unmount a mqfs instance
  */
 static int
 mqfs_unmount(struct mount *mp, int mntflags)
 {
 	int error;
 
 	error = vflush(mp, 0, (mntflags & MNT_FORCE) ?  FORCECLOSE : 0,
 	    curthread);
 	return (error);
 }
 
 /*
  * Return a root vnode
  */
 static int
 mqfs_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct mqfs_info *mqfs;
 	int ret;
 
 	mqfs = VFSTOMQFS(mp);
 	ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
 	return (ret);
 }
 
 /*
  * Return filesystem stats
  */
 static int
 mqfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	/* XXX update statistics */
 	return (0);
 }
 
 /*
  * Initialize a mqfs instance
  */
 static int
 mqfs_init(struct vfsconf *vfc)
 {
 	struct mqfs_node *root;
 	struct mqfs_info *mi;
 	osd_method_t methods[PR_MAXMETHOD] = {
 	    [PR_METHOD_REMOVE] = mqfs_prison_remove,
 	};
 
 	mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mvdata_zone = uma_zcreate("mvdata",
 		sizeof(struct mqfs_vdata), NULL, NULL, NULL,
 		NULL, UMA_ALIGN_PTR, 0);
 	mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mi = &mqfs_data;
 	sx_init(&mi->mi_lock, "mqfs lock");
 	/* set up the root diretory */
 	root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
 		mqfstype_root);
 	root->mn_info = mi;
 	LIST_INIT(&root->mn_children);
 	LIST_INIT(&root->mn_vnodes);
 	mi->mi_root = root;
 	mqfs_fileno_init(mi);
 	mqfs_fileno_alloc(mi, root);
 	mqfs_fixup_dir(root);
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	mq_fdclose = mqueue_fdclose;
 	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
 	mqfs_osd_jail_slot = osd_jail_register(NULL, methods);
 	return (0);
 }
 
 /*
  * Destroy a mqfs instance
  */
 static int
 mqfs_uninit(struct vfsconf *vfc)
 {
 	struct mqfs_info *mi;
 
 	if (!unloadable)
 		return (EOPNOTSUPP);
 	osd_jail_deregister(mqfs_osd_jail_slot);
 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
 	mi = &mqfs_data;
 	mqfs_destroy(mi->mi_root);
 	mi->mi_root = NULL;
 	mqfs_fileno_uninit(mi);
 	sx_destroy(&mi->mi_lock);
 	uma_zdestroy(mqnode_zone);
 	uma_zdestroy(mqueue_zone);
 	uma_zdestroy(mvdata_zone);
 	uma_zdestroy(mqnoti_zone);
 	return (0);
 }
 
 /*
  * task routine
  */
 static void
 do_recycle(void *context, int pending __unused)
 {
 	struct vnode *vp = (struct vnode *)context;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	vrecycle(vp);
 	VOP_UNLOCK(vp);
 	vdrop(vp);
 }
 
 /*
  * Allocate a vnode
  */
 static int
 mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
 {
 	struct mqfs_vdata *vd;
 	struct mqfs_info  *mqfs;
 	struct vnode *newvpp;
 	int error;
 
 	mqfs = pn->mn_info;
 	*vpp = NULL;
 	sx_xlock(&mqfs->mi_lock);
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp) {
 			vhold(vd->mv_vnode);
 			break;
 		}
 	}
 
 	if (vd != NULL) {
 found:
 		*vpp = vd->mv_vnode;
 		sx_xunlock(&mqfs->mi_lock);
 		error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE);
 		vdrop(*vpp);
 		return (error);
 	}
 	sx_xunlock(&mqfs->mi_lock);
 
 	error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp);
 	if (error)
 		return (error);
 	vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY);
 	error = insmntque(newvpp, mp);
 	if (error != 0)
 		return (error);
 
 	sx_xlock(&mqfs->mi_lock);
 	/*
 	 * Check if it has already been allocated
 	 * while we were blocked.
 	 */
 	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 		if (vd->mv_vnode->v_mount == mp) {
 			vhold(vd->mv_vnode);
 			sx_xunlock(&mqfs->mi_lock);
 
 			vgone(newvpp);
 			vput(newvpp);
 			goto found;
 		}
 	}
 
 	*vpp = newvpp;
 
 	vd = uma_zalloc(mvdata_zone, M_WAITOK);
 	(*vpp)->v_data = vd;
 	vd->mv_vnode = *vpp;
 	vd->mv_node = pn;
 	TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
 	LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
 	mqnode_addref(pn);
 	switch (pn->mn_type) {
 	case mqfstype_root:
 		(*vpp)->v_vflag = VV_ROOT;
 		/* fall through */
 	case mqfstype_dir:
 	case mqfstype_this:
 	case mqfstype_parent:
 		(*vpp)->v_type = VDIR;
 		break;
 	case mqfstype_file:
 		(*vpp)->v_type = VREG;
 		break;
 	case mqfstype_symlink:
 		(*vpp)->v_type = VLNK;
 		break;
 	case mqfstype_none:
 		KASSERT(0, ("mqfs_allocf called for null node\n"));
 	default:
 		panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
 	}
 	sx_xunlock(&mqfs->mi_lock);
 	return (0);
 }
 
 /* 
  * Search a directory entry
  */
 static struct mqfs_node *
 mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred)
 {
 	struct mqfs_node *pn;
 	const void *pr_root;
 
 	sx_assert(&pd->mn_info->mi_lock, SX_LOCKED);
 	pr_root = cred->cr_prison->pr_root;
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		/* Only match names within the same prison root directory */
 		if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) &&
 		    strncmp(pn->mn_name, name, len) == 0 &&
 		    pn->mn_name[len] == '\0')
 			return (pn);
 	}
 	return (NULL);
 }
 
 /*
  * Look up a file or directory.
  */
 static int
 mqfs_lookupx(struct vop_cachedlookup_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqfs_info *mqfs;
 	int nameiop, flags, error, namelen;
 	char *pname;
 	struct thread *td;
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	namelen = cnp->cn_namelen;
 	td = cnp->cn_thread;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	pd = VTON(dvp);
 	pn = NULL;
 	mqfs = pd->mn_info;
 	*vpp = NULLVP;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
 	if (error)
 		return (error);
 
 	/* shortcut: check if the name is too long */
 	if (cnp->cn_namelen >= MQFS_NAMELEN)
 		return (ENOENT);
 
 	/* self */
 	if (namelen == 1 && pname[0] == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		pn = pd;
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	/* parent */
 	if (cnp->cn_flags & ISDOTDOT) {
 		if (dvp->v_vflag & VV_ROOT)
 			return (EIO);
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		VOP_UNLOCK(dvp);
 		KASSERT(pd->mn_parent, ("non-root directory has no parent"));
 		pn = pd->mn_parent;
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		return (error);
 	}
 
 	/* named node */
 	sx_xlock(&mqfs->mi_lock);
 	pn = mqfs_search(pd, pname, namelen, cnp->cn_cred);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
 
 	/* found */
 	if (pn != NULL) {
 		/* DELETE */
 		if (nameiop == DELETE && (flags & ISLASTCN)) {
 			error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 			if (error) {
 				mqnode_release(pn);
 				return (error);
 			}
 			if (*vpp == dvp) {
 				VREF(dvp);
 				*vpp = dvp;
 				mqnode_release(pn);
 				return (0);
 			}
 		}
 
 		/* allocate vnode */
 		error = mqfs_allocv(dvp->v_mount, vpp, pn);
 		mqnode_release(pn);
 		if (error == 0 && cnp->cn_flags & MAKEENTRY)
 			cache_enter(dvp, *vpp, cnp);
 		return (error);
 	}
 
 	/* not found */
 
 	/* will create a new entry in the directory ? */
 	if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
 	    && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		cnp->cn_flags |= SAVENAME;
 		return (EJUSTRETURN);
 	}
 	return (ENOENT);
 }
 
 #if 0
 struct vop_lookup_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode lookup operation
  */
 static int
 mqfs_lookup(struct vop_cachedlookup_args *ap)
 {
 	int rc;
 
 	rc = mqfs_lookupx(ap);
 	return (rc);
 }
 
 #if 0
 struct vop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * vnode creation operation
  */
 static int
 mqfs_create(struct vop_create_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct mqueue *mq;
 	int error;
 
 	pd = VTON(ap->a_dvp);
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	mq = mqueue_alloc(NULL);
 	if (mq == NULL)
 		return (EAGAIN);
 	sx_xlock(&mqfs->mi_lock);
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 	pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		cnp->cn_cred, ap->a_vap->va_mode);
 	if (pn == NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		error = ENOSPC;
 	} else {
 		mqnode_addref(pn);
 		sx_xunlock(&mqfs->mi_lock);
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		mqnode_release(pn);
 		if (error)
 			mqfs_destroy(pn);
 		else
 			pn->mn_data = mq;
 	}
 	if (error)
 		mqueue_free(mq);
 	return (error);
 }
 
 /*
  * Remove an entry
  */
 static
 int do_unlink(struct mqfs_node *pn, struct ucred *ucred)
 {
 	struct mqfs_node *parent;
 	struct mqfs_vdata *vd;
 	int error = 0;
 
 	sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
 
 	if (ucred->cr_uid != pn->mn_uid &&
 	    (error = priv_check_cred(ucred, PRIV_MQ_ADMIN)) != 0)
 		error = EACCES;
 	else if (!pn->mn_deleted) {
 		parent = pn->mn_parent;
 		pn->mn_parent = NULL;
 		pn->mn_deleted = 1;
 		LIST_REMOVE(pn, mn_sibling);
 		LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
 			cache_purge(vd->mv_vnode);
 			vhold(vd->mv_vnode);
 			taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
 		}
 		mqnode_release(pn);
 		mqnode_release(parent);
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 #if 0
 struct vop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * vnode removal operation
  */
 static int
 mqfs_remove(struct vop_remove_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn;
 	int error;
 
 	if (ap->a_vp->v_type == VDIR)
                 return (EPERM);
 	pn = VTON(ap->a_vp);
 	sx_xlock(&mqfs->mi_lock);
 	error = do_unlink(pn, ap->a_cnp->cn_cred);
 	sx_xunlock(&mqfs->mi_lock);
 	return (error);
 }
 
 #if 0
 struct vop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_inactive(struct vop_inactive_args *ap)
 {
 	struct mqfs_node *pn = VTON(ap->a_vp);
 
 	if (pn->mn_deleted)
 		vrecycle(ap->a_vp);
 	return (0);
 }
 
 #if 0
 struct vop_reclaim_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 };
 #endif
 
 static int
 mqfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn;
 	struct mqfs_vdata *vd;
 
 	vd = vp->v_data;
 	pn = vd->mv_node;
 	sx_xlock(&mqfs->mi_lock);
 	vp->v_data = NULL;
 	LIST_REMOVE(vd, mv_link);
 	uma_zfree(mvdata_zone, vd);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	return (0);
 }
 
 #if 0
 struct vop_open_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 	struct file *a_fp;
 };
 #endif
 
 static int
 mqfs_open(struct vop_open_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_close_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 mqfs_close(struct vop_close_args *ap)
 {
 	return (0);
 }
 
 #if 0
 struct vop_access_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	accmode_t a_accmode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 /*
  * Verify permissions
  */
 static int
 mqfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr vattr;
 	int error;
 
 	error = VOP_GETATTR(vp, &vattr, ap->a_cred);
 	if (error)
 		return (error);
 	error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid,
 	    ap->a_accmode, ap->a_cred);
 	return (error);
 }
 
 #if 0
 struct vop_getattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Get file attributes
  */
 static int
 mqfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct mqfs_node *pn = VTON(vp);
 	struct vattr *vap = ap->a_vap;
 	int error = 0;
 
 	vap->va_type = vp->v_type;
 	vap->va_mode = pn->mn_mode;
 	vap->va_nlink = 1;
 	vap->va_uid = pn->mn_uid;
 	vap->va_gid = pn->mn_gid;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	vap->va_fileid = pn->mn_fileno;
 	vap->va_size = 0;
 	vap->va_blocksize = PAGE_SIZE;
 	vap->va_bytes = vap->va_size = 0;
 	vap->va_atime = pn->mn_atime;
 	vap->va_mtime = pn->mn_mtime;
 	vap->va_ctime = pn->mn_ctime;
 	vap->va_birthtime = pn->mn_birth;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_rdev = NODEV;
 	vap->va_bytes = 0;
 	vap->va_filerev = 0;
 	return (error);
 }
 
 #if 0
 struct vop_setattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 /*
  * Set attributes
  */
 static int
 mqfs_setattr(struct vop_setattr_args *ap)
 {
 	struct mqfs_node *pn;
 	struct vattr *vap;
 	struct vnode *vp;
 	struct thread *td;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	td = curthread;
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	pn = VTON(vp);
 
 	error = c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = pn->mn_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = pn->mn_gid;
 	else
 		gid = vap->va_gid;
 
 	if (uid != pn->mn_uid || gid != pn->mn_gid) {
 		/*
 		 * To modify the ownership of a file, must possess VADMIN
 		 * for that file.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)))
 			return (error);
 
 		/*
 		 * XXXRW: Why is there a privilege check here: shouldn't the
 		 * check in VOP_ACCESS() be enough?  Also, are the group bits
 		 * below definitely right?
 		 */
 		if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid ||
 		    (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
 		    (error = priv_check(td, PRIV_MQ_ADMIN)) != 0)
 			return (error);
 		pn->mn_uid = uid;
 		pn->mn_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if ((ap->a_cred->cr_uid != pn->mn_uid) &&
 		    (error = priv_check(td, PRIV_MQ_ADMIN)))
 			return (error);
 		pn->mn_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		/* See the comment in ufs_vnops::ufs_setattr(). */
 		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td))))
 			return (error);
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			pn->mn_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			pn->mn_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 	if (c) {
 		vfs_timestamp(&pn->mn_ctime);
 	}
 	return (0);
 }
 
 #if 0
 struct vop_read_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int a_ioflag;
 	struct ucred *a_cred;
 };
 #endif
 
 /*
  * Read from a file
  */
 static int
 mqfs_read(struct vop_read_args *ap)
 {
 	char buf[80];
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct mqueue *mq;
 	int len, error;
 
 	if (vp->v_type != VREG)
 		return (EINVAL);
 
 	mq = VTOMQ(vp);
 	snprintf(buf, sizeof(buf),
 		"QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
 		mq->mq_totalbytes,
 		mq->mq_maxmsg,
 		mq->mq_curmsgs,
 		mq->mq_msgsize);
 	buf[sizeof(buf)-1] = '\0';
 	len = strlen(buf);
 	error = uiomove_frombuf(buf, len, uio);
 	return (error);
 }
 
 #if 0
 struct vop_readdir_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *a_ncookies;
 	u_long **a_cookies;
 };
 #endif
 
 /*
  * Return directory entries.
  */
 static int
 mqfs_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp;
 	struct mqfs_info *mi;
 	struct mqfs_node *pd;
 	struct mqfs_node *pn;
 	struct dirent entry;
 	struct uio *uio;
 	const void *pr_root;
 	int *tmp_ncookies = NULL;
 	off_t offset;
 	int error, i;
 
 	vp = ap->a_vp;
 	mi = VFSTOMQFS(vp->v_mount);
 	pd = VTON(vp);
 	uio = ap->a_uio;
 
 	if (vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
         }
 
 	error = 0;
 	offset = 0;
 
 	pr_root = ap->a_cred->cr_prison->pr_root;
 	sx_xlock(&mi->mi_lock);
 
 	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
 		entry.d_reclen = sizeof(entry);
 
 		/*
 		 * Only show names within the same prison root directory
 		 * (or not associated with a prison, e.g. "." and "..").
 		 */
 		if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root)
 			continue;
 		if (!pn->mn_fileno)
 			mqfs_fileno_alloc(mi, pn);
 		entry.d_fileno = pn->mn_fileno;
 		entry.d_off = offset + entry.d_reclen;
 		for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
 			entry.d_name[i] = pn->mn_name[i];
 		entry.d_namlen = i;
 		switch (pn->mn_type) {
 		case mqfstype_root:
 		case mqfstype_dir:
 		case mqfstype_this:
 		case mqfstype_parent:
 			entry.d_type = DT_DIR;
 			break;
 		case mqfstype_file:
 			entry.d_type = DT_REG;
 			break;
 		case mqfstype_symlink:
 			entry.d_type = DT_LNK;
 			break;
 		default:
 			panic("%s has unexpected node type: %d", pn->mn_name,
 				pn->mn_type);
 		}
 		dirent_terminate(&entry);
 		if (entry.d_reclen > uio->uio_resid)
                         break;
 		if (offset >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, &entry, offset);
                         if (error)
                                 break;
                 }
                 offset += entry.d_reclen;
 	}
 	sx_xunlock(&mi->mi_lock);
 
 	uio->uio_offset = offset;
 
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 #ifdef notyet
 
 #if 0
 struct vop_mkdir_args {
 	struct vnode *a_dvp;
 	struvt vnode **a_vpp;
 	struvt componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 /*
  * Create a directory.
  */
 static int
 mqfs_mkdir(struct vop_mkdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct componentname *cnp = ap->a_cnp;
 	struct mqfs_node *pd = VTON(ap->a_dvp);
 	struct mqfs_node *pn;
 	int error;
 
 	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 	sx_xlock(&mqfs->mi_lock);
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", __func__);
 	pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
 		ap->a_vap->cn_cred, ap->a_vap->va_mode);
 	if (pn != NULL)
 		mqnode_addref(pn);
 	sx_xunlock(&mqfs->mi_lock);
 	if (pn == NULL) {
 		error = ENOSPC;
 	} else {
 		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
 		mqnode_release(pn);
 	}
 	return (error);
 }
 
 #if 0
 struct vop_rmdir_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 /*
  * Remove a directory.
  */
 static int
 mqfs_rmdir(struct vop_rmdir_args *ap)
 {
 	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
 	struct mqfs_node *pn = VTON(ap->a_vp);
 	struct mqfs_node *pt;
 
 	if (pn->mn_type != mqfstype_dir)
 		return (ENOTDIR);
 
 	sx_xlock(&mqfs->mi_lock);
 	if (pn->mn_deleted) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOENT);
 	}
 
 	pt = LIST_FIRST(&pn->mn_children);
 	pt = LIST_NEXT(pt, mn_sibling);
 	pt = LIST_NEXT(pt, mn_sibling);
 	if (pt != NULL) {
 		sx_xunlock(&mqfs->mi_lock);
 		return (ENOTEMPTY);
 	}
 	pt = pn->mn_parent;
 	pn->mn_parent = NULL;
 	pn->mn_deleted = 1;
 	LIST_REMOVE(pn, mn_sibling);
 	mqnode_release(pn);
 	mqnode_release(pt);
 	sx_xunlock(&mqfs->mi_lock);
 	cache_purge(ap->a_vp);
 	return (0);
 }
 
 #endif /* notyet */
 
 /*
  * See if this prison root is obsolete, and clean up associated queues if it is.
  */
 static int
 mqfs_prison_remove(void *obj, void *data __unused)
 {
 	const struct prison *pr = obj;
 	struct prison *tpr;
 	struct mqfs_node *pn, *tpn;
-	int found;
+	struct vnode *pr_root;
 
-	found = 0;
+	pr_root = pr->pr_root;
+	if (pr->pr_parent->pr_root == pr_root)
+		return (0);
 	TAILQ_FOREACH(tpr, &allprison, pr_list) {
-		prison_lock(tpr);
-		if (tpr != pr && prison_isvalid(tpr) &&
-		    tpr->pr_root == pr->pr_root)
-			found = 1;
-		prison_unlock(tpr);
+		if (tpr != pr && tpr->pr_root == pr_root)
+			return (0);
 	}
-	if (!found) {
-		/*
-		 * No jails are rooted in this directory anymore,
-		 * so no queues should be either.
-		 */
-		sx_xlock(&mqfs_data.mi_lock);
-		LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children,
-		    mn_sibling, tpn) {
-			if (pn->mn_pr_root == pr->pr_root)
-				(void)do_unlink(pn, curthread->td_ucred);
-		}
-		sx_xunlock(&mqfs_data.mi_lock);
+	/*
+	 * No jails are rooted in this directory anymore,
+	 * so no queues should be either.
+	 */
+	sx_xlock(&mqfs_data.mi_lock);
+	LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children,
+	    mn_sibling, tpn) {
+		if (pn->mn_pr_root == pr_root)
+			(void)do_unlink(pn, curthread->td_ucred);
 	}
+	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 /*
  * Allocate a message queue
  */
 static struct mqueue *
 mqueue_alloc(const struct mq_attr *attr)
 {
 	struct mqueue *mq;
 
 	if (curmq >= maxmq)
 		return (NULL);
 	mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&mq->mq_msgq);
 	if (attr != NULL) {
 		mq->mq_maxmsg = attr->mq_maxmsg;
 		mq->mq_msgsize = attr->mq_msgsize;
 	} else {
 		mq->mq_maxmsg = default_maxmsg;
 		mq->mq_msgsize = default_msgsize;
 	}
 	mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF);
 	knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex);
 	knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex);
 	atomic_add_int(&curmq, 1);
 	return (mq);
 }
 
 /*
  * Destroy a message queue
  */
 static void
 mqueue_free(struct mqueue *mq)
 {
 	struct mqueue_msg *msg;
 
 	while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
 		TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
 		free(msg, M_MQUEUEDATA);
 	}
 
 	mtx_destroy(&mq->mq_mutex);
 	seldrain(&mq->mq_rsel);
 	seldrain(&mq->mq_wsel);
 	knlist_destroy(&mq->mq_rsel.si_note);
 	knlist_destroy(&mq->mq_wsel.si_note);
 	uma_zfree(mqueue_zone, mq);
 	atomic_add_int(&curmq, -1);
 }
 
 /*
  * Load a message from user space
  */
 static struct mqueue_msg *
 mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
 {
 	struct mqueue_msg *msg;
 	size_t len;
 	int error;
 
 	len = sizeof(struct mqueue_msg) + msg_size;
 	msg = malloc(len, M_MQUEUEDATA, M_WAITOK);
 	error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
 	    msg_size);
 	if (error) {
 		free(msg, M_MQUEUEDATA);
 		msg = NULL;
 	} else {
 		msg->msg_size = msg_size;
 		msg->msg_prio = msg_prio;
 	}
 	return (msg);
 }
 
 /*
  * Save a message to user space
  */
 static int
 mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
 {
 	int error;
 
 	error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
 		msg->msg_size);
 	if (error == 0 && msg_prio != NULL)
 		error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
 	return (error);
 }
 
 /*
  * Free a message's memory
  */
 static __inline void
 mqueue_freemsg(struct mqueue_msg *msg)
 {
 	free(msg, M_MQUEUEDATA);
 }
 
 /*
  * Send a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_send(struct mqueue *mq, const char *msg_ptr,
 	size_t msg_len, unsigned msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_prio >= MQ_PRIO_MAX)
 		return (EINVAL);
 	if (msg_len > mq->mq_msgsize)
 		return (EMSGSIZE);
 	msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
 	if (msg == NULL)
 		return (EFAULT);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_send(mq, msg, -1);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* we allow a null timeout (wait forever) */
 	if (abs_timeout == NULL) {
 		error = _mqueue_send(mq, msg, 0);
 		if (error)
 			goto bad;
 		return (0);
 	}
 
 	/* send it before checking time */
 	error = _mqueue_send(mq, msg, -1);
 	if (error == 0)
 		return (0);
 
 	if (error != EAGAIN)
 		goto bad;
 
 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
 		error = EINVAL;
 		goto bad;
 	}
 	for (;;) {
 		getnanotime(&ts);
 		timespecsub(abs_timeout, &ts, &ts2);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			break;
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_send(mq, msg, tvtohz(&tv));
 		if (error != ETIMEDOUT)
 			break;
 	}
 	if (error == 0)
 		return (0);
 bad:
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to send a message
  */
 static int
 _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
 {	
 	struct mqueue_msg *msg2;
 	int error = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_senders++;
 		error = msleep(&mq->mq_senders, &mq->mq_mutex,
 			    PCATCH, "mqsend", timo);
 		mq->mq_senders--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (mq->mq_curmsgs >= mq->mq_maxmsg) {
 		mtx_unlock(&mq->mq_mutex);
 		return (error);
 	}
 	error = 0;
 	if (TAILQ_EMPTY(&mq->mq_msgq)) {
 		TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
 	} else {
 		if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
 			TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
 		} else {
 			TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
 				if (msg2->msg_prio < msg->msg_prio)
 					break;
 			}
 			TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
 		}
 	}
 	mq->mq_curmsgs++;
 	mq->mq_totalbytes += msg->msg_size;
 	if (mq->mq_receivers)
 		wakeup_one(&mq->mq_receivers);
 	else if (mq->mq_notifier != NULL)
 		mqueue_send_notification(mq);
 	if (mq->mq_flags & MQ_RSEL) {
 		mq->mq_flags &= ~MQ_RSEL;
 		selwakeup(&mq->mq_rsel);
 	}
 	KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
 	mtx_unlock(&mq->mq_mutex);
 	return (0);
 }
 
 /*
  * Send realtime a signal to process which registered itself
  * successfully by mq_notify.
  */
 static void
 mqueue_send_notification(struct mqueue *mq)
 {
 	struct mqueue_notifier *nt;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	nt = mq->mq_notifier;
 	if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
 		p = nt->nt_proc;
 		error = sigev_findtd(p, &nt->nt_sigev, &td);
 		if (error) {
 			mq->mq_notifier = NULL;
 			return;
 		}
 		if (!KSI_ONQ(&nt->nt_ksi)) {
 			ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev);
 			tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi);
 		}
 		PROC_UNLOCK(p);
 	}
 	mq->mq_notifier = NULL;
 }
 
 /*
  * Get a message. if waitok is false, thread will not be
  * blocked if there is no data in queue, otherwise, absolute
  * time will be checked.
  */
 int
 mqueue_receive(struct mqueue *mq, char *msg_ptr,
 	size_t msg_len, unsigned *msg_prio, int waitok,
 	const struct timespec *abs_timeout)
 {
 	struct mqueue_msg *msg;
 	struct timespec ts, ts2;
 	struct timeval tv;
 	int error;
 
 	if (msg_len < mq->mq_msgsize)
 		return (EMSGSIZE);
 
 	/* O_NONBLOCK case */
 	if (!waitok) {
 		error = _mqueue_recv(mq, &msg, -1);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* we allow a null timeout (wait forever). */
 	if (abs_timeout == NULL) {
 		error = _mqueue_recv(mq, &msg, 0);
 		if (error)
 			return (error);
 		goto received;
 	}
 
 	/* try to get a message before checking time */
 	error = _mqueue_recv(mq, &msg, -1);
 	if (error == 0)
 		goto received;
 
 	if (error != EAGAIN)
 		return (error);
 
 	if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) {
 		error = EINVAL;
 		return (error);
 	}
 
 	for (;;) {
 		getnanotime(&ts);
 		timespecsub(abs_timeout, &ts, &ts2);
 		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
 			error = ETIMEDOUT;
 			return (error);
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
 		error = _mqueue_recv(mq, &msg, tvtohz(&tv));
 		if (error == 0)
 			break;
 		if (error != ETIMEDOUT)
 			return (error);
 	}
 
 received:
 	error = mqueue_savemsg(msg, msg_ptr, msg_prio);
 	if (error == 0) {
 		curthread->td_retval[0] = msg->msg_size;
 		curthread->td_retval[1] = 0;
 	}
 	mqueue_freemsg(msg);
 	return (error);
 }
 
 /*
  * Common routine to receive a message
  */
 static int
 _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
 {	
 	int error = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
 		if (timo < 0) {
 			mtx_unlock(&mq->mq_mutex);
 			return (EAGAIN);
 		}
 		mq->mq_receivers++;
 		error = msleep(&mq->mq_receivers, &mq->mq_mutex,
 			    PCATCH, "mqrecv", timo);
 		mq->mq_receivers--;
 		if (error == EAGAIN)
 			error = ETIMEDOUT;
 	}
 	if (*msg != NULL) {
 		error = 0;
 		TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
 		mq->mq_curmsgs--;
 		mq->mq_totalbytes -= (*msg)->msg_size;
 		if (mq->mq_senders)
 			wakeup_one(&mq->mq_senders);
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
 	}
 	if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
 	    !TAILQ_EMPTY(&mq->mq_msgq)) {
 		mqueue_send_notification(mq);
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (error);
 }
 
 static __inline struct mqueue_notifier *
 notifier_alloc(void)
 {
 	return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
 }
 
 static __inline void
 notifier_free(struct mqueue_notifier *p)
 {
 	uma_zfree(mqnoti_zone, p);
 }
 
 static struct mqueue_notifier *
 notifier_search(struct proc *p, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
 		if (nt->nt_ksi.ksi_mqd == fd)
 			break;
 	}
 	return (nt);
 }
 
 static __inline void
 notifier_insert(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
 }
 
 static __inline void
 notifier_delete(struct proc *p, struct mqueue_notifier *nt)
 {
 	LIST_REMOVE(nt, nt_link);
 	notifier_free(nt);
 }
 
 static void
 notifier_remove(struct proc *p, struct mqueue *mq, int fd)
 {
 	struct mqueue_notifier *nt;
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	PROC_LOCK(p);
 	nt = notifier_search(p, fd);
 	if (nt != NULL) {
 		if (mq->mq_notifier == nt)
 			mq->mq_notifier = NULL;
 		sigqueue_take(&nt->nt_ksi);
 		notifier_delete(p, nt);
 	}
 	PROC_UNLOCK(p);
 }
 
 static int
 kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode,
     const struct mq_attr *attr)
 {
 	char path[MQFS_NAMELEN + 1];
 	struct mqfs_node *pn;
 	struct pwddesc *pdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int fd, error, len, cmode;
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 
 	pdp = td->td_proc->p_pd;
 	cmode = (((mode & ~pdp->pd_cmask) & ALLPERMS) & ~S_ISTXT);
 	mq = NULL;
 	if ((flags & O_CREAT) != 0 && attr != NULL) {
 		if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg)
 			return (EINVAL);
 		if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize)
 			return (EINVAL);
 	}
 
 	error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	/*
 	 * The first character of name must be a slash  (/) character
 	 * and the remaining characters of name cannot include any slash
 	 * characters. 
 	 */
 	len = strlen(path);
 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 	/*
 	 * "." and ".." are magic directories, populated on the fly, and cannot
 	 * be opened as queues.
 	 */
 	if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0)
 		return (EINVAL);
 	AUDIT_ARG_UPATH1_CANON(path);
 
 	error = falloc(td, &fp, &fd, O_CLOEXEC);
 	if (error)
 		return (error);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn == NULL) {
 		if (!(flags & O_CREAT)) {
 			error = ENOENT;
 		} else {
 			mq = mqueue_alloc(attr);
 			if (mq == NULL) {
 				error = ENFILE;
 			} else {
 				pn = mqfs_create_file(mqfs_data.mi_root,
 				         path + 1, len - 1, td->td_ucred,
 					 cmode);
 				if (pn == NULL) {
 					error = ENOSPC;
 					mqueue_free(mq);
 				}
 			}
 		}
 
 		if (error == 0) {
 			pn->mn_data = mq;
 		}
 	} else {
 		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
 			error = EEXIST;
 		} else {
 			accmode_t accmode = 0;
 
 			if (flags & FREAD)
 				accmode |= VREAD;
 			if (flags & FWRITE)
 				accmode |= VWRITE;
 			error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
 			    pn->mn_gid, accmode, td->td_ucred);
 		}
 	}
 
 	if (error) {
 		sx_xunlock(&mqfs_data.mi_lock);
 		fdclose(td, fp, fd);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	mqnode_addref(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 
 	finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn,
 	    &mqueueops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 	return (0);
 }
 
 /*
  * Syscall to open a message queue.
  */
 int
 sys_kmq_open(struct thread *td, struct kmq_open_args *uap)
 {
 	struct mq_attr attr;
 	int flags, error;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error)
 			return (error);
 	}
 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
 	    uap->attr != NULL ? &attr : NULL));
 }
 
 /*
  * Syscall to unlink a message queue.
  */
 int
 sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
 {
 	char path[MQFS_NAMELEN+1];
 	struct mqfs_node *pn;
 	int error, len;
 
 	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
         if (error)
 		return (error);
 
 	len = strlen(path);
 	if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL)
 		return (EINVAL);
 	if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0)
 		return (EINVAL);
 	AUDIT_ARG_UPATH1_CANON(path);
 
 	sx_xlock(&mqfs_data.mi_lock);
 	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred);
 	if (pn != NULL)
 		error = do_unlink(pn, td->td_ucred);
 	else
 		error = ENOENT;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **);
 
 /*
  * Get message queue by giving file slot
  */
 static int
 _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func,
        struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = func(td, fd, rightsp, fpp);
 	if (error)
 		return (error);
 	if (&mqueueops != (*fpp)->f_ops) {
 		fdrop(*fpp, td);
 		return (EBADF);
 	}
 	pn = (*fpp)->f_data;
 	if (ppn)
 		*ppn = pn;
 	if (pmq)
 		*pmq = pn->mn_data;
 	return (0);
 }
 
 static __inline int
 getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
 	struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_event_rights, fget,
 	    fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_read(struct thread *td, int fd, struct file **fpp,
 	 struct mqfs_node **ppn, struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_read_rights, fget_read,
 	    fpp, ppn, pmq);
 }
 
 static __inline int
 getmq_write(struct thread *td, int fd, struct file **fpp,
 	struct mqfs_node **ppn, struct mqueue **pmq)
 {
 
 	return _getmq(td, fd, &cap_write_rights, fget_write,
 	    fpp, ppn, pmq);
 }
 
 static int
 kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr,
     struct mq_attr *oattr)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	u_int oflag, flag;
 	int error;
 
 	AUDIT_ARG_FD(mqd);
 	if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0)
 		return (EINVAL);
 	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	oattr->mq_maxmsg  = mq->mq_maxmsg;
 	oattr->mq_msgsize = mq->mq_msgsize;
 	oattr->mq_curmsgs = mq->mq_curmsgs;
 	if (attr != NULL) {
 		do {
 			oflag = flag = fp->f_flag;
 			flag &= ~O_NONBLOCK;
 			flag |= (attr->mq_flags & O_NONBLOCK);
 		} while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);
 	} else
 		oflag = fp->f_flag;
 	oattr->mq_flags = (O_NONBLOCK & oflag);
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
 {
 	struct mq_attr attr, oattr;
 	int error;
 
 	if (uap->attr != NULL) {
 		error = copyin(uap->attr, &attr, sizeof(attr));
 		if (error != 0)
 			return (error);
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
 	if (error == 0 && uap->oattr != NULL) {
 		bzero(oattr.__reserved, sizeof(oattr.__reserved));
 		error = copyout(&oattr, uap->oattr, sizeof(oattr));
 	}
 	return (error);
 }
 
 int
 sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec *abs_timeout, ets;
 	int error;
 	int waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
 		if (error != 0)
 			goto out;
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets, sizeof(ets));
 		if (error != 0)
 			goto out;
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 static int
 kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	struct mqueue *mq;
 	struct file *fp, *fp2;
 	struct mqueue_notifier *nt, *newnt = NULL;
 	int error;
 
 	AUDIT_ARG_FD(mqd);
 	if (sigev != NULL) {
 		if (sigev->sigev_notify != SIGEV_SIGNAL &&
 		    sigev->sigev_notify != SIGEV_THREAD_ID &&
 		    sigev->sigev_notify != SIGEV_NONE)
 			return (EINVAL);
 		if ((sigev->sigev_notify == SIGEV_SIGNAL ||
 		    sigev->sigev_notify == SIGEV_THREAD_ID) &&
 		    !_SIG_VALID(sigev->sigev_signo))
 			return (EINVAL);
 	}
 	p = td->td_proc;
 	fdp = td->td_proc->p_fd;
 	error = getmq(td, mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 again:
 	FILEDESC_SLOCK(fdp);
 	fp2 = fget_locked(fdp, mqd);
 	if (fp2 == NULL) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 #ifdef CAPABILITIES
 	error = cap_check(cap_rights(fdp, mqd), &cap_event_rights);
 	if (error) {
 		FILEDESC_SUNLOCK(fdp);
 		goto out;
 	}
 #endif
 	if (fp2 != fp) {
 		FILEDESC_SUNLOCK(fdp);
 		error = EBADF;
 		goto out;
 	}
 	mtx_lock(&mq->mq_mutex);
 	FILEDESC_SUNLOCK(fdp);
 	if (sigev != NULL) {
 		if (mq->mq_notifier != NULL) {
 			error = EBUSY;
 		} else {
 			PROC_LOCK(p);
 			nt = notifier_search(p, mqd);
 			if (nt == NULL) {
 				if (newnt == NULL) {
 					PROC_UNLOCK(p);
 					mtx_unlock(&mq->mq_mutex);
 					newnt = notifier_alloc();
 					goto again;
 				}
 			}
 
 			if (nt != NULL) {
 				sigqueue_take(&nt->nt_ksi);
 				if (newnt != NULL) {
 					notifier_free(newnt);
 					newnt = NULL;
 				}
 			} else {
 				nt = newnt;
 				newnt = NULL;
 				ksiginfo_init(&nt->nt_ksi);
 				nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
 				nt->nt_ksi.ksi_code = SI_MESGQ;
 				nt->nt_proc = p;
 				nt->nt_ksi.ksi_mqd = mqd;
 				notifier_insert(p, nt);
 			}
 			nt->nt_sigev = *sigev;
 			mq->mq_notifier = nt;
 			PROC_UNLOCK(p);
 			/*
 			 * if there is no receivers and message queue
 			 * is not empty, we should send notification
 			 * as soon as possible.
 			 */
 			if (mq->mq_receivers == 0 &&
 			    !TAILQ_EMPTY(&mq->mq_msgq))
 				mqueue_send_notification(mq);
 		}
 	} else {
 		notifier_remove(p, mq, mqd);
 	}
 	mtx_unlock(&mq->mq_mutex);
 
 out:
 	fdrop(fp, td);
 	if (newnt != NULL)
 		notifier_free(newnt);
 	return (error);
 }
 
 int
 sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap)
 {
 	struct sigevent ev, *evp;
 	int error;
 
 	if (uap->sigev == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->sigev, &ev, sizeof(ev));
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
 static void
 mqueue_fdclose(struct thread *td, int fd, struct file *fp)
 {
 	struct mqueue *mq;
 #ifdef INVARIANTS
 	struct filedesc *fdp;
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_LOCK_ASSERT(fdp);
 #endif
 
 	if (fp->f_ops == &mqueueops) {
 		mq = FPTOMQ(fp);
 		mtx_lock(&mq->mq_mutex);
 		notifier_remove(td->td_proc, mq, fd);
 
 		/* have to wakeup thread in same process */
 		if (mq->mq_flags & MQ_RSEL) {
 			mq->mq_flags &= ~MQ_RSEL;
 			selwakeup(&mq->mq_rsel);
 		}
 		if (mq->mq_flags & MQ_WSEL) {
 			mq->mq_flags &= ~MQ_WSEL;
 			selwakeup(&mq->mq_wsel);
 		}
 		mtx_unlock(&mq->mq_mutex);
 	}
 }
 
 static void
 mq_proc_exit(void *arg __unused, struct proc *p)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct mqueue *mq;
 	int i;
 
 	fdp = p->p_fd;
 	FILEDESC_SLOCK(fdp);
 	for (i = 0; i < fdp->fd_nfiles; ++i) {
 		fp = fget_locked(fdp, i);
 		if (fp != NULL && fp->f_ops == &mqueueops) {
 			mq = FPTOMQ(fp);
 			mtx_lock(&mq->mq_mutex);
 			notifier_remove(p, FPTOMQ(fp), i);
 			mtx_unlock(&mq->mq_mutex);
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
 }
 
 static int
 mqf_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int revents = 0;
 
 	mtx_lock(&mq->mq_mutex);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (mq->mq_curmsgs) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			mq->mq_flags |= MQ_RSEL;
 			selrecord(td, &mq->mq_rsel);
  		}
 	}
 	if (events & POLLOUT) {
 		if (mq->mq_curmsgs < mq->mq_maxmsg)
 			revents |= POLLOUT;
 		else {
 			mq->mq_flags |= MQ_WSEL;
 			selrecord(td, &mq->mq_wsel);
 		}
 	}
 	mtx_unlock(&mq->mq_mutex);
 	return (revents);
 }
 
 static int
 mqf_close(struct file *fp, struct thread *td)
 {
 	struct mqfs_node *pn;
 
 	fp->f_ops = &badfileops;
 	pn = fp->f_data;
 	fp->f_data = NULL;
 	sx_xlock(&mqfs_data.mi_lock);
 	mqnode_release(pn);
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct mqfs_node *pn = fp->f_data;
 
 	bzero(st, sizeof *st);
 	sx_xlock(&mqfs_data.mi_lock);
 	st->st_atim = pn->mn_atime;
 	st->st_mtim = pn->mn_mtime;
 	st->st_ctim = pn->mn_ctime;
 	st->st_birthtim = pn->mn_birth;
 	st->st_uid = pn->mn_uid;
 	st->st_gid = pn->mn_gid;
 	st->st_mode = S_IFIFO | pn->mn_mode;
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (0);
 }
 
 static int
 mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = 0;
 	pn = fp->f_data;
 	sx_xlock(&mqfs_data.mi_lock);
 	error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN,
 	    active_cred);
 	if (error != 0)
 		goto out;
 	pn->mn_mode = mode & ACCESSPERMS;
 out:
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 static int
 mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct mqfs_node *pn;
 	int error;
 
 	error = 0;
 	pn = fp->f_data;
 	sx_xlock(&mqfs_data.mi_lock);
 	if (uid == (uid_t)-1)
 		uid = pn->mn_uid;
 	if (gid == (gid_t)-1)
 		gid = pn->mn_gid;
 	if (((uid != pn->mn_uid && uid != active_cred->cr_uid) ||
 	    (gid != pn->mn_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN)))
 		goto out;
 	pn->mn_uid = uid;
 	pn->mn_gid = gid;
 out:
 	sx_xunlock(&mqfs_data.mi_lock);
 	return (error);
 }
 
 static int
 mqf_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(fp);
 	int error = 0;
 
 	if (kn->kn_filter == EVFILT_READ) {
 		kn->kn_fop = &mq_rfiltops;
 		knlist_add(&mq->mq_rsel.si_note, kn, 0);
 	} else if (kn->kn_filter == EVFILT_WRITE) {
 		kn->kn_fop = &mq_wfiltops;
 		knlist_add(&mq->mq_wsel.si_note, kn, 0);
 	} else
 		error = EINVAL;
 	return (error);
 }
 
 static void
 filt_mqdetach(struct knote *kn)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	if (kn->kn_filter == EVFILT_READ)
 		knlist_remove(&mq->mq_rsel.si_note, kn, 0);
 	else if (kn->kn_filter == EVFILT_WRITE)
 		knlist_remove(&mq->mq_wsel.si_note, kn, 0);
 	else
 		panic("filt_mqdetach");
 }
 
 static int
 filt_mqread(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs != 0);
 }
 
 static int
 filt_mqwrite(struct knote *kn, long hint)
 {
 	struct mqueue *mq = FPTOMQ(kn->kn_fp);
 
 	mtx_assert(&mq->mq_mutex, MA_OWNED);
 	return (mq->mq_curmsgs < mq->mq_maxmsg);
 }
 
 static int
 mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_MQUEUE;
 	return (0);
 }
 
 static struct fileops mqueueops = {
 	.fo_read		= invfo_rdwr,
 	.fo_write		= invfo_rdwr,
 	.fo_truncate		= invfo_truncate,
 	.fo_ioctl		= invfo_ioctl,
 	.fo_poll		= mqf_poll,
 	.fo_kqfilter		= mqf_kqfilter,
 	.fo_stat		= mqf_stat,
 	.fo_close		= mqf_close,
 	.fo_chmod		= mqf_chmod,
 	.fo_chown		= mqf_chown,
 	.fo_sendfile		= invfo_sendfile,
 	.fo_fill_kinfo		= mqf_fill_kinfo,
 	.fo_flags		= DFLAG_PASSABLE,
 };
 
 static struct vop_vector mqfs_vnodeops = {
 	.vop_default 		= &default_vnodeops,
 	.vop_access		= mqfs_access,
 	.vop_cachedlookup	= mqfs_lookup,
 	.vop_lookup		= vfs_cache_lookup,
 	.vop_reclaim		= mqfs_reclaim,
 	.vop_create		= mqfs_create,
 	.vop_remove		= mqfs_remove,
 	.vop_inactive		= mqfs_inactive,
 	.vop_open		= mqfs_open,
 	.vop_close		= mqfs_close,
 	.vop_getattr		= mqfs_getattr,
 	.vop_setattr		= mqfs_setattr,
 	.vop_read		= mqfs_read,
 	.vop_write		= VOP_EOPNOTSUPP,
 	.vop_readdir		= mqfs_readdir,
 	.vop_mkdir		= VOP_EOPNOTSUPP,
 	.vop_rmdir		= VOP_EOPNOTSUPP
 };
 VFS_VOP_VECTOR_REGISTER(mqfs_vnodeops);
 
 static struct vfsops mqfs_vfsops = {
 	.vfs_init 		= mqfs_init,
 	.vfs_uninit		= mqfs_uninit,
 	.vfs_mount		= mqfs_mount,
 	.vfs_unmount		= mqfs_unmount,
 	.vfs_root		= mqfs_root,
 	.vfs_statfs		= mqfs_statfs,
 };
 
 static struct vfsconf mqueuefs_vfsconf = {
 	.vfc_version = VFS_VERSION,
 	.vfc_name = "mqueuefs",
 	.vfc_vfsops = &mqfs_vfsops,
 	.vfc_typenum = -1,
 	.vfc_flags = VFCF_SYNTHETIC
 };
 
 static struct syscall_helper_data mq_syscalls[] = {
 	SYSCALL_INIT_HELPER(kmq_open),
 	SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED),
 	SYSCALL_INIT_HELPER(kmq_unlink),
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static void
 mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to)
 {
 
 	to->mq_flags = from->mq_flags;
 	to->mq_maxmsg = from->mq_maxmsg;
 	to->mq_msgsize = from->mq_msgsize;
 	to->mq_curmsgs = from->mq_curmsgs;
 }
 
 static void
 mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to)
 {
 
 	to->mq_flags = from->mq_flags;
 	to->mq_maxmsg = from->mq_maxmsg;
 	to->mq_msgsize = from->mq_msgsize;
 	to->mq_curmsgs = from->mq_curmsgs;
 }
 
 int
 freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap)
 {
 	struct mq_attr attr;
 	struct mq_attr32 attr32;
 	int flags, error;
 
 	if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC)
 		return (EINVAL);
 	flags = FFLAGS(uap->flags);
 	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
 		error = copyin(uap->attr, &attr32, sizeof(attr32));
 		if (error)
 			return (error);
 		mq_attr_from32(&attr32, &attr);
 	}
 	return (kern_kmq_open(td, uap->path, flags, uap->mode,
 	    uap->attr != NULL ? &attr : NULL));
 }
 
 int
 freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap)
 {
 	struct mq_attr attr, oattr;
 	struct mq_attr32 attr32, oattr32;
 	int error;
 
 	if (uap->attr != NULL) {
 		error = copyin(uap->attr, &attr32, sizeof(attr32));
 		if (error != 0)
 			return (error);
 		mq_attr_from32(&attr32, &attr);
 	}
 	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
 	    &oattr);
 	if (error == 0 && uap->oattr != NULL) {
 		mq_attr_to32(&oattr, &oattr32);
 		bzero(oattr32.__reserved, sizeof(oattr32.__reserved));
 		error = copyout(&oattr32, uap->oattr, sizeof(oattr32));
 	}
 	return (error);
 }
 
 int
 freebsd32_kmq_timedsend(struct thread *td,
     struct freebsd32_kmq_timedsend_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec32 ets32;
 	struct timespec *abs_timeout, ets;
 	int error;
 	int waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
 		if (error != 0)
 			goto out;
 		CP(ets32, ets, tv_sec);
 		CP(ets32, ets, tv_nsec);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 freebsd32_kmq_timedreceive(struct thread *td,
     struct freebsd32_kmq_timedreceive_args *uap)
 {
 	struct mqueue *mq;
 	struct file *fp;
 	struct timespec32 ets32;
 	struct timespec *abs_timeout, ets;
 	int error, waitok;
 
 	AUDIT_ARG_FD(uap->mqd);
 	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
 	if (error)
 		return (error);
 	if (uap->abs_timeout != NULL) {
 		error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
 		if (error != 0)
 			goto out;
 		CP(ets32, ets, tv_sec);
 		CP(ets32, ets, tv_nsec);
 		abs_timeout = &ets;
 	} else
 		abs_timeout = NULL;
 	waitok = !(fp->f_flag & O_NONBLOCK);
 	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
 		uap->msg_prio, waitok, abs_timeout);
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap)
 {
 	struct sigevent ev, *evp;
 	struct sigevent32 ev32;
 	int error;
 
 	if (uap->sigev == NULL) {
 		evp = NULL;
 	} else {
 		error = copyin(uap->sigev, &ev32, sizeof(ev32));
 		if (error != 0)
 			return (error);
 		error = convert_sigevent32(&ev32, &ev);
 		if (error != 0)
 			return (error);
 		evp = &ev;
 	}
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
 static struct syscall_helper_data mq32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_kmq_open),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED),
 	SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink),
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 mqinit(void)
 {
 	int error;
 
 	error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 mqunload(void)
 {
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(mq32_syscalls);
 #endif
 	syscall_helper_unregister(mq_syscalls);
 	return (0);
 }
 
 static int
 mq_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	error = vfs_modevent(module, cmd, arg);
 	if (error != 0)
 		return (error);
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = mqinit();
 		if (error != 0)
 			mqunload();
 		break;
 	case MOD_UNLOAD:
 		error = mqunload();
 		break;
 	default:
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t mqueuefs_mod = {
 	"mqueuefs",
 	mq_modload,
 	&mqueuefs_vfsconf
 };
 DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE);
 MODULE_VERSION(mqueuefs, 1);
diff --git a/sys/sys/jail.h b/sys/sys/jail.h
index 2ac6aabdbd43..a48e189729dc 100644
--- a/sys/sys/jail.h
+++ b/sys/sys/jail.h
@@ -1,450 +1,451 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1999 Poul-Henning Kamp.
  * Copyright (c) 2009 James Gritton.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_JAIL_H_
 #define _SYS_JAIL_H_
 
 #ifdef _KERNEL
 struct jail_v0 {
 	u_int32_t	version;
 	char		*path;
 	char		*hostname;
 	u_int32_t	ip_number;
 };
 #endif
 
 struct jail {
 	uint32_t	version;
 	char		*path;
 	char		*hostname;
 	char		*jailname;
 	uint32_t	ip4s;
 	uint32_t	ip6s;
 	struct in_addr	*ip4;
 	struct in6_addr	*ip6;
 };
 #define	JAIL_API_VERSION	2
 
 /*
  * For all xprison structs, always keep the pr_version an int and
  * the first variable so userspace can easily distinguish them.
  */
 #ifndef _KERNEL
 struct xprison_v1 {
 	int		 pr_version;
 	int		 pr_id;
 	char		 pr_path[MAXPATHLEN];
 	char		 pr_host[MAXHOSTNAMELEN];
 	u_int32_t	 pr_ip;
 };
 #endif
 
 struct xprison {
 	int		 pr_version;
 	int		 pr_id;
 	int		 pr_state;
 	cpusetid_t	 pr_cpusetid;
 	char		 pr_path[MAXPATHLEN];
 	char		 pr_host[MAXHOSTNAMELEN];
 	char		 pr_name[MAXHOSTNAMELEN];
 	uint32_t	 pr_ip4s;
 	uint32_t	 pr_ip6s;
 #if 0
 	/*
 	 * sizeof(xprison) will be malloced + size needed for all
 	 * IPv4 and IPv6 addesses. Offsets are based numbers of addresses.
 	 */
 	struct in_addr	 pr_ip4[];
 	struct in6_addr	 pr_ip6[];
 #endif
 };
 #define	XPRISON_VERSION		3
 
 #define	PRISON_STATE_INVALID	0
 #define	PRISON_STATE_ALIVE	1
 #define	PRISON_STATE_DYING	2
 
 /*
  * Flags for jail_set and jail_get.
  */
 #define	JAIL_CREATE	0x01	/* Create jail if it doesn't exist */
 #define	JAIL_UPDATE	0x02	/* Update parameters of existing jail */
 #define	JAIL_ATTACH	0x04	/* Attach to jail upon creation */
 #define	JAIL_DYING	0x08	/* Allow getting a dying jail */
 #define	JAIL_SET_MASK	0x0f
 #define	JAIL_GET_MASK	0x08
 
 #define	JAIL_SYS_DISABLE	0
 #define	JAIL_SYS_NEW		1
 #define	JAIL_SYS_INHERIT	2
 
 #ifndef _KERNEL
 
 struct iovec;
 
 __BEGIN_DECLS
 int jail(struct jail *);
 int jail_set(struct iovec *, unsigned int, int);
 int jail_get(struct iovec *, unsigned int, int);
 int jail_attach(int);
 int jail_remove(int);
 __END_DECLS
 
 #else /* _KERNEL */
 
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/_task.h>
 
 #define JAIL_MAX	999999
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_PRISON);
 #endif
 #endif /* _KERNEL */
 
 #if defined(_KERNEL) || defined(_WANT_PRISON)
 
 #include <sys/osd.h>
 
 #define	HOSTUUIDLEN	64
 #define	OSRELEASELEN	32
 
 struct racct;
 struct prison_racct;
 
 /*
  * This structure describes a prison.  It is pointed to by all struct
  * ucreds's of the inmates.  pr_ref keeps track of them and is used to
  * delete the struture when the last inmate is dead.
  *
  * Lock key:
  *   (a) allprison_lock
  *   (c) set only during creation before the structure is shared, no mutex
  *       required to read
  *   (m) locked by pr_mtx
  *   (p) locked by pr_mtx, and also at least shared allprison_lock required
  *       to update
- *   (r) atomic via refcount(9), pr_mtx required to decrement to zero
+ *   (r) atomic via refcount(9), pr_mtx and allprison_lock required to
+ *       decrement to zero
  */
 struct prison {
 	TAILQ_ENTRY(prison) pr_list;			/* (a) all prisons */
 	int		 pr_id;				/* (c) prison id */
 	volatile u_int	 pr_ref;			/* (r) refcount */
 	volatile u_int	 pr_uref;			/* (r) user (alive) refcount */
 	unsigned	 pr_flags;			/* (p) PR_* flags */
 	LIST_HEAD(, prison) pr_children;		/* (a) list of child jails */
 	LIST_ENTRY(prison) pr_sibling;			/* (a) next in parent's list */
 	struct prison	*pr_parent;			/* (c) containing jail */
 	struct mtx	 pr_mtx;
 	struct task	 pr_task;			/* (c) destroy task */
 	struct osd	 pr_osd;			/* (p) additional data */
 	struct cpuset	*pr_cpuset;			/* (p) cpuset */
 	struct vnet	*pr_vnet;			/* (c) network stack */
 	struct vnode	*pr_root;			/* (c) vnode to rdir */
 	int		 pr_ip4s;			/* (p) number of v4 IPs */
 	int		 pr_ip6s;			/* (p) number of v6 IPs */
 	struct in_addr	*pr_ip4;			/* (p) v4 IPs of jail */
 	struct in6_addr	*pr_ip6;			/* (p) v6 IPs of jail */
 	struct prison_racct *pr_prison_racct;		/* (c) racct jail proxy */
 	void		*pr_sparep[3];
 	int		 pr_childcount;			/* (a) number of child jails */
 	int		 pr_childmax;			/* (p) maximum child jails */
 	unsigned	 pr_allow;			/* (p) PR_ALLOW_* flags */
 	int		 pr_securelevel;		/* (p) securelevel */
 	int		 pr_enforce_statfs;		/* (p) statfs permission */
 	int		 pr_devfs_rsnum;		/* (p) devfs ruleset */
 	int		 pr_spare[3];
 	int		 pr_osreldate;			/* (c) kern.osreldate value */
 	unsigned long	 pr_hostid;			/* (p) jail hostid */
 	char		 pr_name[MAXHOSTNAMELEN];	/* (p) admin jail name */
 	char		 pr_path[MAXPATHLEN];		/* (c) chroot path */
 	char		 pr_hostname[MAXHOSTNAMELEN];	/* (p) jail hostname */
 	char		 pr_domainname[MAXHOSTNAMELEN];	/* (p) jail domainname */
 	char		 pr_hostuuid[HOSTUUIDLEN];	/* (p) jail hostuuid */
 	char		 pr_osrelease[OSRELEASELEN];	/* (c) kern.osrelease value */
 };
 
 struct prison_racct {
 	LIST_ENTRY(prison_racct) prr_next;
 	char		prr_name[MAXHOSTNAMELEN];
 	u_int		prr_refcount;
 	struct racct	*prr_racct;
 };
 #endif /* _KERNEL || _WANT_PRISON */
 
 #ifdef _KERNEL
 /* Flag bits set via options */
 #define	PR_PERSIST	0x00000001	/* Can exist without processes */
 #define	PR_HOST		0x00000002	/* Virtualize hostname et al */
 #define	PR_IP4_USER	0x00000004	/* Restrict IPv4 addresses */
 #define	PR_IP6_USER	0x00000008	/* Restrict IPv6 addresses */
 #define	PR_VNET		0x00000010	/* Virtual network stack */
 #define	PR_IP4_SADDRSEL	0x00000080	/* Do IPv4 src addr sel. or use the */
 					/* primary jail address. */
 #define	PR_IP6_SADDRSEL	0x00000100	/* Do IPv6 src addr sel. or use the */
 					/* primary jail address. */
 
 /* Internal flag bits */
 #define	PR_REMOVE	0x01000000	/* In process of being removed */
 #define	PR_IP4		0x02000000	/* IPv4 restricted or disabled */
 					/* by this jail or an ancestor */
 #define	PR_IP6		0x04000000	/* IPv6 restricted or disabled */
 					/* by this jail or an ancestor */
 
 /*
  * Flags for pr_allow
  * Bits not noted here may be used for dynamic allow.mount.xxxfs.
  */
 #define	PR_ALLOW_SET_HOSTNAME		0x00000001
 #define	PR_ALLOW_SYSVIPC		0x00000002
 #define	PR_ALLOW_RAW_SOCKETS		0x00000004
 #define	PR_ALLOW_CHFLAGS		0x00000008
 #define	PR_ALLOW_MOUNT			0x00000010
 #define	PR_ALLOW_QUOTAS			0x00000020
 #define	PR_ALLOW_SOCKET_AF		0x00000040
 #define	PR_ALLOW_MLOCK			0x00000080
 #define	PR_ALLOW_READ_MSGBUF		0x00000100
 #define	PR_ALLOW_UNPRIV_DEBUG		0x00000200
 #define	PR_ALLOW_SUSER			0x00000400
 #define	PR_ALLOW_RESERVED_PORTS		0x00008000
 #define	PR_ALLOW_KMEM_ACCESS		0x00010000	/* reserved, not used yet */
 #define	PR_ALLOW_ALL_STATIC		0x000187ff
 
 /*
  * PR_ALLOW_DIFFERENCES determines which flags are able to be
  * different between the parent and child jail upon creation.
  */
 #define	PR_ALLOW_DIFFERENCES		(PR_ALLOW_UNPRIV_DEBUG)
 
 /*
  * OSD methods
  */
 #define	PR_METHOD_CREATE	0
 #define	PR_METHOD_GET		1
 #define	PR_METHOD_SET		2
 #define	PR_METHOD_CHECK		3
 #define	PR_METHOD_ATTACH	4
 #define	PR_METHOD_REMOVE	5
 #define	PR_MAXMETHOD		6
 
 /*
  * Lock/unlock a prison.
  * XXX These exist not so much for general convenience, but to be useable in
  *     the FOREACH_PRISON_DESCENDANT_LOCKED macro which can't handle them in
  *     non-function form as currently defined.
  */
 static __inline void
 prison_lock(struct prison *pr)
 {
 
 	mtx_lock(&pr->pr_mtx);
 }
 
 static __inline void
 prison_unlock(struct prison *pr)
 {
 
 	mtx_unlock(&pr->pr_mtx);
 }
 
 /* Traverse a prison's immediate children. */
 #define	FOREACH_PRISON_CHILD(ppr, cpr)					\
 	LIST_FOREACH(cpr, &(ppr)->pr_children, pr_sibling)
 
 /*
  * Preorder traversal of all of a prison's descendants.
  * This ugly loop allows the macro to be followed by a single block
  * as expected in a looping primitive.
  */
 #define	FOREACH_PRISON_DESCENDANT(ppr, cpr, descend)			\
 	for ((cpr) = (ppr), (descend) = 1;				\
 	    ((cpr) = (((descend) && !LIST_EMPTY(&(cpr)->pr_children))	\
 	      ? LIST_FIRST(&(cpr)->pr_children)				\
 	      : ((cpr) == (ppr)						\
 		 ? NULL							\
 		 : (((descend) = LIST_NEXT(cpr, pr_sibling) != NULL)	\
 		    ? LIST_NEXT(cpr, pr_sibling)			\
 		    : (cpr)->pr_parent))));)				\
 		if (!(descend))						\
 			;						\
 		else
 
 /*
  * As above, but lock descendants on the way down and unlock on the way up.
  */
 #define	FOREACH_PRISON_DESCENDANT_LOCKED(ppr, cpr, descend)		\
 	for ((cpr) = (ppr), (descend) = 1;				\
 	    ((cpr) = (((descend) && !LIST_EMPTY(&(cpr)->pr_children))	\
 	      ? LIST_FIRST(&(cpr)->pr_children)				\
 	      : ((cpr) == (ppr)						\
 		 ? NULL							\
 		 : ((prison_unlock(cpr),				\
 		    (descend) = LIST_NEXT(cpr, pr_sibling) != NULL)	\
 		    ? LIST_NEXT(cpr, pr_sibling)			\
 		    : (cpr)->pr_parent))));)				\
 		if ((descend) ? (prison_lock(cpr), 0) : 1)		\
 			;						\
 		else
 
 /*
  * As above, but also keep track of the level descended to.
  */
 #define	FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(ppr, cpr, descend, level)\
 	for ((cpr) = (ppr), (descend) = 1, (level) = 0;			\
 	    ((cpr) = (((descend) && !LIST_EMPTY(&(cpr)->pr_children))	\
 	      ? (level++, LIST_FIRST(&(cpr)->pr_children))		\
 	      : ((cpr) == (ppr)						\
 		 ? NULL							\
 		 : ((prison_unlock(cpr),				\
 		    (descend) = LIST_NEXT(cpr, pr_sibling) != NULL)	\
 		    ? LIST_NEXT(cpr, pr_sibling)			\
 		    : (level--, (cpr)->pr_parent)))));)			\
 		if ((descend) ? (prison_lock(cpr), 0) : 1)		\
 			;						\
 		else
 
 /*
  * Attributes of the physical system, and the root of the jail tree.
  */
 extern struct	prison prison0;
 
 TAILQ_HEAD(prisonlist, prison);
 extern struct	prisonlist allprison;
 extern struct	sx allprison_lock;
 
 /*
  * Sysctls to describe jail parameters.
  */
 SYSCTL_DECL(_security_jail_param);
 
 #define	SYSCTL_JAIL_PARAM(module, param, type, fmt, descr)		\
     SYSCTL_PROC(_security_jail_param ## module, OID_AUTO, param,	\
 	(type) | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_param, fmt, descr)
 #define	SYSCTL_JAIL_PARAM_STRING(module, param, access, len, descr)	\
     SYSCTL_PROC(_security_jail_param ## module, OID_AUTO, param,	\
 	CTLTYPE_STRING | CTLFLAG_MPSAFE | (access), NULL, len,		\
 	sysctl_jail_param, "A", descr)
 #define	SYSCTL_JAIL_PARAM_STRUCT(module, param, access, len, fmt, descr)\
     SYSCTL_PROC(_security_jail_param ## module, OID_AUTO, param,	\
 	CTLTYPE_STRUCT | CTLFLAG_MPSAFE | (access), NULL, len,		\
 	sysctl_jail_param, fmt, descr)
 #define	SYSCTL_JAIL_PARAM_NODE(module, descr)				\
     SYSCTL_NODE(_security_jail_param, OID_AUTO, module, CTLFLAG_MPSAFE,	\
         0, descr)
 #define	SYSCTL_JAIL_PARAM_SUBNODE(parent, module, descr)		\
     SYSCTL_NODE(_security_jail_param_##parent, OID_AUTO, module, 	\
         CTLFLAG_MPSAFE, 0, descr)
 #define	SYSCTL_JAIL_PARAM_SYS_NODE(module, access, descr)		\
     SYSCTL_JAIL_PARAM_NODE(module, descr);				\
     SYSCTL_JAIL_PARAM(_##module, , CTLTYPE_INT | (access), "E,jailsys",	\
 	descr)
 
 /*
  * Kernel support functions for jail().
  */
 struct ucred;
 struct mount;
 struct sockaddr;
 struct statfs;
 struct vfsconf;
 
 /*
  * Return 1 if the passed credential is in a jail, otherwise 0.
  */
 #define jailed(cred)	(cred->cr_prison != &prison0)
 
 int jailed_without_vnet(struct ucred *);
 void getcredhostname(struct ucred *, char *, size_t);
 void getcreddomainname(struct ucred *, char *, size_t);
 void getcredhostuuid(struct ucred *, char *, size_t);
 void getcredhostid(struct ucred *, unsigned long *);
 void getjailname(struct ucred *cred, char *name, size_t len);
 void prison0_init(void);
 int prison_allow(struct ucred *, unsigned);
 int prison_check(struct ucred *cred1, struct ucred *cred2);
 int prison_owns_vnet(struct ucred *);
 int prison_canseemount(struct ucred *cred, struct mount *mp);
 void prison_enforce_statfs(struct ucred *cred, struct mount *mp,
     struct statfs *sp);
 struct prison *prison_find(int prid);
 struct prison *prison_find_child(struct prison *, int);
 struct prison *prison_find_name(struct prison *, const char *);
 int prison_flag(struct ucred *, unsigned);
 void prison_free(struct prison *pr);
 void prison_free_locked(struct prison *pr);
 void prison_hold(struct prison *pr);
 void prison_hold_locked(struct prison *pr);
 void prison_proc_hold(struct prison *);
 void prison_proc_free(struct prison *);
 void prison_set_allow(struct ucred *cred, unsigned flag, int enable);
 int prison_ischild(struct prison *, struct prison *);
 bool prison_isalive(struct prison *);
 bool prison_isvalid(struct prison *);
 int prison_equal_ip4(struct prison *, struct prison *);
 int prison_get_ip4(struct ucred *cred, struct in_addr *ia);
 int prison_local_ip4(struct ucred *cred, struct in_addr *ia);
 int prison_remote_ip4(struct ucred *cred, struct in_addr *ia);
 int prison_check_ip4(const struct ucred *, const struct in_addr *);
 int prison_check_ip4_locked(const struct prison *, const struct in_addr *);
 int prison_saddrsel_ip4(struct ucred *, struct in_addr *);
 int prison_restrict_ip4(struct prison *, struct in_addr *);
 int prison_qcmp_v4(const void *, const void *);
 #ifdef INET6
 int prison_equal_ip6(struct prison *, struct prison *);
 int prison_get_ip6(struct ucred *, struct in6_addr *);
 int prison_local_ip6(struct ucred *, struct in6_addr *, int);
 int prison_remote_ip6(struct ucred *, struct in6_addr *);
 int prison_check_ip6(const struct ucred *, const struct in6_addr *);
 int prison_check_ip6_locked(const struct prison *, const struct in6_addr *);
 int prison_saddrsel_ip6(struct ucred *, struct in6_addr *);
 int prison_restrict_ip6(struct prison *, struct in6_addr *);
 int prison_qcmp_v6(const void *, const void *);
 #endif
 int prison_check_af(struct ucred *cred, int af);
 int prison_if(struct ucred *cred, const struct sockaddr *sa);
 char *prison_name(struct prison *, struct prison *);
 int prison_priv_check(struct ucred *cred, int priv);
 int sysctl_jail_param(SYSCTL_HANDLER_ARGS);
 unsigned prison_add_allow(const char *prefix, const char *name,
     const char *prefix_descr, const char *descr);
 void prison_add_vfs(struct vfsconf *vfsp);
 void prison_racct_foreach(void (*callback)(struct racct *racct,
     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
     void *arg2, void *arg3);
 struct prison_racct *prison_racct_find(const char *name);
 void prison_racct_hold(struct prison_racct *prr);
 void prison_racct_free(struct prison_racct *prr);
 
 #endif /* _KERNEL */
 #endif /* !_SYS_JAIL_H_ */