Index: head/sys/kern/sysv_msg.c
===================================================================
--- head/sys/kern/sysv_msg.c	(revision 298584)
+++ head/sys/kern/sysv_msg.c	(revision 298585)
@@ -1,1589 +1,1897 @@
 /*-
  * Implementation of SVID messages
  *
  * Author:  Daniel Boulet
  *
  * Copyright 1993 Daniel Boulet and RTMX Inc.
  *
  * This system call was implemented by Daniel Boulet under contract from RTMX.
  *
  * Redistribution and use in source forms, with and without modification,
  * are permitted provided that this entire comment appears intact.
  *
  * Redistribution in binary form may occur without any restrictions.
  * Obviously, it would be nice if you gave credit where credit is due
  * but requiring it would be too onerous.
  *
  * This software is provided ``AS IS'' without any warranties of any kind.
  */
 /*-
  * Copyright (c) 2003-2005 McAfee, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
+#include <sys/mount.h>
 #include <sys/msg.h>
 #include <sys/racct.h>
+#include <sys/sbuf.h>
+#include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/jail.h>
 
 #include <security/mac/mac_framework.h>
 
 FEATURE(sysv_msg, "System V message queues support");
 
 static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues");
 
 static int msginit(void);
 static int msgunload(void);
 static int sysvmsg_modload(struct module *, int, void *);
+static void msq_remove(struct msqid_kernel *);
+static struct prison *msg_find_prison(struct ucred *);
+static int msq_prison_cansee(struct prison *, struct msqid_kernel *);
+static int msg_prison_check(void *, void *);
+static int msg_prison_set(void *, void *);
+static int msg_prison_get(void *, void *);
+static int msg_prison_remove(void *, void *);
+static void msg_prison_cleanup(struct prison *);
 
 
 #ifdef MSG_DEBUG
 #define DPRINTF(a)	printf a
 #else
 #define DPRINTF(a)	(void)0
 #endif
 
 static void msg_freehdr(struct msg *msghdr);
 
 #ifndef MSGSSZ
 #define MSGSSZ	8		/* Each segment must be 2^N long */
 #endif
 #ifndef MSGSEG
 #define MSGSEG	2048		/* must be less than 32767 */
 #endif
 #define MSGMAX	(MSGSSZ*MSGSEG)
 #ifndef MSGMNB
 #define MSGMNB	2048		/* max # of bytes in a queue */
 #endif
 #ifndef MSGMNI
 #define MSGMNI	40
 #endif
 #ifndef MSGTQL
 #define MSGTQL	40
 #endif
 
 /*
  * Based on the configuration parameters described in an SVR2 (yes, two)
  * config(1m) man page.
  *
  * Each message is broken up and stored in segments that are msgssz bytes
  * long.  For efficiency reasons, this should be a power of two.  Also,
  * it doesn't make sense if it is less than 8 or greater than about 256.
  * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of
  * two between 8 and 1024 inclusive (and panic's if it isn't).
  */
 struct msginfo msginfo = {
                 MSGMAX,         /* max chars in a message */
                 MSGMNI,         /* # of message queue identifiers */
                 MSGMNB,         /* max chars in a queue */
                 MSGTQL,         /* max messages in system */
                 MSGSSZ,         /* size of a message segment */
                 		/* (must be small power of 2 greater than 4) */
                 MSGSEG          /* number of message segments */
 };
 
 /*
  * macros to convert between msqid_ds's and msqid's.
  * (specific to this implementation)
  */
 #define MSQID(ix,ds)	((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000))
 #define MSQID_IX(id)	((id) & 0xffff)
 #define MSQID_SEQ(id)	(((id) >> 16) & 0xffff)
 
 /*
  * The rest of this file is specific to this particular implementation.
  */
 
 struct msgmap {
 	short	next;		/* next segment in buffer */
     				/* -1 -> available */
     				/* 0..(MSGSEG-1) -> index of next segment */
 };
 
 #define MSG_LOCKED	01000	/* Is this msqid_ds locked? */
 
 static int nfree_msgmaps;	/* # of free map entries */
 static short free_msgmaps;	/* head of linked list of free map entries */
 static struct msg *free_msghdrs;/* list of free msg headers */
 static char *msgpool;		/* MSGMAX byte long msg buffer pool */
 static struct msgmap *msgmaps;	/* MSGSEG msgmap structures */
 static struct msg *msghdrs;	/* MSGTQL msg headers */
 static struct msqid_kernel *msqids;	/* MSGMNI msqid_kernel struct's */
 static struct mtx msq_mtx;	/* global mutex for message queues. */
+static unsigned msg_prison_slot;/* prison OSD slot */
 
 static struct syscall_helper_data msg_syscalls[] = {
 	SYSCALL_INIT_HELPER(msgctl),
 	SYSCALL_INIT_HELPER(msgget),
 	SYSCALL_INIT_HELPER(msgsnd),
 	SYSCALL_INIT_HELPER(msgrcv),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL_INIT_HELPER(msgsys),
 	SYSCALL_INIT_HELPER_COMPAT(freebsd7_msgctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static struct syscall_helper_data msg32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_msgctl),
 	SYSCALL32_INIT_HELPER(freebsd32_msgsnd),
 	SYSCALL32_INIT_HELPER(freebsd32_msgrcv),
 	SYSCALL32_INIT_HELPER_COMPAT(msgget),
 	SYSCALL32_INIT_HELPER(freebsd32_msgsys),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_msgctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 msginit()
 {
+	struct prison *pr;
+	void *rsv;
 	int i, error;
+	osd_method_t methods[PR_MAXMETHOD] = {
+	    [PR_METHOD_CHECK] =		msg_prison_check,
+	    [PR_METHOD_SET] =		msg_prison_set,
+	    [PR_METHOD_GET] =		msg_prison_get,
+	    [PR_METHOD_REMOVE] =	msg_prison_remove,
+	};
 
 	msginfo.msgmax = msginfo.msgseg * msginfo.msgssz;
 	msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK);
 	msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK);
 	msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK);
 	msqids = malloc(sizeof(struct msqid_kernel) * msginfo.msgmni, M_MSG,
 	    M_WAITOK);
 
 	/*
 	 * msginfo.msgssz should be a power of two for efficiency reasons.
 	 * It is also pretty silly if msginfo.msgssz is less than 8
 	 * or greater than about 256 so ...
 	 */
 
 	i = 8;
 	while (i < 1024 && i != msginfo.msgssz)
 		i <<= 1;
     	if (i != msginfo.msgssz) {
 		DPRINTF(("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
 		    msginfo.msgssz));
 		panic("msginfo.msgssz not a small power of 2");
 	}
 
 	if (msginfo.msgseg > 32767) {
 		DPRINTF(("msginfo.msgseg=%d\n", msginfo.msgseg));
 		panic("msginfo.msgseg > 32767");
 	}
 
 	for (i = 0; i < msginfo.msgseg; i++) {
 		if (i > 0)
 			msgmaps[i-1].next = i;
 		msgmaps[i].next = -1;	/* implies entry is available */
 	}
 	free_msgmaps = 0;
 	nfree_msgmaps = msginfo.msgseg;
 
 	for (i = 0; i < msginfo.msgtql; i++) {
 		msghdrs[i].msg_type = 0;
 		if (i > 0)
 			msghdrs[i-1].msg_next = &msghdrs[i];
 		msghdrs[i].msg_next = NULL;
 #ifdef MAC
 		mac_sysvmsg_init(&msghdrs[i]);
 #endif
     	}
 	free_msghdrs = &msghdrs[0];
 
 	for (i = 0; i < msginfo.msgmni; i++) {
 		msqids[i].u.msg_qbytes = 0;	/* implies entry is available */
 		msqids[i].u.msg_perm.seq = 0;	/* reset to a known value */
 		msqids[i].u.msg_perm.mode = 0;
 #ifdef MAC
 		mac_sysvmsq_init(&msqids[i]);
 #endif
 	}
 	mtx_init(&msq_mtx, "msq", NULL, MTX_DEF);
 
+	/* Set current prisons according to their allow.sysvipc. */
+	msg_prison_slot = osd_jail_register(NULL, methods);
+	rsv = osd_reserve(msg_prison_slot);
+	prison_lock(&prison0);
+	(void)osd_jail_set_reserved(&prison0, msg_prison_slot, rsv, &prison0);
+	prison_unlock(&prison0);
+	rsv = NULL;
+	sx_slock(&allprison_lock);
+	TAILQ_FOREACH(pr, &allprison, pr_list) {
+		if (rsv == NULL)
+			rsv = osd_reserve(msg_prison_slot);
+		prison_lock(pr);
+		if ((pr->pr_allow & PR_ALLOW_SYSVIPC) && pr->pr_ref > 0) {
+			(void)osd_jail_set_reserved(pr, msg_prison_slot, rsv,
+			    &prison0);
+			rsv = NULL;
+		}
+		prison_unlock(pr);
+	}
+	if (rsv != NULL)
+		osd_free_reserved(rsv);
+	sx_sunlock(&allprison_lock);
+
 	error = syscall_helper_register(msg_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(msg32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 msgunload()
 {
 	struct msqid_kernel *msqkptr;
 	int msqid;
 #ifdef MAC
 	int i;
 #endif
 
 	syscall_helper_unregister(msg_syscalls);
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(msg32_syscalls);
 #endif
 
 	for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 		/*
 		 * Look for an unallocated and unlocked msqid_ds.
 		 * msqid_ds's can be locked by msgsnd or msgrcv while
 		 * they are copying the message in/out.  We can't
 		 * re-use the entry until they release it.
 		 */
 		msqkptr = &msqids[msqid];
 		if (msqkptr->u.msg_qbytes != 0 ||
 		    (msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
 			break;
 	}
 	if (msqid != msginfo.msgmni)
 		return (EBUSY);
 
+	if (msg_prison_slot != 0)
+		osd_jail_deregister(msg_prison_slot);
 #ifdef MAC
 	for (i = 0; i < msginfo.msgtql; i++)
 		mac_sysvmsg_destroy(&msghdrs[i]);
 	for (msqid = 0; msqid < msginfo.msgmni; msqid++)
 		mac_sysvmsq_destroy(&msqids[msqid]);
 #endif
 	free(msgpool, M_MSG);
 	free(msgmaps, M_MSG);
 	free(msghdrs, M_MSG);
 	free(msqids, M_MSG);
 	mtx_destroy(&msq_mtx);
 	return (0);
 }
 
 
 static int
 sysvmsg_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = msginit();
 		if (error != 0)
 			msgunload();
 		break;
 	case MOD_UNLOAD:
 		error = msgunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvmsg_mod = {
 	"sysvmsg",
 	&sysvmsg_modload,
 	NULL
 };
 
 DECLARE_MODULE(sysvmsg, sysvmsg_mod, SI_SUB_SYSV_MSG, SI_ORDER_FIRST);
 MODULE_VERSION(sysvmsg, 1);
 
 static void
 msg_freehdr(msghdr)
 	struct msg *msghdr;
 {
 	while (msghdr->msg_ts > 0) {
 		short next;
 		if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
 			panic("msghdr->msg_spot out of range");
 		next = msgmaps[msghdr->msg_spot].next;
 		msgmaps[msghdr->msg_spot].next = free_msgmaps;
 		free_msgmaps = msghdr->msg_spot;
 		nfree_msgmaps++;
 		msghdr->msg_spot = next;
 		if (msghdr->msg_ts >= msginfo.msgssz)
 			msghdr->msg_ts -= msginfo.msgssz;
 		else
 			msghdr->msg_ts = 0;
 	}
 	if (msghdr->msg_spot != -1)
 		panic("msghdr->msg_spot != -1");
 	msghdr->msg_next = free_msghdrs;
 	free_msghdrs = msghdr;
 #ifdef MAC
 	mac_sysvmsg_cleanup(msghdr);
 #endif
 }
 
+static void
+msq_remove(struct msqid_kernel *msqkptr)
+{
+	struct msg *msghdr;
+
+	racct_sub_cred(msqkptr->cred, RACCT_NMSGQ, 1);
+	racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, msqkptr->u.msg_qnum);
+	racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msqkptr->u.msg_cbytes);
+	crfree(msqkptr->cred);
+	msqkptr->cred = NULL;
+
+	/* Free the message headers */
+	msghdr = msqkptr->u.msg_first;
+	while (msghdr != NULL) {
+		struct msg *msghdr_tmp;
+
+		/* Free the segments of each message */
+		msqkptr->u.msg_cbytes -= msghdr->msg_ts;
+		msqkptr->u.msg_qnum--;
+		msghdr_tmp = msghdr;
+		msghdr = msghdr->msg_next;
+		msg_freehdr(msghdr_tmp);
+	}
+
+	if (msqkptr->u.msg_cbytes != 0)
+		panic("msg_cbytes is screwed up");
+	if (msqkptr->u.msg_qnum != 0)
+		panic("msg_qnum is screwed up");
+
+	msqkptr->u.msg_qbytes = 0;	/* Mark it as free */
+
+#ifdef MAC
+	mac_sysvmsq_cleanup(msqkptr);
+#endif
+
+	wakeup(msqkptr);
+}
+
+static struct prison *
+msg_find_prison(struct ucred *cred)
+{
+	struct prison *pr, *rpr;
+
+	pr = cred->cr_prison;
+	prison_lock(pr);
+	rpr = osd_jail_get(pr, msg_prison_slot);
+	prison_unlock(pr);
+	return rpr;
+}
+
+static int
+msq_prison_cansee(struct prison *rpr, struct msqid_kernel *msqkptr)
+{
+
+	if (msqkptr->cred == NULL ||
+	    !(rpr == msqkptr->cred->cr_prison ||
+	      prison_ischild(rpr, msqkptr->cred->cr_prison)))
+		return (EINVAL);
+	return (0);
+}
+
 #ifndef _SYS_SYSPROTO_H_
 struct msgctl_args {
 	int	msqid;
 	int	cmd;
 	struct	msqid_ds *buf;
 };
 #endif
 int
 sys_msgctl(td, uap)
 	struct thread *td;
 	register struct msgctl_args *uap;
 {
 	int msqid = uap->msqid;
 	int cmd = uap->cmd;
 	struct msqid_ds msqbuf;
 	int error;
 
 	DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf));
 	if (cmd == IPC_SET &&
 	    (error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0)
 		return (error);
 	error = kern_msgctl(td, msqid, cmd, &msqbuf);
 	if (cmd == IPC_STAT && error == 0)
 		error = copyout(&msqbuf, uap->buf, sizeof(struct msqid_ds));
 	return (error);
 }
 
 int
 kern_msgctl(td, msqid, cmd, msqbuf)
 	struct thread *td;
 	int msqid;
 	int cmd;
 	struct msqid_ds *msqbuf;
 {
 	int rval, error, msqix;
 	register struct msqid_kernel *msqkptr;
+	struct prison *rpr;
 
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+	rpr = msg_find_prison(td->td_ucred);
+	if (rpr == NULL)
 		return (ENOSYS);
 
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		return (EINVAL);
 	}
 
 	msqkptr = &msqids[msqix];
 
 	mtx_lock(&msq_mtx);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such msqid\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
+
+	error = msq_prison_cansee(rpr, msqkptr);
+	if (error != 0) {
+		DPRINTF(("requester can't see prison\n"));
+		goto done2;
+	}
+
 #ifdef MAC
 	error = mac_sysvmsq_check_msqctl(td->td_ucred, msqkptr, cmd);
 	if (error != 0)
 		goto done2;
 #endif
 
 	error = 0;
 	rval = 0;
 
 	switch (cmd) {
 
 	case IPC_RMID:
 	{
+#ifdef MAC
 		struct msg *msghdr;
+#endif
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
 			goto done2;
 
 #ifdef MAC
 		/*
 		 * Check that the thread has MAC access permissions to
 		 * individual msghdrs.  Note: We need to do this in a
 		 * separate loop because the actual loop alters the
 		 * msq/msghdr info as it progresses, and there is no going
 		 * back if half the way through we discover that the
 		 * thread cannot free a certain msghdr.  The msq will get
 		 * into an inconsistent state.
 		 */
 		for (msghdr = msqkptr->u.msg_first; msghdr != NULL;
 		    msghdr = msghdr->msg_next) {
 			error = mac_sysvmsq_check_msgrmid(td->td_ucred, msghdr);
 			if (error != 0)
 				goto done2;
 		}
 #endif
 
-		racct_sub_cred(msqkptr->cred, RACCT_NMSGQ, 1);
-		racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, msqkptr->u.msg_qnum);
-		racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msqkptr->u.msg_cbytes);
-		crfree(msqkptr->cred);
-		msqkptr->cred = NULL;
-
-		/* Free the message headers */
-		msghdr = msqkptr->u.msg_first;
-		while (msghdr != NULL) {
-			struct msg *msghdr_tmp;
-
-			/* Free the segments of each message */
-			msqkptr->u.msg_cbytes -= msghdr->msg_ts;
-			msqkptr->u.msg_qnum--;
-			msghdr_tmp = msghdr;
-			msghdr = msghdr->msg_next;
-			msg_freehdr(msghdr_tmp);
-		}
-
-		if (msqkptr->u.msg_cbytes != 0)
-			panic("msg_cbytes is screwed up");
-		if (msqkptr->u.msg_qnum != 0)
-			panic("msg_qnum is screwed up");
-
-		msqkptr->u.msg_qbytes = 0;	/* Mark it as free */
-
-#ifdef MAC
-		mac_sysvmsq_cleanup(msqkptr);
-#endif
-
-		wakeup(msqkptr);
+		msq_remove(msqkptr);
 	}
 
 		break;
 
 	case IPC_SET:
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
 			goto done2;
 		if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) {
 			error = priv_check(td, PRIV_IPC_MSGSIZE);
 			if (error)
 				goto done2;
 		}
 		if (msqbuf->msg_qbytes > msginfo.msgmnb) {
 			DPRINTF(("can't increase msg_qbytes beyond %d"
 			    "(truncating)\n", msginfo.msgmnb));
 			msqbuf->msg_qbytes = msginfo.msgmnb;	/* silently restrict qbytes to system limit */
 		}
 		if (msqbuf->msg_qbytes == 0) {
 			DPRINTF(("can't reduce msg_qbytes to 0\n"));
 			error = EINVAL;		/* non-standard errno! */
 			goto done2;
 		}
 		msqkptr->u.msg_perm.uid = msqbuf->msg_perm.uid;	/* change the owner */
 		msqkptr->u.msg_perm.gid = msqbuf->msg_perm.gid;	/* change the owner */
 		msqkptr->u.msg_perm.mode = (msqkptr->u.msg_perm.mode & ~0777) |
 		    (msqbuf->msg_perm.mode & 0777);
 		msqkptr->u.msg_qbytes = msqbuf->msg_qbytes;
 		msqkptr->u.msg_ctime = time_second;
 		break;
 
 	case IPC_STAT:
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
 			DPRINTF(("requester doesn't have read access\n"));
 			goto done2;
 		}
 		*msqbuf = msqkptr->u;
+		if (td->td_ucred->cr_prison != msqkptr->cred->cr_prison)
+			msqbuf->msg_perm.key = IPC_PRIVATE;
 		break;
 
 	default:
 		DPRINTF(("invalid command %d\n", cmd));
 		error = EINVAL;
 		goto done2;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgget_args {
 	key_t	key;
 	int	msgflg;
 };
 #endif
 
 int
 sys_msgget(td, uap)
 	struct thread *td;
 	register struct msgget_args *uap;
 {
 	int msqid, error = 0;
 	int key = uap->key;
 	int msgflg = uap->msgflg;
 	struct ucred *cred = td->td_ucred;
 	register struct msqid_kernel *msqkptr = NULL;
 
 	DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg));
 
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+	if (msg_find_prison(cred) == NULL)
 		return (ENOSYS);
 
 	mtx_lock(&msq_mtx);
 	if (key != IPC_PRIVATE) {
 		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 			msqkptr = &msqids[msqid];
 			if (msqkptr->u.msg_qbytes != 0 &&
+			    msqkptr->cred != NULL &&
+			    msqkptr->cred->cr_prison == cred->cr_prison &&
 			    msqkptr->u.msg_perm.key == key)
 				break;
 		}
 		if (msqid < msginfo.msgmni) {
 			DPRINTF(("found public key\n"));
 			if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
 				DPRINTF(("not exclusive\n"));
 				error = EEXIST;
 				goto done2;
 			}
 			if ((error = ipcperm(td, &msqkptr->u.msg_perm,
 			    msgflg & 0700))) {
 				DPRINTF(("requester doesn't have 0%o access\n",
 				    msgflg & 0700));
 				goto done2;
 			}
 #ifdef MAC
 			error = mac_sysvmsq_check_msqget(cred, msqkptr);
 			if (error != 0)
 				goto done2;
 #endif
 			goto found;
 		}
 	}
 
 	DPRINTF(("need to allocate the msqid_ds\n"));
 	if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
 		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
 			/*
 			 * Look for an unallocated and unlocked msqid_ds.
 			 * msqid_ds's can be locked by msgsnd or msgrcv while
 			 * they are copying the message in/out.  We can't
 			 * re-use the entry until they release it.
 			 */
 			msqkptr = &msqids[msqid];
 			if (msqkptr->u.msg_qbytes == 0 &&
 			    (msqkptr->u.msg_perm.mode & MSG_LOCKED) == 0)
 				break;
 		}
 		if (msqid == msginfo.msgmni) {
 			DPRINTF(("no more msqid_ds's available\n"));
 			error = ENOSPC;
 			goto done2;
 		}
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(td->td_proc);
 			error = racct_add(td->td_proc, RACCT_NMSGQ, 1);
 			PROC_UNLOCK(td->td_proc);
 			if (error != 0) {
 				error = ENOSPC;
 				goto done2;
 			}
 		}
 #endif
 		DPRINTF(("msqid %d is available\n", msqid));
 		msqkptr->u.msg_perm.key = key;
 		msqkptr->u.msg_perm.cuid = cred->cr_uid;
 		msqkptr->u.msg_perm.uid = cred->cr_uid;
 		msqkptr->u.msg_perm.cgid = cred->cr_gid;
 		msqkptr->u.msg_perm.gid = cred->cr_gid;
 		msqkptr->u.msg_perm.mode = (msgflg & 0777);
 		msqkptr->cred = crhold(cred);
 		/* Make sure that the returned msqid is unique */
 		msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff;
 		msqkptr->u.msg_first = NULL;
 		msqkptr->u.msg_last = NULL;
 		msqkptr->u.msg_cbytes = 0;
 		msqkptr->u.msg_qnum = 0;
 		msqkptr->u.msg_qbytes = msginfo.msgmnb;
 		msqkptr->u.msg_lspid = 0;
 		msqkptr->u.msg_lrpid = 0;
 		msqkptr->u.msg_stime = 0;
 		msqkptr->u.msg_rtime = 0;
 		msqkptr->u.msg_ctime = time_second;
 #ifdef MAC
 		mac_sysvmsq_create(cred, msqkptr);
 #endif
 	} else {
 		DPRINTF(("didn't find it and wasn't asked to create it\n"));
 		error = ENOENT;
 		goto done2;
 	}
 
 found:
 	/* Construct the unique msqid */
 	td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm);
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgsnd_args {
 	int	msqid;
 	const void	*msgp;
 	size_t	msgsz;
 	int	msgflg;
 };
 #endif
 int
 kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
 	struct thread *td;
 	int msqid;
 	const void *msgp;	/* XXX msgp is actually mtext. */
 	size_t msgsz;
 	int msgflg;
 	long mtype;
 {
 	int msqix, segs_needed, error = 0;
 	register struct msqid_kernel *msqkptr;
 	register struct msg *msghdr;
+	struct prison *rpr;
 	short next;
 #ifdef RACCT
 	size_t saved_msgsz;
 #endif
 
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+	rpr = msg_find_prison(td->td_ucred);
+	if (rpr == NULL)
 		return (ENOSYS);
 
 	mtx_lock(&msq_mtx);
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		error = EINVAL;
 		goto done2;
 	}
 
 	msqkptr = &msqids[msqix];
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such message queue id\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 
+	if ((error = msq_prison_cansee(rpr, msqkptr))) {
+		DPRINTF(("requester can't see prison\n"));
+		goto done2;
+	}
+
 	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) {
 		DPRINTF(("requester doesn't have write access\n"));
 		goto done2;
 	}
 
 #ifdef MAC
 	error = mac_sysvmsq_check_msqsnd(td->td_ucred, msqkptr);
 	if (error != 0)
 		goto done2;
 #endif
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(td->td_proc);
 		if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) {
 			PROC_UNLOCK(td->td_proc);
 			error = EAGAIN;
 			goto done2;
 		}
 		saved_msgsz = msgsz;
 		if (racct_add(td->td_proc, RACCT_MSGQSIZE, msgsz)) {
 			racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
 			PROC_UNLOCK(td->td_proc);
 			error = EAGAIN;
 			goto done2;
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
 	DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz,
 	    msginfo.msgssz, segs_needed));
 	for (;;) {
 		int need_more_resources = 0;
 
 		/*
 		 * check msgsz
 		 * (inside this loop in case msg_qbytes changes while we sleep)
 		 */
 
 		if (msgsz > msqkptr->u.msg_qbytes) {
 			DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n"));
 			error = EINVAL;
 			goto done3;
 		}
 
 		if (msqkptr->u.msg_perm.mode & MSG_LOCKED) {
 			DPRINTF(("msqid is locked\n"));
 			need_more_resources = 1;
 		}
 		if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) {
 			DPRINTF(("msgsz + msg_cbytes > msg_qbytes\n"));
 			need_more_resources = 1;
 		}
 		if (segs_needed > nfree_msgmaps) {
 			DPRINTF(("segs_needed > nfree_msgmaps\n"));
 			need_more_resources = 1;
 		}
 		if (free_msghdrs == NULL) {
 			DPRINTF(("no more msghdrs\n"));
 			need_more_resources = 1;
 		}
 
 		if (need_more_resources) {
 			int we_own_it;
 
 			if ((msgflg & IPC_NOWAIT) != 0) {
 				DPRINTF(("need more resources but caller "
 				    "doesn't want to wait\n"));
 				error = EAGAIN;
 				goto done3;
 			}
 
 			if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) {
 				DPRINTF(("we don't own the msqid_ds\n"));
 				we_own_it = 0;
 			} else {
 				/* Force later arrivals to wait for our
 				   request */
 				DPRINTF(("we own the msqid_ds\n"));
 				msqkptr->u.msg_perm.mode |= MSG_LOCKED;
 				we_own_it = 1;
 			}
 			DPRINTF(("msgsnd:  goodnight\n"));
 			error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
 			    "msgsnd", hz);
 			DPRINTF(("msgsnd:  good morning, error=%d\n", error));
 			if (we_own_it)
 				msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 			if (error == EWOULDBLOCK) {
 				DPRINTF(("msgsnd:  timed out\n"));
 				continue;
 			}
 			if (error != 0) {
 				DPRINTF(("msgsnd:  interrupted system call\n"));
 				error = EINTR;
 				goto done3;
 			}
 
 			/*
 			 * Make sure that the msq queue still exists
 			 */
 
 			if (msqkptr->u.msg_qbytes == 0) {
 				DPRINTF(("msqid deleted\n"));
 				error = EIDRM;
 				goto done3;
 			}
 
 		} else {
 			DPRINTF(("got all the resources that we need\n"));
 			break;
 		}
 	}
 
 	/*
 	 * We have the resources that we need.
 	 * Make sure!
 	 */
 
 	if (msqkptr->u.msg_perm.mode & MSG_LOCKED)
 		panic("msg_perm.mode & MSG_LOCKED");
 	if (segs_needed > nfree_msgmaps)
 		panic("segs_needed > nfree_msgmaps");
 	if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes)
 		panic("msgsz + msg_cbytes > msg_qbytes");
 	if (free_msghdrs == NULL)
 		panic("no more msghdrs");
 
 	/*
 	 * Re-lock the msqid_ds in case we page-fault when copying in the
 	 * message
 	 */
 
 	if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
 		panic("msqid_ds is already locked");
 	msqkptr->u.msg_perm.mode |= MSG_LOCKED;
 
 	/*
 	 * Allocate a message header
 	 */
 
 	msghdr = free_msghdrs;
 	free_msghdrs = msghdr->msg_next;
 	msghdr->msg_spot = -1;
 	msghdr->msg_ts = msgsz;
 	msghdr->msg_type = mtype;
 #ifdef MAC
 	/*
 	 * XXXMAC: Should the mac_sysvmsq_check_msgmsq check follow here
 	 * immediately?  Or, should it be checked just before the msg is
 	 * enqueued in the msgq (as it is done now)?
 	 */
 	mac_sysvmsg_create(td->td_ucred, msqkptr, msghdr);
 #endif
 
 	/*
 	 * Allocate space for the message
 	 */
 
 	while (segs_needed > 0) {
 		if (nfree_msgmaps <= 0)
 			panic("not enough msgmaps");
 		if (free_msgmaps == -1)
 			panic("nil free_msgmaps");
 		next = free_msgmaps;
 		if (next <= -1)
 			panic("next too low #1");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #1");
 		DPRINTF(("allocating segment %d to message\n", next));
 		free_msgmaps = msgmaps[next].next;
 		nfree_msgmaps--;
 		msgmaps[next].next = msghdr->msg_spot;
 		msghdr->msg_spot = next;
 		segs_needed--;
 	}
 
 	/*
 	 * Validate the message type
 	 */
 
 	if (msghdr->msg_type < 1) {
 		msg_freehdr(msghdr);
 		msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 		wakeup(msqkptr);
 		DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type));
 		error = EINVAL;
 		goto done3;
 	}
 
 	/*
 	 * Copy in the message body
 	 */
 
 	next = msghdr->msg_spot;
 	while (msgsz > 0) {
 		size_t tlen;
 		if (msgsz > msginfo.msgssz)
 			tlen = msginfo.msgssz;
 		else
 			tlen = msgsz;
 		if (next <= -1)
 			panic("next too low #2");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #2");
 		mtx_unlock(&msq_mtx);
 		if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz],
 		    tlen)) != 0) {
 			mtx_lock(&msq_mtx);
 			DPRINTF(("error %d copying in message segment\n",
 			    error));
 			msg_freehdr(msghdr);
 			msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 			wakeup(msqkptr);
 			goto done3;
 		}
 		mtx_lock(&msq_mtx);
 		msgsz -= tlen;
 		msgp = (const char *)msgp + tlen;
 		next = msgmaps[next].next;
 	}
 	if (next != -1)
 		panic("didn't use all the msg segments");
 
 	/*
 	 * We've got the message.  Unlock the msqid_ds.
 	 */
 
 	msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 
 	/*
 	 * Make sure that the msqid_ds is still allocated.
 	 */
 
 	if (msqkptr->u.msg_qbytes == 0) {
 		msg_freehdr(msghdr);
 		wakeup(msqkptr);
 		error = EIDRM;
 		goto done3;
 	}
 
 #ifdef MAC
 	/*
 	 * Note: Since the task/thread allocates the msghdr and usually
 	 * primes it with its own MAC label, for a majority of policies, it
 	 * won't be necessary to check whether the msghdr has access
 	 * permissions to the msgq.  The mac_sysvmsq_check_msqsnd check would
 	 * suffice in that case.  However, this hook may be required where
 	 * individual policies derive a non-identical label for the msghdr
 	 * from the current thread label and may want to check the msghdr
 	 * enqueue permissions, along with read/write permissions to the
 	 * msgq.
 	 */
 	error = mac_sysvmsq_check_msgmsq(td->td_ucred, msghdr, msqkptr);
 	if (error != 0) {
 		msg_freehdr(msghdr);
 		wakeup(msqkptr);
 		goto done3;
 	}
 #endif
 
 	/*
 	 * Put the message into the queue
 	 */
 	if (msqkptr->u.msg_first == NULL) {
 		msqkptr->u.msg_first = msghdr;
 		msqkptr->u.msg_last = msghdr;
 	} else {
 		msqkptr->u.msg_last->msg_next = msghdr;
 		msqkptr->u.msg_last = msghdr;
 	}
 	msqkptr->u.msg_last->msg_next = NULL;
 
 	msqkptr->u.msg_cbytes += msghdr->msg_ts;
 	msqkptr->u.msg_qnum++;
 	msqkptr->u.msg_lspid = td->td_proc->p_pid;
 	msqkptr->u.msg_stime = time_second;
 
 	wakeup(msqkptr);
 	td->td_retval[0] = 0;
 done3:
 #ifdef RACCT
 	if (racct_enable && error != 0) {
 		PROC_LOCK(td->td_proc);
 		racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
 		racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz);
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 int
 sys_msgsnd(td, uap)
 	struct thread *td;
 	register struct msgsnd_args *uap;
 {
 	int error;
 	long mtype;
 
 	DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp,
 	    uap->msgsz, uap->msgflg));
 
 	if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) {
 		DPRINTF(("error %d copying the message type\n", error));
 		return (error);
 	}
 	return (kern_msgsnd(td, uap->msqid,
 	    (const char *)uap->msgp + sizeof(mtype),
 	    uap->msgsz, uap->msgflg, mtype));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct msgrcv_args {
 	int	msqid;
 	void	*msgp;
 	size_t	msgsz;
 	long	msgtyp;
 	int	msgflg;
 };
 #endif
 int
 kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype)
 	struct thread *td;
 	int msqid;
 	void *msgp;	/* XXX msgp is actually mtext. */
 	size_t msgsz;
 	long msgtyp;
 	int msgflg;
 	long *mtype;
 {
 	size_t len;
 	register struct msqid_kernel *msqkptr;
 	register struct msg *msghdr;
+	struct prison *rpr;
 	int msqix, error = 0;
 	short next;
 
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+	rpr = msg_find_prison(td->td_ucred);
+	if (rpr == NULL)
 		return (ENOSYS);
 
 	msqix = IPCID_TO_IX(msqid);
 
 	if (msqix < 0 || msqix >= msginfo.msgmni) {
 		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		return (EINVAL);
 	}
 
 	msqkptr = &msqids[msqix];
 	mtx_lock(&msq_mtx);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such message queue id\n"));
 		error = EINVAL;
 		goto done2;
 	}
 	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
 	}
 
+	if ((error = msq_prison_cansee(rpr, msqkptr))) {
+		DPRINTF(("requester can't see prison\n"));
+		goto done2;
+	}
+
 	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
 		DPRINTF(("requester doesn't have read access\n"));
 		goto done2;
 	}
 
 #ifdef MAC
 	error = mac_sysvmsq_check_msqrcv(td->td_ucred, msqkptr);
 	if (error != 0)
 		goto done2;
 #endif
 
 	msghdr = NULL;
 	while (msghdr == NULL) {
 		if (msgtyp == 0) {
 			msghdr = msqkptr->u.msg_first;
 			if (msghdr != NULL) {
 				if (msgsz < msghdr->msg_ts &&
 				    (msgflg & MSG_NOERROR) == 0) {
 					DPRINTF(("first message on the queue "
 					    "is too big (want %zu, got %d)\n",
 					    msgsz, msghdr->msg_ts));
 					error = E2BIG;
 					goto done2;
 				}
 #ifdef MAC
 				error = mac_sysvmsq_check_msgrcv(td->td_ucred,
 				    msghdr);
 				if (error != 0)
 					goto done2;
 #endif
 				if (msqkptr->u.msg_first == msqkptr->u.msg_last) {
 					msqkptr->u.msg_first = NULL;
 					msqkptr->u.msg_last = NULL;
 				} else {
 					msqkptr->u.msg_first = msghdr->msg_next;
 					if (msqkptr->u.msg_first == NULL)
 						panic("msg_first/last screwed up #1");
 				}
 			}
 		} else {
 			struct msg *previous;
 			struct msg **prev;
 
 			previous = NULL;
 			prev = &(msqkptr->u.msg_first);
 			while ((msghdr = *prev) != NULL) {
 				/*
 				 * Is this message's type an exact match or is
 				 * this message's type less than or equal to
 				 * the absolute value of a negative msgtyp?
 				 * Note that the second half of this test can
 				 * NEVER be true if msgtyp is positive since
 				 * msg_type is always positive!
 				 */
 
 				if (msgtyp == msghdr->msg_type ||
 				    msghdr->msg_type <= -msgtyp) {
 					DPRINTF(("found message type %ld, "
 					    "requested %ld\n",
 					    msghdr->msg_type, msgtyp));
 					if (msgsz < msghdr->msg_ts &&
 					    (msgflg & MSG_NOERROR) == 0) {
 						DPRINTF(("requested message "
 						    "on the queue is too big "
 						    "(want %zu, got %hu)\n",
 						    msgsz, msghdr->msg_ts));
 						error = E2BIG;
 						goto done2;
 					}
 #ifdef MAC
 					error = mac_sysvmsq_check_msgrcv(
 					    td->td_ucred, msghdr);
 					if (error != 0)
 						goto done2;
 #endif
 					*prev = msghdr->msg_next;
 					if (msghdr == msqkptr->u.msg_last) {
 						if (previous == NULL) {
 							if (prev !=
 							    &msqkptr->u.msg_first)
 								panic("msg_first/last screwed up #2");
 							msqkptr->u.msg_first =
 							    NULL;
 							msqkptr->u.msg_last =
 							    NULL;
 						} else {
 							if (prev ==
 							    &msqkptr->u.msg_first)
 								panic("msg_first/last screwed up #3");
 							msqkptr->u.msg_last =
 							    previous;
 						}
 					}
 					break;
 				}
 				previous = msghdr;
 				prev = &(msghdr->msg_next);
 			}
 		}
 
 		/*
 		 * We've either extracted the msghdr for the appropriate
 		 * message or there isn't one.
 		 * If there is one then bail out of this loop.
 		 */
 
 		if (msghdr != NULL)
 			break;
 
 		/*
 		 * Hmph!  No message found.  Does the user want to wait?
 		 */
 
 		if ((msgflg & IPC_NOWAIT) != 0) {
 			DPRINTF(("no appropriate message found (msgtyp=%ld)\n",
 			    msgtyp));
 			/* The SVID says to return ENOMSG. */
 			error = ENOMSG;
 			goto done2;
 		}
 
 		/*
 		 * Wait for something to happen
 		 */
 
 		DPRINTF(("msgrcv:  goodnight\n"));
 		error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
 		    "msgrcv", 0);
 		DPRINTF(("msgrcv:  good morning (error=%d)\n", error));
 
 		if (error != 0) {
 			DPRINTF(("msgrcv:  interrupted system call\n"));
 			error = EINTR;
 			goto done2;
 		}
 
 		/*
 		 * Make sure that the msq queue still exists
 		 */
 
 		if (msqkptr->u.msg_qbytes == 0 ||
 		    msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 			DPRINTF(("msqid deleted\n"));
 			error = EIDRM;
 			goto done2;
 		}
 	}
 
 	/*
 	 * Return the message to the user.
 	 *
 	 * First, do the bookkeeping (before we risk being interrupted).
 	 */
 
 	msqkptr->u.msg_cbytes -= msghdr->msg_ts;
 	msqkptr->u.msg_qnum--;
 	msqkptr->u.msg_lrpid = td->td_proc->p_pid;
 	msqkptr->u.msg_rtime = time_second;
 
 	racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, 1);
 	racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msghdr->msg_ts);
 
 	/*
 	 * Make msgsz the actual amount that we'll be returning.
 	 * Note that this effectively truncates the message if it is too long
 	 * (since msgsz is never increased).
 	 */
 
 	DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz,
 	    msghdr->msg_ts));
 	if (msgsz > msghdr->msg_ts)
 		msgsz = msghdr->msg_ts;
 	*mtype = msghdr->msg_type;
 
 	/*
 	 * Return the segments to the user
 	 */
 
 	next = msghdr->msg_spot;
 	for (len = 0; len < msgsz; len += msginfo.msgssz) {
 		size_t tlen;
 
 		if (msgsz - len > msginfo.msgssz)
 			tlen = msginfo.msgssz;
 		else
 			tlen = msgsz - len;
 		if (next <= -1)
 			panic("next too low #3");
 		if (next >= msginfo.msgseg)
 			panic("next out of range #3");
 		mtx_unlock(&msq_mtx);
 		error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen);
 		mtx_lock(&msq_mtx);
 		if (error != 0) {
 			DPRINTF(("error (%d) copying out message segment\n",
 			    error));
 			msg_freehdr(msghdr);
 			wakeup(msqkptr);
 			goto done2;
 		}
 		msgp = (char *)msgp + tlen;
 		next = msgmaps[next].next;
 	}
 
 	/*
 	 * Done, return the actual number of bytes copied out.
 	 */
 
 	msg_freehdr(msghdr);
 	wakeup(msqkptr);
 	td->td_retval[0] = msgsz;
 done2:
 	mtx_unlock(&msq_mtx);
 	return (error);
 }
 
 int
 sys_msgrcv(td, uap)
 	struct thread *td;
 	register struct msgrcv_args *uap;
 {
 	int error;
 	long mtype;
 
 	DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid,
 	    uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg));
 
 	if ((error = kern_msgrcv(td, uap->msqid,
 	    (char *)uap->msgp + sizeof(mtype), uap->msgsz,
 	    uap->msgtyp, uap->msgflg, &mtype)) != 0)
 		return (error);
 	if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0)
 		DPRINTF(("error %d copying the message type\n", error));
 	return (error);
 }
 
 static int
 sysctl_msqids(SYSCTL_HANDLER_ARGS)
 {
+	struct sbuf sb;
+	struct msqid_kernel tmp, empty;
+	struct msqid_kernel *msqkptr;
+	struct prison *rpr;
+	int error, i;
 
-	return (SYSCTL_OUT(req, msqids,
-	    sizeof(struct msqid_kernel) * msginfo.msgmni));
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		goto done;
+	rpr = msg_find_prison(req->td->td_ucred);
+	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct msqid_kernel) *
+	    msginfo.msgmni, req);
+
+	bzero(&empty, sizeof(empty));
+	for (i = 0; i < msginfo.msgmni; i++) {
+		msqkptr = &msqids[i];
+		if (msqkptr->u.msg_qbytes == 0 || rpr == NULL ||
+		    msq_prison_cansee(rpr, msqkptr) != 0) {
+			msqkptr = &empty;
+		} else if (req->td->td_ucred->cr_prison !=
+		    msqkptr->cred->cr_prison) {
+			bcopy(msqkptr, &tmp, sizeof(tmp));
+			msqkptr = &tmp;
+			msqkptr->u.msg_perm.key = IPC_PRIVATE;
+		}
+
+		sbuf_bcat(&sb, msqkptr, sizeof(*msqkptr));
+	}
+	error = sbuf_finish(&sb);
+	sbuf_delete(&sb);
+
+done:
+	return (error);
 }
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0,
     "Maximum message size");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0,
     "Number of message queue identifiers");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RDTUN, &msginfo.msgmnb, 0,
     "Maximum number of bytes in a queue");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RDTUN, &msginfo.msgtql, 0,
     "Maximum number of messages in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0,
     "Size of a message segment");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0,
     "Number of message segments");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLTYPE_OPAQUE | CTLFLAG_RD,
     NULL, 0, sysctl_msqids, "", "Message queue IDs");
 
+static int
+msg_prison_check(void *obj, void *data)
+{
+	struct prison *pr = obj;
+	struct prison *prpr;
+	struct vfsoptlist *opts = data;
+	int error, jsys;
+
+	/*
+	 * sysvmsg is a jailsys integer.
+	 * It must be "disable" if the parent jail is disabled.
+	 */
+	error = vfs_copyopt(opts, "sysvmsg", &jsys, sizeof(jsys));
+	if (error != ENOENT) {
+		if (error != 0)
+			return (error);
+		switch (jsys) {
+		case JAIL_SYS_DISABLE:
+			break;
+		case JAIL_SYS_NEW:
+		case JAIL_SYS_INHERIT:
+			prison_lock(pr->pr_parent);
+			prpr = osd_jail_get(pr->pr_parent, msg_prison_slot);
+			prison_unlock(pr->pr_parent);
+			if (prpr == NULL)
+				return (EPERM);
+			break;
+		default:
+			return (EINVAL);
+		}
+	}
+
+	return (0);
+}
+
+static int
+msg_prison_set(void *obj, void *data)
+{
+	struct prison *pr = obj;
+	struct prison *tpr, *orpr, *nrpr, *trpr;
+	struct vfsoptlist *opts = data;
+	void *rsv;
+	int jsys, descend;
+
+	/*
+	 * sysvmsg controls which jail is the root of the associated msgs (this
+	 * jail or same as the parent), or if the feature is available at all.
+	 */
+	if (vfs_copyopt(opts, "sysvmsg", &jsys, sizeof(jsys)) == ENOENT)
+		jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0)
+		    ? JAIL_SYS_INHERIT
+		    : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0)
+		    ? JAIL_SYS_DISABLE
+		    : -1;
+	if (jsys == JAIL_SYS_DISABLE) {
+		prison_lock(pr);
+		orpr = osd_jail_get(pr, msg_prison_slot);
+		if (orpr != NULL)
+			osd_jail_del(pr, msg_prison_slot);
+		prison_unlock(pr);
+		if (orpr != NULL) {
+			if (orpr == pr)
+				msg_prison_cleanup(pr);
+			/* Disable all child jails as well. */
+			FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
+				prison_lock(tpr);
+				trpr = osd_jail_get(tpr, msg_prison_slot);
+				if (trpr != NULL) {
+					osd_jail_del(tpr, msg_prison_slot);
+					prison_unlock(tpr);
+					if (trpr == tpr)
+						msg_prison_cleanup(tpr);
+				} else {
+					prison_unlock(tpr);
+					descend = 0;
+				}
+			}
+		}
+	} else if (jsys != -1) {
+		if (jsys == JAIL_SYS_NEW)
+			nrpr = pr;
+		else {
+			prison_lock(pr->pr_parent);
+			nrpr = osd_jail_get(pr->pr_parent, msg_prison_slot);
+			prison_unlock(pr->pr_parent);
+		}
+		rsv = osd_reserve(msg_prison_slot);
+		prison_lock(pr);
+		orpr = osd_jail_get(pr, msg_prison_slot);
+		if (orpr != nrpr)
+			(void)osd_jail_set_reserved(pr, msg_prison_slot, rsv,
+			    nrpr);
+		else
+			osd_free_reserved(rsv);
+		prison_unlock(pr);
+		if (orpr != nrpr) {
+			if (orpr == pr)
+				msg_prison_cleanup(pr);
+			if (orpr != NULL) {
+				/* Change child jails matching the old root, */
+				FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
+					prison_lock(tpr);
+					trpr = osd_jail_get(tpr,
+					    msg_prison_slot);
+					if (trpr == orpr) {
+						(void)osd_jail_set(tpr,
+						    msg_prison_slot, nrpr);
+						prison_unlock(tpr);
+						if (trpr == tpr)
+							msg_prison_cleanup(tpr);
+					} else {
+						prison_unlock(tpr);
+						descend = 0;
+					}
+				}
+			}
+		}
+	}
+
+	return (0);
+}
+
+static int
+msg_prison_get(void *obj, void *data)
+{
+	struct prison *pr = obj;
+	struct prison *rpr;
+	struct vfsoptlist *opts = data;
+	int error, jsys;
+
+	/* Set sysvmsg based on the jail's root prison. */
+	prison_lock(pr);
+	rpr = osd_jail_get(pr, msg_prison_slot);
+	prison_unlock(pr);
+	jsys = rpr == NULL ? JAIL_SYS_DISABLE
+	    : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
+	error = vfs_setopt(opts, "sysvmsg", &jsys, sizeof(jsys));
+	if (error == ENOENT)
+		error = 0;
+	return (error);
+}
+
+static int
+msg_prison_remove(void *obj, void *data __unused)
+{
+	struct prison *pr = obj;
+	struct prison *rpr;
+
+	prison_lock(pr);
+	rpr = osd_jail_get(pr, msg_prison_slot);
+	prison_unlock(pr);
+	if (rpr == pr)
+		msg_prison_cleanup(pr);
+	return (0);
+}
+
+static void
+msg_prison_cleanup(struct prison *pr)
+{
+	struct msqid_kernel *msqkptr;
+	int i;
+
+	/* Remove any msqs that belong to this jail. */
+	mtx_lock(&msq_mtx);
+	for (i = 0; i < msginfo.msgmni; i++) {
+		msqkptr = &msqids[i];
+		if (msqkptr->u.msg_qbytes != 0 &&
+		    msqkptr->cred != NULL && msqkptr->cred->cr_prison == pr)
+			msq_remove(msqkptr);
+	}
+	mtx_unlock(&msq_mtx);
+}
+
+SYSCTL_JAIL_PARAM_SYS_NODE(sysvmsg, CTLFLAG_RW, "SYSV message queues");
+
 #ifdef COMPAT_FREEBSD32
 int
 freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap)
 {
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	switch (uap->which) {
 	case 0:
 		return (freebsd7_freebsd32_msgctl(td,
 		    (struct freebsd7_freebsd32_msgctl_args *)&uap->a2));
 	case 2:
 		return (freebsd32_msgsnd(td,
 		    (struct freebsd32_msgsnd_args *)&uap->a2));
 	case 3:
 		return (freebsd32_msgrcv(td,
 		    (struct freebsd32_msgrcv_args *)&uap->a2));
 	default:
 		return (sys_msgsys(td, (struct msgsys_args *)uap));
 	}
 #else
 	return (nosys(td, NULL));
 #endif
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 int
 freebsd7_freebsd32_msgctl(struct thread *td,
     struct freebsd7_freebsd32_msgctl_args *uap)
 {
 	struct msqid_ds msqbuf;
 	struct msqid_ds32_old msqbuf32;
 	int error;
 
 	if (uap->cmd == IPC_SET) {
 		error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_old_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
 		PTRIN_CP(msqbuf32, msqbuf, msg_first);
 		PTRIN_CP(msqbuf32, msqbuf, msg_last);
 		CP(msqbuf32, msqbuf, msg_cbytes);
 		CP(msqbuf32, msqbuf, msg_qnum);
 		CP(msqbuf32, msqbuf, msg_qbytes);
 		CP(msqbuf32, msqbuf, msg_lspid);
 		CP(msqbuf32, msqbuf, msg_lrpid);
 		CP(msqbuf32, msqbuf, msg_stime);
 		CP(msqbuf32, msqbuf, msg_rtime);
 		CP(msqbuf32, msqbuf, msg_ctime);
 	}
 	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
 	if (error)
 		return (error);
 	if (uap->cmd == IPC_STAT) {
 		bzero(&msqbuf32, sizeof(msqbuf32));
 		freebsd32_ipcperm_old_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
 		PTROUT_CP(msqbuf, msqbuf32, msg_first);
 		PTROUT_CP(msqbuf, msqbuf32, msg_last);
 		CP(msqbuf, msqbuf32, msg_cbytes);
 		CP(msqbuf, msqbuf32, msg_qnum);
 		CP(msqbuf, msqbuf32, msg_qbytes);
 		CP(msqbuf, msqbuf32, msg_lspid);
 		CP(msqbuf, msqbuf32, msg_lrpid);
 		CP(msqbuf, msqbuf32, msg_stime);
 		CP(msqbuf, msqbuf32, msg_rtime);
 		CP(msqbuf, msqbuf32, msg_ctime);
 		error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
 	}
 	return (error);
 }
 #endif
 
 int
 freebsd32_msgctl(struct thread *td, struct freebsd32_msgctl_args *uap)
 {
 	struct msqid_ds msqbuf;
 	struct msqid_ds32 msqbuf32;
 	int error;
 
 	if (uap->cmd == IPC_SET) {
 		error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
 		PTRIN_CP(msqbuf32, msqbuf, msg_first);
 		PTRIN_CP(msqbuf32, msqbuf, msg_last);
 		CP(msqbuf32, msqbuf, msg_cbytes);
 		CP(msqbuf32, msqbuf, msg_qnum);
 		CP(msqbuf32, msqbuf, msg_qbytes);
 		CP(msqbuf32, msqbuf, msg_lspid);
 		CP(msqbuf32, msqbuf, msg_lrpid);
 		CP(msqbuf32, msqbuf, msg_stime);
 		CP(msqbuf32, msqbuf, msg_rtime);
 		CP(msqbuf32, msqbuf, msg_ctime);
 	}
 	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
 	if (error)
 		return (error);
 	if (uap->cmd == IPC_STAT) {
 		freebsd32_ipcperm_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
 		PTROUT_CP(msqbuf, msqbuf32, msg_first);
 		PTROUT_CP(msqbuf, msqbuf32, msg_last);
 		CP(msqbuf, msqbuf32, msg_cbytes);
 		CP(msqbuf, msqbuf32, msg_qnum);
 		CP(msqbuf, msqbuf32, msg_qbytes);
 		CP(msqbuf, msqbuf32, msg_lspid);
 		CP(msqbuf, msqbuf32, msg_lrpid);
 		CP(msqbuf, msqbuf32, msg_stime);
 		CP(msqbuf, msqbuf32, msg_rtime);
 		CP(msqbuf, msqbuf32, msg_ctime);
 		error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
 	}
 	return (error);
 }
 
 int
 freebsd32_msgsnd(struct thread *td, struct freebsd32_msgsnd_args *uap)
 {
 	const void *msgp;
 	long mtype;
 	int32_t mtype32;
 	int error;
 
 	msgp = PTRIN(uap->msgp);
 	if ((error = copyin(msgp, &mtype32, sizeof(mtype32))) != 0)
 		return (error);
 	mtype = mtype32;
 	return (kern_msgsnd(td, uap->msqid,
 	    (const char *)msgp + sizeof(mtype32),
 	    uap->msgsz, uap->msgflg, mtype));
 }
 
 int
 freebsd32_msgrcv(struct thread *td, struct freebsd32_msgrcv_args *uap)
 {
 	void *msgp;
 	long mtype;
 	int32_t mtype32;
 	int error;
 
 	msgp = PTRIN(uap->msgp);
 	if ((error = kern_msgrcv(td, uap->msqid,
 	    (char *)msgp + sizeof(mtype32), uap->msgsz,
 	    uap->msgtyp, uap->msgflg, &mtype)) != 0)
 		return (error);
 	mtype32 = (int32_t)mtype;
 	return (copyout(&mtype32, msgp, sizeof(mtype32)));
 }
 #endif
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *msgcalls[] = {
 	(sy_call_t *)freebsd7_msgctl, (sy_call_t *)sys_msgget,
 	(sy_call_t *)sys_msgsnd, (sy_call_t *)sys_msgrcv
 };
 
 /*
  * Entry point for all MSG calls.
  */
 int
 sys_msgsys(td, uap)
 	struct thread *td;
 	/* XXX actually varargs. */
 	struct msgsys_args /* {
 		int	which;
 		int	a2;
 		int	a3;
 		int	a4;
 		int	a5;
 		int	a6;
 	} */ *uap;
 {
 	int error;
 
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
-		return (ENOSYS);
 	if (uap->which < 0 || uap->which >= nitems(msgcalls))
 		return (EINVAL);
 	error = (*msgcalls[uap->which])(td, &uap->a2);
 	return (error);
 }
 
 #ifndef CP
 #define CP(src, dst, fld)	do { (dst).fld = (src).fld; } while (0)
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd7_msgctl_args {
 	int	msqid;
 	int	cmd;
 	struct	msqid_ds_old *buf;
 };
 #endif
 int
 freebsd7_msgctl(td, uap)
 	struct thread *td;
 	struct freebsd7_msgctl_args *uap;
 {
 	struct msqid_ds_old msqold;
 	struct msqid_ds msqbuf;
 	int error;
 
 	DPRINTF(("call to freebsd7_msgctl(%d, %d, %p)\n", uap->msqid, uap->cmd,
 	    uap->buf));
 	if (uap->cmd == IPC_SET) {
 		error = copyin(uap->buf, &msqold, sizeof(msqold));
 		if (error)
 			return (error);
 		ipcperm_old2new(&msqold.msg_perm, &msqbuf.msg_perm);
 		CP(msqold, msqbuf, msg_first);
 		CP(msqold, msqbuf, msg_last);
 		CP(msqold, msqbuf, msg_cbytes);
 		CP(msqold, msqbuf, msg_qnum);
 		CP(msqold, msqbuf, msg_qbytes);
 		CP(msqold, msqbuf, msg_lspid);
 		CP(msqold, msqbuf, msg_lrpid);
 		CP(msqold, msqbuf, msg_stime);
 		CP(msqold, msqbuf, msg_rtime);
 		CP(msqold, msqbuf, msg_ctime);
 	}
 	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
 	if (error)
 		return (error);
 	if (uap->cmd == IPC_STAT) {
 		bzero(&msqold, sizeof(msqold));
 		ipcperm_new2old(&msqbuf.msg_perm, &msqold.msg_perm);
 		CP(msqbuf, msqold, msg_first);
 		CP(msqbuf, msqold, msg_last);
 		CP(msqbuf, msqold, msg_cbytes);
 		CP(msqbuf, msqold, msg_qnum);
 		CP(msqbuf, msqold, msg_qbytes);
 		CP(msqbuf, msqold, msg_lspid);
 		CP(msqbuf, msqold, msg_lrpid);
 		CP(msqbuf, msqold, msg_stime);
 		CP(msqbuf, msqold, msg_rtime);
 		CP(msqbuf, msqold, msg_ctime);
 		error = copyout(&msqold, uap->buf, sizeof(struct msqid_ds_old));
 	}
 	return (error);
 }
 
 #undef CP
 
 #endif	/* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
 	   COMPAT_FREEBSD7 */
Index: head/sys/kern/sysv_sem.c
===================================================================
--- head/sys/kern/sysv_sem.c	(revision 298584)
+++ head/sys/kern/sysv_sem.c	(revision 298585)
@@ -1,1660 +1,1962 @@
 /*-
  * Implementation of SVID semaphores
  *
  * Author:  Daniel Boulet
  *
  * This software is provided ``AS IS'' without any warranties of any kind.
  */
 /*-
  * Copyright (c) 2003-2005 McAfee, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/racct.h>
+#include <sys/sbuf.h>
 #include <sys/sem.h>
+#include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/jail.h>
 
 #include <security/mac/mac_framework.h>
 
 FEATURE(sysv_sem, "System V semaphores support");
 
 static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores");
 
 #ifdef SEM_DEBUG
 #define DPRINTF(a)	printf a
 #else
 #define DPRINTF(a)
 #endif
 
 static int seminit(void);
 static int sysvsem_modload(struct module *, int, void *);
 static int semunload(void);
 static void semexit_myhook(void *arg, struct proc *p);
 static int sysctl_sema(SYSCTL_HANDLER_ARGS);
-static int semvalid(int semid, struct semid_kernel *semakptr);
+static int semvalid(int semid, struct prison *rpr,
+    struct semid_kernel *semakptr);
+static void sem_remove(int semidx, struct ucred *cred);
+static struct prison *sem_find_prison(struct ucred *);
+static int sem_prison_cansee(struct prison *, struct semid_kernel *);
+static int sem_prison_check(void *, void *);
+static int sem_prison_set(void *, void *);
+static int sem_prison_get(void *, void *);
+static int sem_prison_remove(void *, void *);
+static void sem_prison_cleanup(struct prison *);
 
 #ifndef _SYS_SYSPROTO_H_
 struct __semctl_args;
 int __semctl(struct thread *td, struct __semctl_args *uap);
 struct semget_args;
 int semget(struct thread *td, struct semget_args *uap);
 struct semop_args;
 int semop(struct thread *td, struct semop_args *uap);
 #endif
 
 static struct sem_undo *semu_alloc(struct thread *td);
 static int semundo_adjust(struct thread *td, struct sem_undo **supptr,
     int semid, int semseq, int semnum, int adjval);
 static void semundo_clear(int semid, int semnum);
 
 static struct mtx	sem_mtx;	/* semaphore global lock */
 static struct mtx sem_undo_mtx;
 static int	semtot = 0;
 static struct semid_kernel *sema;	/* semaphore id pool */
 static struct mtx *sema_mtx;	/* semaphore id pool mutexes*/
 static struct sem *sem;		/* semaphore pool */
 LIST_HEAD(, sem_undo) semu_list;	/* list of active undo structures */
 LIST_HEAD(, sem_undo) semu_free_list;	/* list of free undo structures */
 static int	*semu;		/* undo structure pool */
 static eventhandler_tag semexit_tag;
+static unsigned sem_prison_slot;	/* prison OSD slot */
 
 #define SEMUNDO_MTX		sem_undo_mtx
 #define SEMUNDO_LOCK()		mtx_lock(&SEMUNDO_MTX);
 #define SEMUNDO_UNLOCK()	mtx_unlock(&SEMUNDO_MTX);
 #define SEMUNDO_LOCKASSERT(how)	mtx_assert(&SEMUNDO_MTX, (how));
 
 struct sem {
 	u_short	semval;		/* semaphore value */
 	pid_t	sempid;		/* pid of last operation */
 	u_short	semncnt;	/* # awaiting semval > cval */
 	u_short	semzcnt;	/* # awaiting semval = 0 */
 };
 
 /*
  * Undo structure (one per process)
  */
 struct sem_undo {
 	LIST_ENTRY(sem_undo) un_next;	/* ptr to next active undo structure */
 	struct	proc *un_proc;		/* owner of this structure */
 	short	un_cnt;			/* # of active entries */
 	struct undo {
 		short	un_adjval;	/* adjust on exit values */
 		short	un_num;		/* semaphore # */
 		int	un_id;		/* semid */
 		unsigned short un_seq;
 	} un_ent[1];			/* undo entries */
 };
 
 /*
  * Configuration parameters
  */
 #ifndef SEMMNI
 #define SEMMNI	50		/* # of semaphore identifiers */
 #endif
 #ifndef SEMMNS
 #define SEMMNS	340		/* # of semaphores in system */
 #endif
 #ifndef SEMUME
 #define SEMUME	50		/* max # of undo entries per process */
 #endif
 #ifndef SEMMNU
 #define SEMMNU	150		/* # of undo structures in system */
 #endif
 
 /* shouldn't need tuning */
 #ifndef SEMMSL
 #define SEMMSL	SEMMNS		/* max # of semaphores per id */
 #endif
 #ifndef SEMOPM
 #define SEMOPM	100		/* max # of operations per semop call */
 #endif
 
 #define SEMVMX	32767		/* semaphore maximum value */
 #define SEMAEM	16384		/* adjust on exit max value */
 
 /*
  * Due to the way semaphore memory is allocated, we have to ensure that
  * SEMUSZ is properly aligned.
  */
 
 #define	SEM_ALIGN(bytes) roundup2(bytes, sizeof(long))
 
 /* actual size of an undo structure */
 #define SEMUSZ	SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME]))
 
 /*
  * Macro to find a particular sem_undo vector
  */
 #define SEMU(ix) \
 	((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz))
 
 /*
  * semaphore info struct
  */
 struct seminfo seminfo = {
                 SEMMNI,         /* # of semaphore identifiers */
                 SEMMNS,         /* # of semaphores in system */
                 SEMMNU,         /* # of undo structures in system */
                 SEMMSL,         /* max # of semaphores per id */
                 SEMOPM,         /* max # of operations per semop call */
                 SEMUME,         /* max # of undo entries per process */
                 SEMUSZ,         /* size in bytes of undo structure */
                 SEMVMX,         /* semaphore maximum value */
                 SEMAEM          /* adjust on exit max value */
 };
 
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0,
     "Number of semaphore identifiers");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0,
     "Maximum number of semaphores in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RDTUN, &seminfo.semmnu, 0,
     "Maximum number of undo structures in the system");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RWTUN, &seminfo.semmsl, 0,
     "Max semaphores per id");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RDTUN, &seminfo.semopm, 0,
     "Max operations per semop call");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RDTUN, &seminfo.semume, 0,
     "Max undo entries per process");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RDTUN, &seminfo.semusz, 0,
     "Size in bytes of undo structure");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RWTUN, &seminfo.semvmx, 0,
     "Semaphore maximum value");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RWTUN, &seminfo.semaem, 0,
     "Adjust on exit max value");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLTYPE_OPAQUE | CTLFLAG_RD,
     NULL, 0, sysctl_sema, "", "Semaphore id pool");
 
 static struct syscall_helper_data sem_syscalls[] = {
 	SYSCALL_INIT_HELPER(__semctl),
 	SYSCALL_INIT_HELPER(semget),
 	SYSCALL_INIT_HELPER(semop),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL_INIT_HELPER(semsys),
 	SYSCALL_INIT_HELPER_COMPAT(freebsd7___semctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static struct syscall_helper_data sem32_syscalls[] = {
 	SYSCALL32_INIT_HELPER(freebsd32_semctl),
 	SYSCALL32_INIT_HELPER_COMPAT(semget),
 	SYSCALL32_INIT_HELPER_COMPAT(semop),
 	SYSCALL32_INIT_HELPER(freebsd32_semsys),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_semctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 seminit(void)
 {
+	struct prison *pr;
+	void *rsv;
 	int i, error;
+	osd_method_t methods[PR_MAXMETHOD] = {
+	    [PR_METHOD_CHECK] =		sem_prison_check,
+	    [PR_METHOD_SET] =		sem_prison_set,
+	    [PR_METHOD_GET] =		sem_prison_get,
+	    [PR_METHOD_REMOVE] =	sem_prison_remove,
+	};
 
 	sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK);
 	sema = malloc(sizeof(struct semid_kernel) * seminfo.semmni, M_SEM,
 	    M_WAITOK);
 	sema_mtx = malloc(sizeof(struct mtx) * seminfo.semmni, M_SEM,
 	    M_WAITOK | M_ZERO);
 	semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK);
 
 	for (i = 0; i < seminfo.semmni; i++) {
 		sema[i].u.sem_base = 0;
 		sema[i].u.sem_perm.mode = 0;
 		sema[i].u.sem_perm.seq = 0;
 #ifdef MAC
 		mac_sysvsem_init(&sema[i]);
 #endif
 	}
 	for (i = 0; i < seminfo.semmni; i++)
 		mtx_init(&sema_mtx[i], "semid", NULL, MTX_DEF);
 	LIST_INIT(&semu_free_list);
 	for (i = 0; i < seminfo.semmnu; i++) {
 		struct sem_undo *suptr = SEMU(i);
 		suptr->un_proc = NULL;
 		LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
 	}
 	LIST_INIT(&semu_list);
 	mtx_init(&sem_mtx, "sem", NULL, MTX_DEF);
 	mtx_init(&sem_undo_mtx, "semu", NULL, MTX_DEF);
 	semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL,
 	    EVENTHANDLER_PRI_ANY);
 
+	/* Set current prisons according to their allow.sysvipc. */
+	sem_prison_slot = osd_jail_register(NULL, methods);
+	rsv = osd_reserve(sem_prison_slot);
+	prison_lock(&prison0);
+	(void)osd_jail_set_reserved(&prison0, sem_prison_slot, rsv, &prison0);
+	prison_unlock(&prison0);
+	rsv = NULL;
+	sx_slock(&allprison_lock);
+	TAILQ_FOREACH(pr, &allprison, pr_list) {
+		if (rsv == NULL)
+			rsv = osd_reserve(sem_prison_slot);
+		prison_lock(pr);
+		if ((pr->pr_allow & PR_ALLOW_SYSVIPC) && pr->pr_ref > 0) {
+			(void)osd_jail_set_reserved(pr, sem_prison_slot, rsv,
+			    &prison0);
+			rsv = NULL;
+		}
+		prison_unlock(pr);
+	}
+	if (rsv != NULL)
+		osd_free_reserved(rsv);
+	sx_sunlock(&allprison_lock);
+
 	error = syscall_helper_register(sem_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(sem32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 semunload(void)
 {
 	int i;
 
 	/* XXXKIB */
 	if (semtot != 0)
 		return (EBUSY);
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(sem32_syscalls);
 #endif
 	syscall_helper_unregister(sem_syscalls);
 	EVENTHANDLER_DEREGISTER(process_exit, semexit_tag);
+	if (sem_prison_slot != 0)
+		osd_jail_deregister(sem_prison_slot);
 #ifdef MAC
 	for (i = 0; i < seminfo.semmni; i++)
 		mac_sysvsem_destroy(&sema[i]);
 #endif
 	free(sem, M_SEM);
 	free(sema, M_SEM);
 	free(semu, M_SEM);
 	for (i = 0; i < seminfo.semmni; i++)
 		mtx_destroy(&sema_mtx[i]);
 	free(sema_mtx, M_SEM);
 	mtx_destroy(&sem_mtx);
 	mtx_destroy(&sem_undo_mtx);
 	return (0);
 }
 
 static int
 sysvsem_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = seminit();
 		if (error != 0)
 			semunload();
 		break;
 	case MOD_UNLOAD:
 		error = semunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvsem_mod = {
 	"sysvsem",
 	&sysvsem_modload,
 	NULL
 };
 
 DECLARE_MODULE(sysvsem, sysvsem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
 MODULE_VERSION(sysvsem, 1);
 
 /*
  * Allocate a new sem_undo structure for a process
  * (returns ptr to structure or NULL if no more room)
  */
 
 static struct sem_undo *
 semu_alloc(struct thread *td)
 {
 	struct sem_undo *suptr;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	if ((suptr = LIST_FIRST(&semu_free_list)) == NULL)
 		return (NULL);
 	LIST_REMOVE(suptr, un_next);
 	LIST_INSERT_HEAD(&semu_list, suptr, un_next);
 	suptr->un_cnt = 0;
 	suptr->un_proc = td->td_proc;
 	return (suptr);
 }
 
 static int
 semu_try_free(struct sem_undo *suptr)
 {
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 
 	if (suptr->un_cnt != 0)
 		return (0);
 	LIST_REMOVE(suptr, un_next);
 	LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
 	return (1);
 }
 
 /*
  * Adjust a particular entry for a particular proc
  */
 
 static int
 semundo_adjust(struct thread *td, struct sem_undo **supptr, int semid,
     int semseq, int semnum, int adjval)
 {
 	struct proc *p = td->td_proc;
 	struct sem_undo *suptr;
 	struct undo *sunptr;
 	int i;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	/* Look for and remember the sem_undo if the caller doesn't provide
 	   it */
 
 	suptr = *supptr;
 	if (suptr == NULL) {
 		LIST_FOREACH(suptr, &semu_list, un_next) {
 			if (suptr->un_proc == p) {
 				*supptr = suptr;
 				break;
 			}
 		}
 		if (suptr == NULL) {
 			if (adjval == 0)
 				return(0);
 			suptr = semu_alloc(td);
 			if (suptr == NULL)
 				return (ENOSPC);
 			*supptr = suptr;
 		}
 	}
 
 	/*
 	 * Look for the requested entry and adjust it (delete if adjval becomes
 	 * 0).
 	 */
 	sunptr = &suptr->un_ent[0];
 	for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
 		if (sunptr->un_id != semid || sunptr->un_num != semnum)
 			continue;
 		if (adjval != 0) {
 			adjval += sunptr->un_adjval;
 			if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
 				return (ERANGE);
 		}
 		sunptr->un_adjval = adjval;
 		if (sunptr->un_adjval == 0) {
 			suptr->un_cnt--;
 			if (i < suptr->un_cnt)
 				suptr->un_ent[i] =
 				    suptr->un_ent[suptr->un_cnt];
 			if (suptr->un_cnt == 0)
 				semu_try_free(suptr);
 		}
 		return (0);
 	}
 
 	/* Didn't find the right entry - create it */
 	if (adjval == 0)
 		return (0);
 	if (adjval > seminfo.semaem || adjval < -seminfo.semaem)
 		return (ERANGE);
 	if (suptr->un_cnt != seminfo.semume) {
 		sunptr = &suptr->un_ent[suptr->un_cnt];
 		suptr->un_cnt++;
 		sunptr->un_adjval = adjval;
 		sunptr->un_id = semid;
 		sunptr->un_num = semnum;
 		sunptr->un_seq = semseq;
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 static void
 semundo_clear(int semid, int semnum)
 {
 	struct sem_undo *suptr, *suptr1;
 	struct undo *sunptr;
 	int i;
 
 	SEMUNDO_LOCKASSERT(MA_OWNED);
 	LIST_FOREACH_SAFE(suptr, &semu_list, un_next, suptr1) {
 		sunptr = &suptr->un_ent[0];
 		for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
 			if (sunptr->un_id != semid)
 				continue;
 			if (semnum == -1 || sunptr->un_num == semnum) {
 				suptr->un_cnt--;
 				if (i < suptr->un_cnt) {
 					suptr->un_ent[i] =
 					    suptr->un_ent[suptr->un_cnt];
 					continue;
 				}
 				semu_try_free(suptr);
 			}
 			if (semnum != -1)
 				break;
 		}
 	}
 }
 
 static int
-semvalid(int semid, struct semid_kernel *semakptr)
+semvalid(int semid, struct prison *rpr, struct semid_kernel *semakptr)
 {
 
 	return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
-	    semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ? EINVAL : 0);
+	    semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ||
+	    sem_prison_cansee(rpr, semakptr) ? EINVAL : 0);
 }
 
+static void
+sem_remove(int semidx, struct ucred *cred)
+{
+	struct semid_kernel *semakptr;
+	int i;
+
+	KASSERT(semidx >= 0 && semidx < seminfo.semmni,
+		("semidx out of bounds"));
+	semakptr = &sema[semidx];
+	semakptr->u.sem_perm.cuid = cred ? cred->cr_uid : 0;
+	semakptr->u.sem_perm.uid = cred ? cred->cr_uid : 0;
+	semakptr->u.sem_perm.mode = 0;
+	racct_sub_cred(semakptr->cred, RACCT_NSEM, semakptr->u.sem_nsems);
+	crfree(semakptr->cred);
+	semakptr->cred = NULL;
+	SEMUNDO_LOCK();
+	semundo_clear(semidx, -1);
+	SEMUNDO_UNLOCK();
+#ifdef MAC
+	mac_sysvsem_cleanup(semakptr);
+#endif
+	wakeup(semakptr);
+	for (i = 0; i < seminfo.semmni; i++) {
+		if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
+		    sema[i].u.sem_base > semakptr->u.sem_base)
+			mtx_lock_flags(&sema_mtx[i], LOP_DUPOK);
+	}
+	for (i = semakptr->u.sem_base - sem; i < semtot; i++)
+		sem[i] = sem[i + semakptr->u.sem_nsems];
+	for (i = 0; i < seminfo.semmni; i++) {
+		if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
+		    sema[i].u.sem_base > semakptr->u.sem_base) {
+			sema[i].u.sem_base -= semakptr->u.sem_nsems;
+			mtx_unlock(&sema_mtx[i]);
+		}
+	}
+	semtot -= semakptr->u.sem_nsems;
+}
+
+static struct prison *
+sem_find_prison(struct ucred *cred)
+{
+	struct prison *pr, *rpr;
+
+	pr = cred->cr_prison;
+	prison_lock(pr);
+	rpr = osd_jail_get(pr, sem_prison_slot);
+	prison_unlock(pr);
+	return rpr;
+}
+
+static int
+sem_prison_cansee(struct prison *rpr, struct semid_kernel *semakptr)
+{
+
+	if (semakptr->cred == NULL ||
+	    !(rpr == semakptr->cred->cr_prison ||
+	      prison_ischild(rpr, semakptr->cred->cr_prison)))
+		return (EINVAL);
+	return (0);
+}
+
 /*
  * Note that the user-mode half of this passes a union, not a pointer.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct __semctl_args {
 	int	semid;
 	int	semnum;
 	int	cmd;
 	union	semun *arg;
 };
 #endif
 int
 sys___semctl(struct thread *td, struct __semctl_args *uap)
 {
 	struct semid_ds dsbuf;
 	union semun arg, semun;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(arg.buf, &dsbuf, sizeof(dsbuf));
 		if (error)
 			return (error);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = arg.array;
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;		
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		error = copyout(&dsbuf, arg.buf, sizeof(dsbuf));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 
 int
 kern_semctl(struct thread *td, int semid, int semnum, int cmd,
     union semun *arg, register_t *rval)
 {
 	u_short *array;
 	struct ucred *cred = td->td_ucred;
 	int i, error;
+	struct prison *rpr;
 	struct semid_ds *sbuf;
 	struct semid_kernel *semakptr;
 	struct mtx *sema_mtxp;
 	u_short usval, count;
 	int semidx;
 
 	DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n",
 	    semid, semnum, cmd, arg));
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+
+	rpr = sem_find_prison(td->td_ucred);
+	if (sem == NULL)
 		return (ENOSYS);
 
 	array = NULL;
 
 	switch(cmd) {
 	case SEM_STAT:
 		/*
 		 * For this command we assume semid is an array index
 		 * rather than an IPC id.
 		 */
 		if (semid < 0 || semid >= seminfo.semmni)
 			return (EINVAL);
 		semakptr = &sema[semid];
 		sema_mtxp = &sema_mtx[semid];
 		mtx_lock(sema_mtxp);
 		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
 			error = EINVAL;
 			goto done2;
 		}
+		if ((error = sem_prison_cansee(rpr, semakptr)))
+			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 #ifdef MAC
 		error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
 		if (error != 0)
 			goto done2;
 #endif
 		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
+		if (cred->cr_prison != semakptr->cred->cr_prison)
+			arg->buf->sem_perm.key = IPC_PRIVATE;
 		*rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm);
 		mtx_unlock(sema_mtxp);
 		return (0);
 	}
 
 	semidx = IPCID_TO_IX(semid);
 	if (semidx < 0 || semidx >= seminfo.semmni)
 		return (EINVAL);
 
 	semakptr = &sema[semidx];
 	sema_mtxp = &sema_mtx[semidx];
 	if (cmd == IPC_RMID)
 		mtx_lock(&sem_mtx);
 	mtx_lock(sema_mtxp);
+
 #ifdef MAC
 	error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
 	if (error != 0)
 		goto done2;
 #endif
 
 	error = 0;
 	*rval = 0;
 
 	switch (cmd) {
 	case IPC_RMID:
-		if ((error = semvalid(semid, semakptr)) != 0)
+		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
 			goto done2;
-		semakptr->u.sem_perm.cuid = cred->cr_uid;
-		semakptr->u.sem_perm.uid = cred->cr_uid;
-		semakptr->u.sem_perm.mode = 0;
-		racct_sub_cred(semakptr->cred, RACCT_NSEM, semakptr->u.sem_nsems);
-		crfree(semakptr->cred);
-		semakptr->cred = NULL;
-		SEMUNDO_LOCK();
-		semundo_clear(semidx, -1);
-		SEMUNDO_UNLOCK();
-#ifdef MAC
-		mac_sysvsem_cleanup(semakptr);
-#endif
-		wakeup(semakptr);
-		for (i = 0; i < seminfo.semmni; i++) {
-			if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
-			    sema[i].u.sem_base > semakptr->u.sem_base)
-				mtx_lock_flags(&sema_mtx[i], LOP_DUPOK);
-		}
-		for (i = semakptr->u.sem_base - sem; i < semtot; i++)
-			sem[i] = sem[i + semakptr->u.sem_nsems];
-		for (i = 0; i < seminfo.semmni; i++) {
-			if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
-			    sema[i].u.sem_base > semakptr->u.sem_base) {
-				sema[i].u.sem_base -= semakptr->u.sem_nsems;
-				mtx_unlock(&sema_mtx[i]);
-			}
-		}
-		semtot -= semakptr->u.sem_nsems;
+		sem_remove(semidx, cred);
 		break;
 
 	case IPC_SET:
-		if ((error = semvalid(semid, semakptr)) != 0)
+		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
 			goto done2;
 		sbuf = arg->buf;
 		semakptr->u.sem_perm.uid = sbuf->sem_perm.uid;
 		semakptr->u.sem_perm.gid = sbuf->sem_perm.gid;
 		semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode &
 		    ~0777) | (sbuf->sem_perm.mode & 0777);
 		semakptr->u.sem_ctime = time_second;
 		break;
 
 	case IPC_STAT:
-		if ((error = semvalid(semid, semakptr)) != 0)
+		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
+		if (cred->cr_prison != semakptr->cred->cr_prison)
+			arg->buf->sem_perm.key = IPC_PRIVATE;
 		break;
 
 	case GETNCNT:
-		if ((error = semvalid(semid, semakptr)) != 0)
+		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semncnt;
 		break;
 
 	case GETPID:
-		if ((error = semvalid(semid, semakptr)) != 0)
+		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].sempid;
 		break;
 
 	case GETVAL:
-		if ((error = semvalid(semid, semakptr)) != 0)
+		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semval;
 		break;
 
 	case GETALL:
 		/*
 		 * Unfortunately, callers of this function don't know
 		 * in advance how many semaphores are in this set.
 		 * While we could just allocate the maximum size array
 		 * and pass the actual size back to the caller, that
 		 * won't work for SETALL since we can't copyin() more
 		 * data than the user specified as we may return a
 		 * spurious EFAULT.
 		 * 
 		 * Note that the number of semaphores in a set is
 		 * fixed for the life of that set.  The only way that
 		 * the 'count' could change while are blocked in
 		 * malloc() is if this semaphore set were destroyed
 		 * and a new one created with the same index.
 		 * However, semvalid() will catch that due to the
 		 * sequence number unless exactly 0x8000 (or a
 		 * multiple thereof) semaphore sets for the same index
 		 * are created and destroyed while we are in malloc!
 		 *
 		 */
 		count = semakptr->u.sem_nsems;
 		mtx_unlock(sema_mtxp);		    
 		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
 		mtx_lock(sema_mtxp);
-		if ((error = semvalid(semid, semakptr)) != 0)
+		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		for (i = 0; i < semakptr->u.sem_nsems; i++)
 			array[i] = semakptr->u.sem_base[i].semval;
 		mtx_unlock(sema_mtxp);
 		error = copyout(array, arg->array, count * sizeof(*array));
 		mtx_lock(sema_mtxp);
 		break;
 
 	case GETZCNT:
-		if ((error = semvalid(semid, semakptr)) != 0)
+		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		*rval = semakptr->u.sem_base[semnum].semzcnt;
 		break;
 
 	case SETVAL:
-		if ((error = semvalid(semid, semakptr)) != 0)
+		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
 			goto done2;
 		if (semnum < 0 || semnum >= semakptr->u.sem_nsems) {
 			error = EINVAL;
 			goto done2;
 		}
 		if (arg->val < 0 || arg->val > seminfo.semvmx) {
 			error = ERANGE;
 			goto done2;
 		}
 		semakptr->u.sem_base[semnum].semval = arg->val;
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, semnum);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	case SETALL:
 		/*
 		 * See comment on GETALL for why 'count' shouldn't change
 		 * and why we require a userland buffer.
 		 */
 		count = semakptr->u.sem_nsems;
 		mtx_unlock(sema_mtxp);		    
 		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
 		error = copyin(arg->array, array, count * sizeof(*array));
 		mtx_lock(sema_mtxp);
 		if (error)
 			break;
-		if ((error = semvalid(semid, semakptr)) != 0)
+		if ((error = semvalid(semid, rpr, semakptr)) != 0)
 			goto done2;
 		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
 			goto done2;
 		for (i = 0; i < semakptr->u.sem_nsems; i++) {
 			usval = array[i];
 			if (usval > seminfo.semvmx) {
 				error = ERANGE;
 				break;
 			}
 			semakptr->u.sem_base[i].semval = usval;
 		}
 		SEMUNDO_LOCK();
 		semundo_clear(semidx, -1);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 done2:
 	mtx_unlock(sema_mtxp);
 	if (cmd == IPC_RMID)
 		mtx_unlock(&sem_mtx);
 	if (array != NULL)
 		free(array, M_TEMP);
 	return(error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct semget_args {
 	key_t	key;
 	int	nsems;
 	int	semflg;
 };
 #endif
 int
 sys_semget(struct thread *td, struct semget_args *uap)
 {
 	int semid, error = 0;
 	int key = uap->key;
 	int nsems = uap->nsems;
 	int semflg = uap->semflg;
 	struct ucred *cred = td->td_ucred;
 
 	DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg));
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+
+	if (sem_find_prison(cred) == NULL)
 		return (ENOSYS);
 
 	mtx_lock(&sem_mtx);
 	if (key != IPC_PRIVATE) {
 		for (semid = 0; semid < seminfo.semmni; semid++) {
 			if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) &&
+			    sema[semid].cred != NULL &&
+			    sema[semid].cred->cr_prison == cred->cr_prison &&
 			    sema[semid].u.sem_perm.key == key)
 				break;
 		}
 		if (semid < seminfo.semmni) {
 			DPRINTF(("found public key\n"));
 			if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
 				DPRINTF(("not exclusive\n"));
 				error = EEXIST;
 				goto done2;
 			}
 			if ((error = ipcperm(td, &sema[semid].u.sem_perm,
 			    semflg & 0700))) {
 				goto done2;
 			}
 			if (nsems > 0 && sema[semid].u.sem_nsems < nsems) {
 				DPRINTF(("too small\n"));
 				error = EINVAL;
 				goto done2;
 			}
 #ifdef MAC
 			error = mac_sysvsem_check_semget(cred, &sema[semid]);
 			if (error != 0)
 				goto done2;
 #endif
 			goto found;
 		}
 	}
 
 	DPRINTF(("need to allocate the semid_kernel\n"));
 	if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
 		if (nsems <= 0 || nsems > seminfo.semmsl) {
 			DPRINTF(("nsems out of range (0<%d<=%d)\n", nsems,
 			    seminfo.semmsl));
 			error = EINVAL;
 			goto done2;
 		}
 		if (nsems > seminfo.semmns - semtot) {
 			DPRINTF((
 			    "not enough semaphores left (need %d, got %d)\n",
 			    nsems, seminfo.semmns - semtot));
 			error = ENOSPC;
 			goto done2;
 		}
 		for (semid = 0; semid < seminfo.semmni; semid++) {
 			if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0)
 				break;
 		}
 		if (semid == seminfo.semmni) {
 			DPRINTF(("no more semid_kernel's available\n"));
 			error = ENOSPC;
 			goto done2;
 		}
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(td->td_proc);
 			error = racct_add(td->td_proc, RACCT_NSEM, nsems);
 			PROC_UNLOCK(td->td_proc);
 			if (error != 0) {
 				error = ENOSPC;
 				goto done2;
 			}
 		}
 #endif
 		DPRINTF(("semid %d is available\n", semid));
 		mtx_lock(&sema_mtx[semid]);
 		KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0,
 		    ("Lost semaphore %d", semid));
 		sema[semid].u.sem_perm.key = key;
 		sema[semid].u.sem_perm.cuid = cred->cr_uid;
 		sema[semid].u.sem_perm.uid = cred->cr_uid;
 		sema[semid].u.sem_perm.cgid = cred->cr_gid;
 		sema[semid].u.sem_perm.gid = cred->cr_gid;
 		sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
 		sema[semid].cred = crhold(cred);
 		sema[semid].u.sem_perm.seq =
 		    (sema[semid].u.sem_perm.seq + 1) & 0x7fff;
 		sema[semid].u.sem_nsems = nsems;
 		sema[semid].u.sem_otime = 0;
 		sema[semid].u.sem_ctime = time_second;
 		sema[semid].u.sem_base = &sem[semtot];
 		semtot += nsems;
 		bzero(sema[semid].u.sem_base,
 		    sizeof(sema[semid].u.sem_base[0])*nsems);
 #ifdef MAC
 		mac_sysvsem_create(cred, &sema[semid]);
 #endif
 		mtx_unlock(&sema_mtx[semid]);
 		DPRINTF(("sembase = %p, next = %p\n",
 		    sema[semid].u.sem_base, &sem[semtot]));
 	} else {
 		DPRINTF(("didn't find it and wasn't asked to create it\n"));
 		error = ENOENT;
 		goto done2;
 	}
 
 found:
 	td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].u.sem_perm);
 done2:
 	mtx_unlock(&sem_mtx);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct semop_args {
 	int	semid;
 	struct	sembuf *sops;
 	size_t	nsops;
 };
 #endif
 int
 sys_semop(struct thread *td, struct semop_args *uap)
 {
 #define SMALL_SOPS	8
 	struct sembuf small_sops[SMALL_SOPS];
 	int semid = uap->semid;
 	size_t nsops = uap->nsops;
+	struct prison *rpr;
 	struct sembuf *sops;
 	struct semid_kernel *semakptr;
 	struct sembuf *sopptr = NULL;
 	struct sem *semptr = NULL;
 	struct sem_undo *suptr;
 	struct mtx *sema_mtxp;
 	size_t i, j, k;
 	int error;
 	int do_wakeup, do_undos;
 	unsigned short seq;
 
 #ifdef SEM_DEBUG
 	sops = NULL;
 #endif
 	DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops));
 
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+	rpr = sem_find_prison(td->td_ucred);
+	if (sem == NULL)
 		return (ENOSYS);
 
 	semid = IPCID_TO_IX(semid);	/* Convert back to zero origin */
 
 	if (semid < 0 || semid >= seminfo.semmni)
 		return (EINVAL);
 
 	/* Allocate memory for sem_ops */
 	if (nsops <= SMALL_SOPS)
 		sops = small_sops;
 	else if (nsops > seminfo.semopm) {
 		DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm,
 		    nsops));
 		return (E2BIG);
 	} else {
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(td->td_proc);
 			if (nsops >
 			    racct_get_available(td->td_proc, RACCT_NSEMOP)) {
 				PROC_UNLOCK(td->td_proc);
 				return (E2BIG);
 			}
 			PROC_UNLOCK(td->td_proc);
 		}
 #endif
 
 		sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK);
 	}
 	if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) {
 		DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error,
 		    uap->sops, sops, nsops * sizeof(sops[0])));
 		if (sops != small_sops)
 			free(sops, M_SEM);
 		return (error);
 	}
 
 	semakptr = &sema[semid];
 	sema_mtxp = &sema_mtx[semid];
 	mtx_lock(sema_mtxp);
 	if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
 		error = EINVAL;
 		goto done2;
 	}
 	seq = semakptr->u.sem_perm.seq;
 	if (seq != IPCID_TO_SEQ(uap->semid)) {
 		error = EINVAL;
 		goto done2;
 	}
+	if ((error = sem_prison_cansee(rpr, semakptr)) != 0)
+		goto done2;
 	/*
 	 * Initial pass thru sops to see what permissions are needed.
 	 * Also perform any checks that don't need repeating on each
 	 * attempt to satisfy the request vector.
 	 */
 	j = 0;		/* permission needed */
 	do_undos = 0;
 	for (i = 0; i < nsops; i++) {
 		sopptr = &sops[i];
 		if (sopptr->sem_num >= semakptr->u.sem_nsems) {
 			error = EFBIG;
 			goto done2;
 		}
 		if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0)
 			do_undos = 1;
 		j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A;
 	}
 
 	if ((error = ipcperm(td, &semakptr->u.sem_perm, j))) {
 		DPRINTF(("error = %d from ipaccess\n", error));
 		goto done2;
 	}
 #ifdef MAC
 	error = mac_sysvsem_check_semop(td->td_ucred, semakptr, j);
 	if (error != 0)
 		goto done2;
 #endif
 
 	/*
 	 * Loop trying to satisfy the vector of requests.
 	 * If we reach a point where we must wait, any requests already
 	 * performed are rolled back and we go to sleep until some other
 	 * process wakes us up.  At this point, we start all over again.
 	 *
 	 * This ensures that from the perspective of other tasks, a set
 	 * of requests is atomic (never partially satisfied).
 	 */
 	for (;;) {
 		do_wakeup = 0;
 		error = 0;	/* error return if necessary */
 
 		for (i = 0; i < nsops; i++) {
 			sopptr = &sops[i];
 			semptr = &semakptr->u.sem_base[sopptr->sem_num];
 
 			DPRINTF((
 			    "semop:  semakptr=%p, sem_base=%p, "
 			    "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n",
 			    semakptr, semakptr->u.sem_base, semptr,
 			    sopptr->sem_num, semptr->semval, sopptr->sem_op,
 			    (sopptr->sem_flg & IPC_NOWAIT) ?
 			    "nowait" : "wait"));
 
 			if (sopptr->sem_op < 0) {
 				if (semptr->semval + sopptr->sem_op < 0) {
 					DPRINTF(("semop:  can't do it now\n"));
 					break;
 				} else {
 					semptr->semval += sopptr->sem_op;
 					if (semptr->semval == 0 &&
 					    semptr->semzcnt > 0)
 						do_wakeup = 1;
 				}
 			} else if (sopptr->sem_op == 0) {
 				if (semptr->semval != 0) {
 					DPRINTF(("semop:  not zero now\n"));
 					break;
 				}
 			} else if (semptr->semval + sopptr->sem_op >
 			    seminfo.semvmx) {
 				error = ERANGE;
 				break;
 			} else {
 				if (semptr->semncnt > 0)
 					do_wakeup = 1;
 				semptr->semval += sopptr->sem_op;
 			}
 		}
 
 		/*
 		 * Did we get through the entire vector?
 		 */
 		if (i >= nsops)
 			goto done;
 
 		/*
 		 * No ... rollback anything that we've already done
 		 */
 		DPRINTF(("semop:  rollback 0 through %d\n", i-1));
 		for (j = 0; j < i; j++)
 			semakptr->u.sem_base[sops[j].sem_num].semval -=
 			    sops[j].sem_op;
 
 		/* If we detected an error, return it */
 		if (error != 0)
 			goto done2;
 
 		/*
 		 * If the request that we couldn't satisfy has the
 		 * NOWAIT flag set then return with EAGAIN.
 		 */
 		if (sopptr->sem_flg & IPC_NOWAIT) {
 			error = EAGAIN;
 			goto done2;
 		}
 
 		if (sopptr->sem_op == 0)
 			semptr->semzcnt++;
 		else
 			semptr->semncnt++;
 
 		DPRINTF(("semop:  good night!\n"));
 		error = msleep(semakptr, sema_mtxp, (PZERO - 4) | PCATCH,
 		    "semwait", 0);
 		DPRINTF(("semop:  good morning (error=%d)!\n", error));
 		/* return code is checked below, after sem[nz]cnt-- */
 
 		/*
 		 * Make sure that the semaphore still exists
 		 */
 		seq = semakptr->u.sem_perm.seq;
 		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 		    seq != IPCID_TO_SEQ(uap->semid)) {
 			error = EIDRM;
 			goto done2;
 		}
 
 		/*
 		 * Renew the semaphore's pointer after wakeup since
 		 * during msleep sem_base may have been modified and semptr
 		 * is not valid any more
 		 */
 		semptr = &semakptr->u.sem_base[sopptr->sem_num];
 
 		/*
 		 * The semaphore is still alive.  Readjust the count of
 		 * waiting processes.
 		 */
 		if (sopptr->sem_op == 0)
 			semptr->semzcnt--;
 		else
 			semptr->semncnt--;
 
 		/*
 		 * Is it really morning, or was our sleep interrupted?
 		 * (Delayed check of msleep() return code because we
 		 * need to decrement sem[nz]cnt either way.)
 		 */
 		if (error != 0) {
 			error = EINTR;
 			goto done2;
 		}
 		DPRINTF(("semop:  good morning!\n"));
 	}
 
 done:
 	/*
 	 * Process any SEM_UNDO requests.
 	 */
 	if (do_undos) {
 		SEMUNDO_LOCK();
 		suptr = NULL;
 		for (i = 0; i < nsops; i++) {
 			/*
 			 * We only need to deal with SEM_UNDO's for non-zero
 			 * op's.
 			 */
 			int adjval;
 
 			if ((sops[i].sem_flg & SEM_UNDO) == 0)
 				continue;
 			adjval = sops[i].sem_op;
 			if (adjval == 0)
 				continue;
 			error = semundo_adjust(td, &suptr, semid, seq,
 			    sops[i].sem_num, -adjval);
 			if (error == 0)
 				continue;
 
 			/*
 			 * Oh-Oh!  We ran out of either sem_undo's or undo's.
 			 * Rollback the adjustments to this point and then
 			 * rollback the semaphore ups and down so we can return
 			 * with an error with all structures restored.  We
 			 * rollback the undo's in the exact reverse order that
 			 * we applied them.  This guarantees that we won't run
 			 * out of space as we roll things back out.
 			 */
 			for (j = 0; j < i; j++) {
 				k = i - j - 1;
 				if ((sops[k].sem_flg & SEM_UNDO) == 0)
 					continue;
 				adjval = sops[k].sem_op;
 				if (adjval == 0)
 					continue;
 				if (semundo_adjust(td, &suptr, semid, seq,
 				    sops[k].sem_num, adjval) != 0)
 					panic("semop - can't undo undos");
 			}
 
 			for (j = 0; j < nsops; j++)
 				semakptr->u.sem_base[sops[j].sem_num].semval -=
 				    sops[j].sem_op;
 
 			DPRINTF(("error = %d from semundo_adjust\n", error));
 			SEMUNDO_UNLOCK();
 			goto done2;
 		} /* loop through the sops */
 		SEMUNDO_UNLOCK();
 	} /* if (do_undos) */
 
 	/* We're definitely done - set the sempid's and time */
 	for (i = 0; i < nsops; i++) {
 		sopptr = &sops[i];
 		semptr = &semakptr->u.sem_base[sopptr->sem_num];
 		semptr->sempid = td->td_proc->p_pid;
 	}
 	semakptr->u.sem_otime = time_second;
 
 	/*
 	 * Do a wakeup if any semaphore was up'd whilst something was
 	 * sleeping on it.
 	 */
 	if (do_wakeup) {
 		DPRINTF(("semop:  doing wakeup\n"));
 		wakeup(semakptr);
 		DPRINTF(("semop:  back from wakeup\n"));
 	}
 	DPRINTF(("semop:  done\n"));
 	td->td_retval[0] = 0;
 done2:
 	mtx_unlock(sema_mtxp);
 	if (sops != small_sops)
 		free(sops, M_SEM);
 	return (error);
 }
 
 /*
  * Go through the undo structures for this process and apply the adjustments to
  * semaphores.
  */
 static void
 semexit_myhook(void *arg, struct proc *p)
 {
 	struct sem_undo *suptr;
 	struct semid_kernel *semakptr;
 	struct mtx *sema_mtxp;
 	int semid, semnum, adjval, ix;
 	unsigned short seq;
 
 	/*
 	 * Go through the chain of undo vectors looking for one
 	 * associated with this process.
 	 */
 	SEMUNDO_LOCK();
 	LIST_FOREACH(suptr, &semu_list, un_next) {
 		if (suptr->un_proc == p)
 			break;
 	}
 	if (suptr == NULL) {
 		SEMUNDO_UNLOCK();
 		return;
 	}
 	LIST_REMOVE(suptr, un_next);
 
 	DPRINTF(("proc @%p has undo structure with %d entries\n", p,
 	    suptr->un_cnt));
 
 	/*
 	 * If there are any active undo elements then process them.
 	 */
 	if (suptr->un_cnt > 0) {
 		SEMUNDO_UNLOCK();
 		for (ix = 0; ix < suptr->un_cnt; ix++) {
 			semid = suptr->un_ent[ix].un_id;
 			semnum = suptr->un_ent[ix].un_num;
 			adjval = suptr->un_ent[ix].un_adjval;
 			seq = suptr->un_ent[ix].un_seq;
 			semakptr = &sema[semid];
 			sema_mtxp = &sema_mtx[semid];
 
 			mtx_lock(sema_mtxp);
 			if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
 			    (semakptr->u.sem_perm.seq != seq)) {
 				mtx_unlock(sema_mtxp);
 				continue;
 			}
 			if (semnum >= semakptr->u.sem_nsems)
 				panic("semexit - semnum out of range");
 
 			DPRINTF((
 			    "semexit:  %p id=%d num=%d(adj=%d) ; sem=%d\n",
 			    suptr->un_proc, suptr->un_ent[ix].un_id,
 			    suptr->un_ent[ix].un_num,
 			    suptr->un_ent[ix].un_adjval,
 			    semakptr->u.sem_base[semnum].semval));
 
 			if (adjval < 0 && semakptr->u.sem_base[semnum].semval <
 			    -adjval)
 				semakptr->u.sem_base[semnum].semval = 0;
 			else
 				semakptr->u.sem_base[semnum].semval += adjval;
 
 			wakeup(semakptr);
 			DPRINTF(("semexit:  back from wakeup\n"));
 			mtx_unlock(sema_mtxp);
 		}
 		SEMUNDO_LOCK();
 	}
 
 	/*
 	 * Deallocate the undo vector.
 	 */
 	DPRINTF(("removing vector\n"));
 	suptr->un_proc = NULL;
 	suptr->un_cnt = 0;
 	LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
 	SEMUNDO_UNLOCK();
 }
 
 static int
 sysctl_sema(SYSCTL_HANDLER_ARGS)
 {
+	struct prison *rpr;
+	struct sbuf sb;
+	struct semid_kernel tmp, empty;
+	struct semid_kernel *semakptr;
+	int error, i;
 
-	return (SYSCTL_OUT(req, sema,
-	    sizeof(struct semid_kernel) * seminfo.semmni));
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		goto done;
+	rpr = sem_find_prison(req->td->td_ucred);
+	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct semid_kernel) *
+	    seminfo.semmni, req);
+
+	bzero(&empty, sizeof(empty));
+	for (i = 0; i < seminfo.semmni; i++) {
+		semakptr = &sema[i];
+		if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 ||
+		    rpr == NULL || sem_prison_cansee(rpr, semakptr) != 0) {
+			semakptr = &empty;
+		} else if (req->td->td_ucred->cr_prison !=
+		    semakptr->cred->cr_prison) {
+			bcopy(semakptr, &tmp, sizeof(tmp));
+			semakptr = &tmp;
+			semakptr->u.sem_perm.key = IPC_PRIVATE;
+		}
+
+		sbuf_bcat(&sb, semakptr, sizeof(*semakptr));
+	}
+	error = sbuf_finish(&sb);
+	sbuf_delete(&sb);
+
+done:
+	return (error);
 }
 
+static int
+sem_prison_check(void *obj, void *data)
+{
+	struct prison *pr = obj;
+	struct prison *prpr;
+	struct vfsoptlist *opts = data;
+	int error, jsys;
+
+	/*
+	 * sysvsem is a jailsys integer.
+	 * It must be "disable" if the parent jail is disabled.
+	 */
+	error = vfs_copyopt(opts, "sysvsem", &jsys, sizeof(jsys));
+	if (error != ENOENT) {
+		if (error != 0)
+			return (error);
+		switch (jsys) {
+		case JAIL_SYS_DISABLE:
+			break;
+		case JAIL_SYS_NEW:
+		case JAIL_SYS_INHERIT:
+			prison_lock(pr->pr_parent);
+			prpr = osd_jail_get(pr->pr_parent, sem_prison_slot);
+			prison_unlock(pr->pr_parent);
+			if (prpr == NULL)
+				return (EPERM);
+			break;
+		default:
+			return (EINVAL);
+		}
+	}
+
+	return (0);
+}
+
+static int
+sem_prison_set(void *obj, void *data)
+{
+	struct prison *pr = obj;
+	struct prison *tpr, *orpr, *nrpr, *trpr;
+	struct vfsoptlist *opts = data;
+	void *rsv;
+	int jsys, descend;
+
+	/*
+	 * sysvsem controls which jail is the root of the associated sems (this
+	 * jail or same as the parent), or if the feature is available at all.
+	 */
+	if (vfs_copyopt(opts, "sysvsem", &jsys, sizeof(jsys)) == ENOENT)
+		jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0)
+		    ? JAIL_SYS_INHERIT
+		    : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0)
+		    ? JAIL_SYS_DISABLE
+		    : -1;
+	if (jsys == JAIL_SYS_DISABLE) {
+		prison_lock(pr);
+		orpr = osd_jail_get(pr, sem_prison_slot);
+		if (orpr != NULL)
+			osd_jail_del(pr, sem_prison_slot);
+		prison_unlock(pr);
+		if (orpr != NULL) {
+			if (orpr == pr)
+				sem_prison_cleanup(pr);
+			/* Disable all child jails as well. */
+			FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
+				prison_lock(tpr);
+				trpr = osd_jail_get(tpr, sem_prison_slot);
+				if (trpr != NULL) {
+					osd_jail_del(tpr, sem_prison_slot);
+					prison_unlock(tpr);
+					if (trpr == tpr)
+						sem_prison_cleanup(tpr);
+				} else {
+					prison_unlock(tpr);
+					descend = 0;
+				}
+			}
+		}
+	} else if (jsys != -1) {
+		if (jsys == JAIL_SYS_NEW)
+			nrpr = pr;
+		else {
+			prison_lock(pr->pr_parent);
+			nrpr = osd_jail_get(pr->pr_parent, sem_prison_slot);
+			prison_unlock(pr->pr_parent);
+		}
+		rsv = osd_reserve(sem_prison_slot);
+		prison_lock(pr);
+		orpr = osd_jail_get(pr, sem_prison_slot);
+		if (orpr != nrpr)
+			(void)osd_jail_set_reserved(pr, sem_prison_slot, rsv,
+			    nrpr);
+		else
+			osd_free_reserved(rsv);
+		prison_unlock(pr);
+		if (orpr != nrpr) {
+			if (orpr == pr)
+				sem_prison_cleanup(pr);
+			if (orpr != NULL) {
+				/* Change child jails matching the old root, */
+				FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
+					prison_lock(tpr);
+					trpr = osd_jail_get(tpr,
+					    sem_prison_slot);
+					if (trpr == orpr) {
+						(void)osd_jail_set(tpr,
+						    sem_prison_slot, nrpr);
+						prison_unlock(tpr);
+						if (trpr == tpr)
+							sem_prison_cleanup(tpr);
+					} else {
+						prison_unlock(tpr);
+						descend = 0;
+					}
+				}
+			}
+		}
+	}
+
+	return (0);
+}
+
+static int
+sem_prison_get(void *obj, void *data)
+{
+	struct prison *pr = obj;
+	struct prison *rpr;
+	struct vfsoptlist *opts = data;
+	int error, jsys;
+
+	/* Set sysvsem based on the jail's root prison. */
+	prison_lock(pr);
+	rpr = osd_jail_get(pr, sem_prison_slot);
+	prison_unlock(pr);
+	jsys = rpr == NULL ? JAIL_SYS_DISABLE
+	    : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
+	error = vfs_setopt(opts, "sysvsem", &jsys, sizeof(jsys));
+	if (error == ENOENT)
+		error = 0;
+	return (error);
+}
+
+static int
+sem_prison_remove(void *obj, void *data __unused)
+{
+	struct prison *pr = obj;
+	struct prison *rpr;
+
+	prison_lock(pr);
+	rpr = osd_jail_get(pr, sem_prison_slot);
+	prison_unlock(pr);
+	if (rpr == pr)
+		sem_prison_cleanup(pr);
+	return (0);
+}
+
+static void
+sem_prison_cleanup(struct prison *pr)
+{
+	int i;
+
+	/* Remove any sems that belong to this jail. */
+	mtx_lock(&sem_mtx);
+	for (i = 0; i < seminfo.semmni; i++) {
+		if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
+		    sema[i].cred != NULL && sema[i].cred->cr_prison == pr) {
+			mtx_lock(&sema_mtx[i]);
+			sem_remove(i, NULL);
+			mtx_unlock(&sema_mtx[i]);
+		}
+	}
+	mtx_unlock(&sem_mtx);
+}
+
+SYSCTL_JAIL_PARAM_SYS_NODE(sysvsem, CTLFLAG_RW, "SYSV semaphores");
+
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *semcalls[] = {
 	(sy_call_t *)freebsd7___semctl, (sy_call_t *)sys_semget,
 	(sy_call_t *)sys_semop
 };
 
 /*
  * Entry point for all SEM calls.
  */
 int
 sys_semsys(td, uap)
 	struct thread *td;
 	/* XXX actually varargs. */
 	struct semsys_args /* {
 		int	which;
 		int	a2;
 		int	a3;
 		int	a4;
 		int	a5;
 	} */ *uap;
 {
 	int error;
 
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
-		return (ENOSYS);
 	if (uap->which < 0 || uap->which >= nitems(semcalls))
 		return (EINVAL);
 	error = (*semcalls[uap->which])(td, &uap->a2);
 	return (error);
 }
 
 #ifndef CP
 #define CP(src, dst, fld)	do { (dst).fld = (src).fld; } while (0)
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd7___semctl_args {
 	int	semid;
 	int	semnum;
 	int	cmd;
 	union	semun_old *arg;
 };
 #endif
 int
 freebsd7___semctl(struct thread *td, struct freebsd7___semctl_args *uap)
 {
 	struct semid_ds_old dsold;
 	struct semid_ds dsbuf;
 	union semun_old arg;
 	union semun semun;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(arg.buf, &dsold, sizeof(dsold));
 		if (error)
 			return (error);
 		ipcperm_old2new(&dsold.sem_perm, &dsbuf.sem_perm);
 		CP(dsold, dsbuf, sem_base);
 		CP(dsold, dsbuf, sem_nsems);
 		CP(dsold, dsbuf, sem_otime);
 		CP(dsold, dsbuf, sem_ctime);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = arg.array;
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;		
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		bzero(&dsold, sizeof(dsold));
 		ipcperm_new2old(&dsbuf.sem_perm, &dsold.sem_perm);
 		CP(dsbuf, dsold, sem_base);
 		CP(dsbuf, dsold, sem_nsems);
 		CP(dsbuf, dsold, sem_otime);
 		CP(dsbuf, dsold, sem_ctime);
 		error = copyout(&dsold, arg.buf, sizeof(dsold));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 
 #endif /* COMPAT_FREEBSD{4,5,6,7} */
 
 #ifdef COMPAT_FREEBSD32
 
 int
 freebsd32_semsys(struct thread *td, struct freebsd32_semsys_args *uap)
 {
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	switch (uap->which) {
 	case 0:
 		return (freebsd7_freebsd32_semctl(td,
 		    (struct freebsd7_freebsd32_semctl_args *)&uap->a2));
 	default:
 		return (sys_semsys(td, (struct semsys_args *)uap));
 	}
 #else
 	return (nosys(td, NULL));
 #endif
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 int
 freebsd7_freebsd32_semctl(struct thread *td,
     struct freebsd7_freebsd32_semctl_args *uap)
 {
 	struct semid_ds32_old dsbuf32;
 	struct semid_ds dsbuf;
 	union semun semun;
 	union semun32 arg;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);		
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_old_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
 		PTRIN_CP(dsbuf32, dsbuf, sem_base);
 		CP(dsbuf32, dsbuf, sem_nsems);
 		CP(dsbuf32, dsbuf, sem_otime);
 		CP(dsbuf32, dsbuf, sem_ctime);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = PTRIN(arg.array);
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		bzero(&dsbuf32, sizeof(dsbuf32));
 		freebsd32_ipcperm_old_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
 		PTROUT_CP(dsbuf, dsbuf32, sem_base);
 		CP(dsbuf, dsbuf32, sem_nsems);
 		CP(dsbuf, dsbuf32, sem_otime);
 		CP(dsbuf, dsbuf32, sem_ctime);
 		error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 #endif
 
 int
 freebsd32_semctl(struct thread *td, struct freebsd32_semctl_args *uap)
 {
 	struct semid_ds32 dsbuf32;
 	struct semid_ds dsbuf;
 	union semun semun;
 	union semun32 arg;
 	register_t rval;
 	int error;
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_SET:
 	case IPC_STAT:
 	case GETALL:
 	case SETVAL:
 	case SETALL:
 		error = copyin(uap->arg, &arg, sizeof(arg));
 		if (error)
 			return (error);		
 		break;
 	}
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		semun.buf = &dsbuf;
 		break;
 	case IPC_SET:
 		error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
 		if (error)
 			return (error);
 		freebsd32_ipcperm_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
 		PTRIN_CP(dsbuf32, dsbuf, sem_base);
 		CP(dsbuf32, dsbuf, sem_nsems);
 		CP(dsbuf32, dsbuf, sem_otime);
 		CP(dsbuf32, dsbuf, sem_ctime);
 		semun.buf = &dsbuf;
 		break;
 	case GETALL:
 	case SETALL:
 		semun.array = PTRIN(arg.array);
 		break;
 	case SETVAL:
 		semun.val = arg.val;
 		break;		
 	}
 
 	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
 	    &rval);
 	if (error)
 		return (error);
 
 	switch (uap->cmd) {
 	case SEM_STAT:
 	case IPC_STAT:
 		bzero(&dsbuf32, sizeof(dsbuf32));
 		freebsd32_ipcperm_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
 		PTROUT_CP(dsbuf, dsbuf32, sem_base);
 		CP(dsbuf, dsbuf32, sem_nsems);
 		CP(dsbuf, dsbuf32, sem_otime);
 		CP(dsbuf, dsbuf32, sem_ctime);
 		error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
 		break;
 	}
 
 	if (error == 0)
 		td->td_retval[0] = rval;
 	return (error);
 }
 
 #endif /* COMPAT_FREEBSD32 */
Index: head/sys/kern/sysv_shm.c
===================================================================
--- head/sys/kern/sysv_shm.c	(revision 298584)
+++ head/sys/kern/sysv_shm.c	(revision 298585)
@@ -1,1364 +1,1655 @@
 /*	$NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $	*/
 /*-
  * Copyright (c) 1994 Adam Glass and Charles Hannum.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Adam Glass and Charles
  *	Hannum.
  * 4. The names of the authors may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Copyright (c) 2003-2005 McAfee, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
  * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
  * program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/shm.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
+#include <sys/sbuf.h>
 #include <sys/stat.h>
 #include <sys/syscall.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
 FEATURE(sysv_shm, "System V shared memory segments support");
 
 static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
 
 static int shmget_allocate_segment(struct thread *td,
     struct shmget_args *uap, int mode);
 static int shmget_existing(struct thread *td, struct shmget_args *uap,
     int mode, int segnum);
 
 #define	SHMSEG_FREE     	0x0200
 #define	SHMSEG_REMOVED  	0x0400
 #define	SHMSEG_ALLOCATED	0x0800
 
 static int shm_last_free, shm_nused, shmalloced;
 vm_size_t shm_committed;
-static struct shmid_kernel	*shmsegs;
+static struct shmid_kernel *shmsegs;
+static unsigned shm_prison_slot;
 
 struct shmmap_state {
 	vm_offset_t va;
 	int shmid;
 };
 
 static void shm_deallocate_segment(struct shmid_kernel *);
-static int shm_find_segment_by_key(key_t);
-static struct shmid_kernel *shm_find_segment(int, bool);
+static int shm_find_segment_by_key(struct prison *, key_t);
+static struct shmid_kernel *shm_find_segment(struct prison *, int, bool);
 static int shm_delete_mapping(struct vmspace *vm, struct shmmap_state *);
 static void shmrealloc(void);
 static int shminit(void);
 static int sysvshm_modload(struct module *, int, void *);
 static int shmunload(void);
 static void shmexit_myhook(struct vmspace *vm);
 static void shmfork_myhook(struct proc *p1, struct proc *p2);
 static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS);
+static void shm_remove(struct shmid_kernel *, int);
+static struct prison *shm_find_prison(struct ucred *);
+static int shm_prison_cansee(struct prison *, struct shmid_kernel *);
+static int shm_prison_check(void *, void *);
+static int shm_prison_set(void *, void *);
+static int shm_prison_get(void *, void *);
+static int shm_prison_remove(void *, void *);
+static void shm_prison_cleanup(struct prison *);
 
 /*
  * Tuneable values.
  */
 #ifndef SHMMAXPGS
 #define	SHMMAXPGS	131072	/* Note: sysv shared memory is swap backed. */
 #endif
 #ifndef SHMMAX
 #define	SHMMAX	(SHMMAXPGS*PAGE_SIZE)
 #endif
 #ifndef SHMMIN
 #define	SHMMIN	1
 #endif
 #ifndef SHMMNI
 #define	SHMMNI	192
 #endif
 #ifndef SHMSEG
 #define	SHMSEG	128
 #endif
 #ifndef SHMALL
 #define	SHMALL	(SHMMAXPGS)
 #endif
 
 struct	shminfo shminfo = {
 	.shmmax = SHMMAX,
 	.shmmin = SHMMIN,
 	.shmmni = SHMMNI,
 	.shmseg = SHMSEG,
 	.shmall = SHMALL
 };
 
 static int shm_use_phys;
 static int shm_allow_removed = 1;
 
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RWTUN, &shminfo.shmmax, 0,
     "Maximum shared memory segment size");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RWTUN, &shminfo.shmmin, 0,
     "Minimum shared memory segment size");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
     "Number of shared memory identifiers");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
     "Number of segments per process");
 SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RWTUN, &shminfo.shmall, 0,
     "Maximum number of pages available for shared memory");
 SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RWTUN,
     &shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core");
 SYSCTL_INT(_kern_ipc, OID_AUTO, shm_allow_removed, CTLFLAG_RWTUN,
     &shm_allow_removed, 0,
     "Enable/Disable attachment to attached segments marked for removal");
 SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_shmsegs, "",
     "Current number of shared memory segments allocated");
 
 static struct sx sysvshmsx;
 #define	SYSVSHM_LOCK()		sx_xlock(&sysvshmsx)
 #define	SYSVSHM_UNLOCK()	sx_xunlock(&sysvshmsx)
 #define	SYSVSHM_ASSERT_LOCKED()	sx_assert(&sysvshmsx, SA_XLOCKED)
 
 static int
-shm_find_segment_by_key(key_t key)
+shm_find_segment_by_key(struct prison *pr, key_t key)
 {
 	int i;
 
 	for (i = 0; i < shmalloced; i++)
 		if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) &&
+		    shmsegs[i].cred != NULL &&
+		    shmsegs[i].cred->cr_prison == pr &&
 		    shmsegs[i].u.shm_perm.key == key)
 			return (i);
 	return (-1);
 }
 
 /*
  * Finds segment either by shmid if is_shmid is true, or by segnum if
  * is_shmid is false.
  */
 static struct shmid_kernel *
-shm_find_segment(int arg, bool is_shmid)
+shm_find_segment(struct prison *rpr, int arg, bool is_shmid)
 {
 	struct shmid_kernel *shmseg;
 	int segnum;
 
 	segnum = is_shmid ? IPCID_TO_IX(arg) : arg;
 	if (segnum < 0 || segnum >= shmalloced)
 		return (NULL);
 	shmseg = &shmsegs[segnum];
 	if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
 	    (!shm_allow_removed &&
 	    (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) ||
-	    (is_shmid && shmseg->u.shm_perm.seq != IPCID_TO_SEQ(arg)))
+	    (is_shmid && shmseg->u.shm_perm.seq != IPCID_TO_SEQ(arg)) ||
+	    !shm_prison_cansee(rpr, shmseg))
 		return (NULL);
 	return (shmseg);
 }
 
 static void
 shm_deallocate_segment(struct shmid_kernel *shmseg)
 {
 	vm_size_t size;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	vm_object_deallocate(shmseg->object);
 	shmseg->object = NULL;
 	size = round_page(shmseg->u.shm_segsz);
 	shm_committed -= btoc(size);
 	shm_nused--;
 	shmseg->u.shm_perm.mode = SHMSEG_FREE;
 #ifdef MAC
 	mac_sysvshm_cleanup(shmseg);
 #endif
 	racct_sub_cred(shmseg->cred, RACCT_NSHM, 1);
 	racct_sub_cred(shmseg->cred, RACCT_SHMSIZE, size);
 	crfree(shmseg->cred);
 	shmseg->cred = NULL;
 }
 
 static int
 shm_delete_mapping(struct vmspace *vm, struct shmmap_state *shmmap_s)
 {
 	struct shmid_kernel *shmseg;
 	int segnum, result;
 	vm_size_t size;
 
 	SYSVSHM_ASSERT_LOCKED();
 	segnum = IPCID_TO_IX(shmmap_s->shmid);
 	KASSERT(segnum >= 0 && segnum < shmalloced,
 	    ("segnum %d shmalloced %d", segnum, shmalloced));
 
 	shmseg = &shmsegs[segnum];
 	size = round_page(shmseg->u.shm_segsz);
 	result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size);
 	if (result != KERN_SUCCESS)
 		return (EINVAL);
 	shmmap_s->shmid = -1;
 	shmseg->u.shm_dtime = time_second;
 	if ((--shmseg->u.shm_nattch <= 0) &&
 	    (shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) {
 		shm_deallocate_segment(shmseg);
 		shm_last_free = segnum;
 	}
 	return (0);
 }
 
+static void
+shm_remove(struct shmid_kernel *shmseg, int segnum)
+{
+
+	shmseg->u.shm_perm.key = IPC_PRIVATE;
+	shmseg->u.shm_perm.mode |= SHMSEG_REMOVED;
+	if (shmseg->u.shm_nattch <= 0) {
+		shm_deallocate_segment(shmseg);
+		shm_last_free = segnum;
+	}
+}
+
+static struct prison *
+shm_find_prison(struct ucred *cred)
+{
+	struct prison *pr, *rpr;
+
+	pr = cred->cr_prison;
+	prison_lock(pr);
+	rpr = osd_jail_get(pr, shm_prison_slot);
+	prison_unlock(pr);
+	return rpr;
+}
+
 static int
+shm_prison_cansee(struct prison *rpr, struct shmid_kernel *shmseg)
+{
+
+	if (shmseg->cred == NULL ||
+	    !(rpr == shmseg->cred->cr_prison ||
+	      prison_ischild(rpr, shmseg->cred->cr_prison)))
+		return (EINVAL);
+	return (0);
+}
+
+static int
 kern_shmdt_locked(struct thread *td, const void *shmaddr)
 {
 	struct proc *p = td->td_proc;
 	struct shmmap_state *shmmap_s;
 #ifdef MAC
 	struct shmid_kernel *shmsegptr;
 	int error;
 #endif
 	int i;
 
 	SYSVSHM_ASSERT_LOCKED();
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+	if (shm_find_prison(td->td_ucred) == NULL)
 		return (ENOSYS);
 	shmmap_s = p->p_vmspace->vm_shm;
  	if (shmmap_s == NULL)
 		return (EINVAL);
 	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
 		if (shmmap_s->shmid != -1 &&
 		    shmmap_s->va == (vm_offset_t)shmaddr) {
 			break;
 		}
 	}
 	if (i == shminfo.shmseg)
 		return (EINVAL);
 #ifdef MAC
 	shmsegptr = &shmsegs[IPCID_TO_IX(shmmap_s->shmid)];
 	error = mac_sysvshm_check_shmdt(td->td_ucred, shmsegptr);
 	if (error != 0)
 		return (error);
 #endif
 	return (shm_delete_mapping(p->p_vmspace, shmmap_s));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmdt_args {
 	const void *shmaddr;
 };
 #endif
 int
 sys_shmdt(struct thread *td, struct shmdt_args *uap)
 {
 	int error;
 
 	SYSVSHM_LOCK();
 	error = kern_shmdt_locked(td, uap->shmaddr);
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 static int
 kern_shmat_locked(struct thread *td, int shmid, const void *shmaddr,
     int shmflg)
 {
+	struct prison *rpr;
 	struct proc *p = td->td_proc;
 	struct shmid_kernel *shmseg;
 	struct shmmap_state *shmmap_s;
 	vm_offset_t attach_va;
 	vm_prot_t prot;
 	vm_size_t size;
 	int error, i, rv;
 
 	SYSVSHM_ASSERT_LOCKED();
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+	rpr = shm_find_prison(td->td_ucred);
+	if (rpr == NULL)
 		return (ENOSYS);
 	shmmap_s = p->p_vmspace->vm_shm;
 	if (shmmap_s == NULL) {
 		shmmap_s = malloc(shminfo.shmseg * sizeof(struct shmmap_state),
 		    M_SHM, M_WAITOK);
 		for (i = 0; i < shminfo.shmseg; i++)
 			shmmap_s[i].shmid = -1;
 		KASSERT(p->p_vmspace->vm_shm == NULL, ("raced"));
 		p->p_vmspace->vm_shm = shmmap_s;
 	}
-	shmseg = shm_find_segment(shmid, true);
+	shmseg = shm_find_segment(rpr, shmid, true);
 	if (shmseg == NULL)
 		return (EINVAL);
 	error = ipcperm(td, &shmseg->u.shm_perm,
 	    (shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
 	if (error != 0)
 		return (error);
 #ifdef MAC
 	error = mac_sysvshm_check_shmat(td->td_ucred, shmseg, shmflg);
 	if (error != 0)
 		return (error);
 #endif
 	for (i = 0; i < shminfo.shmseg; i++) {
 		if (shmmap_s->shmid == -1)
 			break;
 		shmmap_s++;
 	}
 	if (i >= shminfo.shmseg)
 		return (EMFILE);
 	size = round_page(shmseg->u.shm_segsz);
 	prot = VM_PROT_READ;
 	if ((shmflg & SHM_RDONLY) == 0)
 		prot |= VM_PROT_WRITE;
 	if (shmaddr != NULL) {
 		if ((shmflg & SHM_RND) != 0)
 			attach_va = rounddown2((vm_offset_t)shmaddr, SHMLBA);
 		else if (((vm_offset_t)shmaddr & (SHMLBA-1)) == 0)
 			attach_va = (vm_offset_t)shmaddr;
 		else
 			return (EINVAL);
 	} else {
 		/*
 		 * This is just a hint to vm_map_find() about where to
 		 * put it.
 		 */
 		attach_va = round_page((vm_offset_t)p->p_vmspace->vm_daddr +
 		    lim_max(td, RLIMIT_DATA));
 	}
 
 	vm_object_reference(shmseg->object);
 	rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->object, 0, &attach_va,
 	    size, 0, shmaddr != NULL ? VMFS_NO_SPACE : VMFS_OPTIMAL_SPACE,
 	    prot, prot, MAP_INHERIT_SHARE | MAP_PREFAULT_PARTIAL);
 	if (rv != KERN_SUCCESS) {
 		vm_object_deallocate(shmseg->object);
 		return (ENOMEM);
 	}
 
 	shmmap_s->va = attach_va;
 	shmmap_s->shmid = shmid;
 	shmseg->u.shm_lpid = p->p_pid;
 	shmseg->u.shm_atime = time_second;
 	shmseg->u.shm_nattch++;
 	td->td_retval[0] = attach_va;
 	return (error);
 }
 
 int
 kern_shmat(struct thread *td, int shmid, const void *shmaddr, int shmflg)
 {
 	int error;
 
 	SYSVSHM_LOCK();
 	error = kern_shmat_locked(td, shmid, shmaddr, shmflg);
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmat_args {
 	int shmid;
 	const void *shmaddr;
 	int shmflg;
 };
 #endif
 int
 sys_shmat(struct thread *td, struct shmat_args *uap)
 {
 
 	return (kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg));
 }
 
 static int
 kern_shmctl_locked(struct thread *td, int shmid, int cmd, void *buf,
     size_t *bufsz)
 {
+	struct prison *rpr;
 	struct shmid_kernel *shmseg;
 	struct shmid_ds *shmidp;
 	struct shm_info shm_info;
 	int error;
 
 	SYSVSHM_ASSERT_LOCKED();
 
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+	rpr = shm_find_prison(td->td_ucred);
+	if (rpr == NULL)
 		return (ENOSYS);
 
 	switch (cmd) {
 	/*
 	 * It is possible that kern_shmctl is being called from the Linux ABI
 	 * layer, in which case, we will need to implement IPC_INFO.  It should
 	 * be noted that other shmctl calls will be funneled through here for
 	 * Linix binaries as well.
 	 *
 	 * NB: The Linux ABI layer will convert this data to structure(s) more
 	 * consistent with the Linux ABI.
 	 */
 	case IPC_INFO:
 		memcpy(buf, &shminfo, sizeof(shminfo));
 		if (bufsz)
 			*bufsz = sizeof(shminfo);
 		td->td_retval[0] = shmalloced;
 		return (0);
 	case SHM_INFO: {
 		shm_info.used_ids = shm_nused;
 		shm_info.shm_rss = 0;	/*XXX where to get from ? */
 		shm_info.shm_tot = 0;	/*XXX where to get from ? */
 		shm_info.shm_swp = 0;	/*XXX where to get from ? */
 		shm_info.swap_attempts = 0;	/*XXX where to get from ? */
 		shm_info.swap_successes = 0;	/*XXX where to get from ? */
 		memcpy(buf, &shm_info, sizeof(shm_info));
 		if (bufsz != NULL)
 			*bufsz = sizeof(shm_info);
 		td->td_retval[0] = shmalloced;
 		return (0);
 	}
 	}
-	shmseg = shm_find_segment(shmid, cmd != SHM_STAT);
+	shmseg = shm_find_segment(rpr, shmid, cmd != SHM_STAT);
 	if (shmseg == NULL)
 		return (EINVAL);
 #ifdef MAC
 	error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, cmd);
 	if (error != 0)
 		return (error);
 #endif
 	switch (cmd) {
 	case SHM_STAT:
 	case IPC_STAT:
+		shmidp = (struct shmid_ds *)buf;
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
 		if (error != 0)
 			return (error);
-		memcpy(buf, &shmseg->u, sizeof(struct shmid_ds));
+		memcpy(shmidp, &shmseg->u, sizeof(struct shmid_ds));
+		if (td->td_ucred->cr_prison != shmseg->cred->cr_prison)
+			shmidp->shm_perm.key = IPC_PRIVATE;
 		if (bufsz != NULL)
 			*bufsz = sizeof(struct shmid_ds);
 		if (cmd == SHM_STAT) {
 			td->td_retval[0] = IXSEQ_TO_IPCID(shmid,
 			    shmseg->u.shm_perm);
 		}
 		break;
 	case IPC_SET:
 		shmidp = (struct shmid_ds *)buf;
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
 		if (error != 0)
 			return (error);
 		shmseg->u.shm_perm.uid = shmidp->shm_perm.uid;
 		shmseg->u.shm_perm.gid = shmidp->shm_perm.gid;
 		shmseg->u.shm_perm.mode =
 		    (shmseg->u.shm_perm.mode & ~ACCESSPERMS) |
 		    (shmidp->shm_perm.mode & ACCESSPERMS);
 		shmseg->u.shm_ctime = time_second;
 		break;
 	case IPC_RMID:
 		error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
 		if (error != 0)
 			return (error);
-		shmseg->u.shm_perm.key = IPC_PRIVATE;
-		shmseg->u.shm_perm.mode |= SHMSEG_REMOVED;
-		if (shmseg->u.shm_nattch <= 0) {
-			shm_deallocate_segment(shmseg);
-			shm_last_free = IPCID_TO_IX(shmid);
-		}
+		shm_remove(shmseg, IPCID_TO_IX(shmid));
 		break;
 #if 0
 	case SHM_LOCK:
 	case SHM_UNLOCK:
 #endif
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 int
 kern_shmctl(struct thread *td, int shmid, int cmd, void *buf, size_t *bufsz)
 {
 	int error;
 
 	SYSVSHM_LOCK();
 	error = kern_shmctl_locked(td, shmid, cmd, buf, bufsz);
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmctl_args {
 	int shmid;
 	int cmd;
 	struct shmid_ds *buf;
 };
 #endif
 int
 sys_shmctl(struct thread *td, struct shmctl_args *uap)
 {
 	int error;
 	struct shmid_ds buf;
 	size_t bufsz;
 
 	/*
 	 * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
 	 * Linux binaries.  If we see the call come through the FreeBSD ABI,
 	 * return an error back to the user since we do not to support this.
 	 */
 	if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
 	    uap->cmd == SHM_STAT)
 		return (EINVAL);
 
 	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &buf, sizeof(struct shmid_ds))))
 			goto done;
 	}
 
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
 	if (error)
 		goto done;
 
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_STAT:
 		error = copyout(&buf, uap->buf, bufsz);
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 
 
 static int
 shmget_existing(struct thread *td, struct shmget_args *uap, int mode,
     int segnum)
 {
 	struct shmid_kernel *shmseg;
 #ifdef MAC
 	int error;
 #endif
 
 	SYSVSHM_ASSERT_LOCKED();
 	KASSERT(segnum >= 0 && segnum < shmalloced,
 	    ("segnum %d shmalloced %d", segnum, shmalloced));
 	shmseg = &shmsegs[segnum];
 	if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
 		return (EEXIST);
 #ifdef MAC
 	error = mac_sysvshm_check_shmget(td->td_ucred, shmseg, uap->shmflg);
 	if (error != 0)
 		return (error);
 #endif
 	if (uap->size != 0 && uap->size > shmseg->u.shm_segsz)
 		return (EINVAL);
 	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
 	return (0);
 }
 
 static int
 shmget_allocate_segment(struct thread *td, struct shmget_args *uap, int mode)
 {
 	struct ucred *cred = td->td_ucred;
 	struct shmid_kernel *shmseg;
 	vm_object_t shm_object;
 	int i, segnum;
 	size_t size;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax)
 		return (EINVAL);
 	if (shm_nused >= shminfo.shmmni) /* Any shmids left? */
 		return (ENOSPC);
 	size = round_page(uap->size);
 	if (shm_committed + btoc(size) > shminfo.shmall)
 		return (ENOMEM);
 	if (shm_last_free < 0) {
 		shmrealloc();	/* Maybe expand the shmsegs[] array. */
 		for (i = 0; i < shmalloced; i++)
 			if (shmsegs[i].u.shm_perm.mode & SHMSEG_FREE)
 				break;
 		if (i == shmalloced)
 			return (ENOSPC);
 		segnum = i;
 	} else  {
 		segnum = shm_last_free;
 		shm_last_free = -1;
 	}
 	KASSERT(segnum >= 0 && segnum < shmalloced,
 	    ("segnum %d shmalloced %d", segnum, shmalloced));
 	shmseg = &shmsegs[segnum];
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(td->td_proc);
 		if (racct_add(td->td_proc, RACCT_NSHM, 1)) {
 			PROC_UNLOCK(td->td_proc);
 			return (ENOSPC);
 		}
 		if (racct_add(td->td_proc, RACCT_SHMSIZE, size)) {
 			racct_sub(td->td_proc, RACCT_NSHM, 1);
 			PROC_UNLOCK(td->td_proc);
 			return (ENOMEM);
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 #endif
 
 	/*
 	 * We make sure that we have allocated a pager before we need
 	 * to.
 	 */
 	shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP,
 	    0, size, VM_PROT_DEFAULT, 0, cred);
 	if (shm_object == NULL) {
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(td->td_proc);
 			racct_sub(td->td_proc, RACCT_NSHM, 1);
 			racct_sub(td->td_proc, RACCT_SHMSIZE, size);
 			PROC_UNLOCK(td->td_proc);
 		}
 #endif
 		return (ENOMEM);
 	}
 	shm_object->pg_color = 0;
 	VM_OBJECT_WLOCK(shm_object);
 	vm_object_clear_flag(shm_object, OBJ_ONEMAPPING);
 	vm_object_set_flag(shm_object, OBJ_COLORED | OBJ_NOSPLIT);
 	VM_OBJECT_WUNLOCK(shm_object);
 
 	shmseg->object = shm_object;
 	shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid;
 	shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid;
 	shmseg->u.shm_perm.mode = (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
 	shmseg->u.shm_perm.key = uap->key;
 	shmseg->u.shm_perm.seq = (shmseg->u.shm_perm.seq + 1) & 0x7fff;
 	shmseg->cred = crhold(cred);
 	shmseg->u.shm_segsz = uap->size;
 	shmseg->u.shm_cpid = td->td_proc->p_pid;
 	shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0;
 	shmseg->u.shm_atime = shmseg->u.shm_dtime = 0;
 #ifdef MAC
 	mac_sysvshm_create(cred, shmseg);
 #endif
 	shmseg->u.shm_ctime = time_second;
 	shm_committed += btoc(size);
 	shm_nused++;
 	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
 
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmget_args {
 	key_t key;
 	size_t size;
 	int shmflg;
 };
 #endif
 int
 sys_shmget(struct thread *td, struct shmget_args *uap)
 {
 	int segnum, mode;
 	int error;
 
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+	if (shm_find_prison(td->td_ucred) == NULL)
 		return (ENOSYS);
 	mode = uap->shmflg & ACCESSPERMS;
 	SYSVSHM_LOCK();
 	if (uap->key == IPC_PRIVATE) {
 		error = shmget_allocate_segment(td, uap, mode);
 	} else {
-		segnum = shm_find_segment_by_key(uap->key);
+		segnum = shm_find_segment_by_key(td->td_ucred->cr_prison,
+		    uap->key);
 		if (segnum >= 0)
 			error = shmget_existing(td, uap, mode, segnum);
 		else if ((uap->shmflg & IPC_CREAT) == 0)
 			error = ENOENT;
 		else
 			error = shmget_allocate_segment(td, uap, mode);
 	}
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
 static void
 shmfork_myhook(struct proc *p1, struct proc *p2)
 {
 	struct shmmap_state *shmmap_s;
 	size_t size;
 	int i;
 
 	SYSVSHM_LOCK();
 	size = shminfo.shmseg * sizeof(struct shmmap_state);
 	shmmap_s = malloc(size, M_SHM, M_WAITOK);
 	bcopy(p1->p_vmspace->vm_shm, shmmap_s, size);
 	p2->p_vmspace->vm_shm = shmmap_s;
 	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
 		if (shmmap_s->shmid != -1) {
 			KASSERT(IPCID_TO_IX(shmmap_s->shmid) >= 0 &&
 			    IPCID_TO_IX(shmmap_s->shmid) < shmalloced,
 			    ("segnum %d shmalloced %d",
 			    IPCID_TO_IX(shmmap_s->shmid), shmalloced));
 			shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++;
 		}
 	}
 	SYSVSHM_UNLOCK();
 }
 
 static void
 shmexit_myhook(struct vmspace *vm)
 {
 	struct shmmap_state *base, *shm;
 	int i;
 
 	base = vm->vm_shm;
 	if (base != NULL) {
 		vm->vm_shm = NULL;
 		SYSVSHM_LOCK();
 		for (i = 0, shm = base; i < shminfo.shmseg; i++, shm++) {
 			if (shm->shmid != -1)
 				shm_delete_mapping(vm, shm);
 		}
 		SYSVSHM_UNLOCK();
 		free(base, M_SHM);
 	}
 }
 
 static void
 shmrealloc(void)
 {
 	struct shmid_kernel *newsegs;
 	int i;
 
 	SYSVSHM_ASSERT_LOCKED();
 
 	if (shmalloced >= shminfo.shmmni)
 		return;
 
 	newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK);
 	for (i = 0; i < shmalloced; i++)
 		bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0]));
 	for (; i < shminfo.shmmni; i++) {
 		newsegs[i].u.shm_perm.mode = SHMSEG_FREE;
 		newsegs[i].u.shm_perm.seq = 0;
 #ifdef MAC
 		mac_sysvshm_init(&newsegs[i]);
 #endif
 	}
 	free(shmsegs, M_SHM);
 	shmsegs = newsegs;
 	shmalloced = shminfo.shmmni;
 }
 
 static struct syscall_helper_data shm_syscalls[] = {
 	SYSCALL_INIT_HELPER(shmat),
 	SYSCALL_INIT_HELPER(shmctl),
 	SYSCALL_INIT_HELPER(shmdt),
 	SYSCALL_INIT_HELPER(shmget),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL_INIT_HELPER_COMPAT(freebsd7_shmctl),
 #endif
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 	SYSCALL_INIT_HELPER(shmsys),
 #endif
 	SYSCALL_INIT_LAST
 };
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_ipc.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/freebsd32/freebsd32_util.h>
 
 static struct syscall_helper_data shm32_syscalls[] = {
 	SYSCALL32_INIT_HELPER_COMPAT(shmat),
 	SYSCALL32_INIT_HELPER_COMPAT(shmdt),
 	SYSCALL32_INIT_HELPER_COMPAT(shmget),
 	SYSCALL32_INIT_HELPER(freebsd32_shmsys),
 	SYSCALL32_INIT_HELPER(freebsd32_shmctl),
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_shmctl),
 #endif
 	SYSCALL_INIT_LAST
 };
 #endif
 
 static int
 shminit(void)
 {
+	struct prison *pr;
+	void *rsv;
 	int i, error;
+	osd_method_t methods[PR_MAXMETHOD] = {
+	    [PR_METHOD_CHECK] =		shm_prison_check,
+	    [PR_METHOD_SET] =		shm_prison_set,
+	    [PR_METHOD_GET] =		shm_prison_get,
+	    [PR_METHOD_REMOVE] =	shm_prison_remove,
+	};
 
 #ifndef BURN_BRIDGES
 	if (TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall) != 0)
 		printf("kern.ipc.shmmaxpgs is now called kern.ipc.shmall!\n");
 #endif
 	if (shminfo.shmmax == SHMMAX) {
 		/* Initialize shmmax dealing with possible overflow. */
 		for (i = PAGE_SIZE; i != 0; i--) {
 			shminfo.shmmax = shminfo.shmall * i;
 			if ((shminfo.shmmax / shminfo.shmall) == (u_long)i)
 				break;
 		}
 	}
 	shmalloced = shminfo.shmmni;
 	shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK);
 	for (i = 0; i < shmalloced; i++) {
 		shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
 		shmsegs[i].u.shm_perm.seq = 0;
 #ifdef MAC
 		mac_sysvshm_init(&shmsegs[i]);
 #endif
 	}
 	shm_last_free = 0;
 	shm_nused = 0;
 	shm_committed = 0;
 	sx_init(&sysvshmsx, "sysvshmsx");
 	shmexit_hook = &shmexit_myhook;
 	shmfork_hook = &shmfork_myhook;
 
+	/* Set current prisons according to their allow.sysvipc. */
+	shm_prison_slot = osd_jail_register(NULL, methods);
+	rsv = osd_reserve(shm_prison_slot);
+	prison_lock(&prison0);
+	(void)osd_jail_set_reserved(&prison0, shm_prison_slot, rsv, &prison0);
+	prison_unlock(&prison0);
+	rsv = NULL;
+	sx_slock(&allprison_lock);
+	TAILQ_FOREACH(pr, &allprison, pr_list) {
+		if (rsv == NULL)
+			rsv = osd_reserve(shm_prison_slot);
+		prison_lock(pr);
+		if ((pr->pr_allow & PR_ALLOW_SYSVIPC) && pr->pr_ref > 0) {
+			(void)osd_jail_set_reserved(pr, shm_prison_slot, rsv,
+			    &prison0);
+			rsv = NULL;
+		}
+		prison_unlock(pr);
+	}
+	if (rsv != NULL)
+		osd_free_reserved(rsv);
+	sx_sunlock(&allprison_lock);
+
 	error = syscall_helper_register(shm_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	error = syscall32_helper_register(shm32_syscalls, SY_THR_STATIC_KLD);
 	if (error != 0)
 		return (error);
 #endif
 	return (0);
 }
 
 static int
 shmunload(void)
 {
 	int i;
 
 	if (shm_nused > 0)
 		return (EBUSY);
 
 #ifdef COMPAT_FREEBSD32
 	syscall32_helper_unregister(shm32_syscalls);
 #endif
 	syscall_helper_unregister(shm_syscalls);
+	if (shm_prison_slot != 0)
+		osd_jail_deregister(shm_prison_slot);
 
 	for (i = 0; i < shmalloced; i++) {
 #ifdef MAC
 		mac_sysvshm_destroy(&shmsegs[i]);
 #endif
 		/*
 		 * Objects might be still mapped into the processes
 		 * address spaces.  Actual free would happen on the
 		 * last mapping destruction.
 		 */
 		if (shmsegs[i].u.shm_perm.mode != SHMSEG_FREE)
 			vm_object_deallocate(shmsegs[i].object);
 	}
 	free(shmsegs, M_SHM);
 	shmexit_hook = NULL;
 	shmfork_hook = NULL;
 	sx_destroy(&sysvshmsx);
 	return (0);
 }
 
 static int
 sysctl_shmsegs(SYSCTL_HANDLER_ARGS)
 {
-	int error;
+	struct prison *rpr;
+	struct sbuf sb;
+	struct shmid_kernel tmp, empty;
+	struct shmid_kernel *shmseg;
+	int error, i;
 
 	SYSVSHM_LOCK();
-	error = SYSCTL_OUT(req, shmsegs, shmalloced * sizeof(shmsegs[0]));
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		goto done;
+	rpr = shm_find_prison(req->td->td_ucred);
+	sbuf_new_for_sysctl(&sb, NULL, shmalloced * sizeof(shmsegs[0]), req);
+
+	bzero(&empty, sizeof(empty));
+	empty.u.shm_perm.mode = SHMSEG_FREE;
+	for (i = 0; i < shmalloced; i++) {
+		shmseg = &shmsegs[i];
+		if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 ||
+		    rpr == NULL || shm_prison_cansee(rpr, &shmsegs[i]) != 0) {
+			shmseg = &empty;
+		} else if (req->td->td_ucred->cr_prison !=
+		    shmseg->cred->cr_prison) {
+			bcopy(shmseg, &tmp, sizeof(tmp));
+			shmseg = &tmp;
+			shmseg->u.shm_perm.key = IPC_PRIVATE;
+		}
+
+		sbuf_bcat(&sb, shmseg, sizeof(*shmseg));
+	}
+	error = sbuf_finish(&sb);
+	sbuf_delete(&sb);
+
+done:
 	SYSVSHM_UNLOCK();
 	return (error);
 }
 
+static int
+shm_prison_check(void *obj, void *data)
+{
+	struct prison *pr = obj;
+	struct prison *prpr;
+	struct vfsoptlist *opts = data;
+	int error, jsys;
+
+	/*
+	 * sysvshm is a jailsys integer.
+	 * It must be "disable" if the parent jail is disabled.
+	 */
+	error = vfs_copyopt(opts, "sysvshm", &jsys, sizeof(jsys));
+	if (error != ENOENT) {
+		if (error != 0)
+			return (error);
+		switch (jsys) {
+		case JAIL_SYS_DISABLE:
+			break;
+		case JAIL_SYS_NEW:
+		case JAIL_SYS_INHERIT:
+			prison_lock(pr->pr_parent);
+			prpr = osd_jail_get(pr->pr_parent, shm_prison_slot);
+			prison_unlock(pr->pr_parent);
+			if (prpr == NULL)
+				return (EPERM);
+			break;
+		default:
+			return (EINVAL);
+		}
+	}
+
+	return (0);
+}
+
+static int
+shm_prison_set(void *obj, void *data)
+{
+	struct prison *pr = obj;
+	struct prison *tpr, *orpr, *nrpr, *trpr;
+	struct vfsoptlist *opts = data;
+	void *rsv;
+	int jsys, descend;
+
+	/*
+	 * sysvshm controls which jail is the root of the associated segments
+	 * (this jail or same as the parent), or if the feature is available
+	 * at all.
+	 */
+	if (vfs_copyopt(opts, "sysvshm", &jsys, sizeof(jsys)) == ENOENT)
+		jsys = vfs_flagopt(opts, "allow.sysvipc", NULL, 0)
+		    ? JAIL_SYS_INHERIT
+		    : vfs_flagopt(opts, "allow.nosysvipc", NULL, 0)
+		    ? JAIL_SYS_DISABLE
+		    : -1;
+	if (jsys == JAIL_SYS_DISABLE) {
+		prison_lock(pr);
+		orpr = osd_jail_get(pr, shm_prison_slot);
+		if (orpr != NULL)
+			osd_jail_del(pr, shm_prison_slot);
+		prison_unlock(pr);
+		if (orpr != NULL) {
+			if (orpr == pr)
+				shm_prison_cleanup(pr);
+			/* Disable all child jails as well. */
+			FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
+				prison_lock(tpr);
+				trpr = osd_jail_get(tpr, shm_prison_slot);
+				if (trpr != NULL) {
+					osd_jail_del(tpr, shm_prison_slot);
+					prison_unlock(tpr);
+					if (trpr == tpr)
+						shm_prison_cleanup(tpr);
+				} else {
+					prison_unlock(tpr);
+					descend = 0;
+				}
+			}
+		}
+	} else if (jsys != -1) {
+		if (jsys == JAIL_SYS_NEW)
+			nrpr = pr;
+		else {
+			prison_lock(pr->pr_parent);
+			nrpr = osd_jail_get(pr->pr_parent, shm_prison_slot);
+			prison_unlock(pr->pr_parent);
+		}
+		rsv = osd_reserve(shm_prison_slot);
+		prison_lock(pr);
+		orpr = osd_jail_get(pr, shm_prison_slot);
+		if (orpr != nrpr)
+			(void)osd_jail_set_reserved(pr, shm_prison_slot, rsv,
+			    nrpr);
+		else
+			osd_free_reserved(rsv);
+		prison_unlock(pr);
+		if (orpr != nrpr) {
+			if (orpr == pr)
+				shm_prison_cleanup(pr);
+			if (orpr != NULL) {
+				/* Change child jails matching the old root, */
+				FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
+					prison_lock(tpr);
+					trpr = osd_jail_get(tpr,
+					    shm_prison_slot);
+					if (trpr == orpr) {
+						(void)osd_jail_set(tpr,
+						    shm_prison_slot, nrpr);
+						prison_unlock(tpr);
+						if (trpr == tpr)
+							shm_prison_cleanup(tpr);
+					} else {
+						prison_unlock(tpr);
+						descend = 0;
+					}
+				}
+			}
+		}
+	}
+
+	return (0);
+}
+
+static int
+shm_prison_get(void *obj, void *data)
+{
+	struct prison *pr = obj;
+	struct prison *rpr;
+	struct vfsoptlist *opts = data;
+	int error, jsys;
+
+	/* Set sysvshm based on the jail's root prison. */
+	prison_lock(pr);
+	rpr = osd_jail_get(pr, shm_prison_slot);
+	prison_unlock(pr);
+	jsys = rpr == NULL ? JAIL_SYS_DISABLE
+	    : rpr == pr ? JAIL_SYS_NEW : JAIL_SYS_INHERIT;
+	error = vfs_setopt(opts, "sysvshm", &jsys, sizeof(jsys));
+	if (error == ENOENT)
+		error = 0;
+	return (error);
+}
+
+static int
+shm_prison_remove(void *obj, void *data __unused)
+{
+	struct prison *pr = obj;
+	struct prison *rpr;
+
+	SYSVSHM_LOCK();
+	prison_lock(pr);
+	rpr = osd_jail_get(pr, shm_prison_slot);
+	prison_unlock(pr);
+	if (rpr == pr)
+		shm_prison_cleanup(pr);
+	SYSVSHM_UNLOCK();
+	return (0);
+}
+
+static void
+shm_prison_cleanup(struct prison *pr)
+{
+	struct shmid_kernel *shmseg;
+	int i;
+
+	/* Remove any segments that belong to this jail. */
+	for (i = 0; i < shmalloced; i++) {
+		shmseg = &shmsegs[i];
+		if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) &&
+		    shmseg->cred != NULL && shmseg->cred->cr_prison == pr) {
+			shm_remove(shmseg, i);
+		}
+	}
+}
+
+SYSCTL_JAIL_PARAM_SYS_NODE(sysvshm, CTLFLAG_RW, "SYSV shared memory");
+
 #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 struct oshmid_ds {
 	struct	ipc_perm_old shm_perm;	/* operation perms */
 	int	shm_segsz;		/* size of segment (bytes) */
 	u_short	shm_cpid;		/* pid, creator */
 	u_short	shm_lpid;		/* pid, last operation */
 	short	shm_nattch;		/* no. of current attaches */
 	time_t	shm_atime;		/* last attach time */
 	time_t	shm_dtime;		/* last detach time */
 	time_t	shm_ctime;		/* last change time */
 	void	*shm_handle;		/* internal handle for shm segment */
 };
 
 struct oshmctl_args {
 	int shmid;
 	int cmd;
 	struct oshmid_ds *ubuf;
 };
 
 static int
 oshmctl(struct thread *td, struct oshmctl_args *uap)
 {
 #ifdef COMPAT_43
 	int error = 0;
+	struct prison *rpr;
 	struct shmid_kernel *shmseg;
 	struct oshmid_ds outbuf;
 
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
+	rpr = shm_find_prison(td->td_ucred);
+	if (rpr == NULL)
 		return (ENOSYS);
 	if (uap->cmd != IPC_STAT) {
 		return (freebsd7_shmctl(td,
 		    (struct freebsd7_shmctl_args *)uap));
 	}
 	SYSVSHM_LOCK();
-	shmseg = shm_find_segment(uap->shmid, true);
+	shmseg = shm_find_segment(rpr, uap->shmid, true);
 	if (shmseg == NULL) {
 		SYSVSHM_UNLOCK();
 		return (EINVAL);
 	}
 	error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
 	if (error != 0) {
 		SYSVSHM_UNLOCK();
 		return (error);
 	}
 #ifdef MAC
 	error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, uap->cmd);
 	if (error != 0) {
 		SYSVSHM_UNLOCK();
 		return (error);
 	}
 #endif
 	ipcperm_new2old(&shmseg->u.shm_perm, &outbuf.shm_perm);
 	outbuf.shm_segsz = shmseg->u.shm_segsz;
 	outbuf.shm_cpid = shmseg->u.shm_cpid;
 	outbuf.shm_lpid = shmseg->u.shm_lpid;
 	outbuf.shm_nattch = shmseg->u.shm_nattch;
 	outbuf.shm_atime = shmseg->u.shm_atime;
 	outbuf.shm_dtime = shmseg->u.shm_dtime;
 	outbuf.shm_ctime = shmseg->u.shm_ctime;
 	outbuf.shm_handle = shmseg->object;
 	SYSVSHM_UNLOCK();
 	return (copyout(&outbuf, uap->ubuf, sizeof(outbuf)));
 #else
 	return (EINVAL);
 #endif
 }
 
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *shmcalls[] = {
 	(sy_call_t *)sys_shmat, (sy_call_t *)oshmctl,
 	(sy_call_t *)sys_shmdt, (sy_call_t *)sys_shmget,
 	(sy_call_t *)freebsd7_shmctl
 };
 
 #ifndef _SYS_SYSPROTO_H_
 /* XXX actually varargs. */
 struct shmsys_args {
 	int	which;
 	int	a2;
 	int	a3;
 	int	a4;
 };
 #endif
 int
 sys_shmsys(struct thread *td, struct shmsys_args *uap)
 {
 
-	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
-		return (ENOSYS);
 	if (uap->which < 0 || uap->which >= nitems(shmcalls))
 		return (EINVAL);
 	return ((*shmcalls[uap->which])(td, &uap->a2));
 }
 
 #endif	/* i386 && (COMPAT_FREEBSD4 || COMPAT_43) */
 
 #ifdef COMPAT_FREEBSD32
 
 int
 freebsd32_shmsys(struct thread *td, struct freebsd32_shmsys_args *uap)
 {
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 	switch (uap->which) {
 	case 0:	{	/* shmat */
 		struct shmat_args ap;
 
 		ap.shmid = uap->a2;
 		ap.shmaddr = PTRIN(uap->a3);
 		ap.shmflg = uap->a4;
 		return (sysent[SYS_shmat].sy_call(td, &ap));
 	}
 	case 2: {	/* shmdt */
 		struct shmdt_args ap;
 
 		ap.shmaddr = PTRIN(uap->a2);
 		return (sysent[SYS_shmdt].sy_call(td, &ap));
 	}
 	case 3: {	/* shmget */
 		struct shmget_args ap;
 
 		ap.key = uap->a2;
 		ap.size = uap->a3;
 		ap.shmflg = uap->a4;
 		return (sysent[SYS_shmget].sy_call(td, &ap));
 	}
 	case 4: {	/* shmctl */
 		struct freebsd7_freebsd32_shmctl_args ap;
 
 		ap.shmid = uap->a2;
 		ap.cmd = uap->a3;
 		ap.buf = PTRIN(uap->a4);
 		return (freebsd7_freebsd32_shmctl(td, &ap));
 	}
 	case 1:		/* oshmctl */
 	default:
 		return (EINVAL);
 	}
 #else
 	return (nosys(td, NULL));
 #endif
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 int
 freebsd7_freebsd32_shmctl(struct thread *td,
     struct freebsd7_freebsd32_shmctl_args *uap)
 {
 	int error;
 	union {
 		struct shmid_ds shmid_ds;
 		struct shm_info shm_info;
 		struct shminfo shminfo;
 	} u;
 	union {
 		struct shmid_ds32_old shmid_ds32;
 		struct shm_info32 shm_info32;
 		struct shminfo32 shminfo32;
 	} u32;
 	size_t sz;
 
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &u32.shmid_ds32,
 		    sizeof(u32.shmid_ds32))))
 			goto done;
 		freebsd32_ipcperm_old_in(&u32.shmid_ds32.shm_perm,
 		    &u.shmid_ds.shm_perm);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
 	}
 
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
 	if (error)
 		goto done;
 
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_INFO:
 		CP(u.shminfo, u32.shminfo32, shmmax);
 		CP(u.shminfo, u32.shminfo32, shmmin);
 		CP(u.shminfo, u32.shminfo32, shmmni);
 		CP(u.shminfo, u32.shminfo32, shmseg);
 		CP(u.shminfo, u32.shminfo32, shmall);
 		error = copyout(&u32.shminfo32, uap->buf,
 		    sizeof(u32.shminfo32));
 		break;
 	case SHM_INFO:
 		CP(u.shm_info, u32.shm_info32, used_ids);
 		CP(u.shm_info, u32.shm_info32, shm_rss);
 		CP(u.shm_info, u32.shm_info32, shm_tot);
 		CP(u.shm_info, u32.shm_info32, shm_swp);
 		CP(u.shm_info, u32.shm_info32, swap_attempts);
 		CP(u.shm_info, u32.shm_info32, swap_successes);
 		error = copyout(&u32.shm_info32, uap->buf,
 		    sizeof(u32.shm_info32));
 		break;
 	case SHM_STAT:
 	case IPC_STAT:
 		freebsd32_ipcperm_old_out(&u.shmid_ds.shm_perm,
 		    &u32.shmid_ds32.shm_perm);
 		if (u.shmid_ds.shm_segsz > INT32_MAX)
 			u32.shmid_ds32.shm_segsz = INT32_MAX;
 		else
 			CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
 		u32.shmid_ds32.shm_internal = 0;
 		error = copyout(&u32.shmid_ds32, uap->buf,
 		    sizeof(u32.shmid_ds32));
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 #endif
 
 int
 freebsd32_shmctl(struct thread *td, struct freebsd32_shmctl_args *uap)
 {
 	int error;
 	union {
 		struct shmid_ds shmid_ds;
 		struct shm_info shm_info;
 		struct shminfo shminfo;
 	} u;
 	union {
 		struct shmid_ds32 shmid_ds32;
 		struct shm_info32 shm_info32;
 		struct shminfo32 shminfo32;
 	} u32;
 	size_t sz;
 
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &u32.shmid_ds32,
 		    sizeof(u32.shmid_ds32))))
 			goto done;
 		freebsd32_ipcperm_in(&u32.shmid_ds32.shm_perm,
 		    &u.shmid_ds.shm_perm);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
 		CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
 	}
 
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
 	if (error)
 		goto done;
 
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_INFO:
 		CP(u.shminfo, u32.shminfo32, shmmax);
 		CP(u.shminfo, u32.shminfo32, shmmin);
 		CP(u.shminfo, u32.shminfo32, shmmni);
 		CP(u.shminfo, u32.shminfo32, shmseg);
 		CP(u.shminfo, u32.shminfo32, shmall);
 		error = copyout(&u32.shminfo32, uap->buf,
 		    sizeof(u32.shminfo32));
 		break;
 	case SHM_INFO:
 		CP(u.shm_info, u32.shm_info32, used_ids);
 		CP(u.shm_info, u32.shm_info32, shm_rss);
 		CP(u.shm_info, u32.shm_info32, shm_tot);
 		CP(u.shm_info, u32.shm_info32, shm_swp);
 		CP(u.shm_info, u32.shm_info32, swap_attempts);
 		CP(u.shm_info, u32.shm_info32, swap_successes);
 		error = copyout(&u32.shm_info32, uap->buf,
 		    sizeof(u32.shm_info32));
 		break;
 	case SHM_STAT:
 	case IPC_STAT:
 		freebsd32_ipcperm_out(&u.shmid_ds.shm_perm,
 		    &u32.shmid_ds32.shm_perm);
 		if (u.shmid_ds.shm_segsz > INT32_MAX)
 			u32.shmid_ds32.shm_segsz = INT32_MAX;
 		else
 			CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
 		CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
 		error = copyout(&u32.shmid_ds32, uap->buf,
 		    sizeof(u32.shmid_ds32));
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 #endif
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 
 #ifndef CP
 #define CP(src, dst, fld)	do { (dst).fld = (src).fld; } while (0)
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd7_shmctl_args {
 	int shmid;
 	int cmd;
 	struct shmid_ds_old *buf;
 };
 #endif
 int
 freebsd7_shmctl(struct thread *td, struct freebsd7_shmctl_args *uap)
 {
 	int error;
 	struct shmid_ds_old old;
 	struct shmid_ds buf;
 	size_t bufsz;
 
 	/*
 	 * The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
 	 * Linux binaries.  If we see the call come through the FreeBSD ABI,
 	 * return an error back to the user since we do not to support this.
 	 */
 	if (uap->cmd == IPC_INFO || uap->cmd == SHM_INFO ||
 	    uap->cmd == SHM_STAT)
 		return (EINVAL);
 
 	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
 	if (uap->cmd == IPC_SET) {
 		if ((error = copyin(uap->buf, &old, sizeof(old))))
 			goto done;
 		ipcperm_old2new(&old.shm_perm, &buf.shm_perm);
 		CP(old, buf, shm_segsz);
 		CP(old, buf, shm_lpid);
 		CP(old, buf, shm_cpid);
 		CP(old, buf, shm_nattch);
 		CP(old, buf, shm_atime);
 		CP(old, buf, shm_dtime);
 		CP(old, buf, shm_ctime);
 	}
 
 	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
 	if (error)
 		goto done;
 
 	/* Cases in which we need to copyout */
 	switch (uap->cmd) {
 	case IPC_STAT:
 		ipcperm_new2old(&buf.shm_perm, &old.shm_perm);
 		if (buf.shm_segsz > INT_MAX)
 			old.shm_segsz = INT_MAX;
 		else
 			CP(buf, old, shm_segsz);
 		CP(buf, old, shm_lpid);
 		CP(buf, old, shm_cpid);
 		if (buf.shm_nattch > SHRT_MAX)
 			old.shm_nattch = SHRT_MAX;
 		else
 			CP(buf, old, shm_nattch);
 		CP(buf, old, shm_atime);
 		CP(buf, old, shm_dtime);
 		CP(buf, old, shm_ctime);
 		old.shm_internal = NULL;
 		error = copyout(&old, uap->buf, sizeof(old));
 		break;
 	}
 
 done:
 	if (error) {
 		/* Invalidate the return value */
 		td->td_retval[0] = -1;
 	}
 	return (error);
 }
 
 #endif	/* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 ||
 	   COMPAT_FREEBSD7 */
 
 static int
 sysvshm_modload(struct module *module, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = shminit();
 		if (error != 0)
 			shmunload();
 		break;
 	case MOD_UNLOAD:
 		error = shmunload();
 		break;
 	case MOD_SHUTDOWN:
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sysvshm_mod = {
 	"sysvshm",
 	&sysvshm_modload,
 	NULL
 };
 
 DECLARE_MODULE(sysvshm, sysvshm_mod, SI_SUB_SYSV_SHM, SI_ORDER_FIRST);
 MODULE_VERSION(sysvshm, 1);
Index: head/usr.sbin/jail/jail.8
===================================================================
--- head/usr.sbin/jail/jail.8	(revision 298584)
+++ head/usr.sbin/jail/jail.8	(revision 298585)
@@ -1,1330 +1,1353 @@
 .\" Copyright (c) 2000, 2003 Robert N. M. Watson
 .\" Copyright (c) 2008-2012 James Gritton
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
 .Dd April 25, 2016
 .Dt JAIL 8
 .Os
 .Sh NAME
 .Nm jail
 .Nd "manage system jails"
 .Sh SYNOPSIS
 .Nm
 .Op Fl dhilqv
 .Op Fl J Ar jid_file
 .Op Fl u Ar username
 .Op Fl U Ar username
 .Op Fl cmr
 .Ar param Ns = Ns Ar value ...
 .Op Cm command Ns = Ns Ar command ...
 .Nm
 .Op Fl dqv
 .Op Fl f Ar conf_file
 .Op Fl p Ar limit
 .Op Fl cmr
 .Op Ar jail
 .Nm
 .Op Fl qv
 .Op Fl f Ar conf_file
 .Op Fl rR
 .Op Cm * | Ar jail ...
 .Nm
 .Op Fl dhilqv
 .Op Fl J Ar jid_file
 .Op Fl u Ar username
 .Op Fl U Ar username
 .Op Fl n Ar jailname
 .Op Fl s Ar securelevel
 .Op Ar path hostname [ Ar ip Ns [ Ns Ar ,... Ns ]] Ar command ...
 .Sh DESCRIPTION
 The
 .Nm
 utility creates new jails, or modifies or removes existing jails.
 A jail
 .Pq or Dq prison
 is specified via parameters on the command line, or in the
 .Xr jail.conf 5
 file.
 .Pp
 At least one of the options
 .Fl c ,
 .Fl m
 or
 .Fl r
 must be specified.
 These options are used alone or in combination to describe the operation to
 perform:
 .Bl -tag -width indent
 .It Fl c
 Create a new jail.
 The jail
 .Va jid
 and
 .Va name
 parameters (if specified on the command line)
 must not refer to an existing jail.
 .It Fl m
 Modify an existing jail.
 One of the
 .Va jid
 or
 .Va name
 parameters must exist and refer to an existing jail.
 Some parameters may not be changed on a running jail.
 .It Fl r
 Remove the
 .Ar jail
 specified by jid or name.
 All jailed processes are killed, and all jails that are
 children of this jail are also
 removed.
 .It Fl rc
 Restart an existing jail.
 The jail is first removed and then re-created, as if
 .Dq Nm Fl r
 and
 .Dq Nm Fl c
 were run in succession.
 .It Fl cm
 Create a jail if it does not exist, or modify the jail if it does exist.
 .It Fl mr
 Modify an existing jail.
 The jail may be restarted if necessary to modify parameters than could
 not otherwise be changed.
 .It Fl cmr
 Create a jail if it doesn't exist, or modify (and possibly restart) the
 jail if it does exist.
 .El
 .Pp
 Other available options are:
 .Bl -tag -width indent
 .It Fl d
 Allow making changes to a dying jail, equivalent to the
 .Va allow.dying
 parameter.
 .It Fl f Ar conf_file
 Use configuration file
 .Ar conf_file
 instead of the default
 .Pa /etc/jail.conf .
 .It Fl h
 Resolve the
 .Va host.hostname
 parameter (or
 .Va hostname )
 and add all IP addresses returned by the resolver
 to the list of addresses for this jail.
 This is equivalent to the
 .Va ip_hostname
 parameter.
 .It Fl i
 Output (only) the jail identifier of the newly created jail(s).
 This implies the
 .Fl q
 option.
 .It Fl J Ar jid_file
 Write a
 .Ar jid_file
 file, containing the parameters used to start the jail.
 .It Fl l
 Run commands in a clean environment.
 This is deprecated and is equivalent to the exec.clean parameter.
 .It Fl n Ar jailname
 Set the jail's name.
 This is deprecated and is equivalent to the
 .Va name
 parameter.
 .It Fl p Ar limit
 Limit the number of commands from
 .Va  exec.*
 that can run simultaneously.
 .It Fl q
 Suppress the message printed whenever a jail is created, modified or removed.
 Only error messages will be printed.
 .It Fl R
 A variation of the
 .Fl r
 option that removes an existing jail without using the configuration file.
 No removal-related parameters for this jail will be used \(em the jail will
 simply be removed.
 .It Fl s Ar securelevel
 Set the
 .Va kern.securelevel
 MIB entry to the specified value inside the newly created jail.
 This is deprecated and is equivalent to the
 .Va securelevel
 parameter.
 .It Fl u Ar username
 The user name from host environment as whom jailed commands should run.
 This is deprecated and is equivalent to the
 .Va exec.jail_user
 and
 .Va exec.system_jail_user
 parameters.
 .It Fl U Ar username
 The user name from the jailed environment as whom jailed commands should run.
 This is deprecated and is equivalent to the
 .Va exec.jail_user
 parameter.
 .It Fl v
 Print a message on every operation, such as running commands and
 mounting filesystems.
 .El
 .Pp
 If no arguments are given after the options, the operation (except
 remove) will be performed on all jails specified in the
 .Xr jail.conf 5
 file.
 A single argument of a jail name will operate only on the specified jail.
 The
 .Fl r
 and
 .Fl R
 options can also remove running jails that aren't in the
 .Xr jail.conf 5
 file, specified by name or jid.
 .Pp
 An argument of
 .Dq *
 is a wildcard that will operate on all jails, regardless of whether
 they appear in
 .Xr jail.conf 5 ;
 this is the surest way for
 .Fl r
 to remove all jails.
 If hierarchical jails exist, a partial-matching wildcard definition may
 be specified.
 For example, an argument of
 .Dq foo.*
 would apply to jails with names like
 .Dq foo.bar
 and
 .Dq foo.bar.baz .
 .Pp
 A jail may be specified with parameters directly on the command line.
 In this case, the
 .Xr jail.conf 5
 file will not be used.
 For backward compatibility, the command line may also have four fixed
 parameters, without names:
 .Ar path ,
 .Ar hostname ,
 .Ar ip ,
 and
 .Ar command .
 This mode will always create a new jail, and the
 .Fl c
 and
 .Fl m
 options do not apply (and must not be present).
 .Ss Jail Parameters
 Parameters in the
 .Xr jail.conf 5
 file, or on the command line, are generally of the form
 .Dq name=value .
 Some parameters are boolean, and do not have a value but are set by the
 name alone with or without a
 .Dq no
 prefix, e.g.
 .Va persist
 or
 .Va nopersist .
 They can also be given the values
 .Dq true
 and
 .Dq false .
 Other parameters may have more than one value, specified as a
 comma-separated list or with
 .Dq +=
 in the configuration file (see
 .Xr jail.conf 5
 for details).
 .Pp
 The
 .Nm
 utility recognizes two classes of parameters.
 There are the true jail
 parameters that are passed to the kernel when the jail is created,
 which can be seen with
 .Xr jls 8 ,
 and can (usually) be changed with
 .Dq Nm Fl m .
 Then there are pseudo-parameters that are only used by
 .Nm
 itself.
 .Pp
 Jails have a set of core parameters, and kernel modules can add their own
 jail parameters.
 The current set of available parameters can be retrieved via
 .Dq Nm sysctl Fl d Va security.jail.param .
 Any parameters not set will be given default values, often based on the
 current environment.
 The core parameters are:
 .Bl -tag -width indent
 .It Va jid
 The jail identifier.
 This will be assigned automatically to a new jail (or can be explicitly
 set), and can be used to identify the jail for later modification, or
 for such commands as
 .Xr jls 8
 or
 .Xr jexec 8 .
 .It Va name
 The jail name.
 This is an arbitrary string that identifies a jail (except it may not
 contain a
 .Sq \&. ) .
 Like the
 .Va jid ,
 it can be passed to later
 .Nm
 commands, or to
 .Xr jls 8
 or
 .Xr jexec 8 .
 If no
 .Va name
 is supplied, a default is assumed that is the same as the
 .Va jid .
 The
 .Va name
 parameter is implied by the
 .Xr jail.conf 5
 file format, and need not be explicitly set when using the configuration
 file.
 .It Va path
 The directory which is to be the root of the jail.
 Any commands run inside the jail, either by
 .Nm
 or from
 .Xr jexec 8 ,
 are run from this directory.
 .It Va ip4.addr
 A list of IPv4 addresses assigned to the jail.
 If this is set, the jail is restricted to using only these addresses.
 Any attempts to use other addresses fail, and attempts to use wildcard
 addresses silently use the jailed address instead.
 For IPv4 the first address given will be used as the source address
 when source address selection on unbound sockets cannot find a better
 match.
 It is only possible to start multiple jails with the same IP address
 if none of the jails has more than this single overlapping IP address
 assigned to itself.
 .It Va ip4.saddrsel
 A boolean option to change the formerly mentioned behaviour and disable
 IPv4 source address selection for the jail in favour of the primary
 IPv4 address of the jail.
 Source address selection is enabled by default for all jails and the
 .Va ip4.nosaddrsel
 setting of a parent jail is not inherited for any child jails.
 .It Va ip4
 Control the availability of IPv4 addresses.
 Possible values are
 .Dq inherit
 to allow unrestricted access to all system addresses,
 .Dq new
 to restrict addresses via
 .Va ip4.addr ,
 and
 .Dq disable
 to stop the jail from using IPv4 entirely.
 Setting the
 .Va ip4.addr
 parameter implies a value of
 .Dq new .
 .It Va ip6.addr , Va ip6.saddrsel , Va ip6
 A set of IPv6 options for the jail, the counterparts to
 .Va ip4.addr ,
 .Va ip4.saddrsel
 and
 .Va ip4
 above.
 .It Va vnet
 Create the jail with its own virtual network stack,
 with its own network interfaces, addresses, routing table, etc.
 The kernel must have been compiled with the
 .Sy VIMAGE option
 for this to be available.
 Possible values are
 .Dq inherit
 to use the system network stack, possibly with restricted IP addresses,
 and
 .Dq new
 to create a new network stack.
 .It Va host.hostname
 The hostname of the jail.
 Other similar parameters are
 .Va host.domainname ,
 .Va host.hostuuid
 and
 .Va host.hostid .
 .It Va host
 Set the origin of hostname and related information.
 Possible values are
 .Dq inherit
 to use the system information and
 .Dq new
 for the jail to use the information from the above fields.
 Setting any of the above fields implies a value of
 .Dq new .
 .It Va securelevel
 The value of the jail's
 .Va kern.securelevel
 sysctl.
 A jail never has a lower securelevel than its parent system, but by
 setting this parameter it may have a higher one.
 If the system securelevel is changed, any jail securelevels will be at
 least as secure.
 .It Va devfs_ruleset
 The number of the devfs ruleset that is enforced for mounting devfs in
 this jail.
 A value of zero (default) means no ruleset is enforced.
 Descendant jails inherit the parent jail's devfs ruleset enforcement.
 Mounting devfs inside a jail is possible only if the
 .Va allow.mount
 and
 .Va allow.mount.devfs
 permissions are effective and
 .Va enforce_statfs
 is set to a value lower than 2.
 Devfs rules and rulesets cannot be viewed or modified from inside a jail.
 .Pp
 NOTE: It is important that only appropriate device nodes in devfs be
 exposed to a jail; access to disk devices in the jail may permit processes
 in the jail to bypass the jail sandboxing by modifying files outside of
 the jail.
 See
 .Xr devfs 8
 for information on how to use devfs rules to limit access to entries
 in the per-jail devfs.
 A simple devfs ruleset for jails is available as ruleset #4 in
 .Pa /etc/defaults/devfs.rules .
 .It Va children.max
 The number of child jails allowed to be created by this jail (or by
 other jails under this jail).
 This limit is zero by default, indicating the jail is not allowed to
 create child jails.
 See the
 .Sx "Hierarchical Jails"
 section for more information.
 .It Va children.cur
 The number of descendants of this jail, including its own child jails
 and any jails created under them.
 .It Va enforce_statfs
 This determines what information processes in a jail are able to get
 about mount points.
 It affects the behaviour of the following syscalls:
 .Xr statfs 2 ,
 .Xr fstatfs 2 ,
 .Xr getfsstat 2 ,
 and
 .Xr fhstatfs 2
 (as well as similar compatibility syscalls).
 When set to 0, all mount points are available without any restrictions.
 When set to 1, only mount points below the jail's chroot directory are
 visible.
 In addition to that, the path to the jail's chroot directory is removed
 from the front of their pathnames.
 When set to 2 (default), above syscalls can operate only on a mount-point
 where the jail's chroot directory is located.
 .It Va persist
 Setting this boolean parameter allows a jail to exist without any
 processes.
 Normally, a command is run as part of jail creation, and then the jail
 is destroyed as its last process exits.
 A new jail must have either the
 .Va persist
 parameter or
 .Va exec.start
 or
 .Va command
 pseudo-parameter set.
 .It Va cpuset.id
 The ID of the cpuset associated with this jail (read-only).
 .It Va dying
 This is true if the jail is in the process of shutting down (read-only).
 .It Va parent
 The
 .Va jid
 of the parent of this jail, or zero if this is a top-level jail
 (read-only).
 .It Va osrelease
 The string for the jail's
 .Va kern.osrelease
 sysctl and uname -r.
 .It Va osreldate
 The number for the jail's
 .Va kern.osreldate
 and uname -K.
 .It Va allow.*
 Some restrictions of the jail environment may be set on a per-jail
 basis.
 With the exception of
 .Va allow.set_hostname ,
 these boolean parameters are off by default.
 .Bl -tag -width indent
 .It Va allow.set_hostname
 The jail's hostname may be changed via
 .Xr hostname 1
 or
 .Xr sethostname 3 .
 .It Va allow.sysvipc
 A process within the jail has access to System V IPC primitives.
-In the current jail implementation, System V primitives share a single
-namespace across the host and jail environments, meaning that processes
-within a jail would be able to communicate with (and potentially interfere
-with) processes outside of the jail, and in other jails.
+This is deprecated in favor of the per-module parameters (see below).
+When this parameter is set, it is equivalent to setting
+.Va sysvmsg ,
+.Va sysvsem ,
+and
+.Va sysvshm
+all to
+.Dq inherit .
 .It Va allow.raw_sockets
 The jail root is allowed to create raw sockets.
 Setting this parameter allows utilities like
 .Xr ping 8
 and
 .Xr traceroute 8
 to operate inside the jail.
 If this is set, the source IP addresses are enforced to comply
 with the IP address bound to the jail, regardless of whether or not
 the
 .Dv IP_HDRINCL
 flag has been set on the socket.
 Since raw sockets can be used to configure and interact with various
 network subsystems, extra caution should be used where privileged access
 to jails is given out to untrusted parties.
 .It Va allow.chflags
 Normally, privileged users inside a jail are treated as unprivileged by
 .Xr chflags 2 .
 When this parameter is set, such users are treated as privileged, and
 may manipulate system file flags subject to the usual constraints on
 .Va kern.securelevel .
 .It Va allow.mount
 privileged users inside the jail will be able to mount and unmount file
 system types marked as jail-friendly.
 The
 .Xr lsvfs 1
 command can be used to find file system types available for mount from
 within a jail.
 This permission is effective only if
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.devfs
 privileged users inside the jail will be able to mount and unmount the
 devfs file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 The devfs ruleset should be restricted from the default by using the
 .Va devfs_ruleset
 option.
 .It Va allow.mount.fdescfs
 privileged users inside the jail will be able to mount and unmount the
 fdescfs file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.nullfs
 privileged users inside the jail will be able to mount and unmount the
 nullfs file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.procfs
 privileged users inside the jail will be able to mount and unmount the
 procfs file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.linprocfs
 privileged users inside the jail will be able to mount and unmount the
 linprocfs file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.linsysfs
 privileged users inside the jail will be able to mount and unmount the
 linsysfs file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.tmpfs
 privileged users inside the jail will be able to mount and unmount the
 tmpfs file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 .It Va allow.mount.zfs
 privileged users inside the jail will be able to mount and unmount the
 ZFS file system.
 This permission is effective only together with
 .Va allow.mount
 and only when
 .Va enforce_statfs
 is set to a value lower than 2.
 See
 .Xr zfs 8
 for information on how to configure the ZFS filesystem to operate from
 within a jail.
 .It Va allow.quotas
 The jail root may administer quotas on the jail's filesystem(s).
 This includes filesystems that the jail may share with other jails or
 with non-jailed parts of the system.
 .It Va allow.socket_af
 Sockets within a jail are normally restricted to IPv4, IPv6, local
 (UNIX), and route.  This allows access to other protocol stacks that
 have not had jail functionality added to them.
 .El
 .El
 .Pp
 Kernel modules may add their own parameters, which only exist when the
 module is loaded.
 These are typically headed under a parameter named after the module,
 with values of
 .Dq inherit
 to give the jail full use of the module,
 .Dq new
 to encapsulate the jail in some module-specific way,
 and
 .Dq disable
 to make the module unavailable to the jail.
 There also may be other parameters to define jail behavior within the module.
 Module-specific parameters include:
 .Bl -tag -width indent
 .It Va linux
 Determine how a jail's Linux emulation environment appears.
 A value of
 .Dq inherit
 will keep the same environment, and
 .Dq new
 will give the jail it's own environment (still originally inherited when
 the jail is created).
 .It Va linux.osname , linux.osrelease , linux.oss_version
 The Linux OS name, OS release, and OSS version associated with this jail.
+.It Va sysvmsg
+Allow access to SYSV IPC message primitives.
+If set to
+.Dq inherit ,
+all IPC objects on the system are visible to this jail, whether they
+were created by the jail itself, the base system, or other jails.
+If set to
+.Dq new ,
+the jail will have its own key namespace, and can only see the objects
+that it has created;
+the system (or parent jail) has access to the jail's objects, but not to
+its keys.
+If set to
+.Dq disable ,
+the jail cannot perform any sysvmsg-related system calls.
+.It Va sysvsem, sysvmsg
+Allow access to SYSV IPC semaphore and shared memory primitives, in the
+same manner as
+.Va sysvmsg.
 .El
 .Pp
 There are pseudo-parameters that are not passed to the kernel, but are
 used by
 .Nm
 to set up the jail environment, often by running specified commands
 when jails are created or removed.
 The
 .Va exec.*
 command parameters are
 .Xr sh 1
 command lines that are run in either the system or jail environment.
 They may be given multiple values, which would run the specified
 commands in sequence.
 All commands must succeed (return a zero exit status), or the jail will
 not be created or removed, as appropriate.
 .Pp
 The pseudo-parameters are:
 .Bl -tag -width indent
 .It Va exec.prestart
 Command(s) to run in the system environment before a jail is created.
 .It Va exec.start
 Command(s) to run in the jail environment when a jail is created.
 A typical command to run is
 .Dq sh /etc/rc .
 .It Va command
 A synonym for
 .Va exec.start
 for use when specifying a jail directly on the command line.
 Unlike other parameters whose value is a single string,
 .Va command
 uses the remainder of the
 .Nm
 command line as its own arguments.
 .It Va exec.poststart
 Command(s) to run in the system environment after a jail is created,
 and after any
 .Va exec.start
 commands have completed.
 .It Va exec.prestop
 Command(s) to run in the system environment before a jail is removed.
 .It Va exec.stop
 Command(s) to run in the jail environment before a jail is removed,
 and after any
 .Va exec.prestop
 commands have completed.
 A typical command to run is
 .Dq sh /etc/rc.shutdown .
 .It Va exec.poststop
 Command(s) to run in the system environment after a jail is removed.
 .It Va exec.clean
 Run commands in a clean environment.
 The environment is discarded except for
 .Ev HOME , SHELL , TERM
 and
 .Ev USER .
 .Ev HOME
 and
 .Ev SHELL
 are set to the target login's default values.
 .Ev USER
 is set to the target login.
 .Ev TERM
 is imported from the current environment.
 The environment variables from the login class capability database for the
 target login are also set.
 .It Va exec.jail_user
 The user to run commands as, when running in the jail environment.
 The default is to run the commands as the current user.
 .It Va exec.system_jail_user
 This boolean option looks for the
 .Va exec.jail_user
 in the system
 .Xr passwd 5
 file, instead of in the jail's file.
 .It Va exec.system_user
 The user to run commands as, when running in the system environment.
 The default is to run the commands as the current user.
 .It Va exec.timeout
 The maximum amount of time to wait for a command to complete, in
 seconds.
 If a command is still running after this timeout has passed,
 the jail will not be created or removed, as appropriate.
 .It Va exec.consolelog
 A file to direct command output (stdout and stderr) to.
 .It Va exec.fib
 The FIB (routing table) to set when running commands inside the jail.
 .It Va stop.timeout
 The maximum amount of time to wait for a jail's processes to exit
 after sending them a
 .Dv SIGTERM
 signal (which happens after the
 .Va exec.stop
 commands have completed).
 After this many seconds have passed, the jail will be removed, which
 will kill any remaining processes.
 If this is set to zero, no
 .Dv SIGTERM
 is sent and the jail is immediately removed.
 The default is 10 seconds.
 .It Va interface
 A network interface to add the jail's IP addresses
 .Va ( ip4.addr
 and
 .Va ip6.addr )
 to.
 An alias for each address will be added to the interface before the
 jail is created, and will be removed from the interface after the
 jail is removed.
 .It Va ip4.addr
 In addition to the IP addresses that are passed to the kernel, an
 interface, netmask and additional parameters (as supported by
 .Xr ifconfig 8 Ns )
 may also be specified, in the form
 .Dq Ar interface Ns | Ns Ar ip-address Ns / Ns Ar netmask param ... .
 If an interface is given before the IP address, an alias for the address
 will be added to that interface, as it is with the
 .Va interface
 parameter.
 If a netmask in either dotted-quad or CIDR form is given
 after an IP address, it will be used when adding the IP alias.
 If additional parameters are specified then they will also be used when
 adding the IP alias.
 .It Va ip6.addr
 In addition to the IP addresses that are passed to the kernel,
 an interface, prefix and additional parameters (as supported by
 .Xr ifconfig 8 Ns )
 may also be specified, in the form
 .Dq Ar interface Ns | Ns Ar ip-address Ns / Ns Ar prefix param ... .
 .It Va vnet.interface
 A network interface to give to a vnet-enabled jail after is it created.
 The interface will automatically be released when the jail is removed.
 .It Va ip_hostname
 Resolve the
 .Va host.hostname
 parameter and add all IP addresses returned by the resolver
 to the list of addresses
 .Po Va ip4.addr
 or
 .Va ip6.addr Pc
 for this jail.
 This may affect default address selection for outgoing IPv4 connections
 from jails.
 The address first returned by the resolver for each address family
 will be used as the primary address.
 .It Va mount
 A filesystem to mount before creating the jail (and to unmount after
 removing it), given as a single
 .Xr fstab 5
 line.
 .It Va mount.fstab
 An
 .Xr fstab 5
 format file containing filesystems to mount before creating a jail.
 .It Va mount.devfs
 Mount a
 .Xr devfs 5
 filesystem on the chrooted
 .Pa /dev
 directory, and apply the ruleset in the
 .Va devfs_ruleset
 parameter (or a default of ruleset 4: devfsrules_jail)
 to restrict the devices visible inside the jail.
 .It Va mount.fdescfs
 Mount a
 .Xr fdescfs 5
 filesystem on the chrooted
 .Pa /dev/fd
 directory.
 .It Va mount.procfs
 Mount a
 .Xr procfs 5
 filesystem on the chrooted
 .Pa /proc
 directory.
 .It Va allow.dying
 Allow making changes to a
 .Va dying
 jail.
 .It Va depend
 Specify a jail (or jails) that this jail depends on.
 Any such jails must be fully created, up to the last
 .Va exec.poststart
 command, before any action will taken to create this jail.
 When jails are removed the opposite is true:
 this jail must be fully removed, up to the last
 .Va exec.poststop
 command, before the jail(s) it depends on are stopped.
 .El
 .Sh EXAMPLES
 Jails are typically set up using one of two philosophies: either to
 constrain a specific application (possibly running with privilege), or
 to create a
 .Dq "virtual system image"
 running a variety of daemons and services.
 In both cases, a fairly complete file system install of
 .Fx
 is
 required, so as to provide the necessary command line tools, daemons,
 libraries, application configuration files, etc.
 However, for a virtual server configuration, a fair amount of
 additional work is required so as to replace the
 .Dq boot
 process.
 This manual page documents the configuration steps necessary to support
 either of these steps, although the configuration steps may need to be
 refined based on local requirements.
 .Ss "Setting up a Jail Directory Tree"
 To set up a jail directory tree containing an entire
 .Fx
 distribution, the following
 .Xr sh 1
 command script can be used:
 .Bd -literal
 D=/here/is/the/jail
 cd /usr/src
 mkdir -p $D
 make world DESTDIR=$D
 make distribution DESTDIR=$D
 .Ed
 .Pp
 In many cases this example would put far more in the jail than needed.
 In the other extreme case a jail might contain only one file:
 the executable to be run in the jail.
 .Pp
 We recommend experimentation, and caution that it is a lot easier to
 start with a
 .Dq fat
 jail and remove things until it stops working,
 than it is to start with a
 .Dq thin
 jail and add things until it works.
 .Ss "Setting Up a Jail"
 Do what was described in
 .Sx "Setting Up a Jail Directory Tree"
 to build the jail directory tree.
 For the sake of this example, we will
 assume you built it in
 .Pa /data/jail/testjail ,
 for a jail named
 .Dq testjail .
 Substitute below as needed with your
 own directory, IP address, and hostname.
 .Ss "Setting up the Host Environment"
 First, set up the real system's environment to be
 .Dq jail-friendly .
 For consistency, we will refer to the parent box as the
 .Dq "host environment" ,
 and to the jailed virtual machine as the
 .Dq "jail environment" .
 Since jails are implemented using IP aliases, one of the first things to do
 is to disable IP services on the host system that listen on all local
 IP addresses for a service.
 If a network service is present in the host environment that binds all
 available IP addresses rather than specific IP addresses, it may service
 requests sent to jail IP addresses if the jail did not bind the port.
 This means changing
 .Xr inetd 8
 to only listen on the
 appropriate IP address, and so forth.
 Add the following to
 .Pa /etc/rc.conf
 in the host environment:
 .Bd -literal -offset indent
 sendmail_enable="NO"
 inetd_flags="-wW -a 192.0.2.23"
 rpcbind_enable="NO"
 .Ed
 .Pp
 .Li 192.0.2.23
 is the native IP address for the host system, in this example.
 Daemons that run out of
 .Xr inetd 8
 can be easily configured to use only the specified host IP address.
 Other daemons
 will need to be manually configured \(em for some this is possible through
 .Xr rc.conf 5
 flags entries; for others it is necessary to modify per-application
 configuration files, or to recompile the application.
 The following frequently deployed services must have their individual
 configuration files modified to limit the application to listening
 to a specific IP address:
 .Pp
 To configure
 .Xr sshd 8 ,
 it is necessary to modify
 .Pa /etc/ssh/sshd_config .
 .Pp
 To configure
 .Xr sendmail 8 ,
 it is necessary to modify
 .Pa /etc/mail/sendmail.cf .
 .Pp
 For
 .Xr named 8 ,
 it is necessary to modify
 .Pa /etc/namedb/named.conf .
 .Pp
 In addition, a number of services must be recompiled in order to run
 them in the host environment.
 This includes most applications providing services using
 .Xr rpc 3 ,
 such as
 .Xr rpcbind 8 ,
 .Xr nfsd 8 ,
 and
 .Xr mountd 8 .
 In general, applications for which it is not possible to specify which
 IP address to bind should not be run in the host environment unless they
 should also service requests sent to jail IP addresses.
 Attempting to serve
 NFS from the host environment may also cause confusion, and cannot be
 easily reconfigured to use only specific IPs, as some NFS services are
 hosted directly from the kernel.
 Any third-party network software running
 in the host environment should also be checked and configured so that it
 does not bind all IP addresses, which would result in those services also
 appearing to be offered by the jail environments.
 .Pp
 Once
 these daemons have been disabled or fixed in the host environment, it is
 best to reboot so that all daemons are in a known state, to reduce the
 potential for confusion later (such as finding that when you send mail
 to a jail, and its sendmail is down, the mail is delivered to the host,
 etc.).
 .Ss "Configuring the Jail"
 Start any jail for the first time without configuring the network
 interface so that you can clean it up a little and set up accounts.
 As
 with any machine (virtual or not), you will need to set a root password, time
 zone, etc.
 Some of these steps apply only if you intend to run a full virtual server
 inside the jail; others apply both for constraining a particular application
 or for running a virtual server.
 .Pp
 Start a shell in the jail:
 .Bd -literal -offset indent
 jail -c path=/data/jail/testjail mount.devfs \\
 	host.hostname=testhostname ip4.addr=192.0.2.100 \\
 	command=/bin/sh
 .Ed
 .Pp
 Assuming no errors, you will end up with a shell prompt within the jail.
 You can now run
 .Pa /usr/sbin/sysinstall
 and do the post-install configuration to set various configuration options,
 or perform these actions manually by editing
 .Pa /etc/rc.conf ,
 etc.
 .Pp
 .Bl -bullet -offset indent -compact
 .It
 Configure
 .Pa /etc/resolv.conf
 so that name resolution within the jail will work correctly.
 .It
 Run
 .Xr newaliases 1
 to quell
 .Xr sendmail 8
 warnings.
 .It
 Set a root password, probably different from the real host system.
 .It
 Set the timezone.
 .It
 Add accounts for users in the jail environment.
 .It
 Install any packages the environment requires.
 .El
 .Pp
 You may also want to perform any package-specific configuration (web servers,
 SSH servers, etc), patch up
 .Pa /etc/syslog.conf
 so it logs as you would like, etc.
 If you are not using a virtual server, you may wish to modify
 .Xr syslogd 8
 in the host environment to listen on the syslog socket in the jail
 environment; in this example, the syslog socket would be stored in
 .Pa /data/jail/testjail/var/run/log .
 .Pp
 Exit from the shell, and the jail will be shut down.
 .Ss "Starting the Jail"
 You are now ready to restart the jail and bring up the environment with
 all of its daemons and other programs.
 Create an entry for the jail in
 .Pa /etc/jail.conf :
 .Bd -literal -offset indent
 testjail {
 	path = /tmp/jail/testjail;
 	mount.devfs;
 	host.hostname = testhostname;
 	ip4.addr = 192.0.2.100;
 	interface = ed0;
 	exec.start = "/bin/sh /etc/rc";
 	exec.stop = "/bin/sh /etc/rc.shutdown";
 }
 .Ed
 .Pp
 To start a virtual server environment,
 .Pa /etc/rc
 is run to launch various daemons and services, and
 .Pa /etc/rc.shutdown
 is run to shut them down when the jail is removed.
 If you are running a single application in the jail,
 substitute the command used to start the application for
 .Dq /bin/sh /etc/rc ;
 there may be some script available to cleanly shut down the application,
 or it may be sufficient to go without a stop command, and have
 .Nm
 send
 .Dv SIGTERM
 to the application.
 .Pp
 Start the jail by running:
 .Bd -literal -offset indent
 jail -c testjail
 .Ed
 .Pp
 A few warnings may be produced; however, it should all work properly.
 You should be able to see
 .Xr inetd 8 ,
 .Xr syslogd 8 ,
 and other processes running within the jail using
 .Xr ps 1 ,
 with the
 .Ql J
 flag appearing beside jailed processes.
 To see an active list of jails, use
 .Xr jls 8 .
 If
 .Xr sshd 8
 is enabled in the jail environment, you should be able to
 .Xr ssh 1
 to the hostname or IP address of the jailed environment, and log
 in using the accounts you created previously.
 .Pp
 It is possible to have jails started at boot time.
 Please refer to the
 .Dq jail_*
 variables in
 .Xr rc.conf 5
 for more information.
 .Ss "Managing the Jail"
 Normal machine shutdown commands, such as
 .Xr halt 8 ,
 .Xr reboot 8 ,
 and
 .Xr shutdown 8 ,
 cannot be used successfully within the jail.
 To kill all processes from within a jail, you may use one of the
 following commands, depending on what you want to accomplish:
 .Bd -literal -offset indent
 kill -TERM -1
 kill -KILL -1
 .Ed
 .Pp
 This will send the
 .Dv SIGTERM
 or
 .Dv SIGKILL
 signals to all processes in the jail \(em be careful not to run this from
 the host environment!
 Once all of the jail's processes have died, unless the jail was created
 with the
 .Va persist
 parameter, the jail will be removed.
 Depending on
 the intended use of the jail, you may also want to run
 .Pa /etc/rc.shutdown
 from within the jail.
 .Pp
 To shut down the jail from the outside, simply remove it with
 .Nm
 .Ar -r ,
 which will run any commands specified by
 .Va exec.stop ,
 and then send
 .Dv SIGTERM
 and eventually
 .Dv SIGKILL
 to any remaining jailed processes.
 .Pp
 The
 .Pa /proc/ Ns Ar pid Ns Pa /status
 file contains, as its last field, the name of the jail in which the
 process runs, or
 .Dq Li -
 to indicate that the process is not running within a jail.
 The
 .Xr ps 1
 command also shows a
 .Ql J
 flag for processes in a jail.
 .Pp
 You can also list/kill processes based on their jail ID.
 To show processes and their jail ID, use the following command:
 .Pp
 .Dl "ps ax -o pid,jid,args"
 .Pp
 To show and then kill processes in jail number 3 use the following commands:
 .Bd -literal -offset indent
 pgrep -lfj 3
 pkill -j 3
 .Ed
 or:
 .Pp
 .Dl "killall -j 3"
 .Ss "Jails and File Systems"
 It is not possible to
 .Xr mount 8
 or
 .Xr umount 8
 any file system inside a jail unless the file system is marked
 jail-friendly, the jail's
 .Va allow.mount
 parameter is set, and the jail's
 .Va enforce_statfs
 parameter is lower than 2.
 .Pp
 Multiple jails sharing the same file system can influence each other.
 For example, a user in one jail can fill the file system,
 leaving no space for processes in the other jail.
 Trying to use
 .Xr quota 1
 to prevent this will not work either, as the file system quotas
 are not aware of jails but only look at the user and group IDs.
 This means the same user ID in two jails share a single file
 system quota.
 One would need to use one file system per jail to make this work.
 .Ss "Sysctl MIB Entries"
 The read-only entry
 .Va security.jail.jailed
 can be used to determine if a process is running inside a jail (value
 is one) or not (value is zero).
 .Pp
 The variable
 .Va security.jail.max_af_ips
 determines how may address per address family a jail may have.
 The default is 255.
 .Pp
 Some MIB variables have per-jail settings.
 Changes to these variables by a jailed process do not affect the host
 environment, only the jail environment.
 These variables are
 .Va kern.securelevel ,
 .Va kern.hostname ,
 .Va kern.domainname ,
 .Va kern.hostid ,
 and
 .Va kern.hostuuid .
 .Ss "Hierarchical Jails"
 By setting a jail's
 .Va children.max
 parameter, processes within a jail may be able to create jails of their own.
 These child jails are kept in a hierarchy, with jails only able to see and/or
 modify the jails they created (or those jails' children).
 Each jail has a read-only
 .Va parent
 parameter, containing the
 .Va jid
 of the jail that created it; a
 .Va jid
 of 0 indicates the jail is a child of the current jail (or is a top-level
 jail if the current process isn't jailed).
 .Pp
 Jailed processes are not allowed to confer greater permissions than they
 themselves are given, e.g., if a jail is created with
 .Va allow.nomount ,
 it is not able to create a jail with
 .Va allow.mount
 set.
 Similarly, such restrictions as
 .Va ip4.addr
 and
 .Va securelevel
 may not be bypassed in child jails.
 .Pp
 A child jail may in turn create its own child jails if its own
 .Va children.max
 parameter is set (remember it is zero by default).
 These jails are visible to and can be modified by their parent and all
 ancestors.
 .Pp
 Jail names reflect this hierarchy, with a full name being an MIB-type string
 separated by dots.
 For example, if a base system process creates a jail
 .Dq foo ,
 and a process under that jail creates another jail
 .Dq bar ,
 then the second jail will be seen as
 .Dq foo.bar
 in the base system (though it is only seen as
 .Dq bar
 to any processes inside jail
 .Dq foo ) .
 Jids on the other hand exist in a single space, and each jail must have a
 unique jid.
 .Pp
 Like the names, a child jail's
 .Va path
 appears relative to its creator's own
 .Va path .
 This is by virtue of the child jail being created in the chrooted
 environment of the first jail.
 .Sh SEE ALSO
 .Xr killall 1 ,
 .Xr lsvfs 1 ,
 .Xr newaliases 1 ,
 .Xr pgrep 1 ,
 .Xr pkill 1 ,
 .Xr ps 1 ,
 .Xr quota 1 ,
 .Xr jail_set 2 ,
 .Xr devfs 5 ,
 .Xr fdescfs 5 ,
 .Xr jail.conf 5 ,
 .Xr linprocfs 5 ,
 .Xr linsysfs 5 ,
 .Xr procfs 5 ,
 .Xr rc.conf 5 ,
 .Xr sysctl.conf 5 ,
 .Xr chroot 8 ,
 .Xr devfs 8 ,
 .Xr halt 8 ,
 .Xr ifconfig 8 ,
 .Xr inetd 8 ,
 .Xr jexec 8 ,
 .Xr jls 8 ,
 .Xr mount 8 ,
 .Xr named 8 ,
 .Xr reboot 8 ,
 .Xr rpcbind 8 ,
 .Xr sendmail 8 ,
 .Xr shutdown 8 ,
 .Xr sysctl 8 ,
 .Xr syslogd 8 ,
 .Xr umount 8
 .Sh HISTORY
 The
 .Nm
 utility appeared in
 .Fx 4.0 .
 Hierarchical/extensible jails were introduced in
 .Fx 8.0 .
 The configuration file was introduced in
 .Fx 9.1 .
 .Sh AUTHORS
 .An -nosplit
 The jail feature was written by
 .An Poul-Henning Kamp
 for R&D Associates
 who contributed it to
 .Fx .
 .Pp
 .An Robert Watson
 wrote the extended documentation, found a few bugs, added
 a few new features, and cleaned up the userland jail environment.
 .Pp
 .An Bjoern A. Zeeb
 added multi-IP jail support for IPv4 and IPv6 based on a patch
 originally done by
 .An Pawel Jakub Dawidek
 for IPv4.
 .Pp
 .An James Gritton
 added the extensible jail parameters, hierarchical jails,
 and the configuration file.
 .Sh BUGS
 It might be a good idea to add an
 address alias flag such that daemons listening on all IPs
 .Pq Dv INADDR_ANY
 will not bind on that address, which would facilitate building a safe
 host environment such that host daemons do not impose on services offered
 from within jails.
 Currently, the simplest answer is to minimize services
 offered on the host, possibly limiting it to services offered from
 .Xr inetd 8
 which is easily configurable.
 .Sh NOTES
 Great care should be taken when managing directories visible within the jail.
 For example, if a jailed process has its current working directory set to a
 directory that is moved out of the jail's chroot, then the process may gain
 access to the file space outside of the jail.
 It is recommended that directories always be copied, rather than moved, out
 of a jail.
 .Pp
 In addition, there are several ways in which an unprivileged user
 outside the jail can cooperate with a privileged user inside the jail
 and thereby obtain elevated privileges in the host environment.
 Most of these attacks can be mitigated by ensuring that the jail root
 is not accessible to unprivileged users in the host environment.
 Regardless, as a general rule, untrusted users with privileged access
 to a jail should not be given access to the host environment.