Index: head/bin/sh/miscbltin.c
===================================================================
--- head/bin/sh/miscbltin.c	(revision 296161)
+++ head/bin/sh/miscbltin.c	(revision 296162)
@@ -1,529 +1,532 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Kenneth Almquist.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #ifndef lint
 #if 0
 static char sccsid[] = "@(#)miscbltin.c	8.4 (Berkeley) 5/4/95";
 #endif
 #endif /* not lint */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Miscellaneous builtins.
  */
 
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <unistd.h>
 #include <errno.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 
 #include "shell.h"
 #include "options.h"
 #include "var.h"
 #include "output.h"
 #include "memalloc.h"
 #include "error.h"
 #include "mystring.h"
 #include "syntax.h"
 #include "trap.h"
 
 #undef eflag
 
 int readcmd(int, char **);
 int umaskcmd(int, char **);
 int ulimitcmd(int, char **);
 
 /*
  * The read builtin.  The -r option causes backslashes to be treated like
  * ordinary characters.
  *
  * This uses unbuffered input, which may be avoidable in some cases.
  *
  * Note that if IFS=' :' then read x y should work so that:
  * 'a b'	x='a', y='b'
  * ' a b '	x='a', y='b'
  * ':b'		x='',  y='b'
  * ':'		x='',  y=''
  * '::'		x='',  y=''
  * ': :'	x='',  y=''
  * ':::'	x='',  y='::'
  * ':b c:'	x='',  y='b c:'
  */
 
 int
 readcmd(int argc __unused, char **argv __unused)
 {
 	char **ap;
 	int backslash;
 	char c;
 	int rflag;
 	char *prompt;
 	const char *ifs;
 	char *p;
 	int startword;
 	int status;
 	int i;
 	int is_ifs;
 	int saveall = 0;
 	ptrdiff_t lastnonifs, lastnonifsws;
 	struct timeval tv;
 	char *tvptr;
 	fd_set ifds;
 	ssize_t nread;
 	int sig;
 
 	rflag = 0;
 	prompt = NULL;
 	tv.tv_sec = -1;
 	tv.tv_usec = 0;
 	while ((i = nextopt("erp:t:")) != '\0') {
 		switch(i) {
 		case 'p':
 			prompt = shoptarg;
 			break;
 		case 'e':
 			break;
 		case 'r':
 			rflag = 1;
 			break;
 		case 't':
 			tv.tv_sec = strtol(shoptarg, &tvptr, 0);
 			if (tvptr == shoptarg)
 				error("timeout value");
 			switch(*tvptr) {
 			case 0:
 			case 's':
 				break;
 			case 'h':
 				tv.tv_sec *= 60;
 				/* FALLTHROUGH */
 			case 'm':
 				tv.tv_sec *= 60;
 				break;
 			default:
 				error("timeout unit");
 			}
 			break;
 		}
 	}
 	if (prompt && isatty(0)) {
 		out2str(prompt);
 		flushall();
 	}
 	if (*(ap = argptr) == NULL)
 		error("arg count");
 	if ((ifs = bltinlookup("IFS", 1)) == NULL)
 		ifs = " \t\n";
 
 	if (tv.tv_sec >= 0) {
 		/*
 		 * Wait for something to become available.
 		 */
 		FD_ZERO(&ifds);
 		FD_SET(0, &ifds);
 		status = select(1, &ifds, NULL, NULL, &tv);
 		/*
 		 * If there's nothing ready, return an error.
 		 */
 		if (status <= 0) {
 			sig = pendingsig;
 			return (128 + (sig != 0 ? sig : SIGALRM));
 		}
 	}
 
 	status = 0;
 	startword = 2;
 	backslash = 0;
 	STARTSTACKSTR(p);
 	lastnonifs = lastnonifsws = -1;
 	for (;;) {
 		nread = read(STDIN_FILENO, &c, 1);
 		if (nread == -1) {
 			if (errno == EINTR) {
 				sig = pendingsig;
 				if (sig == 0)
 					continue;
 				status = 128 + sig;
 				break;
 			}
 			warning("read error: %s", strerror(errno));
 			status = 2;
 			break;
 		} else if (nread != 1) {
 			status = 1;
 			break;
 		}
 		if (c == '\0')
 			continue;
 		CHECKSTRSPACE(1, p);
 		if (backslash) {
 			backslash = 0;
 			if (c != '\n') {
 				startword = 0;
 				lastnonifs = lastnonifsws = p - stackblock();
 				USTPUTC(c, p);
 			}
 			continue;
 		}
 		if (!rflag && c == '\\') {
 			backslash++;
 			continue;
 		}
 		if (c == '\n')
 			break;
 		if (strchr(ifs, c))
 			is_ifs = strchr(" \t\n", c) ? 1 : 2;
 		else
 			is_ifs = 0;
 
 		if (startword != 0) {
 			if (is_ifs == 1) {
 				/* Ignore leading IFS whitespace */
 				if (saveall)
 					USTPUTC(c, p);
 				continue;
 			}
 			if (is_ifs == 2 && startword == 1) {
 				/* Only one non-whitespace IFS per word */
 				startword = 2;
 				if (saveall) {
 					lastnonifsws = p - stackblock();
 					USTPUTC(c, p);
 				}
 				continue;
 			}
 		}
 
 		if (is_ifs == 0) {
 			/* append this character to the current variable */
 			startword = 0;
 			if (saveall)
 				/* Not just a spare terminator */
 				saveall++;
 			lastnonifs = lastnonifsws = p - stackblock();
 			USTPUTC(c, p);
 			continue;
 		}
 
 		/* end of variable... */
 		startword = is_ifs;
 
 		if (ap[1] == NULL) {
 			/* Last variable needs all IFS chars */
 			saveall++;
 			if (is_ifs == 2)
 				lastnonifsws = p - stackblock();
 			USTPUTC(c, p);
 			continue;
 		}
 
 		STACKSTRNUL(p);
 		setvar(*ap, stackblock(), 0);
 		ap++;
 		STARTSTACKSTR(p);
 		lastnonifs = lastnonifsws = -1;
 	}
 	STACKSTRNUL(p);
 
 	/*
 	 * Remove trailing IFS chars: always remove whitespace, don't remove
 	 * non-whitespace unless it was naked
 	 */
 	if (saveall <= 1)
 		lastnonifsws = lastnonifs;
 	stackblock()[lastnonifsws + 1] = '\0';
 	setvar(*ap, stackblock(), 0);
 
 	/* Set any remaining args to "" */
 	while (*++ap != NULL)
 		setvar(*ap, "", 0);
 	return status;
 }
 
 
 
 int
 umaskcmd(int argc __unused, char **argv __unused)
 {
 	char *ap;
 	int mask;
 	int i;
 	int symbolic_mode = 0;
 
 	while ((i = nextopt("S")) != '\0') {
 		symbolic_mode = 1;
 	}
 
 	INTOFF;
 	mask = umask(0);
 	umask(mask);
 	INTON;
 
 	if ((ap = *argptr) == NULL) {
 		if (symbolic_mode) {
 			char u[4], g[4], o[4];
 
 			i = 0;
 			if ((mask & S_IRUSR) == 0)
 				u[i++] = 'r';
 			if ((mask & S_IWUSR) == 0)
 				u[i++] = 'w';
 			if ((mask & S_IXUSR) == 0)
 				u[i++] = 'x';
 			u[i] = '\0';
 
 			i = 0;
 			if ((mask & S_IRGRP) == 0)
 				g[i++] = 'r';
 			if ((mask & S_IWGRP) == 0)
 				g[i++] = 'w';
 			if ((mask & S_IXGRP) == 0)
 				g[i++] = 'x';
 			g[i] = '\0';
 
 			i = 0;
 			if ((mask & S_IROTH) == 0)
 				o[i++] = 'r';
 			if ((mask & S_IWOTH) == 0)
 				o[i++] = 'w';
 			if ((mask & S_IXOTH) == 0)
 				o[i++] = 'x';
 			o[i] = '\0';
 
 			out1fmt("u=%s,g=%s,o=%s\n", u, g, o);
 		} else {
 			out1fmt("%.4o\n", mask);
 		}
 	} else {
 		if (is_digit(*ap)) {
 			mask = 0;
 			do {
 				if (*ap >= '8' || *ap < '0')
 					error("Illegal number: %s", *argptr);
 				mask = (mask << 3) + (*ap - '0');
 			} while (*++ap != '\0');
 			umask(mask);
 		} else {
 			void *set;
 			INTOFF;
 			if ((set = setmode (ap)) == 0)
 				error("Illegal number: %s", ap);
 
 			mask = getmode (set, ~mask & 0777);
 			umask(~mask & 0777);
 			free(set);
 			INTON;
 		}
 	}
 	return 0;
 }
 
 /*
  * ulimit builtin
  *
  * This code, originally by Doug Gwyn, Doug Kingston, Eric Gisin, and
  * Michael Rendell was ripped from pdksh 5.0.8 and hacked for use with
  * ash by J.T. Conklin.
  *
  * Public domain.
  */
 
 struct limits {
 	const char *name;
 	const char *units;
 	int	cmd;
 	int	factor;	/* multiply by to get rlim_{cur,max} values */
 	char	option;
 };
 
 static const struct limits limits[] = {
 #ifdef RLIMIT_CPU
 	{ "cpu time",		"seconds",	RLIMIT_CPU,	   1, 't' },
 #endif
 #ifdef RLIMIT_FSIZE
 	{ "file size",		"512-blocks",	RLIMIT_FSIZE,	 512, 'f' },
 #endif
 #ifdef RLIMIT_DATA
 	{ "data seg size",	"kbytes",	RLIMIT_DATA,	1024, 'd' },
 #endif
 #ifdef RLIMIT_STACK
 	{ "stack size",		"kbytes",	RLIMIT_STACK,	1024, 's' },
 #endif
 #ifdef  RLIMIT_CORE
 	{ "core file size",	"512-blocks",	RLIMIT_CORE,	 512, 'c' },
 #endif
 #ifdef RLIMIT_RSS
 	{ "max memory size",	"kbytes",	RLIMIT_RSS,	1024, 'm' },
 #endif
 #ifdef RLIMIT_MEMLOCK
 	{ "locked memory",	"kbytes",	RLIMIT_MEMLOCK, 1024, 'l' },
 #endif
 #ifdef RLIMIT_NPROC
 	{ "max user processes",	(char *)0,	RLIMIT_NPROC,      1, 'u' },
 #endif
 #ifdef RLIMIT_NOFILE
 	{ "open files",		(char *)0,	RLIMIT_NOFILE,     1, 'n' },
 #endif
 #ifdef RLIMIT_VMEM
 	{ "virtual mem size",	"kbytes",	RLIMIT_VMEM,	1024, 'v' },
 #endif
 #ifdef RLIMIT_SWAP
 	{ "swap limit",		"kbytes",	RLIMIT_SWAP,	1024, 'w' },
 #endif
 #ifdef RLIMIT_SBSIZE
 	{ "sbsize",		"bytes",	RLIMIT_SBSIZE,	   1, 'b' },
 #endif
 #ifdef RLIMIT_NPTS
 	{ "pseudo-terminals",	(char *)0,	RLIMIT_NPTS,	   1, 'p' },
 #endif
 #ifdef RLIMIT_KQUEUES
 	{ "kqueues",		(char *)0,	RLIMIT_KQUEUES,	   1, 'k' },
 #endif
+#ifdef RLIMIT_UMTXP
+	{ "umtxp",		(char *)0,	RLIMIT_UMTXP,	   1, 'o' },
+#endif
 	{ (char *) 0,		(char *)0,	0,		   0, '\0' }
 };
 
 enum limithow { SOFT = 0x1, HARD = 0x2 };
 
 static void
 printlimit(enum limithow how, const struct rlimit *limit,
     const struct limits *l)
 {
 	rlim_t val = 0;
 
 	if (how & SOFT)
 		val = limit->rlim_cur;
 	else if (how & HARD)
 		val = limit->rlim_max;
 	if (val == RLIM_INFINITY)
 		out1str("unlimited\n");
 	else
 	{
 		val /= l->factor;
 		out1fmt("%jd\n", (intmax_t)val);
 	}
 }
 
 int
 ulimitcmd(int argc __unused, char **argv __unused)
 {
 	rlim_t val = 0;
 	enum limithow how = SOFT | HARD;
 	const struct limits	*l;
 	int		set, all = 0;
 	int		optc, what;
 	struct rlimit	limit;
 
 	what = 'f';
 	while ((optc = nextopt("HSatfdsmcnuvlbpwk")) != '\0')
 		switch (optc) {
 		case 'H':
 			how = HARD;
 			break;
 		case 'S':
 			how = SOFT;
 			break;
 		case 'a':
 			all = 1;
 			break;
 		default:
 			what = optc;
 		}
 
 	for (l = limits; l->name && l->option != what; l++)
 		;
 	if (!l->name)
 		error("internal error (%c)", what);
 
 	set = *argptr ? 1 : 0;
 	if (set) {
 		char *p = *argptr;
 
 		if (all || argptr[1])
 			error("too many arguments");
 		if (strcmp(p, "unlimited") == 0)
 			val = RLIM_INFINITY;
 		else {
 			char *end;
 			uintmax_t uval;
 
 			if (*p < '0' || *p > '9')
 				error("bad number");
 			errno = 0;
 			uval = strtoumax(p, &end, 10);
 			if (errno != 0 || *end != '\0')
 				error("bad number");
 			if (uval > UINTMAX_MAX / l->factor)
 				error("bad number");
 			uval *= l->factor;
 			val = (rlim_t)uval;
 			if (val < 0 || (uintmax_t)val != uval ||
 			    val == RLIM_INFINITY)
 				error("bad number");
 		}
 	}
 	if (all) {
 		for (l = limits; l->name; l++) {
 			char optbuf[40];
 			if (getrlimit(l->cmd, &limit) < 0)
 				error("can't get limit: %s", strerror(errno));
 
 			if (l->units)
 				snprintf(optbuf, sizeof(optbuf),
 					"(%s, -%c) ", l->units, l->option);
 			else
 				snprintf(optbuf, sizeof(optbuf),
 					"(-%c) ", l->option);
 			out1fmt("%-18s %18s ", l->name, optbuf);
 			printlimit(how, &limit, l);
 		}
 		return 0;
 	}
 
 	if (getrlimit(l->cmd, &limit) < 0)
 		error("can't get limit: %s", strerror(errno));
 	if (set) {
 		if (how & SOFT)
 			limit.rlim_cur = val;
 		if (how & HARD)
 			limit.rlim_max = val;
 		if (setrlimit(l->cmd, &limit) < 0)
 			error("bad limit: %s", strerror(errno));
 	} else
 		printlimit(how, &limit, l);
 	return 0;
 }
Index: head/include/pthread.h
===================================================================
--- head/include/pthread.h	(revision 296161)
+++ head/include/pthread.h	(revision 296162)
@@ -1,339 +1,339 @@
 /*
  * Copyright (c) 1993, 1994 by Chris Provenzano, proven@mit.edu
  * Copyright (c) 1995-1998 by John Birrell <jb@cimlogic.com.au>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *  This product includes software developed by Chris Provenzano.
  * 4. The name of Chris Provenzano may not be used to endorse or promote 
  *	  products derived from this software without specific prior written
  *	  permission.
  *
  * THIS SOFTWARE IS PROVIDED BY CHRIS PROVENZANO ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL CHRIS PROVENZANO BE LIABLE FOR ANY 
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef _PTHREAD_H_
 #define	_PTHREAD_H_
 
 /*
  * Header files.
  */
 #include <sys/cdefs.h>
 #include <sys/_pthreadtypes.h>
 #include <machine/_limits.h>
 #include <machine/_types.h>
 #include <sys/_sigset.h>
 #include <sched.h>
 #include <time.h>
 
 /*
  * Run-time invariant values:
  */
 #define	PTHREAD_DESTRUCTOR_ITERATIONS		4
 #define	PTHREAD_KEYS_MAX			256
 #define	PTHREAD_STACK_MIN			__MINSIGSTKSZ
 #define	PTHREAD_THREADS_MAX			__ULONG_MAX
 #define	PTHREAD_BARRIER_SERIAL_THREAD		-1
 
 /*
  * Flags for threads and thread attributes.
  */
 #define	PTHREAD_DETACHED		0x1
 #define	PTHREAD_SCOPE_SYSTEM		0x2
 #define	PTHREAD_INHERIT_SCHED		0x4
 #define	PTHREAD_NOFLOAT			0x8
 
 #define	PTHREAD_CREATE_DETACHED		PTHREAD_DETACHED
 #define	PTHREAD_CREATE_JOINABLE		0
 #define	PTHREAD_SCOPE_PROCESS		0
 #define	PTHREAD_EXPLICIT_SCHED		0
 
 /*
- * Flags for read/write lock attributes
+ * Values for process shared/private attributes.
  */
 #define	PTHREAD_PROCESS_PRIVATE		0
 #define	PTHREAD_PROCESS_SHARED		1
 
 /*
  * Flags for cancelling threads
  */
 #define	PTHREAD_CANCEL_ENABLE		0
 #define	PTHREAD_CANCEL_DISABLE		1
 #define	PTHREAD_CANCEL_DEFERRED		0
 #define	PTHREAD_CANCEL_ASYNCHRONOUS	2
 #define	PTHREAD_CANCELED		((void *) 1)
 
 /*
  * Flags for once initialization.
  */
 #define	PTHREAD_NEEDS_INIT	0
 #define	PTHREAD_DONE_INIT	1
 
 /*
  * Static once initialization values.
  */
 #define	PTHREAD_ONCE_INIT	{ PTHREAD_NEEDS_INIT, NULL }
 
 /*
  * Static initialization values.
  */
 #define	PTHREAD_MUTEX_INITIALIZER	NULL
 #define	PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP	((pthread_mutex_t)1)
 #define	PTHREAD_COND_INITIALIZER	NULL
 #define	PTHREAD_RWLOCK_INITIALIZER	NULL
 
 /*
  * Default attribute arguments (draft 4, deprecated).
  */
 #ifndef PTHREAD_KERNEL
 #define	pthread_condattr_default	NULL
 #define	pthread_mutexattr_default	NULL
 #define	pthread_attr_default		NULL
 #endif
 
 #define	PTHREAD_PRIO_NONE	0
 #define	PTHREAD_PRIO_INHERIT	1
 #define	PTHREAD_PRIO_PROTECT	2
 
 /*
  * Mutex types (Single UNIX Specification, Version 2, 1997).
  *
  * Note that a mutex attribute with one of the following types:
  *
  *	PTHREAD_MUTEX_NORMAL
  *	PTHREAD_MUTEX_RECURSIVE
  *
  * will deviate from POSIX specified semantics.
  */
 enum pthread_mutextype {
 	PTHREAD_MUTEX_ERRORCHECK	= 1,	/* Default POSIX mutex */
 	PTHREAD_MUTEX_RECURSIVE		= 2,	/* Recursive mutex */
 	PTHREAD_MUTEX_NORMAL		= 3,	/* No error checking */
 	PTHREAD_MUTEX_ADAPTIVE_NP	= 4,	/* Adaptive mutex, spins briefly before blocking on lock */
 	PTHREAD_MUTEX_TYPE_MAX
 };
 
 #define	PTHREAD_MUTEX_DEFAULT		PTHREAD_MUTEX_ERRORCHECK
 
 struct _pthread_cleanup_info {
 	__uintptr_t	pthread_cleanup_pad[8];
 };
 
 /*
  * Thread function prototype definitions:
  */
 __BEGIN_DECLS
 int		pthread_atfork(void (*)(void), void (*)(void), void (*)(void));
 int		pthread_attr_destroy(pthread_attr_t *) __nonnull(1);
 int		pthread_attr_getstack(const pthread_attr_t * __restrict, 
 			void ** __restrict, size_t * __restrict)
 			__nonnull_all;
 int		pthread_attr_getstacksize(const pthread_attr_t *, size_t *)
 			__nonnull_all;
 int		pthread_attr_getguardsize(const pthread_attr_t *, size_t *);
 int		pthread_attr_getstackaddr(const pthread_attr_t *, void **);
 int		pthread_attr_getdetachstate(const pthread_attr_t *, int *)
 			__nonnull_all;
 int		pthread_attr_init(pthread_attr_t *) __nonnull(1);
 int		pthread_attr_setstacksize(pthread_attr_t *, size_t)
 			__nonnull(1);
 int		pthread_attr_setguardsize(pthread_attr_t *, size_t)
 			__nonnull(1);
 int		pthread_attr_setstack(pthread_attr_t *, void *, size_t)
 			__nonnull(1);
 int		pthread_attr_setstackaddr(pthread_attr_t *, void *);
 int		pthread_attr_setdetachstate(pthread_attr_t *, int) __nonnull(1);
 int		pthread_barrier_destroy(pthread_barrier_t *);
 int		pthread_barrier_init(pthread_barrier_t *,
 			const pthread_barrierattr_t *, unsigned);
 int		pthread_barrier_wait(pthread_barrier_t *);
 int		pthread_barrierattr_destroy(pthread_barrierattr_t *);
 int		pthread_barrierattr_getpshared(const pthread_barrierattr_t *,
 			int *);
 int		pthread_barrierattr_init(pthread_barrierattr_t *) __nonnull(1);
 int		pthread_barrierattr_setpshared(pthread_barrierattr_t *, int);
 
 #define		pthread_cleanup_push(cleanup_routine, cleanup_arg)		\
 		{								\
 			struct _pthread_cleanup_info __cleanup_info__;		\
 			__pthread_cleanup_push_imp(cleanup_routine, cleanup_arg,\
 				&__cleanup_info__);				\
 			{
 
 #define		pthread_cleanup_pop(execute)					\
 				(void)0;					\
 			}							\
 			__pthread_cleanup_pop_imp(execute);			\
 		}
 
 int		pthread_condattr_destroy(pthread_condattr_t *) __nonnull(1);
 int		pthread_condattr_getclock(const pthread_condattr_t *,
 			clockid_t *) __nonnull_all;
 int		pthread_condattr_getpshared(const pthread_condattr_t *, int *)
 			__nonnull_all;
 int		pthread_condattr_init(pthread_condattr_t *) __nonnull(1);
 int		pthread_condattr_setclock(pthread_condattr_t *, clockid_t)
 			__nonnull(1);
 int		pthread_condattr_setpshared(pthread_condattr_t *, int)
 			__nonnull(1);
 int		pthread_cond_broadcast(pthread_cond_t *)
 			__nonnull(1);
 int		pthread_cond_destroy(pthread_cond_t *)
 			__nonnull(1);
 int		pthread_cond_init(pthread_cond_t *,
 			const pthread_condattr_t *) __nonnull(1);
 int		pthread_cond_signal(pthread_cond_t *) __nonnull(1);
 int		pthread_cond_timedwait(pthread_cond_t *,
 			pthread_mutex_t *__mutex, const struct timespec *)
 			__nonnull_all __requires_exclusive(*__mutex);
 int		pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *__mutex)
 			__nonnull_all __requires_exclusive(*__mutex);
 int		pthread_create(pthread_t *, const pthread_attr_t *,
 			void *(*) (void *), void *) __nonnull(1) __nonnull(3);
 int		pthread_detach(pthread_t);
 int		pthread_equal(pthread_t, pthread_t);
 void		pthread_exit(void *) __dead2;
 void		*pthread_getspecific(pthread_key_t);
 int		pthread_getcpuclockid(pthread_t, clockid_t *) __nonnull(2);
 int		pthread_join(pthread_t, void **);
 int		pthread_key_create(pthread_key_t *,
 			void (*) (void *)) __nonnull(1);
 int		pthread_key_delete(pthread_key_t);
 int		pthread_mutexattr_init(pthread_mutexattr_t *) __nonnull(1);
 int		pthread_mutexattr_destroy(pthread_mutexattr_t *) __nonnull(1);
 int		pthread_mutexattr_getpshared(const pthread_mutexattr_t *,
 			int *) __nonnull_all;
 int		pthread_mutexattr_gettype(pthread_mutexattr_t *, int *)
 			__nonnull_all;
 int		pthread_mutexattr_settype(pthread_mutexattr_t *, int)
 			__nonnull(1);
 int		pthread_mutexattr_setpshared(pthread_mutexattr_t *, int)
 			__nonnull(1);
 int		pthread_mutex_destroy(pthread_mutex_t *__mutex)
 			__nonnull(1) __requires_unlocked(*__mutex);
 int		pthread_mutex_init(pthread_mutex_t *__mutex,
 			const pthread_mutexattr_t *)
 			__nonnull(1) __requires_unlocked(*__mutex);
 int		pthread_mutex_lock(pthread_mutex_t *__mutex)
 			__nonnull(1) __locks_exclusive(*__mutex);
 int		pthread_mutex_trylock(pthread_mutex_t *__mutex)
 			__nonnull(1) __trylocks_exclusive(0, *__mutex);
 int		pthread_mutex_timedlock(pthread_mutex_t *__mutex,
 			const struct timespec *)
 			__nonnull_all __trylocks_exclusive(0, *__mutex);
 int		pthread_mutex_unlock(pthread_mutex_t *__mutex)
 			__nonnull(1) __unlocks(*__mutex);
 int		pthread_once(pthread_once_t *, void (*) (void)) __nonnull_all;
 int		pthread_rwlock_destroy(pthread_rwlock_t *__rwlock)
 			__nonnull(1) __requires_unlocked(*__rwlock);
 int		pthread_rwlock_init(pthread_rwlock_t *__rwlock,
 			const pthread_rwlockattr_t *)
 			__nonnull(1) __requires_unlocked(*__rwlock);
 int		pthread_rwlock_rdlock(pthread_rwlock_t *__rwlock)
 			__nonnull(1) __locks_shared(*__rwlock);
 int		pthread_rwlock_timedrdlock(pthread_rwlock_t *__rwlock,
 			const struct timespec *)
 			__nonnull_all __trylocks_shared(0, *__rwlock);
 int		pthread_rwlock_timedwrlock(pthread_rwlock_t *__rwlock,
 			const struct timespec *)
 			__nonnull_all __trylocks_exclusive(0, *__rwlock);
 int		pthread_rwlock_tryrdlock(pthread_rwlock_t *__rwlock)
 			__nonnull(1) __trylocks_shared(0, *__rwlock);
 int		pthread_rwlock_trywrlock(pthread_rwlock_t *__rwlock)
 			__nonnull(1) __trylocks_exclusive(0, *__rwlock);
 int		pthread_rwlock_unlock(pthread_rwlock_t *__rwlock)
 			__nonnull(1) __unlocks(*__rwlock);
 int		pthread_rwlock_wrlock(pthread_rwlock_t *__rwlock)
 			__nonnull(1) __locks_exclusive(*__rwlock);
 int		pthread_rwlockattr_destroy(pthread_rwlockattr_t *)
 			__nonnull(1);
 int		pthread_rwlockattr_getkind_np(const pthread_rwlockattr_t *,
 			int *);
 int		pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *,
 			int *) __nonnull_all;
 int		pthread_rwlockattr_init(pthread_rwlockattr_t *)
 			__nonnull(1);
 int		pthread_rwlockattr_setkind_np(pthread_rwlockattr_t *, int);
 int		pthread_rwlockattr_setpshared(pthread_rwlockattr_t *, int)
 			__nonnull(1);
 pthread_t	pthread_self(void);
 int		pthread_setspecific(pthread_key_t, const void *);
 
 int		pthread_spin_init(pthread_spinlock_t *__spin, int)
 			__requires_unlocked(*__spin);
 int		pthread_spin_destroy(pthread_spinlock_t *__spin)
 			__requires_unlocked(*__spin);
 int		pthread_spin_lock(pthread_spinlock_t *__spin)
 			__locks_exclusive(*__spin);
 int		pthread_spin_trylock(pthread_spinlock_t *__spin)
 			__trylocks_exclusive(0, *__spin);
 int		pthread_spin_unlock(pthread_spinlock_t *__spin)
 			__unlocks(*__spin);
 int		pthread_cancel(pthread_t);
 int		pthread_setcancelstate(int, int *);
 int		pthread_setcanceltype(int, int *);
 void		pthread_testcancel(void);
 
 #if __BSD_VISIBLE
 int		pthread_getprio(pthread_t);
 int		pthread_setprio(pthread_t, int);
 void		pthread_yield(void);
 #endif
 
 int		pthread_mutexattr_getprioceiling(pthread_mutexattr_t *,
 			int *);
 int		pthread_mutexattr_setprioceiling(pthread_mutexattr_t *,
 			int);
 int		pthread_mutex_getprioceiling(pthread_mutex_t *, int *);
 int		pthread_mutex_setprioceiling(pthread_mutex_t *, int, int *);
 
 int		pthread_mutexattr_getprotocol(pthread_mutexattr_t *, int *);
 int		pthread_mutexattr_setprotocol(pthread_mutexattr_t *, int);
 
 int		pthread_attr_getinheritsched(const pthread_attr_t *, int *);
 int		pthread_attr_getschedparam(const pthread_attr_t *,
 			struct sched_param *) __nonnull_all;
 int		pthread_attr_getschedpolicy(const pthread_attr_t *, int *)
 			__nonnull_all;
 int		pthread_attr_getscope(const pthread_attr_t *, int *)
 			__nonnull_all;
 int		pthread_attr_setinheritsched(pthread_attr_t *, int);
 int		pthread_attr_setschedparam(pthread_attr_t *,
 			const struct sched_param *) __nonnull(1) __nonnull(2);
 int		pthread_attr_setschedpolicy(pthread_attr_t *, int) __nonnull(1);
 int		pthread_attr_setscope(pthread_attr_t *, int) __nonnull(1);
 int		pthread_getschedparam(pthread_t pthread, int *,
 			struct sched_param *) __nonnull(2) __nonnull(3);
 int		pthread_setschedparam(pthread_t, int,
 			const struct sched_param *) __nonnull(3);
 #if __XSI_VISIBLE
 int		pthread_getconcurrency(void);
 int		pthread_setconcurrency(int);
 #endif
 
 void		__pthread_cleanup_push_imp(void (*)(void *), void *,
 			struct _pthread_cleanup_info *);
 void		__pthread_cleanup_pop_imp(int);
 __END_DECLS
 
 #endif
Index: head/include/unistd.h
===================================================================
--- head/include/unistd.h	(revision 296161)
+++ head/include/unistd.h	(revision 296162)
@@ -1,592 +1,592 @@
 /*-
  * Copyright (c) 1991, 1993, 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)unistd.h	8.12 (Berkeley) 4/27/95
  * $FreeBSD$
  */
 
 #ifndef _UNISTD_H_
 #define	_UNISTD_H_
 
 #include <sys/cdefs.h>
 #include <sys/types.h>			/* XXX adds too much pollution. */
 #include <sys/unistd.h>
 #include <sys/_null.h>
 #include <sys/_types.h>
 
 #ifndef _GID_T_DECLARED
 typedef	__gid_t		gid_t;
 #define	_GID_T_DECLARED
 #endif
 
 #ifndef _OFF_T_DECLARED
 typedef	__off_t		off_t;
 #define	_OFF_T_DECLARED
 #endif
 
 #ifndef _PID_T_DECLARED
 typedef	__pid_t		pid_t;
 #define	_PID_T_DECLARED
 #endif
 
 #ifndef _SIZE_T_DECLARED
 typedef	__size_t	size_t;
 #define	_SIZE_T_DECLARED
 #endif
 
 #ifndef _SSIZE_T_DECLARED
 typedef	__ssize_t	ssize_t;
 #define	_SSIZE_T_DECLARED
 #endif
 
 #ifndef _UID_T_DECLARED
 typedef	__uid_t		uid_t;
 #define	_UID_T_DECLARED
 #endif
 
 #ifndef _USECONDS_T_DECLARED
 typedef	__useconds_t	useconds_t;
 #define	_USECONDS_T_DECLARED
 #endif
 
 #define	STDIN_FILENO	0	/* standard input file descriptor */
 #define	STDOUT_FILENO	1	/* standard output file descriptor */
 #define	STDERR_FILENO	2	/* standard error file descriptor */
 
 #if __XSI_VISIBLE || __POSIX_VISIBLE >= 200112
 #define	F_ULOCK		0	/* unlock locked section */
 #define	F_LOCK		1	/* lock a section for exclusive use */
 #define	F_TLOCK		2	/* test and lock a section for exclusive use */
 #define	F_TEST		3	/* test a section for locks by other procs */
 #endif
 
 /*
  * POSIX options and option groups we unconditionally do or don't
  * implement.  This list includes those options which are exclusively
  * implemented (or not) in user mode.  Please keep this list in
  * alphabetical order.
  *
  * Anything which is defined as zero below **must** have an
  * implementation for the corresponding sysconf() which is able to
  * determine conclusively whether or not the feature is supported.
  * Anything which is defined as other than -1 below **must** have
  * complete headers, types, and function declarations as specified by
  * the POSIX standard; however, if the relevant sysconf() function
  * returns -1, the functions may be stubbed out.
  */
 #define	_POSIX_BARRIERS			200112L
 #define	_POSIX_CPUTIME			200112L
 #define	_POSIX_READER_WRITER_LOCKS	200112L
 #define	_POSIX_REGEXP			1
 #define	_POSIX_SHELL			1
 #define	_POSIX_SPAWN			200112L
 #define	_POSIX_SPIN_LOCKS		200112L
 #define	_POSIX_THREAD_ATTR_STACKADDR	200112L
 #define	_POSIX_THREAD_ATTR_STACKSIZE	200112L
 #define	_POSIX_THREAD_CPUTIME		200112L
 #define	_POSIX_THREAD_PRIO_INHERIT	200112L
 #define	_POSIX_THREAD_PRIO_PROTECT	200112L
 #define	_POSIX_THREAD_PRIORITY_SCHEDULING 200112L
-#define	_POSIX_THREAD_PROCESS_SHARED	-1
+#define	_POSIX_THREAD_PROCESS_SHARED	200112L
 #define	_POSIX_THREAD_SAFE_FUNCTIONS	-1
 #define	_POSIX_THREAD_SPORADIC_SERVER	-1
 #define	_POSIX_THREADS			200112L
 #define	_POSIX_TRACE			-1
 #define	_POSIX_TRACE_EVENT_FILTER	-1
 #define	_POSIX_TRACE_INHERIT		-1
 #define	_POSIX_TRACE_LOG		-1
 #define	_POSIX2_C_BIND			200112L	/* mandatory */
 #define	_POSIX2_C_DEV			-1 /* need c99 utility */
 #define	_POSIX2_CHAR_TERM		1
 #define	_POSIX2_FORT_DEV		-1 /* need fort77 utility */
 #define	_POSIX2_FORT_RUN		200112L
 #define	_POSIX2_LOCALEDEF		-1
 #define	_POSIX2_PBS			-1
 #define	_POSIX2_PBS_ACCOUNTING		-1
 #define	_POSIX2_PBS_CHECKPOINT		-1
 #define	_POSIX2_PBS_LOCATE		-1
 #define	_POSIX2_PBS_MESSAGE		-1
 #define	_POSIX2_PBS_TRACK		-1
 #define	_POSIX2_SW_DEV			-1 /* XXX ??? */
 #define	_POSIX2_UPE			200112L
 #define	_V6_ILP32_OFF32			-1
 #define	_V6_ILP32_OFFBIG		0
 #define	_V6_LP64_OFF64			0
 #define	_V6_LPBIG_OFFBIG		-1
 
 #if __XSI_VISIBLE
 #define	_XOPEN_CRYPT			-1 /* XXX ??? */
 #define	_XOPEN_ENH_I18N			-1 /* mandatory in XSI */
 #define	_XOPEN_LEGACY			-1
 #define	_XOPEN_REALTIME			-1
 #define	_XOPEN_REALTIME_THREADS		-1
 #define	_XOPEN_UNIX			-1
 #endif
 
 /* Define the POSIX.2 version we target for compliance. */
 #define	_POSIX2_VERSION		199212L
 
 /*
  * POSIX-style system configuration variable accessors (for the
  * sysconf function).  The kernel does not directly implement the
  * sysconf() interface; rather, a C library stub translates references
  * to sysconf() into calls to sysctl() using a giant switch statement.
  * Those that are marked `user' are implemented entirely in the C
  * library and never query the kernel.  pathconf() is implemented
  * directly by the kernel so those are not defined here.
  */
 #define	_SC_ARG_MAX		 1
 #define	_SC_CHILD_MAX		 2
 #define	_SC_CLK_TCK		 3
 #define	_SC_NGROUPS_MAX		 4
 #define	_SC_OPEN_MAX		 5
 #define	_SC_JOB_CONTROL		 6
 #define	_SC_SAVED_IDS		 7
 #define	_SC_VERSION		 8
 #define	_SC_BC_BASE_MAX		 9 /* user */
 #define	_SC_BC_DIM_MAX		10 /* user */
 #define	_SC_BC_SCALE_MAX	11 /* user */
 #define	_SC_BC_STRING_MAX	12 /* user */
 #define	_SC_COLL_WEIGHTS_MAX	13 /* user */
 #define	_SC_EXPR_NEST_MAX	14 /* user */
 #define	_SC_LINE_MAX		15 /* user */
 #define	_SC_RE_DUP_MAX		16 /* user */
 #define	_SC_2_VERSION		17 /* user */
 #define	_SC_2_C_BIND		18 /* user */
 #define	_SC_2_C_DEV		19 /* user */
 #define	_SC_2_CHAR_TERM		20 /* user */
 #define	_SC_2_FORT_DEV		21 /* user */
 #define	_SC_2_FORT_RUN		22 /* user */
 #define	_SC_2_LOCALEDEF		23 /* user */
 #define	_SC_2_SW_DEV		24 /* user */
 #define	_SC_2_UPE		25 /* user */
 #define	_SC_STREAM_MAX		26 /* user */
 #define	_SC_TZNAME_MAX		27 /* user */
 
 #if __POSIX_VISIBLE >= 199309
 #define	_SC_ASYNCHRONOUS_IO	28
 #define	_SC_MAPPED_FILES	29
 #define	_SC_MEMLOCK		30
 #define	_SC_MEMLOCK_RANGE	31
 #define	_SC_MEMORY_PROTECTION	32
 #define	_SC_MESSAGE_PASSING	33
 #define	_SC_PRIORITIZED_IO	34
 #define	_SC_PRIORITY_SCHEDULING	35
 #define	_SC_REALTIME_SIGNALS	36
 #define	_SC_SEMAPHORES		37
 #define	_SC_FSYNC		38
 #define	_SC_SHARED_MEMORY_OBJECTS 39
 #define	_SC_SYNCHRONIZED_IO	40
 #define	_SC_TIMERS		41
 #define	_SC_AIO_LISTIO_MAX	42
 #define	_SC_AIO_MAX		43
 #define	_SC_AIO_PRIO_DELTA_MAX	44
 #define	_SC_DELAYTIMER_MAX	45
 #define	_SC_MQ_OPEN_MAX		46
 #define	_SC_PAGESIZE		47
 #define	_SC_RTSIG_MAX		48
 #define	_SC_SEM_NSEMS_MAX	49
 #define	_SC_SEM_VALUE_MAX	50
 #define	_SC_SIGQUEUE_MAX	51
 #define	_SC_TIMER_MAX		52
 #endif
 
 #if __POSIX_VISIBLE >= 200112
 #define	_SC_2_PBS		59 /* user */
 #define	_SC_2_PBS_ACCOUNTING	60 /* user */
 #define	_SC_2_PBS_CHECKPOINT	61 /* user */
 #define	_SC_2_PBS_LOCATE	62 /* user */
 #define	_SC_2_PBS_MESSAGE	63 /* user */
 #define	_SC_2_PBS_TRACK		64 /* user */
 #define	_SC_ADVISORY_INFO	65
 #define	_SC_BARRIERS		66 /* user */
 #define	_SC_CLOCK_SELECTION	67
 #define	_SC_CPUTIME		68
 #define	_SC_FILE_LOCKING	69
 #define	_SC_GETGR_R_SIZE_MAX	70 /* user */
 #define	_SC_GETPW_R_SIZE_MAX	71 /* user */
 #define	_SC_HOST_NAME_MAX	72
 #define	_SC_LOGIN_NAME_MAX	73
 #define	_SC_MONOTONIC_CLOCK	74
 #define	_SC_MQ_PRIO_MAX		75
 #define	_SC_READER_WRITER_LOCKS	76 /* user */
 #define	_SC_REGEXP		77 /* user */
 #define	_SC_SHELL		78 /* user */
 #define	_SC_SPAWN		79 /* user */
 #define	_SC_SPIN_LOCKS		80 /* user */
 #define	_SC_SPORADIC_SERVER	81
 #define	_SC_THREAD_ATTR_STACKADDR 82 /* user */
 #define	_SC_THREAD_ATTR_STACKSIZE 83 /* user */
 #define	_SC_THREAD_CPUTIME	84 /* user */
 #define	_SC_THREAD_DESTRUCTOR_ITERATIONS 85 /* user */
 #define	_SC_THREAD_KEYS_MAX	86 /* user */
 #define	_SC_THREAD_PRIO_INHERIT	87 /* user */
 #define	_SC_THREAD_PRIO_PROTECT	88 /* user */
 #define	_SC_THREAD_PRIORITY_SCHEDULING 89 /* user */
 #define	_SC_THREAD_PROCESS_SHARED 90 /* user */
 #define	_SC_THREAD_SAFE_FUNCTIONS 91 /* user */
 #define	_SC_THREAD_SPORADIC_SERVER 92 /* user */
 #define	_SC_THREAD_STACK_MIN	93 /* user */
 #define	_SC_THREAD_THREADS_MAX	94 /* user */
 #define	_SC_TIMEOUTS		95 /* user */
 #define	_SC_THREADS		96 /* user */
 #define	_SC_TRACE		97 /* user */
 #define	_SC_TRACE_EVENT_FILTER	98 /* user */
 #define	_SC_TRACE_INHERIT	99 /* user */
 #define	_SC_TRACE_LOG		100 /* user */
 #define	_SC_TTY_NAME_MAX	101 /* user */
 #define	_SC_TYPED_MEMORY_OBJECTS 102
 #define	_SC_V6_ILP32_OFF32	103 /* user */
 #define	_SC_V6_ILP32_OFFBIG	104 /* user */
 #define	_SC_V6_LP64_OFF64	105 /* user */
 #define	_SC_V6_LPBIG_OFFBIG	106 /* user */
 #define	_SC_IPV6		118
 #define	_SC_RAW_SOCKETS		119
 #define	_SC_SYMLOOP_MAX		120
 #endif
 
 #if __XSI_VISIBLE
 #define	_SC_ATEXIT_MAX		107 /* user */
 #define	_SC_IOV_MAX		56
 #define	_SC_PAGE_SIZE		_SC_PAGESIZE
 #define	_SC_XOPEN_CRYPT		108 /* user */
 #define	_SC_XOPEN_ENH_I18N	109 /* user */
 #define	_SC_XOPEN_LEGACY	110 /* user */
 #define	_SC_XOPEN_REALTIME	111
 #define	_SC_XOPEN_REALTIME_THREADS 112
 #define	_SC_XOPEN_SHM		113
 #define	_SC_XOPEN_STREAMS	114
 #define	_SC_XOPEN_UNIX		115
 #define	_SC_XOPEN_VERSION	116
 #define	_SC_XOPEN_XCU_VERSION	117 /* user */
 #endif
 
 #if __BSD_VISIBLE
 #define	_SC_NPROCESSORS_CONF	57
 #define	_SC_NPROCESSORS_ONLN	58
 #define	_SC_CPUSET_SIZE		122
 #endif
 
 /* Extensions found in Solaris and Linux. */
 #define	_SC_PHYS_PAGES		121
 
 /* Keys for the confstr(3) function. */
 #if __POSIX_VISIBLE >= 199209
 #define	_CS_PATH		1	/* default value of PATH */
 #endif
 
 #if __POSIX_VISIBLE >= 200112
 #define	_CS_POSIX_V6_ILP32_OFF32_CFLAGS		2
 #define	_CS_POSIX_V6_ILP32_OFF32_LDFLAGS	3
 #define	_CS_POSIX_V6_ILP32_OFF32_LIBS		4
 #define	_CS_POSIX_V6_ILP32_OFFBIG_CFLAGS	5
 #define	_CS_POSIX_V6_ILP32_OFFBIG_LDFLAGS	6
 #define	_CS_POSIX_V6_ILP32_OFFBIG_LIBS		7
 #define	_CS_POSIX_V6_LP64_OFF64_CFLAGS		8
 #define	_CS_POSIX_V6_LP64_OFF64_LDFLAGS		9
 #define	_CS_POSIX_V6_LP64_OFF64_LIBS		10
 #define	_CS_POSIX_V6_LPBIG_OFFBIG_CFLAGS	11
 #define	_CS_POSIX_V6_LPBIG_OFFBIG_LDFLAGS	12
 #define	_CS_POSIX_V6_LPBIG_OFFBIG_LIBS		13
 #define	_CS_POSIX_V6_WIDTH_RESTRICTED_ENVS	14
 #endif
 
 __BEGIN_DECLS
 /* 1003.1-1990 */
 void	 _exit(int) __dead2;
 int	 access(const char *, int);
 unsigned int	 alarm(unsigned int);
 int	 chdir(const char *);
 int	 chown(const char *, uid_t, gid_t);
 int	 close(int);
 void	 closefrom(int);
 int	 dup(int);
 int	 dup2(int, int);
 int	 execl(const char *, const char *, ...) __null_sentinel;
 int	 execle(const char *, const char *, ...);
 int	 execlp(const char *, const char *, ...) __null_sentinel;
 int	 execv(const char *, char * const *);
 int	 execve(const char *, char * const *, char * const *);
 int	 execvp(const char *, char * const *);
 pid_t	 fork(void);
 long	 fpathconf(int, int);
 char	*getcwd(char *, size_t);
 gid_t	 getegid(void);
 uid_t	 geteuid(void);
 gid_t	 getgid(void);
 int	 getgroups(int, gid_t []);
 char	*getlogin(void);
 pid_t	 getpgrp(void);
 pid_t	 getpid(void);
 pid_t	 getppid(void);
 uid_t	 getuid(void);
 int	 isatty(int);
 int	 link(const char *, const char *);
 #ifndef _LSEEK_DECLARED
 #define	_LSEEK_DECLARED
 off_t	 lseek(int, off_t, int);
 #endif
 long	 pathconf(const char *, int);
 int	 pause(void);
 int	 pipe(int *);
 ssize_t	 read(int, void *, size_t);
 int	 rmdir(const char *);
 int	 setgid(gid_t);
 int	 setpgid(pid_t, pid_t);
 pid_t	 setsid(void);
 int	 setuid(uid_t);
 unsigned int	 sleep(unsigned int);
 long	 sysconf(int);
 pid_t	 tcgetpgrp(int);
 int	 tcsetpgrp(int, pid_t);
 char	*ttyname(int);
 int	ttyname_r(int, char *, size_t);
 int	 unlink(const char *);
 ssize_t	 write(int, const void *, size_t);
 
 /* 1003.2-1992 */
 #if __POSIX_VISIBLE >= 199209 || __XSI_VISIBLE
 size_t	 confstr(int, char *, size_t);
 #ifndef _GETOPT_DECLARED
 #define	_GETOPT_DECLARED
 int	 getopt(int, char * const [], const char *);
 
 extern char *optarg;			/* getopt(3) external variables */
 extern int optind, opterr, optopt;
 #endif /* _GETOPT_DECLARED */
 #endif
 
 /* ISO/IEC 9945-1: 1996 */
 #if __POSIX_VISIBLE >= 199506 || __XSI_VISIBLE
 int	 fsync(int);
 
 /*
  * ftruncate() was in the POSIX Realtime Extension (it's used for shared
  * memory), but truncate() was not.
  */
 #ifndef _FTRUNCATE_DECLARED
 #define	_FTRUNCATE_DECLARED
 int	 ftruncate(int, off_t);
 #endif
 #endif
 
 #if __POSIX_VISIBLE >= 199506
 int	 getlogin_r(char *, int);
 #endif
 
 /* 1003.1-2001 */
 #if __POSIX_VISIBLE >= 200112 || __XSI_VISIBLE
 int	 fchown(int, uid_t, gid_t);
 ssize_t	 readlink(const char * __restrict, char * __restrict, size_t);
 #endif
 #if __POSIX_VISIBLE >= 200112
 int	 gethostname(char *, size_t);
 int	 setegid(gid_t);
 int	 seteuid(uid_t);
 #endif
 
 /* 1003.1-2008 */
 #if __POSIX_VISIBLE >= 200809 || __XSI_VISIBLE
 int	 getsid(pid_t _pid);
 int	 fchdir(int);
 int	 getpgid(pid_t _pid);
 int	 lchown(const char *, uid_t, gid_t);
 ssize_t	 pread(int, void *, size_t, off_t);
 ssize_t	 pwrite(int, const void *, size_t, off_t);
 
 /* See comment at ftruncate() above. */
 #ifndef _TRUNCATE_DECLARED
 #define	_TRUNCATE_DECLARED
 int	 truncate(const char *, off_t);
 #endif
 #endif /* __POSIX_VISIBLE >= 200809 || __XSI_VISIBLE */
 
 #if __POSIX_VISIBLE >= 200809
 int	faccessat(int, const char *, int, int);
 int	fchownat(int, const char *, uid_t, gid_t, int);
 int	fexecve(int, char *const [], char *const []);
 int	linkat(int, const char *, int, const char *, int);
 ssize_t	readlinkat(int, const char * __restrict, char * __restrict, size_t);
 int	symlinkat(const char *, int, const char *);
 int	unlinkat(int, const char *, int);
 #endif /* __POSIX_VISIBLE >= 200809 */
 
 /*
  * symlink() was originally in POSIX.1a, which was withdrawn after
  * being overtaken by events (1003.1-2001).  It was in XPG4.2, and of
  * course has been in BSD since 4.2.
  */
 #if __POSIX_VISIBLE >= 200112 || __XSI_VISIBLE >= 402
 int	 symlink(const char * __restrict, const char * __restrict);
 #endif
 
 /* X/Open System Interfaces */
 #if __XSI_VISIBLE
 char	*crypt(const char *, const char *);
 /* char	*ctermid(char *); */		/* XXX ??? */
 int	 encrypt(char *, int);
 long	 gethostid(void);
 int	 lockf(int, int, off_t);
 int	 nice(int);
 int	 setregid(gid_t, gid_t);
 int	 setreuid(uid_t, uid_t);
 
 #ifndef _SWAB_DECLARED
 #define _SWAB_DECLARED
 void	 swab(const void * __restrict, void * __restrict, ssize_t);
 #endif /* _SWAB_DECLARED */
 
 void	 sync(void);
 
 #endif /* __XSI_VISIBLE */
 
 #if (__XSI_VISIBLE && __XSI_VISIBLE <= 500) || __BSD_VISIBLE
 int	 brk(const void *);
 int	 chroot(const char *);
 int	 getdtablesize(void);
 int	 getpagesize(void) __pure2;
 char	*getpass(const char *);
 void	*sbrk(intptr_t);
 #endif
 
 #if (__XSI_VISIBLE && __XSI_VISIBLE <= 600) || __BSD_VISIBLE
 char	*getwd(char *);			/* obsoleted by getcwd() */
 useconds_t
 	 ualarm(useconds_t, useconds_t);
 int	 usleep(useconds_t);
 pid_t	 vfork(void) __returns_twice;
 #endif
 
 #if __BSD_VISIBLE
 struct timeval;				/* select(2) */
 int	 acct(const char *);
 int	 async_daemon(void);
 int	 check_utility_compat(const char *);
 const char *
 	 crypt_get_format(void);
 int	 crypt_set_format(const char *);
 int	 des_cipher(const char *, char *, long, int);
 int	 des_setkey(const char *key);
 int	 dup3(int, int, int);
 int	 eaccess(const char *, int);
 void	 endusershell(void);
 int	 exect(const char *, char * const *, char * const *);
 int	 execvP(const char *, const char *, char * const *);
 int	 feature_present(const char *);
 char	*fflagstostr(u_long);
 int	 getdomainname(char *, int);
 int	 getgrouplist(const char *, gid_t, gid_t *, int *);
 int	 getloginclass(char *, size_t);
 mode_t	 getmode(const void *, mode_t);
 int	 getosreldate(void);
 int	 getpeereid(int, uid_t *, gid_t *);
 int	 getresgid(gid_t *, gid_t *, gid_t *);
 int	 getresuid(uid_t *, uid_t *, uid_t *);
 char	*getusershell(void);
 int	 initgroups(const char *, gid_t);
 int	 iruserok(unsigned long, int, const char *, const char *);
 int	 iruserok_sa(const void *, int, int, const char *, const char *);
 int	 issetugid(void);
 void	__FreeBSD_libc_enter_restricted_mode(void);
 long	 lpathconf(const char *, int);
 #ifndef _MKDTEMP_DECLARED
 char	*mkdtemp(char *);
 #define	_MKDTEMP_DECLARED
 #endif
 #ifndef	_MKNOD_DECLARED
 int	 mknod(const char *, mode_t, dev_t);
 #define	_MKNOD_DECLARED
 #endif
 #ifndef _MKSTEMP_DECLARED
 int	 mkstemp(char *);
 #define	_MKSTEMP_DECLARED
 #endif
 int	 mkstemps(char *, int);
 #ifndef _MKTEMP_DECLARED
 char	*mktemp(char *);
 #define	_MKTEMP_DECLARED
 #endif
 int	 nfssvc(int, void *);
 int	 nlm_syscall(int, int, int, char **);
 int	 pipe2(int *, int);
 int	 profil(char *, size_t, vm_offset_t, int);
 int	 rcmd(char **, int, const char *, const char *, const char *, int *);
 int	 rcmd_af(char **, int, const char *,
 		const char *, const char *, int *, int);
 int	 rcmdsh(char **, int, const char *,
 		const char *, const char *, const char *);
 char	*re_comp(const char *);
 int	 re_exec(const char *);
 int	 reboot(int);
 int	 revoke(const char *);
 pid_t	 rfork(int);
 pid_t	 rfork_thread(int, void *, int (*)(void *), void *);
 int	 rresvport(int *);
 int	 rresvport_af(int *, int);
 int	 ruserok(const char *, int, const char *, const char *);
 #if __BSD_VISIBLE
 #ifndef _SELECT_DECLARED
 #define	_SELECT_DECLARED
 int	 select(int, fd_set *, fd_set *, fd_set *, struct timeval *);
 #endif
 #endif
 int	 setdomainname(const char *, int);
 int	 setgroups(int, const gid_t *);
 void	 sethostid(long);
 int	 sethostname(const char *, int);
 #ifndef _SETKEY_DECLARED
 int	 setkey(const char *);
 #define	_SETKEY_DECLARED
 #endif
 int	 setlogin(const char *);
 int	 setloginclass(const char *);
 void	*setmode(const char *);
 int	 setpgrp(pid_t, pid_t);			/* obsoleted by setpgid() */
 void	 setproctitle(const char *_fmt, ...) __printf0like(1, 2);
 int	 setresgid(gid_t, gid_t, gid_t);
 int	 setresuid(uid_t, uid_t, uid_t);
 int	 setrgid(gid_t);
 int	 setruid(uid_t);
 void	 setusershell(void);
 int	 strtofflags(char **, u_long *, u_long *);
 int	 swapon(const char *);
 int	 swapoff(const char *);
 int	 syscall(int, ...);
 off_t	 __syscall(quad_t, ...);
 int	 undelete(const char *);
 int	 unwhiteout(const char *);
 void	*valloc(size_t);			/* obsoleted by malloc() */
 
 #ifndef _OPTRESET_DECLARED
 #define	_OPTRESET_DECLARED
 extern int optreset;			/* getopt(3) external variable */
 #endif
 #endif /* __BSD_VISIBLE */
 __END_DECLS
 
 #endif /* !_UNISTD_H_ */
Index: head/lib/libthr/thread/Makefile.inc
===================================================================
--- head/lib/libthr/thread/Makefile.inc	(revision 296161)
+++ head/lib/libthr/thread/Makefile.inc	(revision 296162)
@@ -1,59 +1,60 @@
 # $FreeBSD$
 
 # thr sources
 .PATH: ${.CURDIR}/thread
 
 SRCS+= \
 	thr_affinity.c \
 	thr_attr.c \
 	thr_barrier.c \
 	thr_barrierattr.c \
 	thr_cancel.c \
 	thr_clean.c \
 	thr_concurrency.c \
 	thr_cond.c \
 	thr_condattr.c \
 	thr_create.c \
 	thr_ctrdtr.c \
 	thr_detach.c \
 	thr_equal.c \
 	thr_event.c \
 	thr_exit.c \
 	thr_fork.c \
 	thr_getprio.c \
 	thr_getcpuclockid.c \
 	thr_getschedparam.c \
 	thr_getthreadid_np.c \
 	thr_info.c \
 	thr_init.c \
 	thr_join.c \
 	thr_list.c \
 	thr_kern.c \
 	thr_kill.c \
 	thr_main_np.c \
 	thr_multi_np.c \
 	thr_mutex.c \
 	thr_mutexattr.c \
 	thr_once.c \
 	thr_printf.c \
+	thr_pshared.c \
 	thr_pspinlock.c \
 	thr_resume_np.c \
 	thr_rtld.c \
 	thr_rwlock.c \
 	thr_rwlockattr.c \
 	thr_self.c \
 	thr_sem.c \
 	thr_setprio.c \
 	thr_setschedparam.c \
 	thr_sig.c \
 	thr_single_np.c \
 	thr_sleepq.c \
 	thr_spec.c \
 	thr_spinlock.c \
 	thr_stack.c \
 	thr_syscalls.c \
 	thr_suspend_np.c \
 	thr_switch_np.c \
 	thr_symbols.c \
 	thr_umtx.c \
 	thr_yield.c
Index: head/lib/libthr/thread/thr_barrier.c
===================================================================
--- head/lib/libthr/thread/thr_barrier.c	(revision 296161)
+++ head/lib/libthr/thread/thr_barrier.c	(revision 296162)
@@ -1,135 +1,168 @@
 /*-
  * Copyright (c) 2003 David Xu <davidxu@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "namespace.h"
 #include <errno.h>
 #include <stdlib.h>
 #include <pthread.h>
 #include "un-namespace.h"
 
 #include "thr_private.h"
 
 __weak_reference(_pthread_barrier_init,		pthread_barrier_init);
 __weak_reference(_pthread_barrier_wait,		pthread_barrier_wait);
 __weak_reference(_pthread_barrier_destroy,	pthread_barrier_destroy);
 
 int
 _pthread_barrier_destroy(pthread_barrier_t *barrier)
 {
-	pthread_barrier_t	bar;
-	struct pthread		*curthread;
+	pthread_barrier_t bar;
+	struct pthread *curthread;
+	int pshared;
 
 	if (barrier == NULL || *barrier == NULL)
 		return (EINVAL);
 
+	if (*barrier == THR_PSHARED_PTR) {
+		bar = __thr_pshared_offpage(barrier, 0);
+		if (bar == NULL) {
+			*barrier = NULL;
+			return (0);
+		}
+		pshared = 1;
+	} else {
+		bar = *barrier;
+		pshared = 0;
+	}
 	curthread = _get_curthread();
-	bar = *barrier;
 	THR_UMUTEX_LOCK(curthread, &bar->b_lock);
 	if (bar->b_destroying) {
 		THR_UMUTEX_UNLOCK(curthread, &bar->b_lock);
 		return (EBUSY);
 	}
 	bar->b_destroying = 1;
 	do {
 		if (bar->b_waiters > 0) {
 			bar->b_destroying = 0;
 			THR_UMUTEX_UNLOCK(curthread, &bar->b_lock);
 			return (EBUSY);
 		}
 		if (bar->b_refcount != 0) {
 			_thr_ucond_wait(&bar->b_cv, &bar->b_lock, NULL, 0);
 			THR_UMUTEX_LOCK(curthread, &bar->b_lock);
 		} else
 			break;
 	} while (1);
 	bar->b_destroying = 0;
 	THR_UMUTEX_UNLOCK(curthread, &bar->b_lock);
 
 	*barrier = NULL;
-	free(bar);
+	if (pshared)
+		__thr_pshared_destroy(barrier);
+	else
+		free(bar);
 	return (0);
 }
 
 int
 _pthread_barrier_init(pthread_barrier_t *barrier,
-		      const pthread_barrierattr_t *attr, unsigned count)
+    const pthread_barrierattr_t *attr, unsigned count)
 {
-	pthread_barrier_t	bar;
+	pthread_barrier_t bar;
+	int pshared;
 
-	(void)attr;
-
 	if (barrier == NULL || count <= 0)
 		return (EINVAL);
 
-	bar = calloc(1, sizeof(struct pthread_barrier));
-	if (bar == NULL)
-		return (ENOMEM);
+	if (attr == NULL || *attr == NULL ||
+	    (*attr)->pshared == PTHREAD_PROCESS_PRIVATE) {
+		bar = calloc(1, sizeof(struct pthread_barrier));
+		if (bar == NULL)
+			return (ENOMEM);
+		*barrier = bar;
+		pshared = 0;
+	} else {
+		bar = __thr_pshared_offpage(barrier, 1);
+		if (bar == NULL)
+			return (EFAULT);
+		*barrier = THR_PSHARED_PTR;
+		pshared = 1;
+	}
 
 	_thr_umutex_init(&bar->b_lock);
 	_thr_ucond_init(&bar->b_cv);
-	bar->b_count	= count;
-	*barrier	= bar;
-
+	if (pshared) {
+		bar->b_lock.m_flags |= USYNC_PROCESS_SHARED;
+		bar->b_cv.c_flags |= USYNC_PROCESS_SHARED;
+	}
+	bar->b_count = count;
 	return (0);
 }
 
 int
 _pthread_barrier_wait(pthread_barrier_t *barrier)
 {
-	struct pthread *curthread = _get_curthread();
+	struct pthread *curthread;
 	pthread_barrier_t bar;
 	int64_t cycle;
 	int ret;
 
 	if (barrier == NULL || *barrier == NULL)
 		return (EINVAL);
 
-	bar = *barrier;
+	if (*barrier == THR_PSHARED_PTR) {
+		bar = __thr_pshared_offpage(barrier, 0);
+		if (bar == NULL)
+			return (EINVAL);
+	} else {
+		bar = *barrier;
+	}
+	curthread = _get_curthread();
 	THR_UMUTEX_LOCK(curthread, &bar->b_lock);
 	if (++bar->b_waiters == bar->b_count) {
 		/* Current thread is lastest thread */
 		bar->b_waiters = 0;
 		bar->b_cycle++;
 		_thr_ucond_broadcast(&bar->b_cv);
 		THR_UMUTEX_UNLOCK(curthread, &bar->b_lock);
 		ret = PTHREAD_BARRIER_SERIAL_THREAD;
 	} else {
 		cycle = bar->b_cycle;
 		bar->b_refcount++;
 		do {
 			_thr_ucond_wait(&bar->b_cv, &bar->b_lock, NULL, 0);
 			THR_UMUTEX_LOCK(curthread, &bar->b_lock);
 			/* test cycle to avoid bogus wakeup */
 		} while (cycle == bar->b_cycle);
 		if (--bar->b_refcount == 0 && bar->b_destroying)
 			_thr_ucond_broadcast(&bar->b_cv);
 		THR_UMUTEX_UNLOCK(curthread, &bar->b_lock);
 		ret = 0;
 	}
 	return (ret);
 }
Index: head/lib/libthr/thread/thr_barrierattr.c
===================================================================
--- head/lib/libthr/thread/thr_barrierattr.c	(revision 296161)
+++ head/lib/libthr/thread/thr_barrierattr.c	(revision 296162)
@@ -1,96 +1,94 @@
 /*
  * Copyright (c) 2003 David Xu <davidxu@freebsd.org>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible 
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "namespace.h"
 #include <errno.h>
 #include <stdlib.h>
 #include <pthread.h>
 #include "un-namespace.h"
 
 #include "thr_private.h"
 
 __weak_reference(_pthread_barrierattr_destroy, pthread_barrierattr_destroy);
 __weak_reference(_pthread_barrierattr_init, pthread_barrierattr_init);
 __weak_reference(_pthread_barrierattr_setpshared,
 	pthread_barrierattr_setpshared);
 __weak_reference(_pthread_barrierattr_getpshared,
 	pthread_barrierattr_getpshared);
 
 int
 _pthread_barrierattr_destroy(pthread_barrierattr_t *attr)
 {
 
 	if (attr == NULL || *attr == NULL)
 		return (EINVAL);
 
 	free(*attr);
 	return (0);
 }
 
 int
 _pthread_barrierattr_getpshared(const pthread_barrierattr_t *attr,
-	int *pshared)
+    int *pshared)
 {
 
 	if (attr == NULL || *attr == NULL)
 		return (EINVAL);
 
 	*pshared = (*attr)->pshared;
 	return (0);
 }
 
 int
 _pthread_barrierattr_init(pthread_barrierattr_t *attr)
 {
 
 	if (attr == NULL)
 		return (EINVAL);
 
 	if ((*attr = malloc(sizeof(struct pthread_barrierattr))) == NULL)
 		return (ENOMEM);
 
 	(*attr)->pshared = PTHREAD_PROCESS_PRIVATE;
 	return (0);
 }
 
 int
 _pthread_barrierattr_setpshared(pthread_barrierattr_t *attr, int pshared)
 {
 
-	if (attr == NULL || *attr == NULL)
-		return (EINVAL);
-
-	/* Only PTHREAD_PROCESS_PRIVATE is supported. */
-	if (pshared != PTHREAD_PROCESS_PRIVATE)
+	if (attr == NULL || *attr == NULL ||
+	    (pshared != PTHREAD_PROCESS_PRIVATE &&
+	    pshared != PTHREAD_PROCESS_SHARED))
 		return (EINVAL);
 
 	(*attr)->pshared = pshared;
 	return (0);
 }
Index: head/lib/libthr/thread/thr_cond.c
===================================================================
--- head/lib/libthr/thread/thr_cond.c	(revision 296161)
+++ head/lib/libthr/thread/thr_cond.c	(revision 296162)
@@ -1,488 +1,519 @@
 /*
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
+ * Copyright (c) 2015 The FreeBSD Foundation
  * All rights reserved.
  *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "namespace.h"
 #include <stdlib.h>
 #include <errno.h>
 #include <string.h>
 #include <pthread.h>
 #include <limits.h>
 #include "un-namespace.h"
 
 #include "thr_private.h"
 
 /*
  * Prototypes
  */
 int	__pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex);
 int	__pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
 		       const struct timespec * abstime);
 static int cond_init(pthread_cond_t *cond, const pthread_condattr_t *attr);
 static int cond_wait_common(pthread_cond_t *cond, pthread_mutex_t *mutex,
 		    const struct timespec *abstime, int cancel);
 static int cond_signal_common(pthread_cond_t *cond);
 static int cond_broadcast_common(pthread_cond_t *cond);
 
 /*
  * Double underscore versions are cancellation points.  Single underscore
  * versions are not and are provided for libc internal usage (which
  * shouldn't introduce cancellation points).
  */
 __weak_reference(__pthread_cond_wait, pthread_cond_wait);
 __weak_reference(__pthread_cond_timedwait, pthread_cond_timedwait);
 
 __weak_reference(_pthread_cond_init, pthread_cond_init);
 __weak_reference(_pthread_cond_destroy, pthread_cond_destroy);
 __weak_reference(_pthread_cond_signal, pthread_cond_signal);
 __weak_reference(_pthread_cond_broadcast, pthread_cond_broadcast);
 
 #define CV_PSHARED(cvp)	(((cvp)->__flags & USYNC_PROCESS_SHARED) != 0)
 
+static void
+cond_init_body(struct pthread_cond *cvp, const struct pthread_cond_attr *cattr)
+{
+
+	if (cattr == NULL) {
+		cvp->__clock_id = CLOCK_REALTIME;
+	} else {
+		if (cattr->c_pshared)
+			cvp->__flags |= USYNC_PROCESS_SHARED;
+		cvp->__clock_id = cattr->c_clockid;
+	}
+}
+
 static int
 cond_init(pthread_cond_t *cond, const pthread_condattr_t *cond_attr)
 {
-	struct pthread_cond	*cvp;
-	int	error = 0;
+	struct pthread_cond *cvp;
+	const struct pthread_cond_attr *cattr;
+	int pshared;
 
-	if ((cvp = (pthread_cond_t)
-	    calloc(1, sizeof(struct pthread_cond))) == NULL) {
-		error = ENOMEM;
+	cattr = cond_attr != NULL ? *cond_attr : NULL;
+	if (cattr == NULL || cattr->c_pshared == PTHREAD_PROCESS_PRIVATE) {
+		pshared = 0;
+		cvp = calloc(1, sizeof(struct pthread_cond));
+		if (cvp == NULL)
+			return (ENOMEM);
 	} else {
-		/*
-		 * Initialise the condition variable structure:
-		 */
-		if (cond_attr == NULL || *cond_attr == NULL) {
-			cvp->__clock_id = CLOCK_REALTIME;
-		} else {
-			if ((*cond_attr)->c_pshared)
-				cvp->__flags |= USYNC_PROCESS_SHARED;
-			cvp->__clock_id = (*cond_attr)->c_clockid;
-		}
-		*cond = cvp;
+		pshared = 1;
+		cvp = __thr_pshared_offpage(cond, 1);
+		if (cvp == NULL)
+			return (EFAULT);
 	}
-	return (error);
+
+	/*
+	 * Initialise the condition variable structure:
+	 */
+	cond_init_body(cvp, cattr);
+	*cond = pshared ? THR_PSHARED_PTR : cvp;
+	return (0);
 }
 
 static int
 init_static(struct pthread *thread, pthread_cond_t *cond)
 {
 	int ret;
 
 	THR_LOCK_ACQUIRE(thread, &_cond_static_lock);
 
 	if (*cond == NULL)
 		ret = cond_init(cond, NULL);
 	else
 		ret = 0;
 
 	THR_LOCK_RELEASE(thread, &_cond_static_lock);
 
 	return (ret);
 }
 
 #define CHECK_AND_INIT_COND							\
-	if (__predict_false((cvp = (*cond)) <= THR_COND_DESTROYED)) {		\
+	if (*cond == THR_PSHARED_PTR) {						\
+		cvp = __thr_pshared_offpage(cond, 0);				\
+		if (cvp == NULL)						\
+			return (EINVAL);					\
+	} else if (__predict_false((cvp = (*cond)) <= THR_COND_DESTROYED)) {	\
 		if (cvp == THR_COND_INITIALIZER) {				\
 			int ret;						\
 			ret = init_static(_get_curthread(), cond);		\
 			if (ret)						\
 				return (ret);					\
 		} else if (cvp == THR_COND_DESTROYED) {				\
 			return (EINVAL);					\
 		}								\
 		cvp = *cond;							\
 	}
 
 int
 _pthread_cond_init(pthread_cond_t *cond, const pthread_condattr_t *cond_attr)
 {
 
 	*cond = NULL;
 	return (cond_init(cond, cond_attr));
 }
 
 int
 _pthread_cond_destroy(pthread_cond_t *cond)
 {
-	struct pthread_cond	*cvp;
-	int			error = 0;
+	struct pthread_cond *cvp;
+	int error;
 
-	if ((cvp = *cond) == THR_COND_INITIALIZER)
-		error = 0;
-	else if (cvp == THR_COND_DESTROYED)
+	error = 0;
+	if (*cond == THR_PSHARED_PTR) {
+		cvp = __thr_pshared_offpage(cond, 0);
+		if (cvp != NULL)
+			__thr_pshared_destroy(cond);
+		*cond = THR_COND_DESTROYED;
+	} else if ((cvp = *cond) == THR_COND_INITIALIZER) {
+		/* nothing */
+	} else if (cvp == THR_COND_DESTROYED) {
 		error = EINVAL;
-	else {
+	} else {
 		cvp = *cond;
 		*cond = THR_COND_DESTROYED;
-
-		/*
-		 * Free the memory allocated for the condition
-		 * variable structure:
-		 */
 		free(cvp);
 	}
 	return (error);
 }
 
 /*
  * Cancellation behavior:
  *   Thread may be canceled at start, if thread is canceled, it means it
  *   did not get a wakeup from pthread_cond_signal(), otherwise, it is
  *   not canceled.
  *   Thread cancellation never cause wakeup from pthread_cond_signal()
  *   to be lost.
  */
 static int
 cond_wait_kernel(struct pthread_cond *cvp, struct pthread_mutex *mp,
 	const struct timespec *abstime, int cancel)
 {
 	struct pthread	*curthread = _get_curthread();
 	int		recurse;
 	int		error, error2 = 0;
 
 	error = _mutex_cv_detach(mp, &recurse);
 	if (error != 0)
 		return (error);
 
 	if (cancel) {
 		_thr_cancel_enter2(curthread, 0);
 		error = _thr_ucond_wait((struct ucond *)&cvp->__has_kern_waiters,
 			(struct umutex *)&mp->m_lock, abstime,
 			CVWAIT_ABSTIME|CVWAIT_CLOCKID);
 		_thr_cancel_leave(curthread, 0);
 	} else {
 		error = _thr_ucond_wait((struct ucond *)&cvp->__has_kern_waiters,
 			(struct umutex *)&mp->m_lock, abstime,
 			CVWAIT_ABSTIME|CVWAIT_CLOCKID);
 	}
 
 	/*
 	 * Note that PP mutex and ROBUST mutex may return
 	 * interesting error codes.
 	 */
 	if (error == 0) {
 		error2 = _mutex_cv_lock(mp, recurse);
 	} else if (error == EINTR || error == ETIMEDOUT) {
 		error2 = _mutex_cv_lock(mp, recurse);
 		if (error2 == 0 && cancel)
 			_thr_testcancel(curthread);
 		if (error == EINTR)
 			error = 0;
 	} else {
 		/* We know that it didn't unlock the mutex. */
 		error2 = _mutex_cv_attach(mp, recurse);
 		if (error2 == 0 && cancel)
 			_thr_testcancel(curthread);
 	}
 	return (error2 != 0 ? error2 : error);
 }
 
 /*
  * Thread waits in userland queue whenever possible, when thread
  * is signaled or broadcasted, it is removed from the queue, and
  * is saved in curthread's defer_waiters[] buffer, but won't be
  * woken up until mutex is unlocked.
  */
 
 static int
 cond_wait_user(struct pthread_cond *cvp, struct pthread_mutex *mp,
 	const struct timespec *abstime, int cancel)
 {
 	struct pthread	*curthread = _get_curthread();
 	struct sleepqueue *sq;
 	int	recurse;
 	int	error;
 	int	defered;
 
 	if (curthread->wchan != NULL)
 		PANIC("thread was already on queue.");
 
 	if (cancel)
 		_thr_testcancel(curthread);
 
 	_sleepq_lock(cvp);
 	/*
 	 * set __has_user_waiters before unlocking mutex, this allows
 	 * us to check it without locking in pthread_cond_signal().
 	 */
 	cvp->__has_user_waiters = 1; 
 	defered = 0;
 	(void)_mutex_cv_unlock(mp, &recurse, &defered);
 	curthread->mutex_obj = mp;
 	_sleepq_add(cvp, curthread);
 	for(;;) {
 		_thr_clear_wake(curthread);
 		_sleepq_unlock(cvp);
 		if (defered) {
 			defered = 0;
 			if ((mp->m_lock.m_owner & UMUTEX_CONTESTED) == 0)
 				(void)_umtx_op_err(&mp->m_lock, UMTX_OP_MUTEX_WAKE2,
 					 mp->m_lock.m_flags, 0, 0);
 		}
 		if (curthread->nwaiter_defer > 0) {
 			_thr_wake_all(curthread->defer_waiters,
 				curthread->nwaiter_defer);
 			curthread->nwaiter_defer = 0;
 		}
 
 		if (cancel) {
 			_thr_cancel_enter2(curthread, 0);
 			error = _thr_sleep(curthread, cvp->__clock_id, abstime);
 			_thr_cancel_leave(curthread, 0);
 		} else {
 			error = _thr_sleep(curthread, cvp->__clock_id, abstime);
 		}
 
 		_sleepq_lock(cvp);
 		if (curthread->wchan == NULL) {
 			error = 0;
 			break;
 		} else if (cancel && SHOULD_CANCEL(curthread)) {
 			sq = _sleepq_lookup(cvp);
 			cvp->__has_user_waiters = 
 				_sleepq_remove(sq, curthread);
 			_sleepq_unlock(cvp);
 			curthread->mutex_obj = NULL;
 			_mutex_cv_lock(mp, recurse);
 			if (!THR_IN_CRITICAL(curthread))
 				_pthread_exit(PTHREAD_CANCELED);
 			else /* this should not happen */
 				return (0);
 		} else if (error == ETIMEDOUT) {
 			sq = _sleepq_lookup(cvp);
 			cvp->__has_user_waiters =
 				_sleepq_remove(sq, curthread);
 			break;
 		}
 	}
 	_sleepq_unlock(cvp);
 	curthread->mutex_obj = NULL;
 	_mutex_cv_lock(mp, recurse);
 	return (error);
 }
 
 static int
 cond_wait_common(pthread_cond_t *cond, pthread_mutex_t *mutex,
 	const struct timespec *abstime, int cancel)
 {
 	struct pthread	*curthread = _get_curthread();
 	struct pthread_cond *cvp;
 	struct pthread_mutex *mp;
 	int	error;
 
 	CHECK_AND_INIT_COND
 
-	mp = *mutex;
+	if (*mutex == THR_PSHARED_PTR) {
+		mp = __thr_pshared_offpage(mutex, 0);
+		if (mp == NULL)
+			return (EINVAL);
+	} else {
+		mp = *mutex;
+	}
 
 	if ((error = _mutex_owned(curthread, mp)) != 0)
 		return (error);
 
 	if (curthread->attr.sched_policy != SCHED_OTHER ||
 	    (mp->m_lock.m_flags & (UMUTEX_PRIO_PROTECT|UMUTEX_PRIO_INHERIT|
 		USYNC_PROCESS_SHARED)) != 0 ||
 	    (cvp->__flags & USYNC_PROCESS_SHARED) != 0)
 		return cond_wait_kernel(cvp, mp, abstime, cancel);
 	else
 		return cond_wait_user(cvp, mp, abstime, cancel);
 }
 
 int
 _pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
 {
 
 	return (cond_wait_common(cond, mutex, NULL, 0));
 }
 
 int
 __pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
 {
 
 	return (cond_wait_common(cond, mutex, NULL, 1));
 }
 
 int
 _pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
 		       const struct timespec * abstime)
 {
 
 	if (abstime == NULL || abstime->tv_sec < 0 || abstime->tv_nsec < 0 ||
 	    abstime->tv_nsec >= 1000000000)
 		return (EINVAL);
 
 	return (cond_wait_common(cond, mutex, abstime, 0));
 }
 
 int
 __pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
 		       const struct timespec *abstime)
 {
 
 	if (abstime == NULL || abstime->tv_sec < 0 || abstime->tv_nsec < 0 ||
 	    abstime->tv_nsec >= 1000000000)
 		return (EINVAL);
 
 	return (cond_wait_common(cond, mutex, abstime, 1));
 }
 
 static int
 cond_signal_common(pthread_cond_t *cond)
 {
 	struct pthread	*curthread = _get_curthread();
 	struct pthread *td;
 	struct pthread_cond *cvp;
 	struct pthread_mutex *mp;
 	struct sleepqueue *sq;
 	int	*waddr;
 	int	pshared;
 
 	/*
 	 * If the condition variable is statically initialized, perform dynamic
 	 * initialization.
 	 */
 	CHECK_AND_INIT_COND
 
 	pshared = CV_PSHARED(cvp);
 
 	_thr_ucond_signal((struct ucond *)&cvp->__has_kern_waiters);
 
 	if (pshared || cvp->__has_user_waiters == 0)
 		return (0);
 
 	curthread = _get_curthread();
 	waddr = NULL;
 	_sleepq_lock(cvp);
 	sq = _sleepq_lookup(cvp);
 	if (sq == NULL) {
 		_sleepq_unlock(cvp);
 		return (0);
 	}
 
 	td = _sleepq_first(sq);
 	mp = td->mutex_obj;
 	cvp->__has_user_waiters = _sleepq_remove(sq, td);
-	if (mp->m_owner == curthread) {
+	if (mp->m_owner == TID(curthread)) {
 		if (curthread->nwaiter_defer >= MAX_DEFER_WAITERS) {
 			_thr_wake_all(curthread->defer_waiters,
 					curthread->nwaiter_defer);
 			curthread->nwaiter_defer = 0;
 		}
 		curthread->defer_waiters[curthread->nwaiter_defer++] =
 			&td->wake_addr->value;
 		mp->m_flags |= PMUTEX_FLAG_DEFERED;
 	} else {
 		waddr = &td->wake_addr->value;
 	}
 	_sleepq_unlock(cvp);
 	if (waddr != NULL)
 		_thr_set_wake(waddr);
 	return (0);
 }
 
 struct broadcast_arg {
 	struct pthread *curthread;
 	unsigned int *waddrs[MAX_DEFER_WAITERS];
 	int count;
 };
 
 static void
 drop_cb(struct pthread *td, void *arg)
 {
 	struct broadcast_arg *ba = arg;
 	struct pthread_mutex *mp;
 	struct pthread *curthread = ba->curthread;
 
 	mp = td->mutex_obj;
-	if (mp->m_owner == curthread) {
+	if (mp->m_owner == TID(curthread)) {
 		if (curthread->nwaiter_defer >= MAX_DEFER_WAITERS) {
 			_thr_wake_all(curthread->defer_waiters,
 				curthread->nwaiter_defer);
 			curthread->nwaiter_defer = 0;
 		}
 		curthread->defer_waiters[curthread->nwaiter_defer++] =
 			&td->wake_addr->value;
 		mp->m_flags |= PMUTEX_FLAG_DEFERED;
 	} else {
 		if (ba->count >= MAX_DEFER_WAITERS) {
 			_thr_wake_all(ba->waddrs, ba->count);
 			ba->count = 0;
 		}
 		ba->waddrs[ba->count++] = &td->wake_addr->value;
 	}
 }
 
 static int
 cond_broadcast_common(pthread_cond_t *cond)
 {
 	int    pshared;
 	struct pthread_cond *cvp;
 	struct sleepqueue *sq;
 	struct broadcast_arg ba;
 
 	/*
 	 * If the condition variable is statically initialized, perform dynamic
 	 * initialization.
 	 */
 	CHECK_AND_INIT_COND
 
 	pshared = CV_PSHARED(cvp);
 
 	_thr_ucond_broadcast((struct ucond *)&cvp->__has_kern_waiters);
 
 	if (pshared || cvp->__has_user_waiters == 0)
 		return (0);
 
 	ba.curthread = _get_curthread();
 	ba.count = 0;
 	
 	_sleepq_lock(cvp);
 	sq = _sleepq_lookup(cvp);
 	if (sq == NULL) {
 		_sleepq_unlock(cvp);
 		return (0);
 	}
 	_sleepq_drop(sq, drop_cb, &ba);
 	cvp->__has_user_waiters = 0;
 	_sleepq_unlock(cvp);
 	if (ba.count > 0)
 		_thr_wake_all(ba.waddrs, ba.count);
 	return (0);
 }
 
 int
 _pthread_cond_signal(pthread_cond_t * cond)
 {
 
 	return (cond_signal_common(cond));
 }
 
 int
 _pthread_cond_broadcast(pthread_cond_t * cond)
 {
 
 	return (cond_broadcast_common(cond));
 }
Index: head/lib/libthr/thread/thr_condattr.c
===================================================================
--- head/lib/libthr/thread/thr_condattr.c	(revision 296161)
+++ head/lib/libthr/thread/thr_condattr.c	(revision 296162)
@@ -1,124 +1,125 @@
 /*
  * Copyright (c) 1997 John Birrell <jb@cimlogic.com.au>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "namespace.h"
 #include <stdlib.h>
 #include <string.h>
 #include <errno.h>
 #include <pthread.h>
 #include "un-namespace.h"
 
 #include "thr_private.h"
 
 __weak_reference(_pthread_condattr_init, pthread_condattr_init);
 __weak_reference(_pthread_condattr_destroy, pthread_condattr_destroy);
 __weak_reference(_pthread_condattr_getclock, pthread_condattr_getclock);
 __weak_reference(_pthread_condattr_setclock, pthread_condattr_setclock);
 __weak_reference(_pthread_condattr_getpshared, pthread_condattr_getpshared);
 __weak_reference(_pthread_condattr_setpshared, pthread_condattr_setpshared);
 
 int
 _pthread_condattr_init(pthread_condattr_t *attr)
 {
 	pthread_condattr_t pattr;
 	int ret;
 
 	if ((pattr = (pthread_condattr_t)
 	    malloc(sizeof(struct pthread_cond_attr))) == NULL) {
 		ret = ENOMEM;
 	} else {
 		memcpy(pattr, &_pthread_condattr_default,
 		    sizeof(struct pthread_cond_attr));
 		*attr = pattr;
 		ret = 0;
 	}
 	return (ret);
 }
 
 int
 _pthread_condattr_destroy(pthread_condattr_t *attr)
 {
 	int	ret;
 
 	if (attr == NULL || *attr == NULL) {
 		ret = EINVAL;
 	} else {
 		free(*attr);
 		*attr = NULL;
 		ret = 0;
 	}
 	return(ret);
 }
 
 int
 _pthread_condattr_getclock(const pthread_condattr_t *attr, clockid_t *clock_id)
 {
 	if (attr == NULL || *attr == NULL)
 		return (EINVAL);
 	*clock_id = (*attr)->c_clockid;
 	return (0);
 }
 
 int
 _pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock_id)
 {
 	if (attr == NULL || *attr == NULL)
 		return (EINVAL);
 	if (clock_id != CLOCK_REALTIME &&
 	    clock_id != CLOCK_VIRTUAL &&
 	    clock_id != CLOCK_PROF &&
 	    clock_id != CLOCK_MONOTONIC) {
 		return  (EINVAL);
 	}
 	(*attr)->c_clockid = clock_id;
 	return (0);
 }
 
 int
 _pthread_condattr_getpshared(const pthread_condattr_t *attr, int *pshared)
 {
+
 	if (attr == NULL || *attr == NULL)
 		return (EINVAL);
-
-	*pshared = PTHREAD_PROCESS_PRIVATE;
+	*pshared = (*attr)->c_pshared;
 	return (0);
 }
 
 int
 _pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared)
 {
-	if (attr == NULL || *attr == NULL)
-		return (EINVAL);
 
-	if  (pshared != PTHREAD_PROCESS_PRIVATE)
+	if (attr == NULL || *attr == NULL ||
+	    (pshared != PTHREAD_PROCESS_PRIVATE &&
+	    pshared != PTHREAD_PROCESS_SHARED))
 		return (EINVAL);
+	(*attr)->c_pshared = pshared;
 	return (0);
 }
Index: head/lib/libthr/thread/thr_create.c
===================================================================
--- head/lib/libthr/thread/thr_create.c	(revision 296161)
+++ head/lib/libthr/thread/thr_create.c	(revision 296162)
@@ -1,292 +1,292 @@
 /*
  * Copyright (c) 2003 Daniel M. Eischen <deischen@gdeb.com>
  * Copyright (c) 2005, David Xu <davidxu@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "namespace.h"
 #include <sys/types.h>
 #include <sys/rtprio.h>
 #include <sys/signalvar.h>
 #include <errno.h>
 #include <link.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stddef.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include "un-namespace.h"
 
 #include "libc_private.h"
 #include "thr_private.h"
 
 static int  create_stack(struct pthread_attr *pattr);
 static void thread_start(struct pthread *curthread);
 
 __weak_reference(_pthread_create, pthread_create);
 
 int
 _pthread_create(pthread_t * thread, const pthread_attr_t * attr,
 	       void *(*start_routine) (void *), void *arg)
 {
 	struct pthread *curthread, *new_thread;
 	struct thr_param param;
 	struct sched_param sched_param;
 	struct rtprio rtp;
-	int ret = 0, locked, create_suspended;
 	sigset_t set, oset;
-	cpuset_t *cpusetp = NULL;
-	int cpusetsize = 0;
-	int old_stack_prot;
+	cpuset_t *cpusetp;
+	int i, cpusetsize, create_suspended, locked, old_stack_prot, ret;
 
+	cpusetp = NULL;
+	ret = cpusetsize = 0;
 	_thr_check_init();
 
 	/*
 	 * Tell libc and others now they need lock to protect their data.
 	 */
 	if (_thr_isthreaded() == 0) {
 		_malloc_first_thread();
 		if (_thr_setthreaded(1))
 			return (EAGAIN);
 	}
 
 	curthread = _get_curthread();
 	if ((new_thread = _thr_alloc(curthread)) == NULL)
 		return (EAGAIN);
 
 	memset(&param, 0, sizeof(param));
 
 	if (attr == NULL || *attr == NULL)
 		/* Use the default thread attributes: */
 		new_thread->attr = _pthread_attr_default;
 	else {
 		new_thread->attr = *(*attr);
 		cpusetp = new_thread->attr.cpuset;
 		cpusetsize = new_thread->attr.cpusetsize;
 		new_thread->attr.cpuset = NULL;
 		new_thread->attr.cpusetsize = 0;
 	}
 	if (new_thread->attr.sched_inherit == PTHREAD_INHERIT_SCHED) {
 		/* inherit scheduling contention scope */
 		if (curthread->attr.flags & PTHREAD_SCOPE_SYSTEM)
 			new_thread->attr.flags |= PTHREAD_SCOPE_SYSTEM;
 		else
 			new_thread->attr.flags &= ~PTHREAD_SCOPE_SYSTEM;
 
 		new_thread->attr.prio = curthread->attr.prio;
 		new_thread->attr.sched_policy = curthread->attr.sched_policy;
 	}
 
 	new_thread->tid = TID_TERMINATED;
 
 	old_stack_prot = _rtld_get_stack_prot();
 	if (create_stack(&new_thread->attr) != 0) {
 		/* Insufficient memory to create a stack: */
 		_thr_free(curthread, new_thread);
 		return (EAGAIN);
 	}
 	/*
 	 * Write a magic value to the thread structure
 	 * to help identify valid ones:
 	 */
 	new_thread->magic = THR_MAGIC;
 	new_thread->start_routine = start_routine;
 	new_thread->arg = arg;
 	new_thread->cancel_enable = 1;
 	new_thread->cancel_async = 0;
 	/* Initialize the mutex queue: */
-	TAILQ_INIT(&new_thread->mutexq);
-	TAILQ_INIT(&new_thread->pp_mutexq);
+	for (i = 0; i < TMQ_NITEMS; i++)
+		TAILQ_INIT(&new_thread->mq[i]);
 
 	/* Initialise hooks in the thread structure: */
 	if (new_thread->attr.suspend == THR_CREATE_SUSPENDED) {
 		new_thread->flags = THR_FLAGS_NEED_SUSPEND;
 		create_suspended = 1;
 	} else {
 		create_suspended = 0;
 	}
 
 	new_thread->state = PS_RUNNING;
 
 	if (new_thread->attr.flags & PTHREAD_CREATE_DETACHED)
 		new_thread->flags |= THR_FLAGS_DETACHED;
 
 	/* Add the new thread. */
 	new_thread->refcount = 1;
 	_thr_link(curthread, new_thread);
 
 	/*
 	 * Handle the race between __pthread_map_stacks_exec and
 	 * thread linkage.
 	 */
 	if (old_stack_prot != _rtld_get_stack_prot())
 		_thr_stack_fix_protection(new_thread);
 
 	/* Return thread pointer eariler so that new thread can use it. */
 	(*thread) = new_thread;
 	if (SHOULD_REPORT_EVENT(curthread, TD_CREATE) || cpusetp != NULL) {
 		THR_THREAD_LOCK(curthread, new_thread);
 		locked = 1;
 	} else
 		locked = 0;
 	param.start_func = (void (*)(void *)) thread_start;
 	param.arg = new_thread;
 	param.stack_base = new_thread->attr.stackaddr_attr;
 	param.stack_size = new_thread->attr.stacksize_attr;
 	param.tls_base = (char *)new_thread->tcb;
 	param.tls_size = sizeof(struct tcb);
 	param.child_tid = &new_thread->tid;
 	param.parent_tid = &new_thread->tid;
 	param.flags = 0;
 	if (new_thread->attr.flags & PTHREAD_SCOPE_SYSTEM)
 		param.flags |= THR_SYSTEM_SCOPE;
 	if (new_thread->attr.sched_inherit == PTHREAD_INHERIT_SCHED)
 		param.rtp = NULL;
 	else {
 		sched_param.sched_priority = new_thread->attr.prio;
 		_schedparam_to_rtp(new_thread->attr.sched_policy,
 			&sched_param, &rtp);
 		param.rtp = &rtp;
 	}
 
 	/* Schedule the new thread. */
 	if (create_suspended) {
 		SIGFILLSET(set);
 		SIGDELSET(set, SIGTRAP);
 		__sys_sigprocmask(SIG_SETMASK, &set, &oset);
 		new_thread->sigmask = oset;
 		SIGDELSET(new_thread->sigmask, SIGCANCEL);
 	}
 
 	ret = thr_new(&param, sizeof(param));
 
 	if (ret != 0) {
 		ret = errno;
 		/*
 		 * Translate EPROCLIM into well-known POSIX code EAGAIN.
 		 */
 		if (ret == EPROCLIM)
 			ret = EAGAIN;
 	}
 
 	if (create_suspended)
 		__sys_sigprocmask(SIG_SETMASK, &oset, NULL);
 
 	if (ret != 0) {
 		if (!locked)
 			THR_THREAD_LOCK(curthread, new_thread);
 		new_thread->state = PS_DEAD;
 		new_thread->tid = TID_TERMINATED;
 		new_thread->flags |= THR_FLAGS_DETACHED;
 		new_thread->refcount--;
 		if (new_thread->flags & THR_FLAGS_NEED_SUSPEND) {
 			new_thread->cycle++;
 			_thr_umtx_wake(&new_thread->cycle, INT_MAX, 0);
 		}
 		_thr_try_gc(curthread, new_thread); /* thread lock released */
 		atomic_add_int(&_thread_active_threads, -1);
 	} else if (locked) {
 		if (cpusetp != NULL) {
 			if (cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID,
 				TID(new_thread), cpusetsize, cpusetp)) {
 				ret = errno;
 				/* kill the new thread */
 				new_thread->force_exit = 1;
 				new_thread->flags |= THR_FLAGS_DETACHED;
 				_thr_try_gc(curthread, new_thread);
 				 /* thread lock released */
 				goto out;
 			}
 		}
 
 		_thr_report_creation(curthread, new_thread);
 		THR_THREAD_UNLOCK(curthread, new_thread);
 	}
 out:
 	if (ret)
 		(*thread) = 0;
 	return (ret);
 }
 
 static int
 create_stack(struct pthread_attr *pattr)
 {
 	int ret;
 
 	/* Check if a stack was specified in the thread attributes: */
 	if ((pattr->stackaddr_attr) != NULL) {
 		pattr->guardsize_attr = 0;
 		pattr->flags |= THR_STACK_USER;
 		ret = 0;
 	}
 	else
 		ret = _thr_stack_alloc(pattr);
 	return (ret);
 }
 
 static void
 thread_start(struct pthread *curthread)
 {
 	sigset_t set;
 
 	if (curthread->attr.suspend == THR_CREATE_SUSPENDED)
 		set = curthread->sigmask;
 
 	/*
 	 * This is used as a serialization point to allow parent
 	 * to report 'new thread' event to debugger or tweak new thread's
 	 * attributes before the new thread does real-world work.
 	 */
 	THR_LOCK(curthread);
 	THR_UNLOCK(curthread);
 
 	if (curthread->force_exit)
 		_pthread_exit(PTHREAD_CANCELED);
 
 	if (curthread->attr.suspend == THR_CREATE_SUSPENDED) {
 #if 0
 		/* Done in THR_UNLOCK() */
 		_thr_ast(curthread);
 #endif
 
 		/*
 		 * Parent thread have stored signal mask for us,
 		 * we should restore it now.
 		 */
 		__sys_sigprocmask(SIG_SETMASK, &set, NULL);
 	}
 
 #ifdef _PTHREAD_FORCED_UNWIND
 	curthread->unwind_stackend = (char *)curthread->attr.stackaddr_attr +
 		curthread->attr.stacksize_attr;
 #endif
 
 	/* Run the current thread's start routine with argument: */
 	_pthread_exit(curthread->start_routine(curthread->arg));
 
 	/* This point should never be reached. */
 	PANIC("Thread has resumed after exit");
 }
Index: head/lib/libthr/thread/thr_init.c
===================================================================
--- head/lib/libthr/thread/thr_init.c	(revision 296161)
+++ head/lib/libthr/thread/thr_init.c	(revision 296162)
@@ -1,508 +1,512 @@
 /*
  * Copyright (c) 2003 Daniel M. Eischen <deischen@freebsd.org>
  * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by John Birrell.
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "namespace.h"
 #include <sys/types.h>
 #include <sys/signalvar.h>
 #include <sys/ioctl.h>
 #include <sys/link_elf.h>
 #include <sys/resource.h>
 #include <sys/sysctl.h>
 #include <sys/ttycom.h>
 #include <sys/mman.h>
 #include <sys/rtprio.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <paths.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <signal.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <unistd.h>
 #include "un-namespace.h"
 
 #include "libc_private.h"
 #include "thr_private.h"
 
 char		*_usrstack;
 struct pthread	*_thr_initial;
 int		_libthr_debug;
 int		_thread_event_mask;
 struct pthread	*_thread_last_event;
 pthreadlist	_thread_list = TAILQ_HEAD_INITIALIZER(_thread_list);
 pthreadlist 	_thread_gc_list = TAILQ_HEAD_INITIALIZER(_thread_gc_list);
 int		_thread_active_threads = 1;
 atfork_head	_thr_atfork_list = TAILQ_HEAD_INITIALIZER(_thr_atfork_list);
 struct urwlock	_thr_atfork_lock = DEFAULT_URWLOCK;
 
 struct pthread_prio	_thr_priorities[3] = {
 	{RTP_PRIO_MIN,  RTP_PRIO_MAX, 0}, /* FIFO */
 	{0, 0, 63}, /* OTHER */
 	{RTP_PRIO_MIN, RTP_PRIO_MAX, 0}  /* RR */
 };
 
 struct pthread_attr _pthread_attr_default = {
 	.sched_policy = SCHED_OTHER,
 	.sched_inherit = PTHREAD_INHERIT_SCHED,
 	.prio = 0,
 	.suspend = THR_CREATE_RUNNING,
 	.flags = PTHREAD_SCOPE_SYSTEM,
 	.stackaddr_attr = NULL,
 	.stacksize_attr = THR_STACK_DEFAULT,
 	.guardsize_attr = 0,
 	.cpusetsize = 0,
 	.cpuset = NULL
 };
 
 struct pthread_mutex_attr _pthread_mutexattr_default = {
 	.m_type = PTHREAD_MUTEX_DEFAULT,
 	.m_protocol = PTHREAD_PRIO_NONE,
-	.m_ceiling = 0
+	.m_ceiling = 0,
+	.m_pshared = PTHREAD_PROCESS_PRIVATE,
 };
 
 struct pthread_mutex_attr _pthread_mutexattr_adaptive_default = {
 	.m_type = PTHREAD_MUTEX_ADAPTIVE_NP,
 	.m_protocol = PTHREAD_PRIO_NONE,
-	.m_ceiling = 0
+	.m_ceiling = 0,
+	.m_pshared = PTHREAD_PROCESS_PRIVATE,
 };
 
 /* Default condition variable attributes: */
 struct pthread_cond_attr _pthread_condattr_default = {
 	.c_pshared = PTHREAD_PROCESS_PRIVATE,
 	.c_clockid = CLOCK_REALTIME
 };
 
 pid_t		_thr_pid;
 int		_thr_is_smp = 0;
 size_t		_thr_guard_default;
 size_t		_thr_stack_default = THR_STACK_DEFAULT;
 size_t		_thr_stack_initial = THR_STACK_INITIAL;
 int		_thr_page_size;
 int		_thr_spinloops;
 int		_thr_yieldloops;
 int		_thr_queuefifo = 4;
 int		_gc_count;
 struct umutex	_mutex_static_lock = DEFAULT_UMUTEX;
 struct umutex	_cond_static_lock = DEFAULT_UMUTEX;
 struct umutex	_rwlock_static_lock = DEFAULT_UMUTEX;
 struct umutex	_keytable_lock = DEFAULT_UMUTEX;
 struct urwlock	_thr_list_lock = DEFAULT_URWLOCK;
 struct umutex	_thr_event_lock = DEFAULT_UMUTEX;
 struct umutex	_suspend_all_lock = DEFAULT_UMUTEX;
 struct pthread	*_single_thread;
 int		_suspend_all_cycle;
 int		_suspend_all_waiters;
 
 int	__pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *);
 int	__pthread_mutex_lock(pthread_mutex_t *);
 int	__pthread_mutex_trylock(pthread_mutex_t *);
 void	_thread_init_hack(void) __attribute__ ((constructor));
 
 static void init_private(void);
 static void init_main_thread(struct pthread *thread);
 
 /*
  * All weak references used within libc should be in this table.
  * This is so that static libraries will work.
  */
 
 STATIC_LIB_REQUIRE(_fork);
 STATIC_LIB_REQUIRE(_pthread_getspecific);
 STATIC_LIB_REQUIRE(_pthread_key_create);
 STATIC_LIB_REQUIRE(_pthread_key_delete);
 STATIC_LIB_REQUIRE(_pthread_mutex_destroy);
 STATIC_LIB_REQUIRE(_pthread_mutex_init);
 STATIC_LIB_REQUIRE(_pthread_mutex_lock);
 STATIC_LIB_REQUIRE(_pthread_mutex_trylock);
 STATIC_LIB_REQUIRE(_pthread_mutex_unlock);
 STATIC_LIB_REQUIRE(_pthread_mutexattr_init);
 STATIC_LIB_REQUIRE(_pthread_mutexattr_destroy);
 STATIC_LIB_REQUIRE(_pthread_mutexattr_settype);
 STATIC_LIB_REQUIRE(_pthread_once);
 STATIC_LIB_REQUIRE(_pthread_setspecific);
 STATIC_LIB_REQUIRE(_raise);
 STATIC_LIB_REQUIRE(_sem_destroy);
 STATIC_LIB_REQUIRE(_sem_getvalue);
 STATIC_LIB_REQUIRE(_sem_init);
 STATIC_LIB_REQUIRE(_sem_post);
 STATIC_LIB_REQUIRE(_sem_timedwait);
 STATIC_LIB_REQUIRE(_sem_trywait);
 STATIC_LIB_REQUIRE(_sem_wait);
 STATIC_LIB_REQUIRE(_sigaction);
 STATIC_LIB_REQUIRE(_sigprocmask);
 STATIC_LIB_REQUIRE(_sigsuspend);
 STATIC_LIB_REQUIRE(_sigtimedwait);
 STATIC_LIB_REQUIRE(_sigwait);
 STATIC_LIB_REQUIRE(_sigwaitinfo);
 STATIC_LIB_REQUIRE(_spinlock);
 STATIC_LIB_REQUIRE(_spinlock_debug);
 STATIC_LIB_REQUIRE(_spinunlock);
 STATIC_LIB_REQUIRE(_thread_init_hack);
 
 /*
  * These are needed when linking statically.  All references within
  * libgcc (and in the future libc) to these routines are weak, but
  * if they are not (strongly) referenced by the application or other
  * libraries, then the actual functions will not be loaded.
  */
 STATIC_LIB_REQUIRE(_pthread_once);
 STATIC_LIB_REQUIRE(_pthread_key_create);
 STATIC_LIB_REQUIRE(_pthread_key_delete);
 STATIC_LIB_REQUIRE(_pthread_getspecific);
 STATIC_LIB_REQUIRE(_pthread_setspecific);
 STATIC_LIB_REQUIRE(_pthread_mutex_init);
 STATIC_LIB_REQUIRE(_pthread_mutex_destroy);
 STATIC_LIB_REQUIRE(_pthread_mutex_lock);
 STATIC_LIB_REQUIRE(_pthread_mutex_trylock);
 STATIC_LIB_REQUIRE(_pthread_mutex_unlock);
 STATIC_LIB_REQUIRE(_pthread_create);
 
 /* Pull in all symbols required by libthread_db */
 STATIC_LIB_REQUIRE(_thread_state_running);
 
 #define	DUAL_ENTRY(entry)	\
 	(pthread_func_t)entry, (pthread_func_t)entry
 
 static pthread_func_t jmp_table[][2] = {
 	{DUAL_ENTRY(_pthread_atfork)},	/* PJT_ATFORK */
 	{DUAL_ENTRY(_pthread_attr_destroy)},	/* PJT_ATTR_DESTROY */
 	{DUAL_ENTRY(_pthread_attr_getdetachstate)},	/* PJT_ATTR_GETDETACHSTATE */
 	{DUAL_ENTRY(_pthread_attr_getguardsize)},	/* PJT_ATTR_GETGUARDSIZE */
 	{DUAL_ENTRY(_pthread_attr_getinheritsched)},	/* PJT_ATTR_GETINHERITSCHED */
 	{DUAL_ENTRY(_pthread_attr_getschedparam)},	/* PJT_ATTR_GETSCHEDPARAM */
 	{DUAL_ENTRY(_pthread_attr_getschedpolicy)},	/* PJT_ATTR_GETSCHEDPOLICY */
 	{DUAL_ENTRY(_pthread_attr_getscope)},	/* PJT_ATTR_GETSCOPE */
 	{DUAL_ENTRY(_pthread_attr_getstackaddr)},	/* PJT_ATTR_GETSTACKADDR */
 	{DUAL_ENTRY(_pthread_attr_getstacksize)},	/* PJT_ATTR_GETSTACKSIZE */
 	{DUAL_ENTRY(_pthread_attr_init)},	/* PJT_ATTR_INIT */
 	{DUAL_ENTRY(_pthread_attr_setdetachstate)},	/* PJT_ATTR_SETDETACHSTATE */
 	{DUAL_ENTRY(_pthread_attr_setguardsize)},	/* PJT_ATTR_SETGUARDSIZE */
 	{DUAL_ENTRY(_pthread_attr_setinheritsched)},	/* PJT_ATTR_SETINHERITSCHED */
 	{DUAL_ENTRY(_pthread_attr_setschedparam)},	/* PJT_ATTR_SETSCHEDPARAM */
 	{DUAL_ENTRY(_pthread_attr_setschedpolicy)},	/* PJT_ATTR_SETSCHEDPOLICY */
 	{DUAL_ENTRY(_pthread_attr_setscope)},	/* PJT_ATTR_SETSCOPE */
 	{DUAL_ENTRY(_pthread_attr_setstackaddr)},	/* PJT_ATTR_SETSTACKADDR */
 	{DUAL_ENTRY(_pthread_attr_setstacksize)},	/* PJT_ATTR_SETSTACKSIZE */
 	{DUAL_ENTRY(_pthread_cancel)},	/* PJT_CANCEL */
 	{DUAL_ENTRY(_pthread_cleanup_pop)},	/* PJT_CLEANUP_POP */
 	{DUAL_ENTRY(_pthread_cleanup_push)},	/* PJT_CLEANUP_PUSH */
 	{DUAL_ENTRY(_pthread_cond_broadcast)},	/* PJT_COND_BROADCAST */
 	{DUAL_ENTRY(_pthread_cond_destroy)},	/* PJT_COND_DESTROY */
 	{DUAL_ENTRY(_pthread_cond_init)},	/* PJT_COND_INIT */
 	{DUAL_ENTRY(_pthread_cond_signal)},	/* PJT_COND_SIGNAL */
 	{DUAL_ENTRY(_pthread_cond_timedwait)},	/* PJT_COND_TIMEDWAIT */
 	{(pthread_func_t)__pthread_cond_wait,
 	 (pthread_func_t)_pthread_cond_wait},	/* PJT_COND_WAIT */
 	{DUAL_ENTRY(_pthread_detach)},	/* PJT_DETACH */
 	{DUAL_ENTRY(_pthread_equal)},	/* PJT_EQUAL */
 	{DUAL_ENTRY(_pthread_exit)},	/* PJT_EXIT */
 	{DUAL_ENTRY(_pthread_getspecific)},	/* PJT_GETSPECIFIC */
 	{DUAL_ENTRY(_pthread_join)},	/* PJT_JOIN */
 	{DUAL_ENTRY(_pthread_key_create)},	/* PJT_KEY_CREATE */
 	{DUAL_ENTRY(_pthread_key_delete)},	/* PJT_KEY_DELETE*/
 	{DUAL_ENTRY(_pthread_kill)},	/* PJT_KILL */
 	{DUAL_ENTRY(_pthread_main_np)},		/* PJT_MAIN_NP */
 	{DUAL_ENTRY(_pthread_mutexattr_destroy)}, /* PJT_MUTEXATTR_DESTROY */
 	{DUAL_ENTRY(_pthread_mutexattr_init)},	/* PJT_MUTEXATTR_INIT */
 	{DUAL_ENTRY(_pthread_mutexattr_settype)}, /* PJT_MUTEXATTR_SETTYPE */
 	{DUAL_ENTRY(_pthread_mutex_destroy)},	/* PJT_MUTEX_DESTROY */
 	{DUAL_ENTRY(_pthread_mutex_init)},	/* PJT_MUTEX_INIT */
 	{(pthread_func_t)__pthread_mutex_lock,
 	 (pthread_func_t)_pthread_mutex_lock},	/* PJT_MUTEX_LOCK */
 	{(pthread_func_t)__pthread_mutex_trylock,
 	 (pthread_func_t)_pthread_mutex_trylock},/* PJT_MUTEX_TRYLOCK */
 	{DUAL_ENTRY(_pthread_mutex_unlock)},	/* PJT_MUTEX_UNLOCK */
 	{DUAL_ENTRY(_pthread_once)},		/* PJT_ONCE */
 	{DUAL_ENTRY(_pthread_rwlock_destroy)},	/* PJT_RWLOCK_DESTROY */
 	{DUAL_ENTRY(_pthread_rwlock_init)},	/* PJT_RWLOCK_INIT */
 	{DUAL_ENTRY(_pthread_rwlock_rdlock)},	/* PJT_RWLOCK_RDLOCK */
 	{DUAL_ENTRY(_pthread_rwlock_tryrdlock)},/* PJT_RWLOCK_TRYRDLOCK */
 	{DUAL_ENTRY(_pthread_rwlock_trywrlock)},/* PJT_RWLOCK_TRYWRLOCK */
 	{DUAL_ENTRY(_pthread_rwlock_unlock)},	/* PJT_RWLOCK_UNLOCK */
 	{DUAL_ENTRY(_pthread_rwlock_wrlock)},	/* PJT_RWLOCK_WRLOCK */
 	{DUAL_ENTRY(_pthread_self)},		/* PJT_SELF */
 	{DUAL_ENTRY(_pthread_setcancelstate)},	/* PJT_SETCANCELSTATE */
 	{DUAL_ENTRY(_pthread_setcanceltype)},	/* PJT_SETCANCELTYPE */
 	{DUAL_ENTRY(_pthread_setspecific)},	/* PJT_SETSPECIFIC */
 	{DUAL_ENTRY(_pthread_sigmask)},		/* PJT_SIGMASK */
 	{DUAL_ENTRY(_pthread_testcancel)},	/* PJT_TESTCANCEL */
 	{DUAL_ENTRY(__pthread_cleanup_pop_imp)},/* PJT_CLEANUP_POP_IMP */
 	{DUAL_ENTRY(__pthread_cleanup_push_imp)},/* PJT_CLEANUP_PUSH_IMP */
 	{DUAL_ENTRY(_pthread_cancel_enter)},	/* PJT_CANCEL_ENTER */
 	{DUAL_ENTRY(_pthread_cancel_leave)}		/* PJT_CANCEL_LEAVE */
 };
 
 static int init_once = 0;
 
 /*
  * For the shared version of the threads library, the above is sufficient.
  * But for the archive version of the library, we need a little bit more.
  * Namely, we must arrange for this particular module to be pulled in from
  * the archive library at link time.  To accomplish that, we define and
  * initialize a variable, "_thread_autoinit_dummy_decl".  This variable is
  * referenced (as an extern) from libc/stdlib/exit.c. This will always
  * create a need for this module, ensuring that it is present in the
  * executable.
  */
 extern int _thread_autoinit_dummy_decl;
 int _thread_autoinit_dummy_decl = 0;
 
 void
 _thread_init_hack(void)
 {
 
 	_libpthread_init(NULL);
 }
 
 
 /*
  * Threaded process initialization.
  *
  * This is only called under two conditions:
  *
  *   1) Some thread routines have detected that the library hasn't yet
  *      been initialized (_thr_initial == NULL && curthread == NULL), or
  *
  *   2) An explicit call to reinitialize after a fork (indicated
  *      by curthread != NULL)
  */
 void
 _libpthread_init(struct pthread *curthread)
 {
 	int fd, first, dlopened;
 
 	/* Check if this function has already been called: */
 	if ((_thr_initial != NULL) && (curthread == NULL))
 		/* Only initialize the threaded application once. */
 		return;
 
 	/*
 	 * Check the size of the jump table to make sure it is preset
 	 * with the correct number of entries.
 	 */
 	if (sizeof(jmp_table) != (sizeof(pthread_func_t) * PJT_MAX * 2))
 		PANIC("Thread jump table not properly initialized");
 	memcpy(__thr_jtable, jmp_table, sizeof(jmp_table));
 	__thr_interpose_libc();
 
 	/*
 	 * Check for the special case of this process running as
 	 * or in place of init as pid = 1:
 	 */
 	if ((_thr_pid = getpid()) == 1) {
 		/*
 		 * Setup a new session for this process which is
 		 * assumed to be running as root.
 		 */
 		if (setsid() == -1)
 			PANIC("Can't set session ID");
 		if (revoke(_PATH_CONSOLE) != 0)
 			PANIC("Can't revoke console");
 		if ((fd = __sys_openat(AT_FDCWD, _PATH_CONSOLE, O_RDWR)) < 0)
 			PANIC("Can't open console");
 		if (setlogin("root") == -1)
 			PANIC("Can't set login to root");
 		if (_ioctl(fd, TIOCSCTTY, (char *) NULL) == -1)
 			PANIC("Can't set controlling terminal");
 	}
 
 	/* Initialize pthread private data. */
 	init_private();
 
 	/* Set the initial thread. */
 	if (curthread == NULL) {
 		first = 1;
 		/* Create and initialize the initial thread. */
 		curthread = _thr_alloc(NULL);
 		if (curthread == NULL)
 			PANIC("Can't allocate initial thread");
 		init_main_thread(curthread);
 	} else {
 		first = 0;
 	}
 		
 	/*
 	 * Add the thread to the thread list queue.
 	 */
 	THR_LIST_ADD(curthread);
 	_thread_active_threads = 1;
 
 	/* Setup the thread specific data */
 	_tcb_set(curthread->tcb);
 
 	if (first) {
 		_thr_initial = curthread;
 		dlopened = _rtld_is_dlopened(&_thread_autoinit_dummy_decl) != 0;
 		_thr_signal_init(dlopened);
 		if (_thread_event_mask & TD_CREATE)
 			_thr_report_creation(curthread, curthread);
 		/*
 		 * Always use our rtld lock implementation.
 		 * It is faster because it postpones signal handlers
 		 * instead of calling sigprocmask(2).
 		 */
 		_thr_rtld_init();
 	}
 }
 
 /*
  * This function and pthread_create() do a lot of the same things.
  * It'd be nice to consolidate the common stuff in one place.
  */
 static void
 init_main_thread(struct pthread *thread)
 {
 	struct sched_param sched_param;
+	int i;
 
 	/* Setup the thread attributes. */
 	thr_self(&thread->tid);
 	thread->attr = _pthread_attr_default;
 	/*
 	 * Set up the thread stack.
 	 *
 	 * Create a red zone below the main stack.  All other stacks
 	 * are constrained to a maximum size by the parameters
 	 * passed to mmap(), but this stack is only limited by
 	 * resource limits, so this stack needs an explicitly mapped
 	 * red zone to protect the thread stack that is just beyond.
 	 */
 	if (mmap(_usrstack - _thr_stack_initial -
 	    _thr_guard_default, _thr_guard_default, 0, MAP_ANON,
 	    -1, 0) == MAP_FAILED)
 		PANIC("Cannot allocate red zone for initial thread");
 
 	/*
 	 * Mark the stack as an application supplied stack so that it
 	 * isn't deallocated.
 	 *
 	 * XXX - I'm not sure it would hurt anything to deallocate
 	 *       the main thread stack because deallocation doesn't
 	 *       actually free() it; it just puts it in the free
 	 *       stack queue for later reuse.
 	 */
 	thread->attr.stackaddr_attr = _usrstack - _thr_stack_initial;
 	thread->attr.stacksize_attr = _thr_stack_initial;
 	thread->attr.guardsize_attr = _thr_guard_default;
 	thread->attr.flags |= THR_STACK_USER;
 
 	/*
 	 * Write a magic value to the thread structure
 	 * to help identify valid ones:
 	 */
 	thread->magic = THR_MAGIC;
 
 	thread->cancel_enable = 1;
 	thread->cancel_async = 0;
 
-	/* Initialize the mutex queue: */
-	TAILQ_INIT(&thread->mutexq);
-	TAILQ_INIT(&thread->pp_mutexq);
+	/* Initialize the mutex queues */
+	for (i = 0; i < TMQ_NITEMS; i++)
+		TAILQ_INIT(&thread->mq[i]);
 
 	thread->state = PS_RUNNING;
 
 	_thr_getscheduler(thread->tid, &thread->attr.sched_policy,
 		 &sched_param);
 	thread->attr.prio = sched_param.sched_priority;
 
 #ifdef _PTHREAD_FORCED_UNWIND
 	thread->unwind_stackend = _usrstack;
 #endif
 
 	/* Others cleared to zero by thr_alloc() */
 }
 
 static void
 init_private(void)
 {
 	struct rlimit rlim;
 	size_t len;
 	int mib[2];
 	char *env, *env_bigstack, *env_splitstack;
 
 	_thr_umutex_init(&_mutex_static_lock);
 	_thr_umutex_init(&_cond_static_lock);
 	_thr_umutex_init(&_rwlock_static_lock);
 	_thr_umutex_init(&_keytable_lock);
 	_thr_urwlock_init(&_thr_atfork_lock);
 	_thr_umutex_init(&_thr_event_lock);
 	_thr_umutex_init(&_suspend_all_lock);
 	_thr_once_init();
 	_thr_spinlock_init();
 	_thr_list_init();
+	__thr_pshared_init();
 	_thr_wake_addr_init();
 	_sleepq_init();
 	_single_thread = NULL;
 	_suspend_all_waiters = 0;
 
 	/*
 	 * Avoid reinitializing some things if they don't need to be,
 	 * e.g. after a fork().
 	 */
 	if (init_once == 0) {
 		/* Find the stack top */
 		mib[0] = CTL_KERN;
 		mib[1] = KERN_USRSTACK;
 		len = sizeof (_usrstack);
 		if (sysctl(mib, 2, &_usrstack, &len, NULL, 0) == -1)
 			PANIC("Cannot get kern.usrstack from sysctl");
 		env_bigstack = getenv("LIBPTHREAD_BIGSTACK_MAIN");
 		env_splitstack = getenv("LIBPTHREAD_SPLITSTACK_MAIN");
 		if (env_bigstack != NULL || env_splitstack == NULL) {
 			if (getrlimit(RLIMIT_STACK, &rlim) == -1)
 				PANIC("Cannot get stack rlimit");
 			_thr_stack_initial = rlim.rlim_cur;
 		}
 		len = sizeof(_thr_is_smp);
 		sysctlbyname("kern.smp.cpus", &_thr_is_smp, &len, NULL, 0);
 		_thr_is_smp = (_thr_is_smp > 1);
 		_thr_page_size = getpagesize();
 		_thr_guard_default = _thr_page_size;
 		_pthread_attr_default.guardsize_attr = _thr_guard_default;
 		_pthread_attr_default.stacksize_attr = _thr_stack_default;
 		env = getenv("LIBPTHREAD_SPINLOOPS");
 		if (env)
 			_thr_spinloops = atoi(env);
 		env = getenv("LIBPTHREAD_YIELDLOOPS");
 		if (env)
 			_thr_yieldloops = atoi(env);
 		env = getenv("LIBPTHREAD_QUEUE_FIFO");
 		if (env)
 			_thr_queuefifo = atoi(env);
 		TAILQ_INIT(&_thr_atfork_list);
 	}
 	init_once = 1;
 }
Index: head/lib/libthr/thread/thr_mutex.c
===================================================================
--- head/lib/libthr/thread/thr_mutex.c	(revision 296161)
+++ head/lib/libthr/thread/thr_mutex.c	(revision 296162)
@@ -1,798 +1,958 @@
 /*
  * Copyright (c) 1995 John Birrell <jb@cimlogic.com.au>.
  * Copyright (c) 2006 David Xu <davidxu@freebsd.org>.
+ * Copyright (c) 2015 The FreeBSD Foundation
+ *
  * All rights reserved.
  *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by John Birrell.
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "namespace.h"
 #include <stdlib.h>
 #include <errno.h>
 #include <string.h>
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include "un-namespace.h"
 
 #include "thr_private.h"
 
-#if defined(_PTHREADS_INVARIANTS)
-#define MUTEX_INIT_LINK(m) 		do {		\
-	(m)->m_qe.tqe_prev = NULL;			\
-	(m)->m_qe.tqe_next = NULL;			\
-} while (0)
-#define MUTEX_ASSERT_IS_OWNED(m)	do {		\
-	if (__predict_false((m)->m_qe.tqe_prev == NULL))\
-		PANIC("mutex is not on list");		\
-} while (0)
-#define MUTEX_ASSERT_NOT_OWNED(m)	do {		\
-	if (__predict_false((m)->m_qe.tqe_prev != NULL ||	\
-	    (m)->m_qe.tqe_next != NULL))	\
-		PANIC("mutex is on list");		\
-} while (0)
-#else
-#define MUTEX_INIT_LINK(m)
-#define MUTEX_ASSERT_IS_OWNED(m)
-#define MUTEX_ASSERT_NOT_OWNED(m)
-#endif
-
 /*
  * For adaptive mutexes, how many times to spin doing trylock2
  * before entering the kernel to block
  */
 #define MUTEX_ADAPTIVE_SPINS	2000
 
 /*
  * Prototypes
  */
 int	__pthread_mutex_init(pthread_mutex_t *mutex,
 		const pthread_mutexattr_t *mutex_attr);
 int	__pthread_mutex_trylock(pthread_mutex_t *mutex);
 int	__pthread_mutex_lock(pthread_mutex_t *mutex);
 int	__pthread_mutex_timedlock(pthread_mutex_t *mutex,
 		const struct timespec *abstime);
 int	_pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
     		void *(calloc_cb)(size_t, size_t));
 int	_pthread_mutex_getspinloops_np(pthread_mutex_t *mutex, int *count);
 int	_pthread_mutex_setspinloops_np(pthread_mutex_t *mutex, int count);
 int	__pthread_mutex_setspinloops_np(pthread_mutex_t *mutex, int count);
 int	_pthread_mutex_setyieldloops_np(pthread_mutex_t *mutex, int count);
 int	_pthread_mutex_getyieldloops_np(pthread_mutex_t *mutex, int *count);
 int	__pthread_mutex_setyieldloops_np(pthread_mutex_t *mutex, int count);
 
 static int	mutex_self_trylock(pthread_mutex_t);
 static int	mutex_self_lock(pthread_mutex_t,
 				const struct timespec *abstime);
 static int	mutex_unlock_common(struct pthread_mutex *, int, int *);
 static int	mutex_lock_sleep(struct pthread *, pthread_mutex_t,
 				const struct timespec *);
 
 __weak_reference(__pthread_mutex_init, pthread_mutex_init);
 __strong_reference(__pthread_mutex_init, _pthread_mutex_init);
 __weak_reference(__pthread_mutex_lock, pthread_mutex_lock);
 __strong_reference(__pthread_mutex_lock, _pthread_mutex_lock);
 __weak_reference(__pthread_mutex_timedlock, pthread_mutex_timedlock);
 __strong_reference(__pthread_mutex_timedlock, _pthread_mutex_timedlock);
 __weak_reference(__pthread_mutex_trylock, pthread_mutex_trylock);
 __strong_reference(__pthread_mutex_trylock, _pthread_mutex_trylock);
 
 /* Single underscore versions provided for libc internal usage: */
 /* No difference between libc and application usage of these: */
 __weak_reference(_pthread_mutex_destroy, pthread_mutex_destroy);
 __weak_reference(_pthread_mutex_unlock, pthread_mutex_unlock);
 
 __weak_reference(_pthread_mutex_getprioceiling, pthread_mutex_getprioceiling);
 __weak_reference(_pthread_mutex_setprioceiling, pthread_mutex_setprioceiling);
 
 __weak_reference(__pthread_mutex_setspinloops_np, pthread_mutex_setspinloops_np);
 __strong_reference(__pthread_mutex_setspinloops_np, _pthread_mutex_setspinloops_np);
 __weak_reference(_pthread_mutex_getspinloops_np, pthread_mutex_getspinloops_np);
 
 __weak_reference(__pthread_mutex_setyieldloops_np, pthread_mutex_setyieldloops_np);
 __strong_reference(__pthread_mutex_setyieldloops_np, _pthread_mutex_setyieldloops_np);
 __weak_reference(_pthread_mutex_getyieldloops_np, pthread_mutex_getyieldloops_np);
 __weak_reference(_pthread_mutex_isowned_np, pthread_mutex_isowned_np);
 
+static void
+mutex_init_link(struct pthread_mutex *m)
+{
+
+#if defined(_PTHREADS_INVARIANTS)
+	m->m_qe.tqe_prev = NULL;
+	m->m_qe.tqe_next = NULL;
+	m->m_pqe.tqe_prev = NULL;
+	m->m_pqe.tqe_next = NULL;
+#endif
+}
+
+static void
+mutex_assert_is_owned(struct pthread_mutex *m)
+{
+
+#if defined(_PTHREADS_INVARIANTS)
+	if (__predict_false(m->m_qe.tqe_prev == NULL))
+		PANIC("mutex is not on list");
+#endif
+}
+
+static void
+mutex_assert_not_owned(struct pthread_mutex *m)
+{
+
+#if defined(_PTHREADS_INVARIANTS)
+	if (__predict_false(m->m_qe.tqe_prev != NULL ||
+	    m->m_qe.tqe_next != NULL))
+		PANIC("mutex is on list");
+#endif
+}
+
 static int
-mutex_init(pthread_mutex_t *mutex,
-    const struct pthread_mutex_attr *mutex_attr,
-    void *(calloc_cb)(size_t, size_t))
+is_pshared_mutex(struct pthread_mutex *m)
 {
-	const struct pthread_mutex_attr *attr;
-	struct pthread_mutex *pmutex;
 
-	if (mutex_attr == NULL) {
-		attr = &_pthread_mutexattr_default;
-	} else {
-		attr = mutex_attr;
-		if (attr->m_type < PTHREAD_MUTEX_ERRORCHECK ||
-		    attr->m_type >= PTHREAD_MUTEX_TYPE_MAX)
-			return (EINVAL);
-		if (attr->m_protocol < PTHREAD_PRIO_NONE ||
-		    attr->m_protocol > PTHREAD_PRIO_PROTECT)
-			return (EINVAL);
-	}
-	if ((pmutex = (pthread_mutex_t)
-		calloc_cb(1, sizeof(struct pthread_mutex))) == NULL)
-		return (ENOMEM);
+	return ((m->m_lock.m_flags & USYNC_PROCESS_SHARED) != 0);
+}
 
+static int
+mutex_check_attr(const struct pthread_mutex_attr *attr)
+{
+
+	if (attr->m_type < PTHREAD_MUTEX_ERRORCHECK ||
+	    attr->m_type >= PTHREAD_MUTEX_TYPE_MAX)
+		return (EINVAL);
+	if (attr->m_protocol < PTHREAD_PRIO_NONE ||
+	    attr->m_protocol > PTHREAD_PRIO_PROTECT)
+		return (EINVAL);
+	return (0);
+}
+
+static void
+mutex_init_body(struct pthread_mutex *pmutex,
+    const struct pthread_mutex_attr *attr)
+{
+
 	pmutex->m_flags = attr->m_type;
-	pmutex->m_owner = NULL;
+	pmutex->m_owner = 0;
 	pmutex->m_count = 0;
 	pmutex->m_spinloops = 0;
 	pmutex->m_yieldloops = 0;
-	MUTEX_INIT_LINK(pmutex);
-	switch(attr->m_protocol) {
+	mutex_init_link(pmutex);
+	switch (attr->m_protocol) {
 	case PTHREAD_PRIO_NONE:
 		pmutex->m_lock.m_owner = UMUTEX_UNOWNED;
 		pmutex->m_lock.m_flags = 0;
 		break;
 	case PTHREAD_PRIO_INHERIT:
 		pmutex->m_lock.m_owner = UMUTEX_UNOWNED;
 		pmutex->m_lock.m_flags = UMUTEX_PRIO_INHERIT;
 		break;
 	case PTHREAD_PRIO_PROTECT:
 		pmutex->m_lock.m_owner = UMUTEX_CONTESTED;
 		pmutex->m_lock.m_flags = UMUTEX_PRIO_PROTECT;
 		pmutex->m_lock.m_ceilings[0] = attr->m_ceiling;
 		break;
 	}
+	if (attr->m_pshared == PTHREAD_PROCESS_SHARED)
+		pmutex->m_lock.m_flags |= USYNC_PROCESS_SHARED;
 
 	if (PMUTEX_TYPE(pmutex->m_flags) == PTHREAD_MUTEX_ADAPTIVE_NP) {
 		pmutex->m_spinloops =
 		    _thr_spinloops ? _thr_spinloops: MUTEX_ADAPTIVE_SPINS;
 		pmutex->m_yieldloops = _thr_yieldloops;
 	}
+}
 
+static int
+mutex_init(pthread_mutex_t *mutex,
+    const struct pthread_mutex_attr *mutex_attr,
+    void *(calloc_cb)(size_t, size_t))
+{
+	const struct pthread_mutex_attr *attr;
+	struct pthread_mutex *pmutex;
+	int error;
+
+	if (mutex_attr == NULL) {
+		attr = &_pthread_mutexattr_default;
+	} else {
+		attr = mutex_attr;
+		error = mutex_check_attr(attr);
+		if (error != 0)
+			return (error);
+	}
+	if ((pmutex = (pthread_mutex_t)
+		calloc_cb(1, sizeof(struct pthread_mutex))) == NULL)
+		return (ENOMEM);
+	mutex_init_body(pmutex, attr);
 	*mutex = pmutex;
 	return (0);
 }
 
 static int
 init_static(struct pthread *thread, pthread_mutex_t *mutex)
 {
 	int ret;
 
 	THR_LOCK_ACQUIRE(thread, &_mutex_static_lock);
 
 	if (*mutex == THR_MUTEX_INITIALIZER)
 		ret = mutex_init(mutex, &_pthread_mutexattr_default, calloc);
 	else if (*mutex == THR_ADAPTIVE_MUTEX_INITIALIZER)
-		ret = mutex_init(mutex, &_pthread_mutexattr_adaptive_default, calloc);
+		ret = mutex_init(mutex, &_pthread_mutexattr_adaptive_default,
+		    calloc);
 	else
 		ret = 0;
 	THR_LOCK_RELEASE(thread, &_mutex_static_lock);
 
 	return (ret);
 }
 
 static void
 set_inherited_priority(struct pthread *curthread, struct pthread_mutex *m)
 {
 	struct pthread_mutex *m2;
 
-	m2 = TAILQ_LAST(&curthread->pp_mutexq, mutex_queue);
+	m2 = TAILQ_LAST(&curthread->mq[TMQ_NORM_PP], mutex_queue);
 	if (m2 != NULL)
 		m->m_lock.m_ceilings[1] = m2->m_lock.m_ceilings[0];
 	else
 		m->m_lock.m_ceilings[1] = -1;
 }
 
 int
 __pthread_mutex_init(pthread_mutex_t *mutex,
     const pthread_mutexattr_t *mutex_attr)
 {
-	return mutex_init(mutex, mutex_attr ? *mutex_attr : NULL, calloc);
+	struct pthread_mutex *pmtx;
+	int ret;
+
+	if (mutex_attr != NULL) {
+		ret = mutex_check_attr(*mutex_attr);
+		if (ret != 0)
+			return (ret);
+	}
+	if (mutex_attr == NULL ||
+	    (*mutex_attr)->m_pshared == PTHREAD_PROCESS_PRIVATE) {
+		return (mutex_init(mutex, mutex_attr ? *mutex_attr : NULL,
+		   calloc));
+	}
+	pmtx = __thr_pshared_offpage(mutex, 1);
+	if (pmtx == NULL)
+		return (EFAULT);
+	*mutex = THR_PSHARED_PTR;
+	mutex_init_body(pmtx, *mutex_attr);
+	return (0);
 }
 
 /* This function is used internally by malloc. */
 int
 _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
     void *(calloc_cb)(size_t, size_t))
 {
 	static const struct pthread_mutex_attr attr = {
 		.m_type = PTHREAD_MUTEX_NORMAL,
 		.m_protocol = PTHREAD_PRIO_NONE,
-		.m_ceiling = 0
+		.m_ceiling = 0,
+		.m_pshared = PTHREAD_PROCESS_PRIVATE,
 	};
 	int ret;
 
 	ret = mutex_init(mutex, &attr, calloc_cb);
 	if (ret == 0)
 		(*mutex)->m_flags |= PMUTEX_FLAG_PRIVATE;
 	return (ret);
 }
 
-void
-_mutex_fork(struct pthread *curthread)
+/*
+ * Fix mutex ownership for child process.
+ *
+ * Process private mutex ownership is transmitted from the forking
+ * thread to the child process.
+ *
+ * Process shared mutex should not be inherited because owner is
+ * forking thread which is in parent process, they are removed from
+ * the owned mutex list.
+ */
+static void
+queue_fork(struct pthread *curthread, struct mutex_queue *q,
+    struct mutex_queue *qp, uint bit)
 {
 	struct pthread_mutex *m;
 
-	/*
-	 * Fix mutex ownership for child process.
-	 * note that process shared mutex should not
-	 * be inherited because owner is forking thread
-	 * which is in parent process, they should be
-	 * removed from the owned mutex list, current,
-	 * process shared mutex is not supported, so I
-	 * am not worried.
-	 */
+	TAILQ_INIT(q);
+	TAILQ_FOREACH(m, qp, m_pqe) {
+		TAILQ_INSERT_TAIL(q, m, m_qe);
+		m->m_lock.m_owner = TID(curthread) | bit;
+		m->m_owner = TID(curthread);
+	}
+}
 
-	TAILQ_FOREACH(m, &curthread->mutexq, m_qe)
-		m->m_lock.m_owner = TID(curthread);
-	TAILQ_FOREACH(m, &curthread->pp_mutexq, m_qe)
-		m->m_lock.m_owner = TID(curthread) | UMUTEX_CONTESTED;
+void
+_mutex_fork(struct pthread *curthread)
+{
+
+	queue_fork(curthread, &curthread->mq[TMQ_NORM],
+	    &curthread->mq[TMQ_NORM_PRIV], 0);
+	queue_fork(curthread, &curthread->mq[TMQ_NORM_PP],
+	    &curthread->mq[TMQ_NORM_PP_PRIV], UMUTEX_CONTESTED);
 }
 
 int
 _pthread_mutex_destroy(pthread_mutex_t *mutex)
 {
-	pthread_mutex_t m;
+	pthread_mutex_t m, m1;
 	int ret;
 
 	m = *mutex;
 	if (m < THR_MUTEX_DESTROYED) {
 		ret = 0;
 	} else if (m == THR_MUTEX_DESTROYED) {
 		ret = EINVAL;
 	} else {
-		if (m->m_owner != NULL) {
+		if (m == THR_PSHARED_PTR) {
+			m1 = __thr_pshared_offpage(mutex, 0);
+			if (m1 != NULL) {
+				mutex_assert_not_owned(m1);
+				__thr_pshared_destroy(mutex);
+			}
+			*mutex = THR_MUTEX_DESTROYED;
+			return (0);
+		}
+		if (m->m_owner != 0) {
 			ret = EBUSY;
 		} else {
 			*mutex = THR_MUTEX_DESTROYED;
-			MUTEX_ASSERT_NOT_OWNED(m);
+			mutex_assert_not_owned(m);
 			free(m);
 			ret = 0;
 		}
 	}
 
 	return (ret);
 }
 
-#define ENQUEUE_MUTEX(curthread, m)  					\
-	do {								\
-		(m)->m_owner = curthread;				\
-		/* Add to the list of owned mutexes: */			\
-		MUTEX_ASSERT_NOT_OWNED((m));				\
-		if (((m)->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0)	\
-			TAILQ_INSERT_TAIL(&curthread->mutexq, (m), m_qe);\
-		else							\
-			TAILQ_INSERT_TAIL(&curthread->pp_mutexq, (m), m_qe);\
-	} while (0)
+static int
+mutex_qidx(struct pthread_mutex *m)
+{
 
-#define DEQUEUE_MUTEX(curthread, m)					\
-		(m)->m_owner = NULL;					\
-		MUTEX_ASSERT_IS_OWNED(m);				\
-		if (__predict_true(((m)->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0)) \
-			TAILQ_REMOVE(&curthread->mutexq, (m), m_qe);		\
-		else {							\
-			TAILQ_REMOVE(&curthread->pp_mutexq, (m), m_qe);	\
-			set_inherited_priority(curthread, m);		\
-		}							\
-		MUTEX_INIT_LINK(m);
+	if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0)
+		return (TMQ_NORM);
+	return (TMQ_NORM_PP);
+}
 
-#define CHECK_AND_INIT_MUTEX						\
-	if (__predict_false((m = *mutex) <= THR_MUTEX_DESTROYED)) {	\
-		if (m == THR_MUTEX_DESTROYED)				\
-			return (EINVAL);				\
-		int ret;						\
-		ret = init_static(_get_curthread(), mutex);		\
-		if (ret)						\
-			return (ret);					\
-		m = *mutex;						\
-	}
+static void
+enqueue_mutex(struct pthread *curthread, struct pthread_mutex *m)
+{
+	int qidx;
 
+	m->m_owner = TID(curthread);
+	/* Add to the list of owned mutexes: */
+	mutex_assert_not_owned(m);
+	qidx = mutex_qidx(m);
+	TAILQ_INSERT_TAIL(&curthread->mq[qidx], m, m_qe);
+	if (!is_pshared_mutex(m))
+		TAILQ_INSERT_TAIL(&curthread->mq[qidx + 1], m, m_pqe);
+}
+
+static void
+dequeue_mutex(struct pthread *curthread, struct pthread_mutex *m)
+{
+	int qidx;
+
+	m->m_owner = 0;
+	mutex_assert_is_owned(m);
+	qidx = mutex_qidx(m);
+	TAILQ_REMOVE(&curthread->mq[qidx], m, m_qe);
+	if (!is_pshared_mutex(m))
+		TAILQ_REMOVE(&curthread->mq[qidx + 1], m, m_pqe);
+	if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) != 0)
+		set_inherited_priority(curthread, m);
+	mutex_init_link(m);
+}
+
 static int
-mutex_trylock_common(pthread_mutex_t *mutex)
+check_and_init_mutex(pthread_mutex_t *mutex, struct pthread_mutex **m)
 {
-	struct pthread *curthread = _get_curthread();
-	struct pthread_mutex *m = *mutex;
+	int ret;
+
+	*m = *mutex;
+	ret = 0;
+	if (*m == THR_PSHARED_PTR) {
+		*m = __thr_pshared_offpage(mutex, 0);
+		if (*m == NULL)
+			ret = EINVAL;
+	} else if (__predict_false(*m <= THR_MUTEX_DESTROYED)) {
+		if (*m == THR_MUTEX_DESTROYED) {
+			ret = EINVAL;
+		} else {
+			ret = init_static(_get_curthread(), mutex);
+			if (ret == 0)
+				*m = *mutex;
+		}
+	}
+	return (ret);
+}
+
+int
+__pthread_mutex_trylock(pthread_mutex_t *mutex)
+{
+	struct pthread *curthread;
+	struct pthread_mutex *m;
 	uint32_t id;
 	int ret;
 
+	ret = check_and_init_mutex(mutex, &m);
+	if (ret != 0)
+		return (ret);
+	curthread = _get_curthread();
 	id = TID(curthread);
 	if (m->m_flags & PMUTEX_FLAG_PRIVATE)
 		THR_CRITICAL_ENTER(curthread);
 	ret = _thr_umutex_trylock(&m->m_lock, id);
 	if (__predict_true(ret == 0)) {
-		ENQUEUE_MUTEX(curthread, m);
-	} else if (m->m_owner == curthread) {
+		enqueue_mutex(curthread, m);
+	} else if (m->m_owner == id) {
 		ret = mutex_self_trylock(m);
 	} /* else {} */
 	if (ret && (m->m_flags & PMUTEX_FLAG_PRIVATE))
 		THR_CRITICAL_LEAVE(curthread);
 	return (ret);
 }
 
-int
-__pthread_mutex_trylock(pthread_mutex_t *mutex)
-{
-	struct pthread_mutex *m;
-
-	CHECK_AND_INIT_MUTEX
-
-	return (mutex_trylock_common(mutex));
-}
-
 static int
 mutex_lock_sleep(struct pthread *curthread, struct pthread_mutex *m,
 	const struct timespec *abstime)
 {
 	uint32_t	id, owner;
 	int	count;
 	int	ret;
 
-	if (m->m_owner == curthread)
-		return mutex_self_lock(m, abstime);
-
 	id = TID(curthread);
+	if (m->m_owner == id)
+		return (mutex_self_lock(m, abstime));
+
 	/*
 	 * For adaptive mutexes, spin for a bit in the expectation
 	 * that if the application requests this mutex type then
 	 * the lock is likely to be released quickly and it is
 	 * faster than entering the kernel
 	 */
 	if (__predict_false(
 		(m->m_lock.m_flags & 
 		 (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) != 0))
 			goto sleep_in_kernel;
 
 	if (!_thr_is_smp)
 		goto yield_loop;
 
 	count = m->m_spinloops;
 	while (count--) {
 		owner = m->m_lock.m_owner;
 		if ((owner & ~UMUTEX_CONTESTED) == 0) {
 			if (atomic_cmpset_acq_32(&m->m_lock.m_owner, owner, id|owner)) {
 				ret = 0;
 				goto done;
 			}
 		}
 		CPU_SPINWAIT;
 	}
 
 yield_loop:
 	count = m->m_yieldloops;
 	while (count--) {
 		_sched_yield();
 		owner = m->m_lock.m_owner;
 		if ((owner & ~UMUTEX_CONTESTED) == 0) {
 			if (atomic_cmpset_acq_32(&m->m_lock.m_owner, owner, id|owner)) {
 				ret = 0;
 				goto done;
 			}
 		}
 	}
 
 sleep_in_kernel:
 	if (abstime == NULL) {
 		ret = __thr_umutex_lock(&m->m_lock, id);
 	} else if (__predict_false(
 		   abstime->tv_nsec < 0 ||
 		   abstime->tv_nsec >= 1000000000)) {
 		ret = EINVAL;
 	} else {
 		ret = __thr_umutex_timedlock(&m->m_lock, id, abstime);
 	}
 done:
 	if (ret == 0)
-		ENQUEUE_MUTEX(curthread, m);
+		enqueue_mutex(curthread, m);
 
 	return (ret);
 }
 
 static inline int
 mutex_lock_common(struct pthread_mutex *m,
 	const struct timespec *abstime, int cvattach)
 {
 	struct pthread *curthread  = _get_curthread();
 	int ret;
 
 	if (!cvattach && m->m_flags & PMUTEX_FLAG_PRIVATE)
 		THR_CRITICAL_ENTER(curthread);
 	if (_thr_umutex_trylock2(&m->m_lock, TID(curthread)) == 0) {
-		ENQUEUE_MUTEX(curthread, m);
+		enqueue_mutex(curthread, m);
 		ret = 0;
 	} else {
 		ret = mutex_lock_sleep(curthread, m, abstime);
 	}
 	if (ret && (m->m_flags & PMUTEX_FLAG_PRIVATE) && !cvattach)
 		THR_CRITICAL_LEAVE(curthread);
 	return (ret);
 }
 
 int
 __pthread_mutex_lock(pthread_mutex_t *mutex)
 {
-	struct pthread_mutex	*m;
+	struct pthread_mutex *m;
+	int ret;
 
 	_thr_check_init();
-
-	CHECK_AND_INIT_MUTEX
-
-	return (mutex_lock_common(m, NULL, 0));
+	ret = check_and_init_mutex(mutex, &m);
+	if (ret == 0)
+		ret = mutex_lock_common(m, NULL, 0);
+	return (ret);
 }
 
 int
-__pthread_mutex_timedlock(pthread_mutex_t *mutex, const struct timespec *abstime)
+__pthread_mutex_timedlock(pthread_mutex_t *mutex,
+    const struct timespec *abstime)
 {
-	struct pthread_mutex	*m;
+	struct pthread_mutex *m;
+	int ret;
 
 	_thr_check_init();
-
-	CHECK_AND_INIT_MUTEX
-
-	return (mutex_lock_common(m, abstime, 0));
+	ret = check_and_init_mutex(mutex, &m);
+	if (ret == 0)
+		ret = mutex_lock_common(m, abstime, 0);
+	return (ret);
 }
 
 int
 _pthread_mutex_unlock(pthread_mutex_t *mutex)
 {
 	struct pthread_mutex *mp;
 
-	mp = *mutex;
+	if (*mutex == THR_PSHARED_PTR) {
+		mp = __thr_pshared_offpage(mutex, 0);
+		if (mp == NULL)
+			return (EINVAL);
+	} else {
+		mp = *mutex;
+	}
 	return (mutex_unlock_common(mp, 0, NULL));
 }
 
 int
 _mutex_cv_lock(struct pthread_mutex *m, int count)
 {
 	int	error;
 
 	error = mutex_lock_common(m, NULL, 1);
 	if (error == 0)
 		m->m_count = count;
 	return (error);
 }
 
 int
 _mutex_cv_unlock(struct pthread_mutex *m, int *count, int *defer)
 {
 
 	/*
 	 * Clear the count in case this is a recursive mutex.
 	 */
 	*count = m->m_count;
 	m->m_count = 0;
 	(void)mutex_unlock_common(m, 1, defer);
         return (0);
 }
 
 int
 _mutex_cv_attach(struct pthread_mutex *m, int count)
 {
 	struct pthread *curthread = _get_curthread();
 
-	ENQUEUE_MUTEX(curthread, m);
+	enqueue_mutex(curthread, m);
 	m->m_count = count;
 	return (0);
 }
 
 int
 _mutex_cv_detach(struct pthread_mutex *mp, int *recurse)
 {
 	struct pthread *curthread = _get_curthread();
 	int     defered;
 	int     error;
 
 	if ((error = _mutex_owned(curthread, mp)) != 0)
                 return (error);
 
 	/*
 	 * Clear the count in case this is a recursive mutex.
 	 */
 	*recurse = mp->m_count;
 	mp->m_count = 0;
-	DEQUEUE_MUTEX(curthread, mp);
+	dequeue_mutex(curthread, mp);
 
 	/* Will this happen in real-world ? */
         if ((mp->m_flags & PMUTEX_FLAG_DEFERED) != 0) {
 		defered = 1;
 		mp->m_flags &= ~PMUTEX_FLAG_DEFERED;
 	} else
 		defered = 0;
 
 	if (defered)  {
 		_thr_wake_all(curthread->defer_waiters,
 				curthread->nwaiter_defer);
 		curthread->nwaiter_defer = 0;
 	}
 	return (0);
 }
 
 static int
 mutex_self_trylock(struct pthread_mutex *m)
 {
 	int	ret;
 
 	switch (PMUTEX_TYPE(m->m_flags)) {
 	case PTHREAD_MUTEX_ERRORCHECK:
 	case PTHREAD_MUTEX_NORMAL:
 	case PTHREAD_MUTEX_ADAPTIVE_NP:
 		ret = EBUSY; 
 		break;
 
 	case PTHREAD_MUTEX_RECURSIVE:
 		/* Increment the lock count: */
 		if (m->m_count + 1 > 0) {
 			m->m_count++;
 			ret = 0;
 		} else
 			ret = EAGAIN;
 		break;
 
 	default:
 		/* Trap invalid mutex types; */
 		ret = EINVAL;
 	}
 
 	return (ret);
 }
 
 static int
 mutex_self_lock(struct pthread_mutex *m, const struct timespec *abstime)
 {
 	struct timespec	ts1, ts2;
 	int	ret;
 
 	switch (PMUTEX_TYPE(m->m_flags)) {
 	case PTHREAD_MUTEX_ERRORCHECK:
 	case PTHREAD_MUTEX_ADAPTIVE_NP:
 		if (abstime) {
 			if (abstime->tv_sec < 0 || abstime->tv_nsec < 0 ||
 			    abstime->tv_nsec >= 1000000000) {
 				ret = EINVAL;
 			} else {
 				clock_gettime(CLOCK_REALTIME, &ts1);
 				TIMESPEC_SUB(&ts2, abstime, &ts1);
 				__sys_nanosleep(&ts2, NULL);
 				ret = ETIMEDOUT;
 			}
 		} else {
 			/*
 			 * POSIX specifies that mutexes should return
 			 * EDEADLK if a recursive lock is detected.
 			 */
 			ret = EDEADLK; 
 		}
 		break;
 
 	case PTHREAD_MUTEX_NORMAL:
 		/*
 		 * What SS2 define as a 'normal' mutex.  Intentionally
 		 * deadlock on attempts to get a lock you already own.
 		 */
 		ret = 0;
 		if (abstime) {
 			if (abstime->tv_sec < 0 || abstime->tv_nsec < 0 ||
 			    abstime->tv_nsec >= 1000000000) {
 				ret = EINVAL;
 			} else {
 				clock_gettime(CLOCK_REALTIME, &ts1);
 				TIMESPEC_SUB(&ts2, abstime, &ts1);
 				__sys_nanosleep(&ts2, NULL);
 				ret = ETIMEDOUT;
 			}
 		} else {
 			ts1.tv_sec = 30;
 			ts1.tv_nsec = 0;
 			for (;;)
 				__sys_nanosleep(&ts1, NULL);
 		}
 		break;
 
 	case PTHREAD_MUTEX_RECURSIVE:
 		/* Increment the lock count: */
 		if (m->m_count + 1 > 0) {
 			m->m_count++;
 			ret = 0;
 		} else
 			ret = EAGAIN;
 		break;
 
 	default:
 		/* Trap invalid mutex types; */
 		ret = EINVAL;
 	}
 
 	return (ret);
 }
 
 static int
 mutex_unlock_common(struct pthread_mutex *m, int cv, int *mtx_defer)
 {
 	struct pthread *curthread = _get_curthread();
 	uint32_t id;
 	int defered, error;
 
 	if (__predict_false(m <= THR_MUTEX_DESTROYED)) {
 		if (m == THR_MUTEX_DESTROYED)
 			return (EINVAL);
 		return (EPERM);
 	}
 
+	id = TID(curthread);
+
 	/*
 	 * Check if the running thread is not the owner of the mutex.
 	 */
-	if (__predict_false(m->m_owner != curthread))
+	if (__predict_false(m->m_owner != id))
 		return (EPERM);
 
 	error = 0;
-	id = TID(curthread);
 	if (__predict_false(
 		PMUTEX_TYPE(m->m_flags) == PTHREAD_MUTEX_RECURSIVE &&
 		m->m_count > 0)) {
 		m->m_count--;
 	} else {
 		if ((m->m_flags & PMUTEX_FLAG_DEFERED) != 0) {
 			defered = 1;
 			m->m_flags &= ~PMUTEX_FLAG_DEFERED;
         	} else
 			defered = 0;
 
-		DEQUEUE_MUTEX(curthread, m);
+		dequeue_mutex(curthread, m);
 		error = _thr_umutex_unlock2(&m->m_lock, id, mtx_defer);
 
 		if (mtx_defer == NULL && defered)  {
 			_thr_wake_all(curthread->defer_waiters,
 				curthread->nwaiter_defer);
 			curthread->nwaiter_defer = 0;
 		}
 	}
 	if (!cv && m->m_flags & PMUTEX_FLAG_PRIVATE)
 		THR_CRITICAL_LEAVE(curthread);
 	return (error);
 }
 
 int
 _pthread_mutex_getprioceiling(pthread_mutex_t *mutex,
-			      int *prioceiling)
+    int *prioceiling)
 {
 	struct pthread_mutex *m;
-	int ret;
 
-	m = *mutex;
-	if ((m <= THR_MUTEX_DESTROYED) ||
-	    (m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0)
-		ret = EINVAL;
-	else {
-		*prioceiling = m->m_lock.m_ceilings[0];
-		ret = 0;
+	if (*mutex == THR_PSHARED_PTR) {
+		m = __thr_pshared_offpage(mutex, 0);
+		if (m == NULL)
+			return (EINVAL);
+	} else {
+		m = *mutex;
+		if (m <= THR_MUTEX_DESTROYED)
+			return (EINVAL);
 	}
-
-	return (ret);
+	if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0)
+		return (EINVAL);
+	*prioceiling = m->m_lock.m_ceilings[0];
+	return (0);
 }
 
 int
 _pthread_mutex_setprioceiling(pthread_mutex_t *mutex,
-			      int ceiling, int *old_ceiling)
+    int ceiling, int *old_ceiling)
 {
-	struct pthread *curthread = _get_curthread();
+	struct pthread *curthread;
 	struct pthread_mutex *m, *m1, *m2;
+	struct mutex_queue *q, *qp;
 	int ret;
 
-	m = *mutex;
-	if ((m <= THR_MUTEX_DESTROYED) ||
-	    (m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0)
+	if (*mutex == THR_PSHARED_PTR) {
+		m = __thr_pshared_offpage(mutex, 0);
+		if (m == NULL)
+			return (EINVAL);
+	} else {
+		m = *mutex;
+		if (m <= THR_MUTEX_DESTROYED)
+			return (EINVAL);
+	}
+	if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0)
 		return (EINVAL);
 
 	ret = __thr_umutex_set_ceiling(&m->m_lock, ceiling, old_ceiling);
 	if (ret != 0)
 		return (ret);
 
-	if (m->m_owner == curthread) {
-		MUTEX_ASSERT_IS_OWNED(m);
+	curthread = _get_curthread();
+	if (m->m_owner == TID(curthread)) {
+		mutex_assert_is_owned(m);
 		m1 = TAILQ_PREV(m, mutex_queue, m_qe);
 		m2 = TAILQ_NEXT(m, m_qe);
 		if ((m1 != NULL && m1->m_lock.m_ceilings[0] > (u_int)ceiling) ||
 		    (m2 != NULL && m2->m_lock.m_ceilings[0] < (u_int)ceiling)) {
-			TAILQ_REMOVE(&curthread->pp_mutexq, m, m_qe);
-			TAILQ_FOREACH(m2, &curthread->pp_mutexq, m_qe) {
+			q = &curthread->mq[TMQ_NORM_PP];
+			qp = &curthread->mq[TMQ_NORM_PP_PRIV];
+			TAILQ_REMOVE(q, m, m_qe);
+			if (!is_pshared_mutex(m))
+				TAILQ_REMOVE(qp, m, m_pqe);
+			TAILQ_FOREACH(m2, q, m_qe) {
 				if (m2->m_lock.m_ceilings[0] > (u_int)ceiling) {
 					TAILQ_INSERT_BEFORE(m2, m, m_qe);
+					if (!is_pshared_mutex(m)) {
+						while (m2 != NULL &&
+						    is_pshared_mutex(m2)) {
+							m2 = TAILQ_PREV(m2,
+							    mutex_queue, m_qe);
+						}
+						if (m2 == NULL) {
+							TAILQ_INSERT_HEAD(qp,
+							    m, m_pqe);
+						} else {
+							TAILQ_INSERT_BEFORE(m2,
+							    m, m_pqe);
+						}
+					}
 					return (0);
 				}
 			}
-			TAILQ_INSERT_TAIL(&curthread->pp_mutexq, m, m_qe);
+			TAILQ_INSERT_TAIL(q, m, m_qe);
+			if (!is_pshared_mutex(m))
+				TAILQ_INSERT_TAIL(qp, m, m_pqe);
 		}
 	}
 	return (0);
 }
 
 int
 _pthread_mutex_getspinloops_np(pthread_mutex_t *mutex, int *count)
 {
-	struct pthread_mutex	*m;
+	struct pthread_mutex *m;
+	int ret;
 
-	CHECK_AND_INIT_MUTEX
-
-	*count = m->m_spinloops;
-	return (0);
+	ret = check_and_init_mutex(mutex, &m);
+	if (ret == 0)
+		*count = m->m_spinloops;
+	return (ret);
 }
 
 int
 __pthread_mutex_setspinloops_np(pthread_mutex_t *mutex, int count)
 {
-	struct pthread_mutex	*m;
+	struct pthread_mutex *m;
+	int ret;
 
-	CHECK_AND_INIT_MUTEX
-
-	m->m_spinloops = count;
-	return (0);
+	ret = check_and_init_mutex(mutex, &m);
+	if (ret == 0)
+		m->m_spinloops = count;
+	return (ret);
 }
 
 int
 _pthread_mutex_getyieldloops_np(pthread_mutex_t *mutex, int *count)
 {
-	struct pthread_mutex	*m;
+	struct pthread_mutex *m;
+	int ret;
 
-	CHECK_AND_INIT_MUTEX
-
-	*count = m->m_yieldloops;
-	return (0);
+	ret = check_and_init_mutex(mutex, &m);
+	if (ret == 0)
+		*count = m->m_yieldloops;
+	return (ret);
 }
 
 int
 __pthread_mutex_setyieldloops_np(pthread_mutex_t *mutex, int count)
 {
-	struct pthread_mutex	*m;
+	struct pthread_mutex *m;
+	int ret;
 
-	CHECK_AND_INIT_MUTEX
-
-	m->m_yieldloops = count;
+	ret = check_and_init_mutex(mutex, &m);
+	if (ret == 0)
+		m->m_yieldloops = count;
 	return (0);
 }
 
 int
 _pthread_mutex_isowned_np(pthread_mutex_t *mutex)
 {
 	struct pthread_mutex	*m;
 
-	m = *mutex;
-	if (m <= THR_MUTEX_DESTROYED)
-		return (0);
-	return (m->m_owner == _get_curthread());
+	if (*mutex == THR_PSHARED_PTR) {
+		m = __thr_pshared_offpage(mutex, 0);
+		if (m == NULL)
+			return (0);
+	} else {
+		m = *mutex;
+		if (m <= THR_MUTEX_DESTROYED)
+			return (0);
+	}
+	return (m->m_owner == TID(_get_curthread()));
 }
 
 int
 _mutex_owned(struct pthread *curthread, const struct pthread_mutex *mp)
 {
 	if (__predict_false(mp <= THR_MUTEX_DESTROYED)) {
 		if (mp == THR_MUTEX_DESTROYED)
 			return (EINVAL);
 		return (EPERM);
 	}
-      	if (mp->m_owner != curthread)
+	if (mp->m_owner != TID(curthread))
 		return (EPERM);
 	return (0);                  
 }
Index: head/lib/libthr/thread/thr_mutexattr.c
===================================================================
--- head/lib/libthr/thread/thr_mutexattr.c	(revision 296161)
+++ head/lib/libthr/thread/thr_mutexattr.c	(revision 296162)
@@ -1,255 +1,252 @@
 /*
  * Copyright (c) 1996 Jeffrey Hsu <hsu@freebsd.org>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Copyright (c) 1997 John Birrell <jb@cimlogic.com.au>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include "namespace.h"
 #include <string.h>
 #include <stdlib.h>
 #include <errno.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include "un-namespace.h"
 
 #include "thr_private.h"
 
 __weak_reference(_pthread_mutexattr_init, pthread_mutexattr_init);
 __weak_reference(_pthread_mutexattr_setkind_np, pthread_mutexattr_setkind_np);
 __weak_reference(_pthread_mutexattr_getkind_np, pthread_mutexattr_getkind_np);
 __weak_reference(_pthread_mutexattr_gettype, pthread_mutexattr_gettype);
 __weak_reference(_pthread_mutexattr_settype, pthread_mutexattr_settype);
 __weak_reference(_pthread_mutexattr_destroy, pthread_mutexattr_destroy);
 __weak_reference(_pthread_mutexattr_getpshared, pthread_mutexattr_getpshared);
 __weak_reference(_pthread_mutexattr_setpshared, pthread_mutexattr_setpshared);
 __weak_reference(_pthread_mutexattr_getprotocol, pthread_mutexattr_getprotocol);
 __weak_reference(_pthread_mutexattr_setprotocol, pthread_mutexattr_setprotocol);
 __weak_reference(_pthread_mutexattr_getprioceiling, pthread_mutexattr_getprioceiling);
 __weak_reference(_pthread_mutexattr_setprioceiling, pthread_mutexattr_setprioceiling);
 
 int
 _pthread_mutexattr_init(pthread_mutexattr_t *attr)
 {
 	int ret;
 	pthread_mutexattr_t pattr;
 
 	if ((pattr = (pthread_mutexattr_t)
 	    malloc(sizeof(struct pthread_mutex_attr))) == NULL) {
 		ret = ENOMEM;
 	} else {
 		memcpy(pattr, &_pthread_mutexattr_default,
 		    sizeof(struct pthread_mutex_attr));
 		*attr = pattr;
 		ret = 0;
 	}
 	return (ret);
 }
 
 int
 _pthread_mutexattr_setkind_np(pthread_mutexattr_t *attr, int kind)
 {
 	int	ret;
 	if (attr == NULL || *attr == NULL) {
 		errno = EINVAL;
 		ret = -1;
 	} else {
 		(*attr)->m_type = kind;
 		ret = 0;
 	}
 	return(ret);
 }
 
 int
 _pthread_mutexattr_getkind_np(pthread_mutexattr_t attr)
 {
 	int	ret;
 	if (attr == NULL) {
 		errno = EINVAL;
 		ret = -1;
 	} else {
 		ret = attr->m_type;
 	}
 	return(ret);
 }
 
 int
 _pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type)
 {
 	int	ret;
 	if (attr == NULL || *attr == NULL || type >= PTHREAD_MUTEX_TYPE_MAX) {
 		ret = EINVAL;
 	} else {
 		(*attr)->m_type = type;
 		ret = 0;
 	}
 	return(ret);
 }
 
 int
 _pthread_mutexattr_gettype(pthread_mutexattr_t *attr, int *type)
 {
 	int	ret;
 
 	if (attr == NULL || *attr == NULL || (*attr)->m_type >=
 	    PTHREAD_MUTEX_TYPE_MAX) {
 		ret = EINVAL;
 	} else {
 		*type = (*attr)->m_type;
 		ret = 0;
 	}
 	return ret;
 }
 
 int
 _pthread_mutexattr_destroy(pthread_mutexattr_t *attr)
 {
 	int	ret;
 	if (attr == NULL || *attr == NULL) {
 		ret = EINVAL;
 	} else {
 		free(*attr);
 		*attr = NULL;
 		ret = 0;
 	}
 	return(ret);
 }
 
 int
 _pthread_mutexattr_getpshared(const pthread_mutexattr_t *attr,
 	int *pshared)
 {
 
 	if (attr == NULL || *attr == NULL)
 		return (EINVAL);
-
-	*pshared = PTHREAD_PROCESS_PRIVATE;
+	*pshared = (*attr)->m_pshared;
 	return (0);
 }
 
 int
 _pthread_mutexattr_setpshared(pthread_mutexattr_t *attr, int pshared)
 {
 
-	if (attr == NULL || *attr == NULL)
+	if (attr == NULL || *attr == NULL ||
+	    (pshared != PTHREAD_PROCESS_PRIVATE &&
+	    pshared != PTHREAD_PROCESS_SHARED))
 		return (EINVAL);
-
-	/* Only PTHREAD_PROCESS_PRIVATE is supported. */
-	if (pshared != PTHREAD_PROCESS_PRIVATE)
-		return (EINVAL);
-
+	(*attr)->m_pshared = pshared;
 	return (0);
 }
 
 int
 _pthread_mutexattr_getprotocol(pthread_mutexattr_t *mattr, int *protocol)
 {
 	int ret = 0;
 
 	if ((mattr == NULL) || (*mattr == NULL))
 		ret = EINVAL;
 	else
 		*protocol = (*mattr)->m_protocol;
 
 	return(ret);
 }
 
 int
 _pthread_mutexattr_setprotocol(pthread_mutexattr_t *mattr, int protocol)
 {
 	int ret = 0;
 
 	if ((mattr == NULL) || (*mattr == NULL) ||
 	    (protocol < PTHREAD_PRIO_NONE) || (protocol > PTHREAD_PRIO_PROTECT))
 		ret = EINVAL;
 	else {
 		(*mattr)->m_protocol = protocol;
 		(*mattr)->m_ceiling = THR_MAX_RR_PRIORITY;
 	}
 	return(ret);
 }
 
 int
 _pthread_mutexattr_getprioceiling(pthread_mutexattr_t *mattr, int *prioceiling)
 {
 	int ret = 0;
 
 	if ((mattr == NULL) || (*mattr == NULL))
 		ret = EINVAL;
 	else if ((*mattr)->m_protocol != PTHREAD_PRIO_PROTECT)
 		ret = EINVAL;
 	else
 		*prioceiling = (*mattr)->m_ceiling;
 
 	return(ret);
 }
 
 int
 _pthread_mutexattr_setprioceiling(pthread_mutexattr_t *mattr, int prioceiling)
 {
 	int ret = 0;
 
 	if ((mattr == NULL) || (*mattr == NULL))
 		ret = EINVAL;
 	else if ((*mattr)->m_protocol != PTHREAD_PRIO_PROTECT)
 		ret = EINVAL;
 	else
 		(*mattr)->m_ceiling = prioceiling;
 
 	return(ret);
 }
 
Index: head/lib/libthr/thread/thr_private.h
===================================================================
--- head/lib/libthr/thread/thr_private.h	(revision 296161)
+++ head/lib/libthr/thread/thr_private.h	(revision 296162)
@@ -1,941 +1,958 @@
 /*
  * Copyright (C) 2005 Daniel M. Eischen <deischen@freebsd.org>
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>.
  *
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _THR_PRIVATE_H
 #define _THR_PRIVATE_H
 
 /*
  * Include files.
  */
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/cdefs.h>
 #include <sys/queue.h>
 #include <sys/param.h>
 #include <sys/cpuset.h>
 #include <machine/atomic.h>
 #include <errno.h>
 #include <limits.h>
 #include <signal.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <ucontext.h>
 #include <sys/thr.h>
 #include <pthread.h>
 
 #define	SYM_FB10(sym)			__CONCAT(sym, _fb10)
 #define	SYM_FBP10(sym)			__CONCAT(sym, _fbp10)
 #define	WEAK_REF(sym, alias)		__weak_reference(sym, alias)
 #define	SYM_COMPAT(sym, impl, ver)	__sym_compat(sym, impl, ver)
 #define	SYM_DEFAULT(sym, impl, ver)	__sym_default(sym, impl, ver)
 
 #define	FB10_COMPAT(func, sym)				\
 	WEAK_REF(func, SYM_FB10(sym));			\
 	SYM_COMPAT(sym, SYM_FB10(sym), FBSD_1.0)
 
 #define	FB10_COMPAT_PRIVATE(func, sym)			\
 	WEAK_REF(func, SYM_FBP10(sym));			\
 	SYM_DEFAULT(sym, SYM_FBP10(sym), FBSDprivate_1.0)
 
 #include "pthread_md.h"
 #include "thr_umtx.h"
 #include "thread_db.h"
 
 #ifdef _PTHREAD_FORCED_UNWIND
 #define _BSD_SOURCE
 #include <unwind.h>
 #endif
 
 typedef TAILQ_HEAD(pthreadlist, pthread) pthreadlist;
 typedef TAILQ_HEAD(atfork_head, pthread_atfork) atfork_head;
 TAILQ_HEAD(mutex_queue, pthread_mutex);
 
 /* Signal to do cancellation */
 #define	SIGCANCEL		SIGTHR
 
 /*
  * Kernel fatal error handler macro.
  */
 #define PANIC(string)		_thread_exit(__FILE__,__LINE__,string)
 
 /* Output debug messages like this: */
 #define stdout_debug(args...)	_thread_printf(STDOUT_FILENO, ##args)
 #define stderr_debug(args...)	_thread_printf(STDERR_FILENO, ##args)
 
 #ifdef _PTHREADS_INVARIANTS
 #define THR_ASSERT(cond, msg) do {	\
 	if (__predict_false(!(cond)))	\
 		PANIC(msg);		\
 } while (0)
 #else
 #define THR_ASSERT(cond, msg)
 #endif
 
 #ifdef PIC
 # define STATIC_LIB_REQUIRE(name)
 #else
 # define STATIC_LIB_REQUIRE(name) __asm (".globl " #name)
 #endif
 
 #define	TIMESPEC_ADD(dst, src, val)				\
 	do { 							\
 		(dst)->tv_sec = (src)->tv_sec + (val)->tv_sec;	\
 		(dst)->tv_nsec = (src)->tv_nsec + (val)->tv_nsec; \
 		if ((dst)->tv_nsec >= 1000000000) {		\
 			(dst)->tv_sec++;			\
 			(dst)->tv_nsec -= 1000000000;		\
 		}						\
 	} while (0)
 
 #define	TIMESPEC_SUB(dst, src, val)				\
 	do { 							\
 		(dst)->tv_sec = (src)->tv_sec - (val)->tv_sec;	\
 		(dst)->tv_nsec = (src)->tv_nsec - (val)->tv_nsec; \
 		if ((dst)->tv_nsec < 0) {			\
 			(dst)->tv_sec--;			\
 			(dst)->tv_nsec += 1000000000;		\
 		}						\
 	} while (0)
 
+/* Magic cookie set for shared pthread locks and cv's pointers */
+#define	THR_PSHARED_PTR						\
+    ((void *)(uintptr_t)((1ULL << (NBBY * sizeof(long) - 1)) | 1))
+
 /* XXX These values should be same as those defined in pthread.h */
 #define	THR_MUTEX_INITIALIZER		((struct pthread_mutex *)NULL)
 #define	THR_ADAPTIVE_MUTEX_INITIALIZER	((struct pthread_mutex *)1)
 #define	THR_MUTEX_DESTROYED		((struct pthread_mutex *)2)
 #define	THR_COND_INITIALIZER		((struct pthread_cond *)NULL)
 #define	THR_COND_DESTROYED		((struct pthread_cond *)1)
 #define	THR_RWLOCK_INITIALIZER		((struct pthread_rwlock *)NULL)
 #define	THR_RWLOCK_DESTROYED		((struct pthread_rwlock *)1)
 
 #define PMUTEX_FLAG_TYPE_MASK	0x0ff
 #define PMUTEX_FLAG_PRIVATE	0x100
 #define PMUTEX_FLAG_DEFERED	0x200
 #define PMUTEX_TYPE(mtxflags)	((mtxflags) & PMUTEX_FLAG_TYPE_MASK)
 
 #define MAX_DEFER_WAITERS       50
 
 struct pthread_mutex {
 	/*
 	 * Lock for accesses to this structure.
 	 */
 	struct umutex			m_lock;
 	int				m_flags;
-	struct pthread			*m_owner;
+	uint32_t			m_owner;
 	int				m_count;
 	int				m_spinloops;
 	int				m_yieldloops;
 	/*
-	 * Link for all mutexes a thread currently owns.
+	 * Link for all mutexes a thread currently owns, of the same
+	 * prio type.
 	 */
 	TAILQ_ENTRY(pthread_mutex)	m_qe;
+	/* Link for all private mutexes a thread currently owns. */
+	TAILQ_ENTRY(pthread_mutex)	m_pqe;
 };
 
 struct pthread_mutex_attr {
 	enum pthread_mutextype	m_type;
 	int			m_protocol;
 	int			m_ceiling;
+	int			m_pshared;
 };
 
 #define PTHREAD_MUTEXATTR_STATIC_INITIALIZER \
 	{ PTHREAD_MUTEX_DEFAULT, PTHREAD_PRIO_NONE, 0, MUTEX_FLAGS_PRIVATE }
 
 struct pthread_cond {
 	__uint32_t	__has_user_waiters;
 	__uint32_t	__has_kern_waiters;
 	__uint32_t	__flags;
 	__uint32_t	__clock_id;
 };
 
 struct pthread_cond_attr {
 	int		c_pshared;
 	int		c_clockid;
 };
 
 struct pthread_barrier {
 	struct umutex		b_lock;
 	struct ucond		b_cv;
 	int64_t			b_cycle;
 	int			b_count;
 	int			b_waiters;
 	int			b_refcount;
 	int			b_destroying;
 };
 
 struct pthread_barrierattr {
 	int		pshared;
 };
 
 struct pthread_spinlock {
 	struct umutex	s_lock;
 };
 
 /*
  * Flags for condition variables.
  */
 #define COND_FLAGS_PRIVATE	0x01
 #define COND_FLAGS_INITED	0x02
 #define COND_FLAGS_BUSY		0x04
 
 /*
  * Cleanup definitions.
  */
 struct pthread_cleanup {
 	struct pthread_cleanup	*prev;
 	void			(*routine)(void *);
 	void			*routine_arg;
 	int			onheap;
 };
 
 #define	THR_CLEANUP_PUSH(td, func, arg) {		\
 	struct pthread_cleanup __cup;			\
 							\
 	__cup.routine = func;				\
 	__cup.routine_arg = arg;			\
 	__cup.onheap = 0;				\
 	__cup.prev = (td)->cleanup;			\
 	(td)->cleanup = &__cup;
 
 #define	THR_CLEANUP_POP(td, exec)			\
 	(td)->cleanup = __cup.prev;			\
 	if ((exec) != 0)				\
 		__cup.routine(__cup.routine_arg);	\
 }
 
 struct pthread_atfork {
 	TAILQ_ENTRY(pthread_atfork) qe;
 	void (*prepare)(void);
 	void (*parent)(void);
 	void (*child)(void);
 };
 
 struct pthread_attr {
 #define pthread_attr_start_copy	sched_policy
 	int	sched_policy;
 	int	sched_inherit;
 	int	prio;
 	int	suspend;
 #define	THR_STACK_USER		0x100	/* 0xFF reserved for <pthread.h> */
 	int	flags;
 	void	*stackaddr_attr;
 	size_t	stacksize_attr;
 	size_t	guardsize_attr;
 #define pthread_attr_end_copy	cpuset
 	cpuset_t	*cpuset;
 	size_t	cpusetsize;
 };
 
 struct wake_addr {
 	struct wake_addr *link;
 	unsigned int	value;
 	char		pad[12];
 };
 
 struct sleepqueue {
 	TAILQ_HEAD(, pthread)    sq_blocked;
 	SLIST_HEAD(, sleepqueue) sq_freeq;
 	LIST_ENTRY(sleepqueue)   sq_hash;
 	SLIST_ENTRY(sleepqueue)  sq_flink;
 	void			 *sq_wchan;
 	int			 sq_type;
 };
 
 /*
  * Thread creation state attributes.
  */
 #define THR_CREATE_RUNNING		0
 #define THR_CREATE_SUSPENDED		1
 
 /*
  * Miscellaneous definitions.
  */
 #define THR_STACK_DEFAULT		(sizeof(void *) / 4 * 1024 * 1024)
 
 /*
  * Maximum size of initial thread's stack.  This perhaps deserves to be larger
  * than the stacks of other threads, since many applications are likely to run
  * almost entirely on this stack.
  */
 #define THR_STACK_INITIAL		(THR_STACK_DEFAULT * 2)
 
 /*
  * Define priorities returned by kernel.
  */
 #define THR_MIN_PRIORITY		(_thr_priorities[SCHED_OTHER-1].pri_min)
 #define THR_MAX_PRIORITY		(_thr_priorities[SCHED_OTHER-1].pri_max)
 #define THR_DEF_PRIORITY		(_thr_priorities[SCHED_OTHER-1].pri_default)
 
 #define THR_MIN_RR_PRIORITY		(_thr_priorities[SCHED_RR-1].pri_min)
 #define THR_MAX_RR_PRIORITY		(_thr_priorities[SCHED_RR-1].pri_max)
 #define THR_DEF_RR_PRIORITY		(_thr_priorities[SCHED_RR-1].pri_default)
 
 /* XXX The SCHED_FIFO should have same priority range as SCHED_RR */
 #define THR_MIN_FIFO_PRIORITY		(_thr_priorities[SCHED_FIFO_1].pri_min)
 #define THR_MAX_FIFO_PRIORITY		(_thr_priorities[SCHED_FIFO-1].pri_max)
 #define THR_DEF_FIFO_PRIORITY		(_thr_priorities[SCHED_FIFO-1].pri_default)
 
 struct pthread_prio {
 	int	pri_min;
 	int	pri_max;
 	int	pri_default;
 };
 
 struct pthread_rwlockattr {
 	int		pshared;
 };
 
 struct pthread_rwlock {
 	struct urwlock 	lock;
-	struct pthread	*owner;
+	uint32_t	owner;
 };
 
 /*
  * Thread states.
  */
 enum pthread_state {
 	PS_RUNNING,
 	PS_DEAD
 };
 
 struct pthread_specific_elem {
 	const void	*data;
 	int		seqno;
 };
 
 struct pthread_key {
 	volatile int	allocated;
 	int		seqno;
 	void            (*destructor)(void *);
 };
 
 /*
  * lwpid_t is 32bit but kernel thr API exports tid as long type
  * to preserve the ABI for M:N model in very early date (r131431).
  */
 #define TID(thread)	((uint32_t) ((thread)->tid))
 
 /*
  * Thread structure.
  */
 struct pthread {
 #define _pthread_startzero	tid
 	/* Kernel thread id. */
 	long			tid;
 #define	TID_TERMINATED		1
 
 	/*
 	 * Lock for accesses to this thread structure.
 	 */
 	struct umutex		lock;
 
 	/* Internal condition variable cycle number. */
 	uint32_t		cycle;
 
 	/* How many low level locks the thread held. */
 	int			locklevel;
 
 	/*
 	 * Set to non-zero when this thread has entered a critical
 	 * region.  We allow for recursive entries into critical regions.
 	 */
 	int			critical_count;
 
 	/* Signal blocked counter. */
 	int			sigblock;
 
 	/* Queue entry for list of all threads. */
 	TAILQ_ENTRY(pthread)	tle;	/* link for all threads in process */
 
 	/* Queue entry for GC lists. */
 	TAILQ_ENTRY(pthread)	gcle;
 
 	/* Hash queue entry. */
 	LIST_ENTRY(pthread)	hle;
 
 	/* Sleep queue entry */
 	TAILQ_ENTRY(pthread)    wle;
 
 	/* Threads reference count. */
 	int			refcount;
 
 	/*
 	 * Thread start routine, argument, stack pointer and thread
 	 * attributes.
 	 */
 	void			*(*start_routine)(void *);
 	void			*arg;
 	struct pthread_attr	attr;
 
 #define	SHOULD_CANCEL(thr)					\
 	((thr)->cancel_pending && (thr)->cancel_enable &&	\
 	 (thr)->no_cancel == 0)
 
 	/* Cancellation is enabled */
 	int			cancel_enable;
 
 	/* Cancellation request is pending */
 	int			cancel_pending;
 
 	/* Thread is at cancellation point */
 	int			cancel_point;
 
 	/* Cancellation is temporarily disabled */
 	int			no_cancel;
 
 	/* Asynchronouse cancellation is enabled */
 	int			cancel_async;
 
 	/* Cancellation is in progress */
 	int			cancelling;
 
 	/* Thread temporary signal mask. */
 	sigset_t		sigmask;
 
 	/* Thread should unblock SIGCANCEL. */
 	int			unblock_sigcancel;
 
 	/* In sigsuspend state */
 	int			in_sigsuspend;
 
 	/* deferred signal info	*/
 	siginfo_t		deferred_siginfo;
 
 	/* signal mask to restore. */
 	sigset_t		deferred_sigmask;
 
 	/* the sigaction should be used for deferred signal. */
 	struct sigaction	deferred_sigact;
 
 	/* deferred signal delivery is performed, do not reenter. */
 	int			deferred_run;
 
 	/* Force new thread to exit. */
 	int			force_exit;
 
 	/* Thread state: */
 	enum pthread_state 	state;
 
 	/*
 	 * Error variable used instead of errno. The function __error()
 	 * returns a pointer to this. 
 	 */
 	int			error;
 
 	/*
 	 * The joiner is the thread that is joining to this thread.  The
 	 * join status keeps track of a join operation to another thread.
 	 */
 	struct pthread		*joiner;
 
 	/* Miscellaneous flags; only set with scheduling lock held. */
 	int			flags;
 #define THR_FLAGS_PRIVATE	0x0001
 #define	THR_FLAGS_NEED_SUSPEND	0x0002	/* thread should be suspended */
 #define	THR_FLAGS_SUSPENDED	0x0004	/* thread is suspended */
 #define	THR_FLAGS_DETACHED	0x0008	/* thread is detached */
 
 	/* Thread list flags; only set with thread list lock held. */
 	int			tlflags;
 #define	TLFLAGS_GC_SAFE		0x0001	/* thread safe for cleaning */
 #define	TLFLAGS_IN_TDLIST	0x0002	/* thread in all thread list */
 #define	TLFLAGS_IN_GCLIST	0x0004	/* thread in gc list */
 
-	/* Queue of currently owned NORMAL or PRIO_INHERIT type mutexes. */
-	struct mutex_queue	mutexq;
+	/*
+	 * Queues of the owned mutexes.  Private queue must have index
+	 * + 1 of the corresponding full queue.
+	 */
+#define	TMQ_NORM		0	/* NORMAL or PRIO_INHERIT normal */
+#define	TMQ_NORM_PRIV		1	/* NORMAL or PRIO_INHERIT normal priv */
+#define	TMQ_NORM_PP		2	/* PRIO_PROTECT normal mutexes */
+#define	TMQ_NORM_PP_PRIV	3	/* PRIO_PROTECT normal priv */
+#define	TMQ_NITEMS		4
+	struct mutex_queue	mq[TMQ_NITEMS];
 
-	/* Queue of all owned PRIO_PROTECT mutexes. */
-	struct mutex_queue	pp_mutexq;
-
 	void				*ret;
 	struct pthread_specific_elem	*specific;
 	int				specific_data_count;
 
 	/* Number rwlocks rdlocks held. */
 	int			rdlock_count;
 
 	/*
 	 * Current locks bitmap for rtld. */
 	int			rtld_bits;
 
 	/* Thread control block */
 	struct tcb		*tcb;
 
 	/* Cleanup handlers Link List */
 	struct pthread_cleanup	*cleanup;
 
 #ifdef _PTHREAD_FORCED_UNWIND
 	struct _Unwind_Exception	ex;
 	void			*unwind_stackend;
 	int			unwind_disabled;
 #endif
 
 	/*
 	 * Magic value to help recognize a valid thread structure
 	 * from an invalid one:
 	 */
 #define	THR_MAGIC		((u_int32_t) 0xd09ba115)
 	u_int32_t		magic;
 
 	/* Enable event reporting */
 	int			report_events;
 
 	/* Event mask */
 	int			event_mask;
 
 	/* Event */
 	td_event_msg_t		event_buf;
 
 	/* Wait channel */
 	void			*wchan;
 
 	/* Referenced mutex. */
 	struct pthread_mutex	*mutex_obj;
 
 	/* Thread will sleep. */
 	int			will_sleep;
 
 	/* Number of threads deferred. */
 	int			nwaiter_defer;
 
 	/* Deferred threads from pthread_cond_signal. */
 	unsigned int 		*defer_waiters[MAX_DEFER_WAITERS];
 #define _pthread_endzero	wake_addr
 
 	struct wake_addr	*wake_addr;
 #define WAKE_ADDR(td)           ((td)->wake_addr)
 
 	/* Sleep queue */
 	struct	sleepqueue	*sleepqueue;
 
 };
 
 #define THR_SHOULD_GC(thrd) 						\
 	((thrd)->refcount == 0 && (thrd)->state == PS_DEAD &&		\
 	 ((thrd)->flags & THR_FLAGS_DETACHED) != 0)
 
 #define	THR_IN_CRITICAL(thrd)				\
 	(((thrd)->locklevel > 0) ||			\
 	((thrd)->critical_count > 0))
 
 #define	THR_CRITICAL_ENTER(thrd)			\
 	(thrd)->critical_count++
 
 #define	THR_CRITICAL_LEAVE(thrd)			\
 	do {						\
 		(thrd)->critical_count--;		\
 		_thr_ast(thrd);				\
 	} while (0)
 
 #define THR_UMUTEX_TRYLOCK(thrd, lck)			\
 	_thr_umutex_trylock((lck), TID(thrd))
 
 #define	THR_UMUTEX_LOCK(thrd, lck)			\
 	_thr_umutex_lock((lck), TID(thrd))
 
 #define	THR_UMUTEX_TIMEDLOCK(thrd, lck, timo)		\
 	_thr_umutex_timedlock((lck), TID(thrd), (timo))
 
 #define	THR_UMUTEX_UNLOCK(thrd, lck)			\
 	_thr_umutex_unlock((lck), TID(thrd))
 
 #define	THR_LOCK_ACQUIRE(thrd, lck)			\
 do {							\
 	(thrd)->locklevel++;				\
 	_thr_umutex_lock(lck, TID(thrd));		\
 } while (0)
 
 #define	THR_LOCK_ACQUIRE_SPIN(thrd, lck)		\
 do {							\
 	(thrd)->locklevel++;				\
 	_thr_umutex_lock_spin(lck, TID(thrd));		\
 } while (0)
 
 #ifdef	_PTHREADS_INVARIANTS
 #define	THR_ASSERT_LOCKLEVEL(thrd)			\
 do {							\
 	if (__predict_false((thrd)->locklevel <= 0))	\
 		_thr_assert_lock_level();		\
 } while (0)
 #else
 #define THR_ASSERT_LOCKLEVEL(thrd)
 #endif
 
 #define	THR_LOCK_RELEASE(thrd, lck)			\
 do {							\
 	THR_ASSERT_LOCKLEVEL(thrd);			\
 	_thr_umutex_unlock((lck), TID(thrd));		\
 	(thrd)->locklevel--;				\
 	_thr_ast(thrd);					\
 } while (0)
 
 #define	THR_LOCK(curthrd)		THR_LOCK_ACQUIRE(curthrd, &(curthrd)->lock)
 #define	THR_UNLOCK(curthrd)		THR_LOCK_RELEASE(curthrd, &(curthrd)->lock)
 #define	THR_THREAD_LOCK(curthrd, thr)	THR_LOCK_ACQUIRE(curthrd, &(thr)->lock)
 #define	THR_THREAD_UNLOCK(curthrd, thr)	THR_LOCK_RELEASE(curthrd, &(thr)->lock)
 
 #define	THREAD_LIST_RDLOCK(curthrd)				\
 do {								\
 	(curthrd)->locklevel++;					\
 	_thr_rwl_rdlock(&_thr_list_lock);			\
 } while (0)
 
 #define	THREAD_LIST_WRLOCK(curthrd)				\
 do {								\
 	(curthrd)->locklevel++;					\
 	_thr_rwl_wrlock(&_thr_list_lock);			\
 } while (0)
 
 #define	THREAD_LIST_UNLOCK(curthrd)				\
 do {								\
 	_thr_rwl_unlock(&_thr_list_lock);			\
 	(curthrd)->locklevel--;					\
 	_thr_ast(curthrd);					\
 } while (0)
 
 /*
  * Macros to insert/remove threads to the all thread list and
  * the gc list.
  */
 #define	THR_LIST_ADD(thrd) do {					\
 	if (((thrd)->tlflags & TLFLAGS_IN_TDLIST) == 0) {	\
 		TAILQ_INSERT_HEAD(&_thread_list, thrd, tle);	\
 		_thr_hash_add(thrd);				\
 		(thrd)->tlflags |= TLFLAGS_IN_TDLIST;		\
 	}							\
 } while (0)
 #define	THR_LIST_REMOVE(thrd) do {				\
 	if (((thrd)->tlflags & TLFLAGS_IN_TDLIST) != 0) {	\
 		TAILQ_REMOVE(&_thread_list, thrd, tle);		\
 		_thr_hash_remove(thrd);				\
 		(thrd)->tlflags &= ~TLFLAGS_IN_TDLIST;		\
 	}							\
 } while (0)
 #define	THR_GCLIST_ADD(thrd) do {				\
 	if (((thrd)->tlflags & TLFLAGS_IN_GCLIST) == 0) {	\
 		TAILQ_INSERT_HEAD(&_thread_gc_list, thrd, gcle);\
 		(thrd)->tlflags |= TLFLAGS_IN_GCLIST;		\
 		_gc_count++;					\
 	}							\
 } while (0)
 #define	THR_GCLIST_REMOVE(thrd) do {				\
 	if (((thrd)->tlflags & TLFLAGS_IN_GCLIST) != 0) {	\
 		TAILQ_REMOVE(&_thread_gc_list, thrd, gcle);	\
 		(thrd)->tlflags &= ~TLFLAGS_IN_GCLIST;		\
 		_gc_count--;					\
 	}							\
 } while (0)
 
 #define THR_REF_ADD(curthread, pthread) {			\
 	THR_CRITICAL_ENTER(curthread);				\
 	pthread->refcount++;					\
 } while (0)
 
 #define THR_REF_DEL(curthread, pthread) {			\
 	pthread->refcount--;					\
 	THR_CRITICAL_LEAVE(curthread);				\
 } while (0)
 
 #define GC_NEEDED()	(_gc_count >= 5)
 
 #define SHOULD_REPORT_EVENT(curthr, e)			\
 	(curthr->report_events && 			\
 	 (((curthr)->event_mask | _thread_event_mask ) & e) != 0)
 
 extern int __isthreaded;
 
 /*
  * Global variables for the pthread kernel.
  */
 
 extern char		*_usrstack __hidden;
 extern struct pthread	*_thr_initial __hidden;
 
 /* For debugger */
 extern int		_libthr_debug;
 extern int		_thread_event_mask;
 extern struct pthread	*_thread_last_event;
 
 /* List of all threads: */
 extern pthreadlist	_thread_list;
 
 /* List of threads needing GC: */
 extern pthreadlist	_thread_gc_list __hidden;
 
 extern int		_thread_active_threads;
 extern atfork_head	_thr_atfork_list __hidden;
 extern struct urwlock	_thr_atfork_lock __hidden;
 
 /* Default thread attributes: */
 extern struct pthread_attr _pthread_attr_default __hidden;
 
 /* Default mutex attributes: */
 extern struct pthread_mutex_attr _pthread_mutexattr_default __hidden;
 extern struct pthread_mutex_attr _pthread_mutexattr_adaptive_default __hidden;
 
 /* Default condition variable attributes: */
 extern struct pthread_cond_attr _pthread_condattr_default __hidden;
 
 extern struct pthread_prio _thr_priorities[] __hidden;
 
 extern pid_t	_thr_pid __hidden;
 extern int	_thr_is_smp __hidden;
 
 extern size_t	_thr_guard_default __hidden;
 extern size_t	_thr_stack_default __hidden;
 extern size_t	_thr_stack_initial __hidden;
 extern int	_thr_page_size __hidden;
 extern int	_thr_spinloops __hidden;
 extern int	_thr_yieldloops __hidden;
 extern int	_thr_queuefifo __hidden;
 
 /* Garbage thread count. */
 extern int	_gc_count __hidden;
 
 extern struct umutex	_mutex_static_lock __hidden;
 extern struct umutex	_cond_static_lock __hidden;
 extern struct umutex	_rwlock_static_lock __hidden;
 extern struct umutex	_keytable_lock __hidden;
 extern struct urwlock	_thr_list_lock __hidden;
 extern struct umutex	_thr_event_lock __hidden;
 extern struct umutex	_suspend_all_lock __hidden;
 extern int		_suspend_all_waiters __hidden;
 extern int		_suspend_all_cycle __hidden;
 extern struct pthread	*_single_thread __hidden;
 
 /*
  * Function prototype definitions.
  */
 __BEGIN_DECLS
 int	_thr_setthreaded(int) __hidden;
 int	_mutex_cv_lock(struct pthread_mutex *, int) __hidden;
 int	_mutex_cv_unlock(struct pthread_mutex *, int *, int *) __hidden;
 int     _mutex_cv_attach(struct pthread_mutex *, int) __hidden;
 int     _mutex_cv_detach(struct pthread_mutex *, int *) __hidden;
 int     _mutex_owned(struct pthread *, const struct pthread_mutex *) __hidden;
 int	_mutex_reinit(pthread_mutex_t *) __hidden;
 void	_mutex_fork(struct pthread *curthread) __hidden;
 void	_libpthread_init(struct pthread *) __hidden;
 struct pthread *_thr_alloc(struct pthread *) __hidden;
 void	_thread_exit(const char *, int, const char *) __hidden __dead2;
 int	_thr_ref_add(struct pthread *, struct pthread *, int) __hidden;
 void	_thr_ref_delete(struct pthread *, struct pthread *) __hidden;
 void	_thr_ref_delete_unlocked(struct pthread *, struct pthread *) __hidden;
 int	_thr_find_thread(struct pthread *, struct pthread *, int) __hidden;
 void	_thr_rtld_init(void) __hidden;
 void	_thr_rtld_postfork_child(void) __hidden;
 int	_thr_stack_alloc(struct pthread_attr *) __hidden;
 void	_thr_stack_free(struct pthread_attr *) __hidden;
 void	_thr_free(struct pthread *, struct pthread *) __hidden;
 void	_thr_gc(struct pthread *) __hidden;
 void    _thread_cleanupspecific(void) __hidden;
 void	_thread_printf(int, const char *, ...) __hidden;
 void	_thr_spinlock_init(void) __hidden;
 void	_thr_cancel_enter(struct pthread *) __hidden;
 void	_thr_cancel_enter2(struct pthread *, int) __hidden;
 void	_thr_cancel_leave(struct pthread *, int) __hidden;
 void	_thr_testcancel(struct pthread *) __hidden;
 void	_thr_signal_block(struct pthread *) __hidden;
 void	_thr_signal_unblock(struct pthread *) __hidden;
 void	_thr_signal_init(int) __hidden;
 void	_thr_signal_deinit(void) __hidden;
 int	_thr_send_sig(struct pthread *, int sig) __hidden;
 void	_thr_list_init(void) __hidden;
 void	_thr_hash_add(struct pthread *) __hidden;
 void	_thr_hash_remove(struct pthread *) __hidden;
 struct pthread *_thr_hash_find(struct pthread *) __hidden;
 void	_thr_link(struct pthread *, struct pthread *) __hidden;
 void	_thr_unlink(struct pthread *, struct pthread *) __hidden;
 void	_thr_assert_lock_level(void) __hidden __dead2;
 void	_thr_ast(struct pthread *) __hidden;
 void	_thr_once_init(void) __hidden;
 void	_thr_report_creation(struct pthread *curthread,
 	    struct pthread *newthread) __hidden;
 void	_thr_report_death(struct pthread *curthread) __hidden;
 int	_thr_getscheduler(lwpid_t, int *, struct sched_param *) __hidden;
 int	_thr_setscheduler(lwpid_t, int, const struct sched_param *) __hidden;
 void	_thr_signal_prefork(void) __hidden;
 void	_thr_signal_postfork(void) __hidden;
 void	_thr_signal_postfork_child(void) __hidden;
 void	_thr_suspend_all_lock(struct pthread *) __hidden;
 void	_thr_suspend_all_unlock(struct pthread *) __hidden;
 void	_thr_try_gc(struct pthread *, struct pthread *) __hidden;
 int	_rtp_to_schedparam(const struct rtprio *rtp, int *policy,
 		struct sched_param *param) __hidden;
 int	_schedparam_to_rtp(int policy, const struct sched_param *param,
 		struct rtprio *rtp) __hidden;
 void	_thread_bp_create(void);
 void	_thread_bp_death(void);
 int	_sched_yield(void);
 
 void	_pthread_cleanup_push(void (*)(void *), void *);
 void	_pthread_cleanup_pop(int);
 void	_pthread_exit_mask(void *status, sigset_t *mask) __dead2 __hidden;
 void	_pthread_cancel_enter(int maycancel);
 void 	_pthread_cancel_leave(int maycancel);
 
 /* #include <fcntl.h> */
 #ifdef  _SYS_FCNTL_H_
 int     __sys_fcntl(int, int, ...);
 int     __sys_openat(int, const char *, int, ...);
 #endif
 
 /* #include <signal.h> */
 #ifdef _SIGNAL_H_
 int	__sys_kill(pid_t, int);
 int     __sys_sigaction(int, const struct sigaction *, struct sigaction *);
 int     __sys_sigpending(sigset_t *);
 int     __sys_sigprocmask(int, const sigset_t *, sigset_t *);
 int     __sys_sigsuspend(const sigset_t *);
 int     __sys_sigreturn(const ucontext_t *);
 int     __sys_sigaltstack(const struct sigaltstack *, struct sigaltstack *);
 int	__sys_sigwait(const sigset_t *, int *);
 int	__sys_sigtimedwait(const sigset_t *, siginfo_t *,
 		const struct timespec *);
 int	__sys_sigwaitinfo(const sigset_t *set, siginfo_t *info);
 #endif
 
 /* #include <time.h> */
 #ifdef	_TIME_H_
 int	__sys_nanosleep(const struct timespec *, struct timespec *);
 #endif
 
 /* #include <sys/ucontext.h> */
 #ifdef _SYS_UCONTEXT_H_
 int	__sys_setcontext(const ucontext_t *ucp);
 int	__sys_swapcontext(ucontext_t *oucp, const ucontext_t *ucp);
 #endif
 
 /* #include <unistd.h> */
 #ifdef  _UNISTD_H_
 int     __sys_close(int);
 int	__sys_fork(void);
 pid_t	__sys_getpid(void);
 ssize_t __sys_read(int, void *, size_t);
 void	__sys_exit(int);
 #endif
 
 static inline int
 _thr_isthreaded(void)
 {
 	return (__isthreaded != 0);
 }
 
 static inline int
 _thr_is_inited(void)
 {
 	return (_thr_initial != NULL);
 }
 
 static inline void
 _thr_check_init(void)
 {
 	if (_thr_initial == NULL)
 		_libpthread_init(NULL);
 }
 
 struct wake_addr *_thr_alloc_wake_addr(void);
 void	_thr_release_wake_addr(struct wake_addr *);
 int	_thr_sleep(struct pthread *, int, const struct timespec *);
 
 void _thr_wake_addr_init(void) __hidden;
 
 static inline void
 _thr_clear_wake(struct pthread *td)
 {
 	td->wake_addr->value = 0;
 }
 
 static inline int
 _thr_is_woken(struct pthread *td)
 {
 	return td->wake_addr->value != 0;
 }
 
 static inline void
 _thr_set_wake(unsigned int *waddr)
 {
 	*waddr = 1;
 	_thr_umtx_wake(waddr, INT_MAX, 0);
 }
 
 void _thr_wake_all(unsigned int *waddrs[], int) __hidden;
 
 static inline struct pthread *
 _sleepq_first(struct sleepqueue *sq)
 {
 	return TAILQ_FIRST(&sq->sq_blocked);
 }
 
 void	_sleepq_init(void) __hidden;
 struct sleepqueue *_sleepq_alloc(void) __hidden;
 void	_sleepq_free(struct sleepqueue *) __hidden;
 void	_sleepq_lock(void *) __hidden;
 void	_sleepq_unlock(void *) __hidden;
 struct sleepqueue *_sleepq_lookup(void *) __hidden;
 void	_sleepq_add(void *, struct pthread *) __hidden;
 int	_sleepq_remove(struct sleepqueue *, struct pthread *) __hidden;
 void	_sleepq_drop(struct sleepqueue *,
 		void (*cb)(struct pthread *, void *arg), void *) __hidden;
 
 int	_pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
 	    void *(calloc_cb)(size_t, size_t));
 
 struct dl_phdr_info;
 void __pthread_cxa_finalize(struct dl_phdr_info *phdr_info);
 void _thr_tsd_unload(struct dl_phdr_info *phdr_info) __hidden;
 void _thr_sigact_unload(struct dl_phdr_info *phdr_info) __hidden;
 void _thr_stack_fix_protection(struct pthread *thrd);
 
 int *__error_threaded(void) __hidden;
 void __thr_interpose_libc(void) __hidden;
 pid_t __thr_fork(void);
 int __thr_setcontext(const ucontext_t *ucp);
 int __thr_sigaction(int sig, const struct sigaction *act,
     struct sigaction *oact) __hidden;
 int __thr_sigprocmask(int how, const sigset_t *set, sigset_t *oset);
 int __thr_sigsuspend(const sigset_t * set);
 int __thr_sigtimedwait(const sigset_t *set, siginfo_t *info,
     const struct timespec * timeout);
 int __thr_sigwait(const sigset_t *set, int *sig);
 int __thr_sigwaitinfo(const sigset_t *set, siginfo_t *info);
 int __thr_swapcontext(ucontext_t *oucp, const ucontext_t *ucp);
 
 void __thr_map_stacks_exec(void);
 
 struct _spinlock;
 void __thr_spinunlock(struct _spinlock *lck);
 void __thr_spinlock(struct _spinlock *lck);
 
 struct tcb *_tcb_ctor(struct pthread *, int);
 void	_tcb_dtor(struct tcb *);
+
+void __thr_pshared_init(void) __hidden;
+void *__thr_pshared_offpage(void *key, int doalloc) __hidden;
+void __thr_pshared_destroy(void *key) __hidden;
 
 __END_DECLS
 
 #endif  /* !_THR_PRIVATE_H */
Index: head/lib/libthr/thread/thr_pshared.c
===================================================================
--- head/lib/libthr/thread/thr_pshared.c	(nonexistent)
+++ head/lib/libthr/thread/thr_pshared.c	(revision 296162)
@@ -0,0 +1,223 @@
+/*-
+ * Copyright (c) 2015 The FreeBSD Foundation
+ *
+ * This software was developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include "namespace.h"
+#include <stdlib.h>
+#include "un-namespace.h"
+
+#include "thr_private.h"
+
+struct psh {
+	LIST_ENTRY(psh) link;
+	void *key;
+	void *val;
+};
+
+LIST_HEAD(pshared_hash_head, psh);
+#define	HASH_SIZE	128
+static struct pshared_hash_head pshared_hash[HASH_SIZE];
+#define	PSHARED_KEY_HASH(key)	(((unsigned long)(key) >> 8) % HASH_SIZE)
+/* XXXKIB: lock could be split to per-hash chain, if appears contested */
+static struct urwlock pshared_lock = DEFAULT_URWLOCK;
+
+void
+__thr_pshared_init(void)
+{
+	int i;
+
+	_thr_urwlock_init(&pshared_lock);
+	for (i = 0; i < HASH_SIZE; i++)
+		LIST_INIT(&pshared_hash[i]);
+}
+
+static void
+pshared_rlock(struct pthread *curthread)
+{
+
+	curthread->locklevel++;
+	_thr_rwl_rdlock(&pshared_lock);
+}
+
+static void
+pshared_wlock(struct pthread *curthread)
+{
+
+	curthread->locklevel++;
+	_thr_rwl_wrlock(&pshared_lock);
+}
+
+static void
+pshared_unlock(struct pthread *curthread)
+{
+
+	_thr_rwl_unlock(&pshared_lock);
+	curthread->locklevel--;
+	_thr_ast(curthread);
+}
+
+static void
+pshared_gc(struct pthread *curthread)
+{
+	struct pshared_hash_head *hd;
+	struct psh *h, *h1;
+	int error, i;
+
+	pshared_wlock(curthread);
+	for (i = 0; i < HASH_SIZE; i++) {
+		hd = &pshared_hash[i];
+		LIST_FOREACH_SAFE(h, hd, link, h1) {
+			error = _umtx_op(NULL, UMTX_OP_SHM, UMTX_SHM_ALIVE,
+			    h->val, NULL);
+			if (error == 0)
+				continue;
+			LIST_REMOVE(h, link);
+			munmap(h->val, PAGE_SIZE);
+			free(h);
+		}
+	}
+	pshared_unlock(curthread);
+}
+
+static void *
+pshared_lookup(void *key)
+{
+	struct pshared_hash_head *hd;
+	struct psh *h;
+
+	hd = &pshared_hash[PSHARED_KEY_HASH(key)];
+	LIST_FOREACH(h, hd, link) {
+		if (h->key == key)
+			return (h->val);
+	}
+	return (NULL);
+}
+
+static int
+pshared_insert(void *key, void **val)
+{
+	struct pshared_hash_head *hd;
+	struct psh *h;
+
+	hd = &pshared_hash[PSHARED_KEY_HASH(key)];
+	LIST_FOREACH(h, hd, link) {
+		if (h->key == key) {
+			if (h->val != *val) {
+				munmap(*val, PAGE_SIZE);
+				*val = h->val;
+			}
+			return (1);
+		}
+	}
+
+	h = malloc(sizeof(*h));
+	if (h == NULL)
+		return (0);
+	h->key = key;
+	h->val = *val;
+	LIST_INSERT_HEAD(hd, h, link);
+	return (1);
+}
+
+static void *
+pshared_remove(void *key)
+{
+	struct pshared_hash_head *hd;
+	struct psh *h;
+	void *val;
+
+	hd = &pshared_hash[PSHARED_KEY_HASH(key)];
+	LIST_FOREACH(h, hd, link) {
+		if (h->key == key) {
+			LIST_REMOVE(h, link);
+			val = h->val;
+			free(h);
+			return (val);
+		}
+	}
+	return (NULL);
+}
+
+static void
+pshared_clean(void *key, void *val)
+{
+
+	if (val != NULL)
+		munmap(val, PAGE_SIZE);
+	_umtx_op(NULL, UMTX_OP_SHM, UMTX_SHM_DESTROY, key, NULL);
+}
+
+void *
+__thr_pshared_offpage(void *key, int doalloc)
+{
+	struct pthread *curthread;
+	void *res;
+	int fd, ins_done;
+
+	curthread = _get_curthread();
+	pshared_rlock(curthread);
+	res = pshared_lookup(key);
+	pshared_unlock(curthread);
+	if (res != NULL)
+		return (res);
+	fd = _umtx_op(NULL, UMTX_OP_SHM, doalloc ? UMTX_SHM_CREAT :
+	    UMTX_SHM_LOOKUP, key, NULL);
+	if (fd == -1)
+		return (NULL);
+	res = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	close(fd);
+	if (res == MAP_FAILED)
+		return (NULL);
+	pshared_wlock(curthread);
+	ins_done = pshared_insert(key, &res);
+	pshared_unlock(curthread);
+	if (!ins_done) {
+		pshared_clean(key, res);
+		res = NULL;
+	}
+	return (res);
+}
+
+void
+__thr_pshared_destroy(void *key)
+{
+	struct pthread *curthread;
+	void *val;
+
+	curthread = _get_curthread();
+	pshared_wlock(curthread);
+	val = pshared_remove(key);
+	pshared_unlock(curthread);
+	pshared_clean(key, val);
+	pshared_gc(curthread);
+}

Property changes on: head/lib/libthr/thread/thr_pshared.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: head/lib/libthr/thread/thr_rwlock.c
===================================================================
--- head/lib/libthr/thread/thr_rwlock.c	(revision 296161)
+++ head/lib/libthr/thread/thr_rwlock.c	(revision 296162)
@@ -1,324 +1,348 @@
 /*-
  * Copyright (c) 1998 Alex Nash
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <errno.h>
 #include <limits.h>
 #include <stdlib.h>
 
 #include "namespace.h"
 #include <pthread.h>
 #include "un-namespace.h"
 #include "thr_private.h"
 
 __weak_reference(_pthread_rwlock_destroy, pthread_rwlock_destroy);
 __weak_reference(_pthread_rwlock_init, pthread_rwlock_init);
 __weak_reference(_pthread_rwlock_rdlock, pthread_rwlock_rdlock);
 __weak_reference(_pthread_rwlock_timedrdlock, pthread_rwlock_timedrdlock);
 __weak_reference(_pthread_rwlock_tryrdlock, pthread_rwlock_tryrdlock);
 __weak_reference(_pthread_rwlock_trywrlock, pthread_rwlock_trywrlock);
 __weak_reference(_pthread_rwlock_unlock, pthread_rwlock_unlock);
 __weak_reference(_pthread_rwlock_wrlock, pthread_rwlock_wrlock);
 __weak_reference(_pthread_rwlock_timedwrlock, pthread_rwlock_timedwrlock);
 
 #define CHECK_AND_INIT_RWLOCK							\
-	if (__predict_false((prwlock = (*rwlock)) <= THR_RWLOCK_DESTROYED)) {	\
+	if (*rwlock == THR_PSHARED_PTR) {					\
+		prwlock = __thr_pshared_offpage(rwlock, 0);			\
+		if (prwlock == NULL)						\
+			return (EINVAL);					\
+	} else if (__predict_false((prwlock = (*rwlock)) <=			\
+	    THR_RWLOCK_DESTROYED)) {						\
 		if (prwlock == THR_RWLOCK_INITIALIZER) {			\
 			int ret;						\
 			ret = init_static(_get_curthread(), rwlock);		\
 			if (ret)						\
 				return (ret);					\
 		} else if (prwlock == THR_RWLOCK_DESTROYED) {			\
 			return (EINVAL);					\
 		}								\
 		prwlock = *rwlock;						\
 	}
 
 /*
  * Prototypes
  */
 
 static int
-rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr __unused)
+rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr)
 {
 	pthread_rwlock_t prwlock;
 
-	prwlock = (pthread_rwlock_t)calloc(1, sizeof(struct pthread_rwlock));
-	if (prwlock == NULL)
-		return (ENOMEM);
-	*rwlock = prwlock;
+	if (attr == NULL || *attr == NULL ||
+	    (*attr)->pshared == PTHREAD_PROCESS_PRIVATE) {
+		prwlock = calloc(1, sizeof(struct pthread_rwlock));
+		if (prwlock == NULL)
+			return (ENOMEM);
+		*rwlock = prwlock;
+	} else {
+		prwlock = __thr_pshared_offpage(rwlock, 1);
+		if (prwlock == NULL)
+			return (EFAULT);
+		prwlock->lock.rw_flags |= USYNC_PROCESS_SHARED;
+		*rwlock = THR_PSHARED_PTR;
+	}
 	return (0);
 }
 
 int
 _pthread_rwlock_destroy (pthread_rwlock_t *rwlock)
 {
 	pthread_rwlock_t prwlock;
 	int ret;
 
 	prwlock = *rwlock;
 	if (prwlock == THR_RWLOCK_INITIALIZER)
 		ret = 0;
 	else if (prwlock == THR_RWLOCK_DESTROYED)
 		ret = EINVAL;
-	else {
+	else if (prwlock == THR_PSHARED_PTR) {
 		*rwlock = THR_RWLOCK_DESTROYED;
-
+		__thr_pshared_destroy(rwlock);
+		ret = 0;
+	} else {
+		*rwlock = THR_RWLOCK_DESTROYED;
 		free(prwlock);
 		ret = 0;
 	}
 	return (ret);
 }
 
 static int
 init_static(struct pthread *thread, pthread_rwlock_t *rwlock)
 {
 	int ret;
 
 	THR_LOCK_ACQUIRE(thread, &_rwlock_static_lock);
 
 	if (*rwlock == THR_RWLOCK_INITIALIZER)
 		ret = rwlock_init(rwlock, NULL);
 	else
 		ret = 0;
 
 	THR_LOCK_RELEASE(thread, &_rwlock_static_lock);
 
 	return (ret);
 }
 
 int
-_pthread_rwlock_init (pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr)
+_pthread_rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr)
 {
+
 	*rwlock = NULL;
 	return (rwlock_init(rwlock, attr));
 }
 
 static int
 rwlock_rdlock_common(pthread_rwlock_t *rwlock, const struct timespec *abstime)
 {
 	struct pthread *curthread = _get_curthread();
 	pthread_rwlock_t prwlock;
 	int flags;
 	int ret;
 
 	CHECK_AND_INIT_RWLOCK
 
 	if (curthread->rdlock_count) {
 		/*
 		 * To avoid having to track all the rdlocks held by
 		 * a thread or all of the threads that hold a rdlock,
 		 * we keep a simple count of all the rdlocks held by
 		 * a thread.  If a thread holds any rdlocks it is
 		 * possible that it is attempting to take a recursive
 		 * rdlock.  If there are blocked writers and precedence
 		 * is given to them, then that would result in the thread
 		 * deadlocking.  So allowing a thread to take the rdlock
 		 * when it already has one or more rdlocks avoids the
 		 * deadlock.  I hope the reader can follow that logic ;-)
 		 */
 		flags = URWLOCK_PREFER_READER;
 	} else {
 		flags = 0;
 	}
 
 	/*
 	 * POSIX said the validity of the abstimeout parameter need
 	 * not be checked if the lock can be immediately acquired.
 	 */
 	ret = _thr_rwlock_tryrdlock(&prwlock->lock, flags);
 	if (ret == 0) {
 		curthread->rdlock_count++;
 		return (ret);
 	}
 
 	if (__predict_false(abstime && 
 		(abstime->tv_nsec >= 1000000000 || abstime->tv_nsec < 0)))
 		return (EINVAL);
 
 	for (;;) {
 		/* goto kernel and lock it */
 		ret = __thr_rwlock_rdlock(&prwlock->lock, flags, abstime);
 		if (ret != EINTR)
 			break;
 
 		/* if interrupted, try to lock it in userland again. */
 		if (_thr_rwlock_tryrdlock(&prwlock->lock, flags) == 0) {
 			ret = 0;
 			break;
 		}
 	}
 	if (ret == 0)
 		curthread->rdlock_count++;
 	return (ret);
 }
 
 int
 _pthread_rwlock_rdlock (pthread_rwlock_t *rwlock)
 {
 	return (rwlock_rdlock_common(rwlock, NULL));
 }
 
 int
 _pthread_rwlock_timedrdlock (pthread_rwlock_t *rwlock,
 	 const struct timespec *abstime)
 {
 	return (rwlock_rdlock_common(rwlock, abstime));
 }
 
 int
 _pthread_rwlock_tryrdlock (pthread_rwlock_t *rwlock)
 {
 	struct pthread *curthread = _get_curthread();
 	pthread_rwlock_t prwlock;
 	int flags;
 	int ret;
 
 	CHECK_AND_INIT_RWLOCK
 
 	if (curthread->rdlock_count) {
 		/*
 		 * To avoid having to track all the rdlocks held by
 		 * a thread or all of the threads that hold a rdlock,
 		 * we keep a simple count of all the rdlocks held by
 		 * a thread.  If a thread holds any rdlocks it is
 		 * possible that it is attempting to take a recursive
 		 * rdlock.  If there are blocked writers and precedence
 		 * is given to them, then that would result in the thread
 		 * deadlocking.  So allowing a thread to take the rdlock
 		 * when it already has one or more rdlocks avoids the
 		 * deadlock.  I hope the reader can follow that logic ;-)
 		 */
 		flags = URWLOCK_PREFER_READER;
 	} else {
 		flags = 0;
 	}
 
 	ret = _thr_rwlock_tryrdlock(&prwlock->lock, flags);
 	if (ret == 0)
 		curthread->rdlock_count++;
 	return (ret);
 }
 
 int
 _pthread_rwlock_trywrlock (pthread_rwlock_t *rwlock)
 {
 	struct pthread *curthread = _get_curthread();
 	pthread_rwlock_t prwlock;
 	int ret;
 
 	CHECK_AND_INIT_RWLOCK
 
 	ret = _thr_rwlock_trywrlock(&prwlock->lock);
 	if (ret == 0)
-		prwlock->owner = curthread;
+		prwlock->owner = TID(curthread);
 	return (ret);
 }
 
 static int
 rwlock_wrlock_common (pthread_rwlock_t *rwlock, const struct timespec *abstime)
 {
 	struct pthread *curthread = _get_curthread();
 	pthread_rwlock_t prwlock;
 	int ret;
 
 	CHECK_AND_INIT_RWLOCK
 
 	/*
 	 * POSIX said the validity of the abstimeout parameter need
 	 * not be checked if the lock can be immediately acquired.
 	 */
 	ret = _thr_rwlock_trywrlock(&prwlock->lock);
 	if (ret == 0) {
-		prwlock->owner = curthread;
+		prwlock->owner = TID(curthread);
 		return (ret);
 	}
 
 	if (__predict_false(abstime && 
-		(abstime->tv_nsec >= 1000000000 || abstime->tv_nsec < 0)))
+	    (abstime->tv_nsec >= 1000000000 || abstime->tv_nsec < 0)))
 		return (EINVAL);
 
 	for (;;) {
 		/* goto kernel and lock it */
 		ret = __thr_rwlock_wrlock(&prwlock->lock, abstime);
 		if (ret == 0) {
-			prwlock->owner = curthread;
+			prwlock->owner = TID(curthread);
 			break;
 		}
 
 		if (ret != EINTR)
 			break;
 
 		/* if interrupted, try to lock it in userland again. */
 		if (_thr_rwlock_trywrlock(&prwlock->lock) == 0) {
 			ret = 0;
-			prwlock->owner = curthread;
+			prwlock->owner = TID(curthread);
 			break;
 		}
 	}
 	return (ret);
 }
 
 int
 _pthread_rwlock_wrlock (pthread_rwlock_t *rwlock)
 {
 	return (rwlock_wrlock_common (rwlock, NULL));
 }
 
 int
 _pthread_rwlock_timedwrlock (pthread_rwlock_t *rwlock,
     const struct timespec *abstime)
 {
 	return (rwlock_wrlock_common (rwlock, abstime));
 }
 
 int
-_pthread_rwlock_unlock (pthread_rwlock_t *rwlock)
+_pthread_rwlock_unlock(pthread_rwlock_t *rwlock)
 {
 	struct pthread *curthread = _get_curthread();
 	pthread_rwlock_t prwlock;
 	int ret;
 	int32_t state;
 
-	prwlock = *rwlock;
+	if (*rwlock == THR_PSHARED_PTR) {
+		prwlock = __thr_pshared_offpage(rwlock, 0);
+		if (prwlock == NULL)
+			return (EINVAL);
+	} else {
+		prwlock = *rwlock;
+	}
 
 	if (__predict_false(prwlock <= THR_RWLOCK_DESTROYED))
 		return (EINVAL);
 
 	state = prwlock->lock.rw_state;
 	if (state & URWLOCK_WRITE_OWNER) {
-		if (__predict_false(prwlock->owner != curthread))
+		if (__predict_false(prwlock->owner != TID(curthread)))
 			return (EPERM);
-		prwlock->owner = NULL;
+		prwlock->owner = 0;
 	}
 
 	ret = _thr_rwlock_unlock(&prwlock->lock);
 	if (ret == 0 && (state & URWLOCK_WRITE_OWNER) == 0)
 		curthread->rdlock_count--;
 
 	return (ret);
 }
Index: head/lib/libthr/thread/thr_rwlockattr.c
===================================================================
--- head/lib/libthr/thread/thr_rwlockattr.c	(revision 296161)
+++ head/lib/libthr/thread/thr_rwlockattr.c	(revision 296162)
@@ -1,99 +1,91 @@
 /*-
  * Copyright (c) 1998 Alex Nash
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "namespace.h"
 #include <errno.h>
 #include <stdlib.h>
 #include <pthread.h>
 #include "un-namespace.h"
 
 #include "thr_private.h"
 
 __weak_reference(_pthread_rwlockattr_destroy, pthread_rwlockattr_destroy);
 __weak_reference(_pthread_rwlockattr_getpshared, pthread_rwlockattr_getpshared);
 __weak_reference(_pthread_rwlockattr_init, pthread_rwlockattr_init);
 __weak_reference(_pthread_rwlockattr_setpshared, pthread_rwlockattr_setpshared);
 
 int
 _pthread_rwlockattr_destroy(pthread_rwlockattr_t *rwlockattr)
 {
 	pthread_rwlockattr_t prwlockattr;
 
 	if (rwlockattr == NULL)
-		return(EINVAL);
-
+		return (EINVAL);
 	prwlockattr = *rwlockattr;
-
 	if (prwlockattr == NULL)
-		return(EINVAL);
-
+		return (EINVAL);
 	free(prwlockattr);
-
-	return(0);
+	return (0);
 }
 
 int
 _pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *rwlockattr,
-	int *pshared)
+    int *pshared)
 {
-	*pshared = (*rwlockattr)->pshared;
 
-	return(0);
+	*pshared = (*rwlockattr)->pshared;
+	return (0);
 }
 
 int
 _pthread_rwlockattr_init(pthread_rwlockattr_t *rwlockattr)
 {
 	pthread_rwlockattr_t prwlockattr;
 
 	if (rwlockattr == NULL)
-		return(EINVAL);
+		return (EINVAL);
 
-	prwlockattr = (pthread_rwlockattr_t)
-		malloc(sizeof(struct pthread_rwlockattr));
-
+	prwlockattr = malloc(sizeof(struct pthread_rwlockattr));
 	if (prwlockattr == NULL)
-		return(ENOMEM);
+		return (ENOMEM);
 
-	prwlockattr->pshared 	= PTHREAD_PROCESS_PRIVATE;
-	*rwlockattr		= prwlockattr;
-
-	return(0);
+	prwlockattr->pshared = PTHREAD_PROCESS_PRIVATE;
+	*rwlockattr = prwlockattr;
+	return (0);
 }
 
 int
 _pthread_rwlockattr_setpshared(pthread_rwlockattr_t *rwlockattr, int pshared)
 {
-	/* Only PTHREAD_PROCESS_PRIVATE is supported. */
-	if (pshared != PTHREAD_PROCESS_PRIVATE)
-		return(EINVAL);
 
+	if (pshared != PTHREAD_PROCESS_PRIVATE &&
+	    pshared != PTHREAD_PROCESS_SHARED)
+		return (EINVAL);
 	(*rwlockattr)->pshared = pshared;
-
-	return(0);
+	return (0);
 }
Index: head/sys/kern/kern_resource.c
===================================================================
--- head/sys/kern/kern_resource.c	(revision 296161)
+++ head/sys/kern/kern_resource.c	(revision 296162)
@@ -1,1434 +1,1441 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_resource.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/time.h>
 #include <sys/umtx.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 
 static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
 static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
 #define	UIHASH(uid)	(&uihashtbl[(uid) & uihash])
 static struct rwlock uihashtbl_lock;
 static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
 static u_long uihash;		/* size of hash table - 1 */
 
 static void	calcru1(struct proc *p, struct rusage_ext *ruxp,
 		    struct timeval *up, struct timeval *sp);
 static int	donice(struct thread *td, struct proc *chgp, int n);
 static struct uidinfo *uilookup(uid_t uid);
 static void	ruxagg_locked(struct rusage_ext *rux, struct thread *td);
 
 /*
  * Resource controls and accounting.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getpriority_args {
 	int	which;
 	int	who;
 };
 #endif
 int
 sys_getpriority(struct thread *td, register struct getpriority_args *uap)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	int error, low;
 
 	error = 0;
 	low = PRIO_MAX + 1;
 	switch (uap->which) {
 
 	case PRIO_PROCESS:
 		if (uap->who == 0)
 			low = td->td_proc->p_nice;
 		else {
 			p = pfind(uap->who);
 			if (p == NULL)
 				break;
 			if (p_cansee(td, p) == 0)
 				low = p->p_nice;
 			PROC_UNLOCK(p);
 		}
 		break;
 
 	case PRIO_PGRP:
 		sx_slock(&proctree_lock);
 		if (uap->who == 0) {
 			pg = td->td_proc->p_pgrp;
 			PGRP_LOCK(pg);
 		} else {
 			pg = pgfind(uap->who);
 			if (pg == NULL) {
 				sx_sunlock(&proctree_lock);
 				break;
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NORMAL &&
 			    p_cansee(td, p) == 0) {
 				if (p->p_nice < low)
 					low = p->p_nice;
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pg);
 		break;
 
 	case PRIO_USER:
 		if (uap->who == 0)
 			uap->who = td->td_ucred->cr_uid;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NORMAL &&
 			    p_cansee(td, p) == 0 &&
 			    p->p_ucred->cr_uid == uap->who) {
 				if (p->p_nice < low)
 					low = p->p_nice;
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (low == PRIO_MAX + 1 && error == 0)
 		error = ESRCH;
 	td->td_retval[0] = low;
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setpriority_args {
 	int	which;
 	int	who;
 	int	prio;
 };
 #endif
 int
 sys_setpriority(struct thread *td, struct setpriority_args *uap)
 {
 	struct proc *curp, *p;
 	struct pgrp *pg;
 	int found = 0, error = 0;
 
 	curp = td->td_proc;
 	switch (uap->which) {
 	case PRIO_PROCESS:
 		if (uap->who == 0) {
 			PROC_LOCK(curp);
 			error = donice(td, curp, uap->prio);
 			PROC_UNLOCK(curp);
 		} else {
 			p = pfind(uap->who);
 			if (p == NULL)
 				break;
 			error = p_cansee(td, p);
 			if (error == 0)
 				error = donice(td, p, uap->prio);
 			PROC_UNLOCK(p);
 		}
 		found++;
 		break;
 
 	case PRIO_PGRP:
 		sx_slock(&proctree_lock);
 		if (uap->who == 0) {
 			pg = curp->p_pgrp;
 			PGRP_LOCK(pg);
 		} else {
 			pg = pgfind(uap->who);
 			if (pg == NULL) {
 				sx_sunlock(&proctree_lock);
 				break;
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NORMAL &&
 			    p_cansee(td, p) == 0) {
 				error = donice(td, p, uap->prio);
 				found++;
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pg);
 		break;
 
 	case PRIO_USER:
 		if (uap->who == 0)
 			uap->who = td->td_ucred->cr_uid;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NORMAL &&
 			    p->p_ucred->cr_uid == uap->who &&
 			    p_cansee(td, p) == 0) {
 				error = donice(td, p, uap->prio);
 				found++;
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (found == 0 && error == 0)
 		error = ESRCH;
 	return (error);
 }
 
 /*
  * Set "nice" for a (whole) process.
  */
 static int
 donice(struct thread *td, struct proc *p, int n)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((error = p_cansched(td, p)))
 		return (error);
 	if (n > PRIO_MAX)
 		n = PRIO_MAX;
 	if (n < PRIO_MIN)
 		n = PRIO_MIN;
 	if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0)
 		return (EACCES);
 	sched_nice(p, n);
 	return (0);
 }
 
 static int unprivileged_idprio;
 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_idprio, CTLFLAG_RW,
     &unprivileged_idprio, 0, "Allow non-root users to set an idle priority");
 
 /*
  * Set realtime priority for LWP.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rtprio_thread_args {
 	int		function;
 	lwpid_t		lwpid;
 	struct rtprio	*rtp;
 };
 #endif
 int
 sys_rtprio_thread(struct thread *td, struct rtprio_thread_args *uap)
 {
 	struct proc *p;
 	struct rtprio rtp;
 	struct thread *td1;
 	int cierror, error;
 
 	/* Perform copyin before acquiring locks if needed. */
 	if (uap->function == RTP_SET)
 		cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 	else
 		cierror = 0;
 
 	if (uap->lwpid == 0 || uap->lwpid == td->td_tid) {
 		p = td->td_proc;
 		td1 = td;
 		PROC_LOCK(p);
 	} else {
 		/* Only look up thread in current process */
 		td1 = tdfind(uap->lwpid, curproc->p_pid);
 		if (td1 == NULL)
 			return (ESRCH);
 		p = td1->td_proc;
 	}
 
 	switch (uap->function) {
 	case RTP_LOOKUP:
 		if ((error = p_cansee(td, p)))
 			break;
 		pri_to_rtp(td1, &rtp);
 		PROC_UNLOCK(p);
 		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 	case RTP_SET:
 		if ((error = p_cansched(td, p)) || (error = cierror))
 			break;
 
 		/* Disallow setting rtprio in most cases if not superuser. */
 
 		/*
 		 * Realtime priority has to be restricted for reasons which
 		 * should be obvious.  However, for idleprio processes, there is
 		 * a potential for system deadlock if an idleprio process gains
 		 * a lock on a resource that other processes need (and the
 		 * idleprio process can't run due to a CPU-bound normal
 		 * process).  Fix me!  XXX
 		 *
 		 * This problem is not only related to idleprio process.
 		 * A user level program can obtain a file lock and hold it
 		 * indefinitely.  Additionally, without idleprio processes it is
 		 * still conceivable that a program with low priority will never
 		 * get to run.  In short, allowing this feature might make it
 		 * easier to lock a resource indefinitely, but it is not the
 		 * only thing that makes it possible.
 		 */
 		if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME ||
 		    (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_IDLE &&
 		    unprivileged_idprio == 0)) {
 			error = priv_check(td, PRIV_SCHED_RTPRIO);
 			if (error)
 				break;
 		}
 		error = rtp_to_pri(&rtp, td1);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Set realtime priority.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rtprio_args {
 	int		function;
 	pid_t		pid;
 	struct rtprio	*rtp;
 };
 #endif
 int
 sys_rtprio(struct thread *td, register struct rtprio_args *uap)
 {
 	struct proc *p;
 	struct thread *tdp;
 	struct rtprio rtp;
 	int cierror, error;
 
 	/* Perform copyin before acquiring locks if needed. */
 	if (uap->function == RTP_SET)
 		cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 	else
 		cierror = 0;
 
 	if (uap->pid == 0) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 	} else {
 		p = pfind(uap->pid);
 		if (p == NULL)
 			return (ESRCH);
 	}
 
 	switch (uap->function) {
 	case RTP_LOOKUP:
 		if ((error = p_cansee(td, p)))
 			break;
 		/*
 		 * Return OUR priority if no pid specified,
 		 * or if one is, report the highest priority
 		 * in the process.  There isn't much more you can do as
 		 * there is only room to return a single priority.
 		 * Note: specifying our own pid is not the same
 		 * as leaving it zero.
 		 */
 		if (uap->pid == 0) {
 			pri_to_rtp(td, &rtp);
 		} else {
 			struct rtprio rtp2;
 
 			rtp.type = RTP_PRIO_IDLE;
 			rtp.prio = RTP_PRIO_MAX;
 			FOREACH_THREAD_IN_PROC(p, tdp) {
 				pri_to_rtp(tdp, &rtp2);
 				if (rtp2.type <  rtp.type ||
 				    (rtp2.type == rtp.type &&
 				    rtp2.prio < rtp.prio)) {
 					rtp.type = rtp2.type;
 					rtp.prio = rtp2.prio;
 				}
 			}
 		}
 		PROC_UNLOCK(p);
 		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 	case RTP_SET:
 		if ((error = p_cansched(td, p)) || (error = cierror))
 			break;
 
 		/*
 		 * Disallow setting rtprio in most cases if not superuser.
 		 * See the comment in sys_rtprio_thread about idprio
 		 * threads holding a lock.
 		 */
 		if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME ||
 		    (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_IDLE &&
 		    !unprivileged_idprio)) {
 			error = priv_check(td, PRIV_SCHED_RTPRIO);
 			if (error)
 				break;
 		}
 
 		/*
 		 * If we are setting our own priority, set just our
 		 * thread but if we are doing another process,
 		 * do all the threads on that process. If we
 		 * specify our own pid we do the latter.
 		 */
 		if (uap->pid == 0) {
 			error = rtp_to_pri(&rtp, td);
 		} else {
 			FOREACH_THREAD_IN_PROC(p, td) {
 				if ((error = rtp_to_pri(&rtp, td)) != 0)
 					break;
 			}
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 int
 rtp_to_pri(struct rtprio *rtp, struct thread *td)
 {
 	u_char  newpri, oldclass, oldpri;
 
 	switch (RTP_PRIO_BASE(rtp->type)) {
 	case RTP_PRIO_REALTIME:
 		if (rtp->prio > RTP_PRIO_MAX)
 			return (EINVAL);
 		newpri = PRI_MIN_REALTIME + rtp->prio;
 		break;
 	case RTP_PRIO_NORMAL:
 		if (rtp->prio > (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE))
 			return (EINVAL);
 		newpri = PRI_MIN_TIMESHARE + rtp->prio;
 		break;
 	case RTP_PRIO_IDLE:
 		if (rtp->prio > RTP_PRIO_MAX)
 			return (EINVAL);
 		newpri = PRI_MIN_IDLE + rtp->prio;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	thread_lock(td);
 	oldclass = td->td_pri_class;
 	sched_class(td, rtp->type);	/* XXX fix */
 	oldpri = td->td_user_pri;
 	sched_user_prio(td, newpri);
 	if (td->td_user_pri != oldpri && (oldclass != RTP_PRIO_NORMAL ||
 	    td->td_pri_class != RTP_PRIO_NORMAL))
 		sched_prio(td, td->td_user_pri);
 	if (TD_ON_UPILOCK(td) && oldpri != newpri) {
 		critical_enter();
 		thread_unlock(td);
 		umtx_pi_adjust(td, oldpri);
 		critical_exit();
 	} else
 		thread_unlock(td);
 	return (0);
 }
 
 void
 pri_to_rtp(struct thread *td, struct rtprio *rtp)
 {
 
 	thread_lock(td);
 	switch (PRI_BASE(td->td_pri_class)) {
 	case PRI_REALTIME:
 		rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME;
 		break;
 	case PRI_TIMESHARE:
 		rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE;
 		break;
 	case PRI_IDLE:
 		rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE;
 		break;
 	default:
 		break;
 	}
 	rtp->type = td->td_pri_class;
 	thread_unlock(td);
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct osetrlimit_args {
 	u_int	which;
 	struct	orlimit *rlp;
 };
 #endif
 int
 osetrlimit(struct thread *td, register struct osetrlimit_args *uap)
 {
 	struct orlimit olim;
 	struct rlimit lim;
 	int error;
 
 	if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
 		return (error);
 	lim.rlim_cur = olim.rlim_cur;
 	lim.rlim_max = olim.rlim_max;
 	error = kern_setrlimit(td, uap->which, &lim);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ogetrlimit_args {
 	u_int	which;
 	struct	orlimit *rlp;
 };
 #endif
 int
 ogetrlimit(struct thread *td, register struct ogetrlimit_args *uap)
 {
 	struct orlimit olim;
 	struct rlimit rl;
 	int error;
 
 	if (uap->which >= RLIM_NLIMITS)
 		return (EINVAL);
 	lim_rlimit(td, uap->which, &rl);
 
 	/*
 	 * XXX would be more correct to convert only RLIM_INFINITY to the
 	 * old RLIM_INFINITY and fail with EOVERFLOW for other larger
 	 * values.  Most 64->32 and 32->16 conversions, including not
 	 * unimportant ones of uids are even more broken than what we
 	 * do here (they blindly truncate).  We don't do this correctly
 	 * here since we have little experience with EOVERFLOW yet.
 	 * Elsewhere, getuid() can't fail...
 	 */
 	olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur;
 	olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max;
 	error = copyout(&olim, uap->rlp, sizeof(olim));
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct __setrlimit_args {
 	u_int	which;
 	struct	rlimit *rlp;
 };
 #endif
 int
 sys_setrlimit(struct thread *td, register struct __setrlimit_args *uap)
 {
 	struct rlimit alim;
 	int error;
 
 	if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit))))
 		return (error);
 	error = kern_setrlimit(td, uap->which, &alim);
 	return (error);
 }
 
 static void
 lim_cb(void *arg)
 {
 	struct rlimit rlim;
 	struct thread *td;
 	struct proc *p;
 
 	p = arg;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * Check if the process exceeds its cpu resource allocation.  If
 	 * it reaches the max, arrange to kill the process in ast().
 	 */
 	if (p->p_cpulimit == RLIM_INFINITY)
 		return;
 	PROC_STATLOCK(p);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		ruxagg(p, td);
 	}
 	PROC_STATUNLOCK(p);
 	if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) {
 		lim_rlimit_proc(p, RLIMIT_CPU, &rlim);
 		if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) {
 			killproc(p, "exceeded maximum CPU limit");
 		} else {
 			if (p->p_cpulimit < rlim.rlim_max)
 				p->p_cpulimit += 5;
 			kern_psignal(p, SIGXCPU);
 		}
 	}
 	if ((p->p_flag & P_WEXIT) == 0)
 		callout_reset_sbt(&p->p_limco, SBT_1S, 0,
 		    lim_cb, p, C_PREL(1));
 }
 
 int
 kern_setrlimit(struct thread *td, u_int which, struct rlimit *limp)
 {
 
 	return (kern_proc_setrlimit(td, td->td_proc, which, limp));
 }
 
 int
 kern_proc_setrlimit(struct thread *td, struct proc *p, u_int which,
     struct rlimit *limp)
 {
 	struct plimit *newlim, *oldlim;
 	register struct rlimit *alimp;
 	struct rlimit oldssiz;
 	int error;
 
 	if (which >= RLIM_NLIMITS)
 		return (EINVAL);
 
 	/*
 	 * Preserve historical bugs by treating negative limits as unsigned.
 	 */
 	if (limp->rlim_cur < 0)
 		limp->rlim_cur = RLIM_INFINITY;
 	if (limp->rlim_max < 0)
 		limp->rlim_max = RLIM_INFINITY;
 
 	oldssiz.rlim_cur = 0;
 	newlim = lim_alloc();
 	PROC_LOCK(p);
 	oldlim = p->p_limit;
 	alimp = &oldlim->pl_rlimit[which];
 	if (limp->rlim_cur > alimp->rlim_max ||
 	    limp->rlim_max > alimp->rlim_max)
 		if ((error = priv_check(td, PRIV_PROC_SETRLIMIT))) {
 			PROC_UNLOCK(p);
 			lim_free(newlim);
 			return (error);
 		}
 	if (limp->rlim_cur > limp->rlim_max)
 		limp->rlim_cur = limp->rlim_max;
 	lim_copy(newlim, oldlim);
 	alimp = &newlim->pl_rlimit[which];
 
 	switch (which) {
 
 	case RLIMIT_CPU:
 		if (limp->rlim_cur != RLIM_INFINITY &&
 		    p->p_cpulimit == RLIM_INFINITY)
 			callout_reset_sbt(&p->p_limco, SBT_1S, 0,
 			    lim_cb, p, C_PREL(1));
 		p->p_cpulimit = limp->rlim_cur;
 		break;
 	case RLIMIT_DATA:
 		if (limp->rlim_cur > maxdsiz)
 			limp->rlim_cur = maxdsiz;
 		if (limp->rlim_max > maxdsiz)
 			limp->rlim_max = maxdsiz;
 		break;
 
 	case RLIMIT_STACK:
 		if (limp->rlim_cur > maxssiz)
 			limp->rlim_cur = maxssiz;
 		if (limp->rlim_max > maxssiz)
 			limp->rlim_max = maxssiz;
 		oldssiz = *alimp;
 		if (p->p_sysent->sv_fixlimit != NULL)
 			p->p_sysent->sv_fixlimit(&oldssiz,
 			    RLIMIT_STACK);
 		break;
 
 	case RLIMIT_NOFILE:
 		if (limp->rlim_cur > maxfilesperproc)
 			limp->rlim_cur = maxfilesperproc;
 		if (limp->rlim_max > maxfilesperproc)
 			limp->rlim_max = maxfilesperproc;
 		break;
 
 	case RLIMIT_NPROC:
 		if (limp->rlim_cur > maxprocperuid)
 			limp->rlim_cur = maxprocperuid;
 		if (limp->rlim_max > maxprocperuid)
 			limp->rlim_max = maxprocperuid;
 		if (limp->rlim_cur < 1)
 			limp->rlim_cur = 1;
 		if (limp->rlim_max < 1)
 			limp->rlim_max = 1;
 		break;
 	}
 	if (p->p_sysent->sv_fixlimit != NULL)
 		p->p_sysent->sv_fixlimit(limp, which);
 	*alimp = *limp;
 	p->p_limit = newlim;
 	PROC_UPDATE_COW(p);
 	PROC_UNLOCK(p);
 	lim_free(oldlim);
 
 	if (which == RLIMIT_STACK &&
 	    /*
 	     * Skip calls from exec_new_vmspace(), done when stack is
 	     * not mapped yet.
 	     */
 	    (td != curthread || (p->p_flag & P_INEXEC) == 0)) {
 		/*
 		 * Stack is allocated to the max at exec time with only
 		 * "rlim_cur" bytes accessible.  If stack limit is going
 		 * up make more accessible, if going down make inaccessible.
 		 */
 		if (limp->rlim_cur != oldssiz.rlim_cur) {
 			vm_offset_t addr;
 			vm_size_t size;
 			vm_prot_t prot;
 
 			if (limp->rlim_cur > oldssiz.rlim_cur) {
 				prot = p->p_sysent->sv_stackprot;
 				size = limp->rlim_cur - oldssiz.rlim_cur;
 				addr = p->p_sysent->sv_usrstack -
 				    limp->rlim_cur;
 			} else {
 				prot = VM_PROT_NONE;
 				size = oldssiz.rlim_cur - limp->rlim_cur;
 				addr = p->p_sysent->sv_usrstack -
 				    oldssiz.rlim_cur;
 			}
 			addr = trunc_page(addr);
 			size = round_page(size);
 			(void)vm_map_protect(&p->p_vmspace->vm_map,
 			    addr, addr + size, prot, FALSE);
 		}
 	}
 
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __getrlimit_args {
 	u_int	which;
 	struct	rlimit *rlp;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getrlimit(struct thread *td, register struct __getrlimit_args *uap)
 {
 	struct rlimit rlim;
 	int error;
 
 	if (uap->which >= RLIM_NLIMITS)
 		return (EINVAL);
 	lim_rlimit(td, uap->which, &rlim);
 	error = copyout(&rlim, uap->rlp, sizeof(struct rlimit));
 	return (error);
 }
 
 /*
  * Transform the running time and tick information for children of proc p
  * into user and system time usage.
  */
 void
 calccru(struct proc *p, struct timeval *up, struct timeval *sp)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	calcru1(p, &p->p_crux, up, sp);
 }
 
 /*
  * Transform the running time and tick information in proc p into user
  * and system time usage.  If appropriate, include the current time slice
  * on this CPU.
  */
 void
 calcru(struct proc *p, struct timeval *up, struct timeval *sp)
 {
 	struct thread *td;
 	uint64_t runtime, u;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_STATLOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * If we are getting stats for the current process, then add in the
 	 * stats that this thread has accumulated in its current time slice.
 	 * We reset the thread and CPU state as if we had performed a context
 	 * switch right here.
 	 */
 	td = curthread;
 	if (td->td_proc == p) {
 		u = cpu_ticks();
 		runtime = u - PCPU_GET(switchtime);
 		td->td_runtime += runtime;
 		td->td_incruntime += runtime;
 		PCPU_SET(switchtime, u);
 	}
 	/* Make sure the per-thread stats are current. */
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (td->td_incruntime == 0)
 			continue;
 		ruxagg(p, td);
 	}
 	calcru1(p, &p->p_rux, up, sp);
 }
 
 /* Collect resource usage for a single thread. */
 void
 rufetchtd(struct thread *td, struct rusage *ru)
 {
 	struct proc *p;
 	uint64_t runtime, u;
 
 	p = td->td_proc;
 	PROC_STATLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	/*
 	 * If we are getting stats for the current thread, then add in the
 	 * stats that this thread has accumulated in its current time slice.
 	 * We reset the thread and CPU state as if we had performed a context
 	 * switch right here.
 	 */
 	if (td == curthread) {
 		u = cpu_ticks();
 		runtime = u - PCPU_GET(switchtime);
 		td->td_runtime += runtime;
 		td->td_incruntime += runtime;
 		PCPU_SET(switchtime, u);
 	}
 	ruxagg(p, td);
 	*ru = td->td_ru;
 	calcru1(p, &td->td_rux, &ru->ru_utime, &ru->ru_stime);
 }
 
 static void
 calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
     struct timeval *sp)
 {
 	/* {user, system, interrupt, total} {ticks, usec}: */
 	uint64_t ut, uu, st, su, it, tt, tu;
 
 	ut = ruxp->rux_uticks;
 	st = ruxp->rux_sticks;
 	it = ruxp->rux_iticks;
 	tt = ut + st + it;
 	if (tt == 0) {
 		/* Avoid divide by zero */
 		st = 1;
 		tt = 1;
 	}
 	tu = cputick2usec(ruxp->rux_runtime);
 	if ((int64_t)tu < 0) {
 		/* XXX: this should be an assert /phk */
 		printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
 		    (intmax_t)tu, p->p_pid, p->p_comm);
 		tu = ruxp->rux_tu;
 	}
 
 	if (tu >= ruxp->rux_tu) {
 		/*
 		 * The normal case, time increased.
 		 * Enforce monotonicity of bucketed numbers.
 		 */
 		uu = (tu * ut) / tt;
 		if (uu < ruxp->rux_uu)
 			uu = ruxp->rux_uu;
 		su = (tu * st) / tt;
 		if (su < ruxp->rux_su)
 			su = ruxp->rux_su;
 	} else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
 		/*
 		 * When we calibrate the cputicker, it is not uncommon to
 		 * see the presumably fixed frequency increase slightly over
 		 * time as a result of thermal stabilization and NTP
 		 * discipline (of the reference clock).  We therefore ignore
 		 * a bit of backwards slop because we  expect to catch up
 		 * shortly.  We use a 3 microsecond limit to catch low
 		 * counts and a 1% limit for high counts.
 		 */
 		uu = ruxp->rux_uu;
 		su = ruxp->rux_su;
 		tu = ruxp->rux_tu;
 	} else { /* tu < ruxp->rux_tu */
 		/*
 		 * What happened here was likely that a laptop, which ran at
 		 * a reduced clock frequency at boot, kicked into high gear.
 		 * The wisdom of spamming this message in that case is
 		 * dubious, but it might also be indicative of something
 		 * serious, so lets keep it and hope laptops can be made
 		 * more truthful about their CPU speed via ACPI.
 		 */
 		printf("calcru: runtime went backwards from %ju usec "
 		    "to %ju usec for pid %d (%s)\n",
 		    (uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
 		    p->p_pid, p->p_comm);
 		uu = (tu * ut) / tt;
 		su = (tu * st) / tt;
 	}
 
 	ruxp->rux_uu = uu;
 	ruxp->rux_su = su;
 	ruxp->rux_tu = tu;
 
 	up->tv_sec = uu / 1000000;
 	up->tv_usec = uu % 1000000;
 	sp->tv_sec = su / 1000000;
 	sp->tv_usec = su % 1000000;
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getrusage_args {
 	int	who;
 	struct	rusage *rusage;
 };
 #endif
 int
 sys_getrusage(register struct thread *td, register struct getrusage_args *uap)
 {
 	struct rusage ru;
 	int error;
 
 	error = kern_getrusage(td, uap->who, &ru);
 	if (error == 0)
 		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
 	return (error);
 }
 
 int
 kern_getrusage(struct thread *td, int who, struct rusage *rup)
 {
 	struct proc *p;
 	int error;
 
 	error = 0;
 	p = td->td_proc;
 	PROC_LOCK(p);
 	switch (who) {
 	case RUSAGE_SELF:
 		rufetchcalc(p, rup, &rup->ru_utime,
 		    &rup->ru_stime);
 		break;
 
 	case RUSAGE_CHILDREN:
 		*rup = p->p_stats->p_cru;
 		calccru(p, &rup->ru_utime, &rup->ru_stime);
 		break;
 
 	case RUSAGE_THREAD:
 		PROC_STATLOCK(p);
 		thread_lock(td);
 		rufetchtd(td, rup);
 		thread_unlock(td);
 		PROC_STATUNLOCK(p);
 		break;
 
 	default:
 		error = EINVAL;
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 void
 rucollect(struct rusage *ru, struct rusage *ru2)
 {
 	long *ip, *ip2;
 	int i;
 
 	if (ru->ru_maxrss < ru2->ru_maxrss)
 		ru->ru_maxrss = ru2->ru_maxrss;
 	ip = &ru->ru_first;
 	ip2 = &ru2->ru_first;
 	for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
 		*ip++ += *ip2++;
 }
 
 void
 ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2,
     struct rusage_ext *rux2)
 {
 
 	rux->rux_runtime += rux2->rux_runtime;
 	rux->rux_uticks += rux2->rux_uticks;
 	rux->rux_sticks += rux2->rux_sticks;
 	rux->rux_iticks += rux2->rux_iticks;
 	rux->rux_uu += rux2->rux_uu;
 	rux->rux_su += rux2->rux_su;
 	rux->rux_tu += rux2->rux_tu;
 	rucollect(ru, ru2);
 }
 
 /*
  * Aggregate tick counts into the proc's rusage_ext.
  */
 static void
 ruxagg_locked(struct rusage_ext *rux, struct thread *td)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	PROC_STATLOCK_ASSERT(td->td_proc, MA_OWNED);
 	rux->rux_runtime += td->td_incruntime;
 	rux->rux_uticks += td->td_uticks;
 	rux->rux_sticks += td->td_sticks;
 	rux->rux_iticks += td->td_iticks;
 }
 
 void
 ruxagg(struct proc *p, struct thread *td)
 {
 
 	thread_lock(td);
 	ruxagg_locked(&p->p_rux, td);
 	ruxagg_locked(&td->td_rux, td);
 	td->td_incruntime = 0;
 	td->td_uticks = 0;
 	td->td_iticks = 0;
 	td->td_sticks = 0;
 	thread_unlock(td);
 }
 
 /*
  * Update the rusage_ext structure and fetch a valid aggregate rusage
  * for proc p if storage for one is supplied.
  */
 void
 rufetch(struct proc *p, struct rusage *ru)
 {
 	struct thread *td;
 
 	PROC_STATLOCK_ASSERT(p, MA_OWNED);
 
 	*ru = p->p_ru;
 	if (p->p_numthreads > 0)  {
 		FOREACH_THREAD_IN_PROC(p, td) {
 			ruxagg(p, td);
 			rucollect(ru, &td->td_ru);
 		}
 	}
 }
 
 /*
  * Atomically perform a rufetch and a calcru together.
  * Consumers, can safely assume the calcru is executed only once
  * rufetch is completed.
  */
 void
 rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up,
     struct timeval *sp)
 {
 
 	PROC_STATLOCK(p);
 	rufetch(p, ru);
 	calcru(p, up, sp);
 	PROC_STATUNLOCK(p);
 }
 
 /*
  * Allocate a new resource limits structure and initialize its
  * reference count and mutex pointer.
  */
 struct plimit *
 lim_alloc()
 {
 	struct plimit *limp;
 
 	limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK);
 	refcount_init(&limp->pl_refcnt, 1);
 	return (limp);
 }
 
 struct plimit *
 lim_hold(struct plimit *limp)
 {
 
 	refcount_acquire(&limp->pl_refcnt);
 	return (limp);
 }
 
 void
 lim_fork(struct proc *p1, struct proc *p2)
 {
 
 	PROC_LOCK_ASSERT(p1, MA_OWNED);
 	PROC_LOCK_ASSERT(p2, MA_OWNED);
 
 	p2->p_limit = lim_hold(p1->p_limit);
 	callout_init_mtx(&p2->p_limco, &p2->p_mtx, 0);
 	if (p1->p_cpulimit != RLIM_INFINITY)
 		callout_reset_sbt(&p2->p_limco, SBT_1S, 0,
 		    lim_cb, p2, C_PREL(1));
 }
 
 void
 lim_free(struct plimit *limp)
 {
 
 	if (refcount_release(&limp->pl_refcnt))
 		free((void *)limp, M_PLIMIT);
 }
 
 /*
  * Make a copy of the plimit structure.
  * We share these structures copy-on-write after fork.
  */
 void
 lim_copy(struct plimit *dst, struct plimit *src)
 {
 
 	KASSERT(dst->pl_refcnt <= 1, ("lim_copy to shared limit"));
 	bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit));
 }
 
 /*
  * Return the hard limit for a particular system resource.  The
  * which parameter specifies the index into the rlimit array.
  */
 rlim_t
 lim_max(struct thread *td, int which)
 {
 	struct rlimit rl;
 
 	lim_rlimit(td, which, &rl);
 	return (rl.rlim_max);
 }
 
 rlim_t
 lim_max_proc(struct proc *p, int which)
 {
 	struct rlimit rl;
 
 	lim_rlimit_proc(p, which, &rl);
 	return (rl.rlim_max);
 }
 
 /*
  * Return the current (soft) limit for a particular system resource.
  * The which parameter which specifies the index into the rlimit array
  */
 rlim_t
 lim_cur(struct thread *td, int which)
 {
 	struct rlimit rl;
 
 	lim_rlimit(td, which, &rl);
 	return (rl.rlim_cur);
 }
 
 rlim_t
 lim_cur_proc(struct proc *p, int which)
 {
 	struct rlimit rl;
 
 	lim_rlimit_proc(p, which, &rl);
 	return (rl.rlim_cur);
 }
 
 /*
  * Return a copy of the entire rlimit structure for the system limit
  * specified by 'which' in the rlimit structure pointed to by 'rlp'.
  */
 void
 lim_rlimit(struct thread *td, int which, struct rlimit *rlp)
 {
 	struct proc *p = td->td_proc;
 
 	MPASS(td == curthread);
 	KASSERT(which >= 0 && which < RLIM_NLIMITS,
 	    ("request for invalid resource limit"));
 	*rlp = td->td_limit->pl_rlimit[which];
 	if (p->p_sysent->sv_fixlimit != NULL)
 		p->p_sysent->sv_fixlimit(rlp, which);
 }
 
 void
 lim_rlimit_proc(struct proc *p, int which, struct rlimit *rlp)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(which >= 0 && which < RLIM_NLIMITS,
 	    ("request for invalid resource limit"));
 	*rlp = p->p_limit->pl_rlimit[which];
 	if (p->p_sysent->sv_fixlimit != NULL)
 		p->p_sysent->sv_fixlimit(rlp, which);
 }
 
 void
 uihashinit()
 {
 
 	uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
 	rw_init(&uihashtbl_lock, "uidinfo hash");
 }
 
 /*
  * Look up a uidinfo struct for the parameter uid.
  * uihashtbl_lock must be locked.
  * Increase refcount on uidinfo struct returned.
  */
 static struct uidinfo *
 uilookup(uid_t uid)
 {
 	struct uihashhead *uipp;
 	struct uidinfo *uip;
 
 	rw_assert(&uihashtbl_lock, RA_LOCKED);
 	uipp = UIHASH(uid);
 	LIST_FOREACH(uip, uipp, ui_hash)
 		if (uip->ui_uid == uid) {
 			uihold(uip);
 			break;
 		}
 
 	return (uip);
 }
 
 /*
  * Find or allocate a struct uidinfo for a particular uid.
  * Returns with uidinfo struct referenced.
  * uifree() should be called on a struct uidinfo when released.
  */
 struct uidinfo *
 uifind(uid_t uid)
 {
 	struct uidinfo *new_uip, *uip;
 
 	rw_rlock(&uihashtbl_lock);
 	uip = uilookup(uid);
 	rw_runlock(&uihashtbl_lock);
 	if (uip != NULL)
 		return (uip);
 
 	new_uip = malloc(sizeof(*new_uip), M_UIDINFO, M_WAITOK | M_ZERO);
 	racct_create(&new_uip->ui_racct);
 	refcount_init(&new_uip->ui_ref, 1);
 	new_uip->ui_uid = uid;
 	mtx_init(&new_uip->ui_vmsize_mtx, "ui_vmsize", NULL, MTX_DEF);
 
 	rw_wlock(&uihashtbl_lock);
 	/*
 	 * There's a chance someone created our uidinfo while we
 	 * were in malloc and not holding the lock, so we have to
 	 * make sure we don't insert a duplicate uidinfo.
 	 */
 	if ((uip = uilookup(uid)) == NULL) {
 		LIST_INSERT_HEAD(UIHASH(uid), new_uip, ui_hash);
 		rw_wunlock(&uihashtbl_lock);
 		uip = new_uip;
 	} else {
 		rw_wunlock(&uihashtbl_lock);
 		racct_destroy(&new_uip->ui_racct);
 		mtx_destroy(&new_uip->ui_vmsize_mtx);
 		free(new_uip, M_UIDINFO);
 	}
 	return (uip);
 }
 
 /*
  * Place another refcount on a uidinfo struct.
  */
 void
 uihold(struct uidinfo *uip)
 {
 
 	refcount_acquire(&uip->ui_ref);
 }
 
 /*-
  * Since uidinfo structs have a long lifetime, we use an
  * opportunistic refcounting scheme to avoid locking the lookup hash
  * for each release.
  *
  * If the refcount hits 0, we need to free the structure,
  * which means we need to lock the hash.
  * Optimal case:
  *   After locking the struct and lowering the refcount, if we find
  *   that we don't need to free, simply unlock and return.
  * Suboptimal case:
  *   If refcount lowering results in need to free, bump the count
  *   back up, lose the lock and acquire the locks in the proper
  *   order to try again.
  */
 void
 uifree(struct uidinfo *uip)
 {
 	int old;
 
 	/* Prepare for optimal case. */
 	old = uip->ui_ref;
 	if (old > 1 && atomic_cmpset_int(&uip->ui_ref, old, old - 1))
 		return;
 
 	/* Prepare for suboptimal case. */
 	rw_wlock(&uihashtbl_lock);
 	if (refcount_release(&uip->ui_ref) == 0) {
 		rw_wunlock(&uihashtbl_lock);
 		return;
 	}
 
 	racct_destroy(&uip->ui_racct);
 	LIST_REMOVE(uip, ui_hash);
 	rw_wunlock(&uihashtbl_lock);
 
 	if (uip->ui_sbsize != 0)
 		printf("freeing uidinfo: uid = %d, sbsize = %ld\n",
 		    uip->ui_uid, uip->ui_sbsize);
 	if (uip->ui_proccnt != 0)
 		printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
 		    uip->ui_uid, uip->ui_proccnt);
 	if (uip->ui_vmsize != 0)
 		printf("freeing uidinfo: uid = %d, swapuse = %lld\n",
 		    uip->ui_uid, (unsigned long long)uip->ui_vmsize);
 	mtx_destroy(&uip->ui_vmsize_mtx);
 	free(uip, M_UIDINFO);
 }
 
 #ifdef RACCT
 void
 ui_racct_foreach(void (*callback)(struct racct *racct,
     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
     void *arg2, void *arg3)
 {
 	struct uidinfo *uip;
 	struct uihashhead *uih;
 
 	rw_rlock(&uihashtbl_lock);
 	if (pre != NULL)
 		(pre)();
 	for (uih = &uihashtbl[uihash]; uih >= uihashtbl; uih--) {
 		LIST_FOREACH(uip, uih, ui_hash) {
 			(callback)(uip->ui_racct, arg2, arg3);
 		}
 	}
 	if (post != NULL)
 		(post)();
 	rw_runlock(&uihashtbl_lock);
 }
 #endif
 
 static inline int
 chglimit(struct uidinfo *uip, long *limit, int diff, rlim_t max, const char *name)
 {
 
 	/* Don't allow them to exceed max, but allow subtraction. */
 	if (diff > 0 && max != 0) {
 		if (atomic_fetchadd_long(limit, (long)diff) + diff > max) {
 			atomic_subtract_long(limit, (long)diff);
 			return (0);
 		}
 	} else {
 		atomic_add_long(limit, (long)diff);
 		if (*limit < 0)
 			printf("negative %s for uid = %d\n", name, uip->ui_uid);
 	}
 	return (1);
 }
 
 /*
  * Change the count associated with number of processes
  * a given user is using.  When 'max' is 0, don't enforce a limit
  */
 int
 chgproccnt(struct uidinfo *uip, int diff, rlim_t max)
 {
 
 	return (chglimit(uip, &uip->ui_proccnt, diff, max, "proccnt"));
 }
 
 /*
  * Change the total socket buffer size a user has used.
  */
 int
 chgsbsize(struct uidinfo *uip, u_int *hiwat, u_int to, rlim_t max)
 {
 	int diff, rv;
 
 	diff = to - *hiwat;
 	if (diff > 0 && max == 0) {
 		rv = 0;
 	} else {
 		rv = chglimit(uip, &uip->ui_sbsize, diff, max, "sbsize");
 		if (rv != 0)
 			*hiwat = to;
 	}
 	return (rv);
 }
 
 /*
  * Change the count associated with number of pseudo-terminals
  * a given user is using.  When 'max' is 0, don't enforce a limit
  */
 int
 chgptscnt(struct uidinfo *uip, int diff, rlim_t max)
 {
 
 	return (chglimit(uip, &uip->ui_ptscnt, diff, max, "ptscnt"));
 }
 
 int
 chgkqcnt(struct uidinfo *uip, int diff, rlim_t max)
 {
 
 	return (chglimit(uip, &uip->ui_kqcnt, diff, max, "kqcnt"));
 }
+
+int
+chgumtxcnt(struct uidinfo *uip, int diff, rlim_t max)
+{
+
+	return (chglimit(uip, &uip->ui_umtxcnt, diff, max, "umtxcnt"));
+}
Index: head/sys/kern/kern_umtx.c
===================================================================
--- head/sys/kern/kern_umtx.c	(revision 296161)
+++ head/sys/kern/kern_umtx.c	(revision 296162)
@@ -1,3856 +1,4178 @@
 /*-
+ * Copyright (c) 2015 The FreeBSD Foundation
  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_umtx_profiling.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
+#include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
+#include <sys/resource.h>
+#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/syscallsubr.h>
+#include <sys/taskqueue.h>
 #include <sys/eventhandler.h>
 #include <sys/umtx.h>
 
+#include <security/mac/mac_framework.h>
+
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
 #include <machine/cpu.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32_proto.h>
 #endif
 
 #define _UMUTEX_TRY		1
 #define _UMUTEX_WAIT		2
 
 #ifdef UMTX_PROFILING
 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
 #endif
 
 /* Priority inheritance mutex info. */
 struct umtx_pi {
 	/* Owner thread */
 	struct thread		*pi_owner;
 
 	/* Reference count */
 	int			pi_refcount;
 
  	/* List entry to link umtx holding by thread */
 	TAILQ_ENTRY(umtx_pi)	pi_link;
 
 	/* List entry in hash */
 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
 
 	/* List for waiters */
 	TAILQ_HEAD(,umtx_q)	pi_blocked;
 
 	/* Identify a userland lock object */
 	struct umtx_key		pi_key;
 };
 
 /* A userland synchronous object user. */
 struct umtx_q {
 	/* Linked list for the hash. */
 	TAILQ_ENTRY(umtx_q)	uq_link;
 
 	/* Umtx key. */
 	struct umtx_key		uq_key;
 
 	/* Umtx flags. */
 	int			uq_flags;
 #define UQF_UMTXQ	0x0001
 
 	/* The thread waits on. */
 	struct thread		*uq_thread;
 
 	/*
 	 * Blocked on PI mutex. read can use chain lock
 	 * or umtx_lock, write must have both chain lock and
 	 * umtx_lock being hold.
 	 */
 	struct umtx_pi		*uq_pi_blocked;
 
 	/* On blocked list */
 	TAILQ_ENTRY(umtx_q)	uq_lockq;
 
 	/* Thread contending with us */
 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
 
 	/* Inherited priority from PP mutex */
 	u_char			uq_inherited_pri;
 	
 	/* Spare queue ready to be reused */
 	struct umtxq_queue	*uq_spare_queue;
 
 	/* The queue we on */
 	struct umtxq_queue	*uq_cur_queue;
 };
 
 TAILQ_HEAD(umtxq_head, umtx_q);
 
 /* Per-key wait-queue */
 struct umtxq_queue {
 	struct umtxq_head	head;
 	struct umtx_key		key;
 	LIST_ENTRY(umtxq_queue)	link;
 	int			length;
 };
 
 LIST_HEAD(umtxq_list, umtxq_queue);
 
 /* Userland lock object's wait-queue chain */
 struct umtxq_chain {
 	/* Lock for this chain. */
 	struct mtx		uc_lock;
 
 	/* List of sleep queues. */
 	struct umtxq_list	uc_queue[2];
 #define UMTX_SHARED_QUEUE	0
 #define UMTX_EXCLUSIVE_QUEUE	1
 
 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
 
 	/* Busy flag */
 	char			uc_busy;
 
 	/* Chain lock waiters */
 	int			uc_waiters;
 
 	/* All PI in the list */
 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
 
 #ifdef UMTX_PROFILING
 	u_int 			length;
 	u_int			max_length;
 #endif
 };
 
 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
 
 /*
  * Don't propagate time-sharing priority, there is a security reason,
  * a user can simply introduce PI-mutex, let thread A lock the mutex,
  * and let another thread B block on the mutex, because B is
  * sleeping, its priority will be boosted, this causes A's priority to
  * be boosted via priority propagating too and will never be lowered even
  * if it is using 100%CPU, this is unfair to other processes.
  */
 
 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
 
 #define	GOLDEN_RATIO_PRIME	2654404609U
 #define	UMTX_CHAINS		512
 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
 
 #define	GET_SHARE(flags)	\
     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
 
 #define BUSY_SPINS		200
 
 struct abs_timeout {
 	int clockid;
 	struct timespec cur;
 	struct timespec end;
 };
 
 static uma_zone_t		umtx_pi_zone;
 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
 static int			umtx_pi_allocated;
 
 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
     &umtx_pi_allocated, 0, "Allocated umtx_pi");
 
 #ifdef UMTX_PROFILING
 static long max_length;
 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
 #endif
 
+static void umtx_shm_init(void);
 static void umtxq_sysinit(void *);
 static void umtxq_hash(struct umtx_key *key);
 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
 static void umtxq_lock(struct umtx_key *key);
 static void umtxq_unlock(struct umtx_key *key);
 static void umtxq_busy(struct umtx_key *key);
 static void umtxq_unbusy(struct umtx_key *key);
 static void umtxq_insert_queue(struct umtx_q *uq, int q);
 static void umtxq_remove_queue(struct umtx_q *uq, int q);
 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
 static int umtxq_count(struct umtx_key *key);
 static struct umtx_pi *umtx_pi_alloc(int);
 static void umtx_pi_free(struct umtx_pi *pi);
 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
 static void umtx_thread_cleanup(struct thread *td);
 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
 	struct image_params *imgp __unused);
 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
 
 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
 
 static struct mtx umtx_lock;
 
 #ifdef UMTX_PROFILING
 static void
 umtx_init_profiling(void) 
 {
 	struct sysctl_oid *chain_oid;
 	char chain_name[10];
 	int i;
 
 	for (i = 0; i < UMTX_CHAINS; ++i) {
 		snprintf(chain_name, sizeof(chain_name), "%d", i);
 		chain_oid = SYSCTL_ADD_NODE(NULL, 
 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO, 
 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
 	}
 }
 
 static int
 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
 {
 	char buf[512];
 	struct sbuf sb;
 	struct umtxq_chain *uc;
 	u_int fract, i, j, tot, whole;
 	u_int sf0, sf1, sf2, sf3, sf4;
 	u_int si0, si1, si2, si3, si4;
 	u_int sw0, sw1, sw2, sw3, sw4;
 
 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 	for (i = 0; i < 2; i++) {
 		tot = 0;
 		for (j = 0; j < UMTX_CHAINS; ++j) {
 			uc = &umtxq_chains[i][j];
 			mtx_lock(&uc->uc_lock);
 			tot += uc->max_length;
 			mtx_unlock(&uc->uc_lock);
 		}
 		if (tot == 0)
 			sbuf_printf(&sb, "%u) Empty ", i);
 		else {
 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
 			si0 = si1 = si2 = si3 = si4 = 0;
 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
 			for (j = 0; j < UMTX_CHAINS; j++) {
 				uc = &umtxq_chains[i][j];
 				mtx_lock(&uc->uc_lock);
 				whole = uc->max_length * 100;
 				mtx_unlock(&uc->uc_lock);
 				fract = (whole % tot) * 100;
 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
 					sf0 = fract;
 					si0 = j;
 					sw0 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
 				    sf1)) {
 					sf1 = fract;
 					si1 = j;
 					sw1 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
 				    sf2)) {
 					sf2 = fract;
 					si2 = j;
 					sw2 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
 				    sf3)) {
 					sf3 = fract;
 					si3 = j;
 					sw3 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
 				    sf4)) {
 					sf4 = fract;
 					si4 = j;
 					sw4 = whole;
 				}
 			}
 			sbuf_printf(&sb, "queue %u:\n", i);
 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
 			    sf0 / tot, si0);
 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
 			    sf1 / tot, si1);
 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
 			    sf2 / tot, si2);
 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
 			    sf3 / tot, si3);
 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
 			    sf4 / tot, si4);
 		}
 	}
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 	return (0);
 }
 
 static int
 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
 {
 	struct umtxq_chain *uc;
 	u_int i, j;
 	int clear, error;
 
 	clear = 0;
 	error = sysctl_handle_int(oidp, &clear, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (clear != 0) {
 		for (i = 0; i < 2; ++i) {
 			for (j = 0; j < UMTX_CHAINS; ++j) {
 				uc = &umtxq_chains[i][j];
 				mtx_lock(&uc->uc_lock);
 				uc->length = 0;
 				uc->max_length = 0;	
 				mtx_unlock(&uc->uc_lock);
 			}
 		}
 	}
 	return (0);
 }
 
 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
 #endif
 
 static void
 umtxq_sysinit(void *arg __unused)
 {
 	int i, j;
 
 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	for (i = 0; i < 2; ++i) {
 		for (j = 0; j < UMTX_CHAINS; ++j) {
 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
 				 MTX_DEF | MTX_DUPOK);
 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
 			umtxq_chains[i][j].uc_busy = 0;
 			umtxq_chains[i][j].uc_waiters = 0;
 #ifdef UMTX_PROFILING
 			umtxq_chains[i][j].length = 0;
 			umtxq_chains[i][j].max_length = 0;	
 #endif
 		}
 	}
 #ifdef UMTX_PROFILING
 	umtx_init_profiling();
 #endif
 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
 	    EVENTHANDLER_PRI_ANY);
+	umtx_shm_init();
 }
 
 struct umtx_q *
 umtxq_alloc(void)
 {
 	struct umtx_q *uq;
 
 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&uq->uq_spare_queue->head);
 	TAILQ_INIT(&uq->uq_pi_contested);
 	uq->uq_inherited_pri = PRI_MAX;
 	return (uq);
 }
 
 void
 umtxq_free(struct umtx_q *uq)
 {
 	MPASS(uq->uq_spare_queue != NULL);
 	free(uq->uq_spare_queue, M_UMTX);
 	free(uq, M_UMTX);
 }
 
 static inline void
 umtxq_hash(struct umtx_key *key)
 {
 	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
 }
 
 static inline struct umtxq_chain *
 umtxq_getchain(struct umtx_key *key)
 {
 	if (key->type <= TYPE_SEM)
 		return (&umtxq_chains[1][key->hash]);
 	return (&umtxq_chains[0][key->hash]);
 }
 
 /*
  * Lock a chain.
  */
 static inline void
 umtxq_lock(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_lock(&uc->uc_lock);
 }
 
 /*
  * Unlock a chain.
  */
 static inline void
 umtxq_unlock(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_unlock(&uc->uc_lock);
 }
 
 /*
  * Set chain to busy state when following operation
  * may be blocked (kernel mutex can not be used).
  */
 static inline void
 umtxq_busy(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_assert(&uc->uc_lock, MA_OWNED);
 	if (uc->uc_busy) {
 #ifdef SMP
 		if (smp_cpus > 1) {
 			int count = BUSY_SPINS;
 			if (count > 0) {
 				umtxq_unlock(key);
 				while (uc->uc_busy && --count > 0)
 					cpu_spinwait();
 				umtxq_lock(key);
 			}
 		}
 #endif
 		while (uc->uc_busy) {
 			uc->uc_waiters++;
 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
 			uc->uc_waiters--;
 		}
 	}
 	uc->uc_busy = 1;
 }
 
 /*
  * Unbusy a chain.
  */
 static inline void
 umtxq_unbusy(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_assert(&uc->uc_lock, MA_OWNED);
 	KASSERT(uc->uc_busy != 0, ("not busy"));
 	uc->uc_busy = 0;
 	if (uc->uc_waiters)
 		wakeup_one(uc);
 }
 
 static inline void
 umtxq_unbusy_unlocked(struct umtx_key *key)
 {
 
 	umtxq_lock(key);
 	umtxq_unbusy(key);
 	umtxq_unlock(key);
 }
 
 static struct umtxq_queue *
 umtxq_queue_lookup(struct umtx_key *key, int q)
 {
 	struct umtxq_queue *uh;
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
 		if (umtx_key_match(&uh->key, key))
 			return (uh);
 	}
 
 	return (NULL);
 }
 
 static inline void
 umtxq_insert_queue(struct umtx_q *uq, int q)
 {
 	struct umtxq_queue *uh;
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
 	uh = umtxq_queue_lookup(&uq->uq_key, q);
 	if (uh != NULL) {
 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
 	} else {
 		uh = uq->uq_spare_queue;
 		uh->key = uq->uq_key;
 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
 #ifdef UMTX_PROFILING
 		uc->length++;
 		if (uc->length > uc->max_length) {
 			uc->max_length = uc->length;
 			if (uc->max_length > max_length)
 				max_length = uc->max_length;	
 		}
 #endif
 	}
 	uq->uq_spare_queue = NULL;
 
 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
 	uh->length++;
 	uq->uq_flags |= UQF_UMTXQ;
 	uq->uq_cur_queue = uh;
 	return;
 }
 
 static inline void
 umtxq_remove_queue(struct umtx_q *uq, int q)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	if (uq->uq_flags & UQF_UMTXQ) {
 		uh = uq->uq_cur_queue;
 		TAILQ_REMOVE(&uh->head, uq, uq_link);
 		uh->length--;
 		uq->uq_flags &= ~UQF_UMTXQ;
 		if (TAILQ_EMPTY(&uh->head)) {
 			KASSERT(uh->length == 0,
 			    ("inconsistent umtxq_queue length"));
 #ifdef UMTX_PROFILING
 			uc->length--;
 #endif
 			LIST_REMOVE(uh, link);
 		} else {
 			uh = LIST_FIRST(&uc->uc_spare_queue);
 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
 			LIST_REMOVE(uh, link);
 		}
 		uq->uq_spare_queue = uh;
 		uq->uq_cur_queue = NULL;
 	}
 }
 
 /*
  * Check if there are multiple waiters
  */
 static int
 umtxq_count(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
 	if (uh != NULL)
 		return (uh->length);
 	return (0);
 }
 
 /*
  * Check if there are multiple PI waiters and returns first
  * waiter.
  */
 static int
 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 
 	*first = NULL;
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
 	if (uh != NULL) {
 		*first = TAILQ_FIRST(&uh->head);
 		return (uh->length);
 	}
 	return (0);
 }
 
 static int
 umtxq_check_susp(struct thread *td)
 {
 	struct proc *p;
 	int error;
 
 	/*
 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
 	 * eventually break the lockstep loop.
 	 */
 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
 		return (0);
 	error = 0;
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (P_SHOULDSTOP(p) ||
 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
 		if (p->p_flag & P_SINGLE_EXIT)
 			error = EINTR;
 		else
 			error = ERESTART;
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Wake up threads waiting on an userland object.
  */
 
 static int
 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 	struct umtx_q *uq;
 	int ret;
 
 	ret = 0;
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	uh = umtxq_queue_lookup(key, q);
 	if (uh != NULL) {
 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
 			umtxq_remove_queue(uq, q);
 			wakeup(uq);
 			if (++ret >= n_wake)
 				return (ret);
 		}
 	}
 	return (ret);
 }
 
 
 /*
  * Wake up specified thread.
  */
 static inline void
 umtxq_signal_thread(struct umtx_q *uq)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	umtxq_remove(uq);
 	wakeup(uq);
 }
 
 static inline int 
 tstohz(const struct timespec *tsp)
 {
 	struct timeval tv;
 
 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
 	return tvtohz(&tv);
 }
 
 static void
 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
 	const struct timespec *timeout)
 {
 
 	timo->clockid = clockid;
 	if (!absolute) {
 		kern_clock_gettime(curthread, clockid, &timo->end);
 		timo->cur = timo->end;
 		timespecadd(&timo->end, timeout);
 	} else {
 		timo->end = *timeout;
 		kern_clock_gettime(curthread, clockid, &timo->cur);
 	}
 }
 
 static void
 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
 {
 
 	abs_timeout_init(timo, umtxtime->_clockid,
 		(umtxtime->_flags & UMTX_ABSTIME) != 0,
 		&umtxtime->_timeout);
 }
 
 static inline void
 abs_timeout_update(struct abs_timeout *timo)
 {
 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
 }
 
 static int
 abs_timeout_gethz(struct abs_timeout *timo)
 {
 	struct timespec tts;
 
 	if (timespeccmp(&timo->end, &timo->cur, <=))
 		return (-1); 
 	tts = timo->end;
 	timespecsub(&tts, &timo->cur);
 	return (tstohz(&tts));
 }
 
 /*
  * Put thread into sleep state, before sleeping, check if
  * thread was removed from umtx queue.
  */
 static inline int
 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
 {
 	struct umtxq_chain *uc;
 	int error, timo;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	for (;;) {
 		if (!(uq->uq_flags & UQF_UMTXQ))
 			return (0);
 		if (abstime != NULL) {
 			timo = abs_timeout_gethz(abstime);
 			if (timo < 0)
 				return (ETIMEDOUT);
 		} else
 			timo = 0;
 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
 		if (error != EWOULDBLOCK) {
 			umtxq_lock(&uq->uq_key);
 			break;
 		}
 		if (abstime != NULL)
 			abs_timeout_update(abstime);
 		umtxq_lock(&uq->uq_key);
 	}
 	return (error);
 }
 
 /*
  * Convert userspace address into unique logical address.
  */
 int
 umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
 {
 	struct thread *td = curthread;
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 
 	key->type = type;
 	if (share == THREAD_SHARE) {
 		key->shared = 0;
 		key->info.private.vs = td->td_proc->p_vmspace;
 		key->info.private.addr = (uintptr_t)addr;
 	} else {
 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
 		map = &td->td_proc->p_vmspace->vm_map;
 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
 		    &entry, &key->info.shared.object, &pindex, &prot,
 		    &wired) != KERN_SUCCESS) {
 			return (EFAULT);
 		}
 
 		if ((share == PROCESS_SHARE) ||
 		    (share == AUTO_SHARE &&
 		     VM_INHERIT_SHARE == entry->inheritance)) {
 			key->shared = 1;
 			key->info.shared.offset = (vm_offset_t)addr -
 			    entry->start + entry->offset;
 			vm_object_reference(key->info.shared.object);
 		} else {
 			key->shared = 0;
 			key->info.private.vs = td->td_proc->p_vmspace;
 			key->info.private.addr = (uintptr_t)addr;
 		}
 		vm_map_lookup_done(map, entry);
 	}
 
 	umtxq_hash(key);
 	return (0);
 }
 
 /*
  * Release key.
  */
 void
 umtx_key_release(struct umtx_key *key)
 {
 	if (key->shared)
 		vm_object_deallocate(key->info.shared.object);
 }
 
 /*
  * Fetch and compare value, sleep on the address if value is not changed.
  */
 static int
 do_wait(struct thread *td, void *addr, u_long id,
 	struct _umtx_time *timeout, int compat32, int is_private)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	u_long tmp;
 	uint32_t tmp32;
 	int error = 0;
 
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	if (compat32 == 0) {
 		error = fueword(addr, &tmp);
 		if (error != 0)
 			error = EFAULT;
 	} else {
 		error = fueword32(addr, &tmp32);
 		if (error == 0)
 			tmp = tmp32;
 		else
 			error = EFAULT;
 	}
 	umtxq_lock(&uq->uq_key);
 	if (error == 0) {
 		if (tmp == id)
 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
 			    NULL : &timo);
 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
 			error = 0;
 		else
 			umtxq_remove(uq);
 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
 		umtxq_remove(uq);
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 /*
  * Wake up threads sleeping on the specified address.
  */
 int
 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
 {
 	struct umtx_key key;
 	int ret;
 	
 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
 		return (ret);
 	umtxq_lock(&key);
 	umtxq_signal(&key, n_wake);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (0);
 }
 
 /*
  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
 	struct _umtx_time *timeout, int mode)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t owner, old, id;
 	int error, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	error = 0;
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure. It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		rv = fueword32(&m->m_owner, &owner);
 		if (rv == -1)
 			return (EFAULT);
 		if (mode == _UMUTEX_WAIT) {
 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
 				return (0);
 		} else {
 			/*
 			 * Try the uncontested case.  This should be done in userland.
 			 */
 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
 			    &owner, id);
 			/* The address was invalid. */
 			if (rv == -1)
 				return (EFAULT);
 
 			/* The acquire succeeded. */
 			if (owner == UMUTEX_UNOWNED)
 				return (0);
 
 			/* If no one owns it but it is contested try to acquire it. */
 			if (owner == UMUTEX_CONTESTED) {
 				rv = casueword32(&m->m_owner,
 				    UMUTEX_CONTESTED, &owner,
 				    id | UMUTEX_CONTESTED);
 				/* The address was invalid. */
 				if (rv == -1)
 					return (EFAULT);
 
 				if (owner == UMUTEX_CONTESTED)
 					return (0);
 
 				rv = umtxq_check_susp(td);
 				if (rv != 0)
 					return (rv);
 
 				/* If this failed the lock has changed, restart. */
 				continue;
 			}
 		}
 
 		if (mode == _UMUTEX_TRY)
 			return (EBUSY);
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			return (error);
 
 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
 		    GET_SHARE(flags), &uq->uq_key)) != 0)
 			return (error);
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		rv = casueword32(&m->m_owner, owner, &old,
 		    owner | UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (rv == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		if (old == owner)
 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
 			    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 
 		if (error == 0)
 			error = umtxq_check_susp(td);
 	}
 
 	return (0);
 }
 
 /*
  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
 {
 	struct umtx_key key;
 	uint32_t owner, old, id;
 	int error;
 	int count;
 
 	id = td->td_tid;
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	if ((owner & UMUTEX_CONTESTED) == 0) {
 		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
 		if (error == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 	error = casueword32(&m->m_owner, owner, &old,
 	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
 	umtxq_lock(&key);
 	umtxq_signal(&key,1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	if (error == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Check if the mutex is available and wake up a waiter,
  * only for simple mutex.
  */
 static int
 do_wake_umutex(struct thread *td, struct umutex *m)
 {
 	struct umtx_key key;
 	uint32_t owner;
 	uint32_t flags;
 	int error;
 	int count;
 
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != 0)
 		return (0);
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	if (count <= 1) {
 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
 		    UMUTEX_UNOWNED);
 		if (error == -1)
 			error = EFAULT;
 	}
 
 	umtxq_lock(&key);
 	if (error == 0 && count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 /*
  * Check if the mutex has waiters and tries to fix contention bit.
  */
 static int
 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
 {
 	struct umtx_key key;
 	uint32_t owner, old;
 	int type;
 	int error;
 	int count;
 
 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
 		type = TYPE_NORMAL_UMUTEX;
 		break;
 	case UMUTEX_PRIO_INHERIT:
 		type = TYPE_PI_UMUTEX;
 		break;
 	case UMUTEX_PRIO_PROTECT:
 		type = TYPE_PP_UMUTEX;
 		break;
 	default:
 		return (EINVAL);
 	}
 	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	owner = 0;
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 	/*
 	 * Only repair contention bit if there is a waiter, this means the mutex
 	 * is still being referenced by userland code, otherwise don't update
 	 * any memory.
 	 */
 	if (count > 1) {
 		error = fueword32(&m->m_owner, &owner);
 		if (error == -1)
 			error = EFAULT;
 		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
 			error = casueword32(&m->m_owner, owner, &old,
 			    owner | UMUTEX_CONTESTED);
 			if (error == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (old == owner)
 				break;
 			owner = old;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 	} else if (count == 1) {
 		error = fueword32(&m->m_owner, &owner);
 		if (error == -1)
 			error = EFAULT;
 		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
 		       (owner & UMUTEX_CONTESTED) == 0) {
 			error = casueword32(&m->m_owner, owner, &old,
 			    owner | UMUTEX_CONTESTED);
 			if (error == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (old == owner)
 				break;
 			owner = old;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 	}
 	umtxq_lock(&key);
 	if (error == EFAULT) {
 		umtxq_signal(&key, INT_MAX);
 	} else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 static inline struct umtx_pi *
 umtx_pi_alloc(int flags)
 {
 	struct umtx_pi *pi;
 
 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
 	TAILQ_INIT(&pi->pi_blocked);
 	atomic_add_int(&umtx_pi_allocated, 1);
 	return (pi);
 }
 
 static inline void
 umtx_pi_free(struct umtx_pi *pi)
 {
 	uma_zfree(umtx_pi_zone, pi);
 	atomic_add_int(&umtx_pi_allocated, -1);
 }
 
 /*
  * Adjust the thread's position on a pi_state after its priority has been
  * changed.
  */
 static int
 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
 {
 	struct umtx_q *uq, *uq1, *uq2;
 	struct thread *td1;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi == NULL)
 		return (0);
 
 	uq = td->td_umtxq;
 
 	/*
 	 * Check if the thread needs to be moved on the blocked chain.
 	 * It needs to be moved if either its priority is lower than
 	 * the previous thread or higher than the next thread.
 	 */
 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
 	uq2 = TAILQ_NEXT(uq, uq_lockq);
 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
 		/*
 		 * Remove thread from blocked chain and determine where
 		 * it should be moved to.
 		 */
 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
 			td1 = uq1->uq_thread;
 			MPASS(td1->td_proc->p_magic == P_MAGIC);
 			if (UPRI(td1) > UPRI(td))
 				break;
 		}
 
 		if (uq1 == NULL)
 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
 		else
 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
 	}
 	return (1);
 }
 
 static struct umtx_pi *
 umtx_pi_next(struct umtx_pi *pi)
 {
 	struct umtx_q *uq_owner;
 
 	if (pi->pi_owner == NULL)
 		return (NULL);
 	uq_owner = pi->pi_owner->td_umtxq;
 	if (uq_owner == NULL)
 		return (NULL);
 	return (uq_owner->uq_pi_blocked);
 }
 
 /*
  * Floyd's Cycle-Finding Algorithm.
  */
 static bool
 umtx_pi_check_loop(struct umtx_pi *pi)
 {
 	struct umtx_pi *pi1;	/* fast iterator */
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi == NULL)
 		return (false);
 	pi1 = pi;
 	for (;;) {
 		pi = umtx_pi_next(pi);
 		if (pi == NULL)
 			break;
 		pi1 = umtx_pi_next(pi1);
 		if (pi1 == NULL)
 			break;
 		pi1 = umtx_pi_next(pi1);
 		if (pi1 == NULL)
 			break;
 		if (pi == pi1)
 			return (true);
 	}
 	return (false);
 }
 
 /*
  * Propagate priority when a thread is blocked on POSIX
  * PI mutex.
  */ 
 static void
 umtx_propagate_priority(struct thread *td)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 	int pri;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	pri = UPRI(td);
 	uq = td->td_umtxq;
 	pi = uq->uq_pi_blocked;
 	if (pi == NULL)
 		return;
 	if (umtx_pi_check_loop(pi))
 		return;
 
 	for (;;) {
 		td = pi->pi_owner;
 		if (td == NULL || td == curthread)
 			return;
 
 		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
 
 		thread_lock(td);
 		if (td->td_lend_user_pri > pri)
 			sched_lend_user_prio(td, pri);
 		else {
 			thread_unlock(td);
 			break;
 		}
 		thread_unlock(td);
 
 		/*
 		 * Pick up the lock that td is blocked on.
 		 */
 		uq = td->td_umtxq;
 		pi = uq->uq_pi_blocked;
 		if (pi == NULL)
 			break;
 		/* Resort td on the list if needed. */
 		umtx_pi_adjust_thread(pi, td);
 	}
 }
 
 /*
  * Unpropagate priority for a PI mutex when a thread blocked on
  * it is interrupted by signal or resumed by others.
  */
 static void
 umtx_repropagate_priority(struct umtx_pi *pi)
 {
 	struct umtx_q *uq, *uq_owner;
 	struct umtx_pi *pi2;
 	int pri;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 
 	if (umtx_pi_check_loop(pi))
 		return;
 	while (pi != NULL && pi->pi_owner != NULL) {
 		pri = PRI_MAX;
 		uq_owner = pi->pi_owner->td_umtxq;
 
 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
 			uq = TAILQ_FIRST(&pi2->pi_blocked);
 			if (uq != NULL) {
 				if (pri > UPRI(uq->uq_thread))
 					pri = UPRI(uq->uq_thread);
 			}
 		}
 
 		if (pri > uq_owner->uq_inherited_pri)
 			pri = uq_owner->uq_inherited_pri;
 		thread_lock(pi->pi_owner);
 		sched_lend_user_prio(pi->pi_owner, pri);
 		thread_unlock(pi->pi_owner);
 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
 	}
 }
 
 /*
  * Insert a PI mutex into owned list.
  */
 static void
 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
 {
 	struct umtx_q *uq_owner;
 
 	uq_owner = owner->td_umtxq;
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi->pi_owner != NULL)
 		panic("pi_owner != NULL");
 	pi->pi_owner = owner;
 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
 }
 
 
 /*
  * Disown a PI mutex, and remove it from the owned list.
  */
 static void
 umtx_pi_disown(struct umtx_pi *pi)
 {
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
 	pi->pi_owner = NULL;
 }
 
 /*
  * Claim ownership of a PI mutex.
  */
 static int
 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
 {
 	struct umtx_q *uq;
 
 	mtx_lock(&umtx_lock);
 	if (pi->pi_owner == owner) {
 		mtx_unlock(&umtx_lock);
 		return (0);
 	}
 
 	if (pi->pi_owner != NULL) {
 		/*
 		 * userland may have already messed the mutex, sigh.
 		 */
 		mtx_unlock(&umtx_lock);
 		return (EPERM);
 	}
 	umtx_pi_setowner(pi, owner);
 	uq = TAILQ_FIRST(&pi->pi_blocked);
 	if (uq != NULL) {
 		int pri;
 
 		pri = UPRI(uq->uq_thread);
 		thread_lock(owner);
 		if (pri < UPRI(owner))
 			sched_lend_user_prio(owner, pri);
 		thread_unlock(owner);
 	}
 	mtx_unlock(&umtx_lock);
 	return (0);
 }
 
 /*
  * Adjust a thread's order position in its blocked PI mutex,
  * this may result new priority propagating process.
  */
 void
 umtx_pi_adjust(struct thread *td, u_char oldpri)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 
 	uq = td->td_umtxq;
 	mtx_lock(&umtx_lock);
 	/*
 	 * Pick up the lock that td is blocked on.
 	 */
 	pi = uq->uq_pi_blocked;
 	if (pi != NULL) {
 		umtx_pi_adjust_thread(pi, td);
 		umtx_repropagate_priority(pi);
 	}
 	mtx_unlock(&umtx_lock);
 }
 
 /*
  * Sleep on a PI mutex.
  */
 static int
 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
 	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
 {
 	struct umtxq_chain *uc;
 	struct thread *td, *td1;
 	struct umtx_q *uq1;
 	int pri;
 	int error = 0;
 
 	td = uq->uq_thread;
 	KASSERT(td == curthread, ("inconsistent uq_thread"));
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
 	umtxq_insert(uq);
 	mtx_lock(&umtx_lock);
 	if (pi->pi_owner == NULL) {
 		mtx_unlock(&umtx_lock);
 		/* XXX Only look up thread in current process. */
 		td1 = tdfind(owner, curproc->p_pid);
 		mtx_lock(&umtx_lock);
 		if (td1 != NULL) {
 			if (pi->pi_owner == NULL)
 				umtx_pi_setowner(pi, td1);
 			PROC_UNLOCK(td1->td_proc);
 		}
 	}
 
 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
 		pri = UPRI(uq1->uq_thread);
 		if (pri > UPRI(td))
 			break;
 	}
 
 	if (uq1 != NULL)
 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
 	else
 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
 
 	uq->uq_pi_blocked = pi;
 	thread_lock(td);
 	td->td_flags |= TDF_UPIBLOCKED;
 	thread_unlock(td);
 	umtx_propagate_priority(td);
 	mtx_unlock(&umtx_lock);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, wmesg, timo);
 	umtxq_remove(uq);
 
 	mtx_lock(&umtx_lock);
 	uq->uq_pi_blocked = NULL;
 	thread_lock(td);
 	td->td_flags &= ~TDF_UPIBLOCKED;
 	thread_unlock(td);
 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 	umtx_repropagate_priority(pi);
 	mtx_unlock(&umtx_lock);
 	umtxq_unlock(&uq->uq_key);
 
 	return (error);
 }
 
 /*
  * Add reference count for a PI mutex.
  */
 static void
 umtx_pi_ref(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	pi->pi_refcount++;
 }
 
 /*
  * Decrease reference count for a PI mutex, if the counter
  * is decreased to zero, its memory space is freed.
  */ 
 static void
 umtx_pi_unref(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
 	if (--pi->pi_refcount == 0) {
 		mtx_lock(&umtx_lock);
 		if (pi->pi_owner != NULL)
 			umtx_pi_disown(pi);
 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
 			("blocked queue not empty"));
 		mtx_unlock(&umtx_lock);
 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
 		umtx_pi_free(pi);
 	}
 }
 
 /*
  * Find a PI mutex in hash table.
  */
 static struct umtx_pi *
 umtx_pi_lookup(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 	struct umtx_pi *pi;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 
 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
 		if (umtx_key_match(&pi->pi_key, key)) {
 			return (pi);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Insert a PI mutex into hash table.
  */
 static inline void
 umtx_pi_insert(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
 }
 
 /*
  * Lock a PI mutex.
  */
 static int
 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int try)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	struct umtx_pi *pi, *new_pi;
 	uint32_t id, owner, old;
 	int error, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 
 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	pi = umtx_pi_lookup(&uq->uq_key);
 	if (pi == NULL) {
 		new_pi = umtx_pi_alloc(M_NOWAIT);
 		if (new_pi == NULL) {
 			umtxq_unlock(&uq->uq_key);
 			new_pi = umtx_pi_alloc(M_WAITOK);
 			umtxq_lock(&uq->uq_key);
 			pi = umtx_pi_lookup(&uq->uq_key);
 			if (pi != NULL) {
 				umtx_pi_free(new_pi);
 				new_pi = NULL;
 			}
 		}
 		if (new_pi != NULL) {
 			new_pi->pi_key = uq->uq_key;
 			umtx_pi_insert(new_pi);
 			pi = new_pi;
 		}
 	}
 	umtx_pi_ref(pi);
 	umtxq_unlock(&uq->uq_key);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure.  It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		/*
 		 * Try the uncontested case.  This should be done in userland.
 		 */
 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
 		/* The address was invalid. */
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		/* The acquire succeeded. */
 		if (owner == UMUTEX_UNOWNED) {
 			error = 0;
 			break;
 		}
 
 		/* If no one owns it but it is contested try to acquire it. */
 		if (owner == UMUTEX_CONTESTED) {
 			rv = casueword32(&m->m_owner,
 			    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
 			/* The address was invalid. */
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 
 			if (owner == UMUTEX_CONTESTED) {
 				umtxq_lock(&uq->uq_key);
 				umtxq_busy(&uq->uq_key);
 				error = umtx_pi_claim(pi, td);
 				umtxq_unbusy(&uq->uq_key);
 				umtxq_unlock(&uq->uq_key);
 				if (error != 0) {
 					/*
 					 * Since we're going to return an
 					 * error, restore the m_owner to its
 					 * previous, unowned state to avoid
 					 * compounding the problem.
 					 */
 					(void)casuword32(&m->m_owner,
 					    id | UMUTEX_CONTESTED,
 					    UMUTEX_CONTESTED);
 				}
 				break;
 			}
 
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 
 			/* If this failed the lock has changed, restart. */
 			continue;
 		}
 
 		if ((owner & ~UMUTEX_CONTESTED) == id) {
 			error = EDEADLK;
 			break;
 		}
 
 		if (try != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 			
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		rv = casueword32(&m->m_owner, owner, &old,
 		    owner | UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 
 		umtxq_lock(&uq->uq_key);
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		if (old == owner) {
 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
 			    "umtxpi", timeout == NULL ? NULL : &timo);
 			if (error != 0)
 				continue;
 		} else {
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 		}
 
 		error = umtxq_check_susp(td);
 		if (error != 0)
 			break;
 	}
 
 	umtxq_lock(&uq->uq_key);
 	umtx_pi_unref(pi);
 	umtxq_unlock(&uq->uq_key);
 
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Unlock a PI mutex.
  */
 static int
 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
 {
 	struct umtx_key key;
 	struct umtx_q *uq_first, *uq_first2, *uq_me;
 	struct umtx_pi *pi, *pi2;
 	uint32_t owner, old, id;
 	int error;
 	int count;
 	int pri;
 
 	id = td->td_tid;
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	/* This should be done in userland */
 	if ((owner & UMUTEX_CONTESTED) == 0) {
 		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
 		if (error == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count_pi(&key, &uq_first);
 	if (uq_first != NULL) {
 		mtx_lock(&umtx_lock);
 		pi = uq_first->uq_pi_blocked;
 		KASSERT(pi != NULL, ("pi == NULL?"));
 		if (pi->pi_owner != td) {
 			mtx_unlock(&umtx_lock);
 			umtxq_unbusy(&key);
 			umtxq_unlock(&key);
 			umtx_key_release(&key);
 			/* userland messed the mutex */
 			return (EPERM);
 		}
 		uq_me = td->td_umtxq;
 		umtx_pi_disown(pi);
 		/* get highest priority thread which is still sleeping. */
 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
 		while (uq_first != NULL && 
 		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
 		}
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
 			if (uq_first2 != NULL) {
 				if (pri > UPRI(uq_first2->uq_thread))
 					pri = UPRI(uq_first2->uq_thread);
 			}
 		}
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 		if (uq_first)
 			umtxq_signal_thread(uq_first);
 	} else {
 		pi = umtx_pi_lookup(&key);
 		/*
 		 * A umtx_pi can exist if a signal or timeout removed the
 		 * last waiter from the umtxq, but there is still
 		 * a thread in do_lock_pi() holding the umtx_pi.
 		 */
 		if (pi != NULL) {
 			/*
 			 * The umtx_pi can be unowned, such as when a thread
 			 * has just entered do_lock_pi(), allocated the
 			 * umtx_pi, and unlocked the umtxq.
 			 * If the current thread owns it, it must disown it.
 			 */
 			mtx_lock(&umtx_lock);
 			if (pi->pi_owner == td)
 				umtx_pi_disown(pi);
 			mtx_unlock(&umtx_lock);
 		}
 	}
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 	error = casueword32(&m->m_owner, owner, &old,
 	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
 
 	umtxq_unbusy_unlocked(&key);
 	umtx_key_release(&key);
 	if (error == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Lock a PP mutex.
  */
 static int
 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int try)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
 	uint32_t ceiling;
 	uint32_t owner, id;
 	int error, pri, old_inherited_pri, su, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 	for (;;) {
 		old_inherited_pri = uq->uq_inherited_pri;
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		rv = fueword32(&m->m_ceilings[0], &ceiling);
 		if (rv == -1) {
 			error = EFAULT;
 			goto out;
 		}
 		ceiling = RTP_PRIO_MAX - ceiling;
 		if (ceiling > RTP_PRIO_MAX) {
 			error = EINVAL;
 			goto out;
 		}
 
 		mtx_lock(&umtx_lock);
 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
 			mtx_unlock(&umtx_lock);
 			error = EINVAL;
 			goto out;
 		}
 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
 			thread_lock(td);
 			if (uq->uq_inherited_pri < UPRI(td))
 				sched_lend_user_prio(td, uq->uq_inherited_pri);
 			thread_unlock(td);
 		}
 		mtx_unlock(&umtx_lock);
 
 		rv = casueword32(&m->m_owner,
 		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
 		/* The address was invalid. */
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		if (owner == UMUTEX_CONTESTED) {
 			error = 0;
 			break;
 		}
 
 		if (try != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
 		    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 
 	if (error != 0) {
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 
 out:
 	umtxq_unbusy_unlocked(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Unlock a PP mutex.
  */
 static int
 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
 {
 	struct umtx_key key;
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
 	uint32_t owner, id;
 	uint32_t rceiling;
 	int error, pri, new_inherited_pri, su;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
 	if (error != 0)
 		return (error);
 
 	if (rceiling == -1)
 		new_inherited_pri = PRI_MAX;
 	else {
 		rceiling = RTP_PRIO_MAX - rceiling;
 		if (rceiling > RTP_PRIO_MAX)
 			return (EINVAL);
 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
 	}
 
 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	umtxq_unlock(&key);
 	/*
 	 * For priority protected mutex, always set unlocked state
 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
 	 * to lock the mutex, it is necessary because thread priority
 	 * has to be adjusted for such mutex.
 	 */
 	error = suword32(&m->m_owner, UMUTEX_CONTESTED);
 
 	umtxq_lock(&key);
 	if (error == 0)
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 
 	if (error == -1)
 		error = EFAULT;
 	else {
 		mtx_lock(&umtx_lock);
 		if (su != 0)
 			uq->uq_inherited_pri = new_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
 	uint32_t *old_ceiling)
 {
 	struct umtx_q *uq;
 	uint32_t save_ceiling;
 	uint32_t owner, id;
 	uint32_t flags;
 	int error, rv;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
 		return (EINVAL);
 	if (ceiling > RTP_PRIO_MAX)
 		return (EINVAL);
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
 	   &uq->uq_key)) != 0)
 		return (error);
 	for (;;) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		rv = casueword32(&m->m_owner,
 		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		if (owner == UMUTEX_CONTESTED) {
 			suword32(&m->m_ceilings[0], ceiling);
 			suword32(&m->m_owner, UMUTEX_CONTESTED);
 			error = 0;
 			break;
 		}
 
 		if ((owner & ~UMUTEX_CONTESTED) == id) {
 			suword32(&m->m_ceilings[0], ceiling);
 			error = 0;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = umtxq_sleep(uq, "umtxpp", NULL);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 	}
 	umtxq_lock(&uq->uq_key);
 	if (error == 0)
 		umtxq_signal(&uq->uq_key, INT_MAX);
 	umtxq_unbusy(&uq->uq_key);
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	if (error == 0 && old_ceiling != NULL)
 		suword32(old_ceiling, save_ceiling);
 	return (error);
 }
 
 /*
  * Lock a userland POSIX mutex.
  */
 static int
 do_lock_umutex(struct thread *td, struct umutex *m,
     struct _umtx_time *timeout, int mode)
 {
 	uint32_t flags;
 	int error;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
 		error = do_lock_normal(td, m, flags, timeout, mode);
 		break;
 	case UMUTEX_PRIO_INHERIT:
 		error = do_lock_pi(td, m, flags, timeout, mode);
 		break;
 	case UMUTEX_PRIO_PROTECT:
 		error = do_lock_pp(td, m, flags, timeout, mode);
 		break;
 	default:
 		return (EINVAL);
 	}
 	if (timeout == NULL) {
 		if (error == EINTR && mode != _UMUTEX_WAIT)
 			error = ERESTART;
 	} else {
 		/* Timed-locking is not restarted. */
 		if (error == ERESTART)
 			error = EINTR;
 	}
 	return (error);
 }
 
 /*
  * Unlock a userland POSIX mutex.
  */
 static int
 do_unlock_umutex(struct thread *td, struct umutex *m)
 {
 	uint32_t flags;
 	int error;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
 		return (do_unlock_normal(td, m, flags));
 	case UMUTEX_PRIO_INHERIT:
 		return (do_unlock_pi(td, m, flags));
 	case UMUTEX_PRIO_PROTECT:
 		return (do_unlock_pp(td, m, flags));
 	}
 
 	return (EINVAL);
 }
 
 static int
 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
 	struct timespec *timeout, u_long wflags)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, clockid, hasw;
 	int error;
 
 	uq = td->td_umtxq;
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if ((wflags & CVWAIT_CLOCKID) != 0) {
 		error = fueword32(&cv->c_clockid, &clockid);
 		if (error == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		if (clockid < CLOCK_REALTIME ||
 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
 			/* hmm, only HW clock id will work. */
 			umtx_key_release(&uq->uq_key);
 			return (EINVAL);
 		}
 	} else {
 		clockid = CLOCK_REALTIME;
 	}
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 
 	/*
 	 * Set c_has_waiters to 1 before releasing user mutex, also
 	 * don't modify cache line when unnecessary.
 	 */
 	error = fueword32(&cv->c_has_waiters, &hasw);
 	if (error == 0 && hasw == 0)
 		suword32(&cv->c_has_waiters, 1);
 
 	umtxq_unbusy_unlocked(&uq->uq_key);
 
 	error = do_unlock_umutex(td, m);
 
 	if (timeout != NULL)
 		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
 			timeout);
 	
 	umtxq_lock(&uq->uq_key);
 	if (error == 0) {
 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
 		    NULL : &timo);
 	}
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		/*
 		 * This must be timeout,interrupted by signal or
 		 * surprious wakeup, clear c_has_waiter flag when
 		 * necessary.
 		 */
 		umtxq_busy(&uq->uq_key);
 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
 			int oldlen = uq->uq_cur_queue->length;
 			umtxq_remove(uq);
 			if (oldlen == 1) {
 				umtxq_unlock(&uq->uq_key);
 				suword32(&cv->c_has_waiters, 0);
 				umtxq_lock(&uq->uq_key);
 			}
 		}
 		umtxq_unbusy(&uq->uq_key);
 		if (error == ERESTART)
 			error = EINTR;
 	}
 
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland condition variable.
  */
 static int
 do_cv_signal(struct thread *td, struct ucond *cv)
 {
 	struct umtx_key key;
 	int error, cnt, nwake;
 	uint32_t flags;
 
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	nwake = umtxq_signal(&key, 1);
 	if (cnt <= nwake) {
 		umtxq_unlock(&key);
 		error = suword32(&cv->c_has_waiters, 0);
 		if (error == -1)
 			error = EFAULT;
 		umtxq_lock(&key);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_cv_broadcast(struct thread *td, struct ucond *cv)
 {
 	struct umtx_key key;
 	int error;
 	uint32_t flags;
 
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	umtxq_signal(&key, INT_MAX);
 	umtxq_unlock(&key);
 
 	error = suword32(&cv->c_has_waiters, 0);
 	if (error == -1)
 		error = EFAULT;
 
 	umtxq_unbusy_unlocked(&key);
 
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, wrflags;
 	int32_t state, oldstate;
 	int32_t blocked_readers;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	wrflags = URWLOCK_WRITE_OWNER;
 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
 		wrflags |= URWLOCK_WRITE_WAITERS;
 
 	for (;;) {
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/* try to lock it */
 		while (!(state & wrflags)) {
 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
 				umtx_key_release(&uq->uq_key);
 				return (EAGAIN);
 			}
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state + 1);
 			if (rv == -1) {
 				umtx_key_release(&uq->uq_key);
 				return (EFAULT);
 			}
 			if (oldstate == state) {
 				umtx_key_release(&uq->uq_key);
 				return (0);
 			}
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			state = oldstate;
 		}
 
 		if (error)
 			break;
 
 		/* grab monitor lock */
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * re-read the state, in case it changed between the try-lock above
 		 * and the check below
 		 */
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1)
 			error = EFAULT;
 
 		/* set read contention bit */
 		while (error == 0 && (state & wrflags) &&
 		    !(state & URWLOCK_READ_WAITERS)) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_READ_WAITERS);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (oldstate == state)
 				goto sleep;
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 		if (error != 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			break;
 		}
 
 		/* state is changed while setting flags, restart */
 		if (!(state & wrflags)) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			continue;
 		}
 
 sleep:
 		/* contention bit is set, before sleeping, increase read waiter count */
 		rv = fueword32(&rwlock->rw_blocked_readers,
 		    &blocked_readers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
 
 		while (state & wrflags) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_insert(uq);
 			umtxq_unbusy(&uq->uq_key);
 
 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
 			    NULL : &timo);
 
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			if (error)
 				break;
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 		}
 
 		/* decrease read waiter count, and may clear read contention bit */
 		rv = fueword32(&rwlock->rw_blocked_readers,
 		    &blocked_readers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
 		if (blocked_readers == 1) {
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1)
 				error = EFAULT;
 			while (error == 0) {
 				rv = casueword32(&rwlock->rw_state, state,
 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
 				if (rv == -1) {
 					error = EFAULT;
 					break;
 				}
 				if (oldstate == state)
 					break;
 				state = oldstate;
 				error = umtxq_check_susp(td);
 			}
 		}
 
 		umtxq_unbusy_unlocked(&uq->uq_key);
 		if (error != 0)
 			break;
 	}
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 static int
 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags;
 	int32_t state, oldstate;
 	int32_t blocked_writers;
 	int32_t blocked_readers;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	blocked_readers = 0;
 	for (;;) {
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_WRITE_OWNER);
 			if (rv == -1) {
 				umtx_key_release(&uq->uq_key);
 				return (EFAULT);
 			}
 			if (oldstate == state) {
 				umtx_key_release(&uq->uq_key);
 				return (0);
 			}
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 
 		if (error) {
 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
 			    blocked_readers != 0) {
 				umtxq_lock(&uq->uq_key);
 				umtxq_busy(&uq->uq_key);
 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
 				umtxq_unbusy(&uq->uq_key);
 				umtxq_unlock(&uq->uq_key);
 			}
 
 			break;
 		}
 
 		/* grab monitor lock */
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * re-read the state, in case it changed between the try-lock above
 		 * and the check below
 		 */
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1)
 			error = EFAULT;
 
 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
 		    URWLOCK_READER_COUNT(state) != 0) &&
 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (oldstate == state)
 				goto sleep;
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 		if (error != 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			break;
 		}
 
 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			continue;
 		}
 sleep:
 		rv = fueword32(&rwlock->rw_blocked_writers,
 		    &blocked_writers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
 
 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
 			umtxq_unbusy(&uq->uq_key);
 
 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
 			    NULL : &timo);
 
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
 			umtxq_unlock(&uq->uq_key);
 			if (error)
 				break;
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 		}
 
 		rv = fueword32(&rwlock->rw_blocked_writers,
 		    &blocked_writers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
 		if (blocked_writers == 1) {
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 			for (;;) {
 				rv = casueword32(&rwlock->rw_state, state,
 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
 				if (rv == -1) {
 					error = EFAULT;
 					break;
 				}
 				if (oldstate == state)
 					break;
 				state = oldstate;
 				error = umtxq_check_susp(td);
 				/*
 				 * We are leaving the URWLOCK_WRITE_WAITERS
 				 * behind, but this should not harm the
 				 * correctness.
 				 */
 				if (error != 0)
 					break;
 			}
 			rv = fueword32(&rwlock->rw_blocked_readers,
 			    &blocked_readers);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 		} else
 			blocked_readers = 0;
 
 		umtxq_unbusy_unlocked(&uq->uq_key);
 	}
 
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 static int
 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
 {
 	struct umtx_q *uq;
 	uint32_t flags;
 	int32_t state, oldstate;
 	int error, rv, q, count;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	error = fueword32(&rwlock->rw_state, &state);
 	if (error == -1) {
 		error = EFAULT;
 		goto out;
 	}
 	if (state & URWLOCK_WRITE_OWNER) {
 		for (;;) {
 			rv = casueword32(&rwlock->rw_state, state, 
 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
 			if (rv == -1) {
 				error = EFAULT;
 				goto out;
 			}
 			if (oldstate != state) {
 				state = oldstate;
 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
 					error = EPERM;
 					goto out;
 				}
 				error = umtxq_check_susp(td);
 				if (error != 0)
 					goto out;
 			} else
 				break;
 		}
 	} else if (URWLOCK_READER_COUNT(state) != 0) {
 		for (;;) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state - 1);
 			if (rv == -1) {
 				error = EFAULT;
 				goto out;
 			}
 			if (oldstate != state) {
 				state = oldstate;
 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
 					error = EPERM;
 					goto out;
 				}
 				error = umtxq_check_susp(td);
 				if (error != 0)
 					goto out;
 			} else
 				break;
 		}
 	} else {
 		error = EPERM;
 		goto out;
 	}
 
 	count = 0;
 
 	if (!(flags & URWLOCK_PREFER_READER)) {
 		if (state & URWLOCK_WRITE_WAITERS) {
 			count = 1;
 			q = UMTX_EXCLUSIVE_QUEUE;
 		} else if (state & URWLOCK_READ_WAITERS) {
 			count = INT_MAX;
 			q = UMTX_SHARED_QUEUE;
 		}
 	} else {
 		if (state & URWLOCK_READ_WAITERS) {
 			count = INT_MAX;
 			q = UMTX_SHARED_QUEUE;
 		} else if (state & URWLOCK_WRITE_WAITERS) {
 			count = 1;
 			q = UMTX_EXCLUSIVE_QUEUE;
 		}
 	}
 
 	if (count) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_signal_queue(&uq->uq_key, count, q);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 	}
 out:
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, count, count1;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&sem->_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
 	if (rv == 0)
 		rv = fueword32(&sem->_count, &count);
 	if (rv == -1 || count != 0) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 		return (rv == -1 ? EFAULT : 0);
 	}
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		umtxq_remove(uq);
 		/* A relative timeout cannot be restarted. */
 		if (error == ERESTART && timeout != NULL &&
 		    (timeout->_flags & UMTX_ABSTIME) == 0)
 			error = EINTR;
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland semaphore.
  */
 static int
 do_sem_wake(struct thread *td, struct _usem *sem)
 {
 	struct umtx_key key;
 	int error, cnt;
 	uint32_t flags;
 
 	error = fueword32(&sem->_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	if (cnt > 0) {
 		umtxq_signal(&key, 1);
 		/*
 		 * Check if count is greater than 0, this means the memory is
 		 * still being referenced by user code, so we can safely
 		 * update _has_waiters flag.
 		 */
 		if (cnt == 1) {
 			umtxq_unlock(&key);
 			error = suword32(&sem->_has_waiters, 0);
 			umtxq_lock(&key);
 			if (error == -1)
 				error = EFAULT;
 		}
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 #endif
 
 static int
 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t count, flags;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	flags = fuword32(&sem->_flags);
 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	rv = fueword32(&sem->_count, &count);
 	if (rv == -1) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 		return (EFAULT);
 	}
 	for (;;) {
 		if (USEM_COUNT(count) != 0) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (0);
 		}
 		if (count == USEM_HAS_WAITERS)
 			break;
 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
 		if (rv == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		if (count == 0)
 			break;
 	}
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		umtxq_remove(uq);
 		/* A relative timeout cannot be restarted. */
 		if (error == ERESTART && timeout != NULL &&
 		    (timeout->_flags & UMTX_ABSTIME) == 0)
 			error = EINTR;
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland semaphore.
  */
 static int
 do_sem2_wake(struct thread *td, struct _usem2 *sem)
 {
 	struct umtx_key key;
 	int error, cnt, rv;
 	uint32_t count, flags;
 
 	rv = fueword32(&sem->_flags, &flags);
 	if (rv == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	if (cnt > 0) {
 		umtxq_signal(&key, 1);
 
 		/*
 		 * If this was the last sleeping thread, clear the waiters
 		 * flag in _count.
 		 */
 		if (cnt == 1) {
 			umtxq_unlock(&key);
 			rv = fueword32(&sem->_count, &count);
 			while (rv != -1 && count & USEM_HAS_WAITERS)
 				rv = casueword32(&sem->_count, count, &count,
 				    count & ~USEM_HAS_WAITERS);
 			if (rv == -1)
 				error = EFAULT;
 			umtxq_lock(&key);
 		}
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 inline int
 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
 {
 	int error;
 
 	error = copyin(addr, tsp, sizeof(struct timespec));
 	if (error == 0) {
 		if (tsp->tv_sec < 0 ||
 		    tsp->tv_nsec >= 1000000000 ||
 		    tsp->tv_nsec < 0)
 			error = EINVAL;
 	}
 	return (error);
 }
 
 static inline int
 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
 {
 	int error;
 	
 	if (size <= sizeof(struct timespec)) {
 		tp->_clockid = CLOCK_REALTIME;
 		tp->_flags = 0;
 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
 	} else 
 		error = copyin(addr, tp, sizeof(struct _umtx_time));
 	if (error != 0)
 		return (error);
 	if (tp->_timeout.tv_sec < 0 ||
 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
 		return (EINVAL);
 	return (0);
 }
 
 static int
 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (EOPNOTSUPP);
 }
 
 static int
 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout, *tm_p;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
 }
 
 static int
 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout, *tm_p;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
 }
 
 static int
 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
 }
 
 static int
 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
 }
 
 #define BATCH_SIZE	128
 static int
 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
 {
 	int count = uap->val;
 	void *uaddrs[BATCH_SIZE];
 	char **upp = (char **)uap->obj;
 	int tocopy;
 	int error = 0;
 	int i, pos = 0;
 
 	while (count > 0) {
 		tocopy = count;
 		if (tocopy > BATCH_SIZE)
 			tocopy = BATCH_SIZE;
 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
 		if (error != 0)
 			break;
 		for (i = 0; i < tocopy; ++i)
 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
 		count -= tocopy;
 		pos += tocopy;
 	}
 	return (error);
 }
 
 static int
 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
 {
 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
 }
 
 static int
 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return do_lock_umutex(td, uap->obj, tm_p, 0);
 }
 
 static int
 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
 }
 
 static int
 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
 }
 
 static int
 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	return do_wake_umutex(td, uap->obj);
 }
 
 static int
 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	return do_unlock_umutex(td, uap->obj);
 }
 
 static int
 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
 {
 	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
 }
 
 static int
 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		ts = &timeout;
 	}
 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
 }
 
 static int
 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
 {
 	return do_cv_signal(td, uap->obj);
 }
 
 static int
 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
 {
 	return do_cv_broadcast(td, uap->obj);
 }
 
 static int
 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
 	} else {
 		error = umtx_copyin_umtx_time(uap->uaddr2,
 		   (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_wrlock(td, uap->obj, 0);
 	} else {
 		error = umtx_copyin_umtx_time(uap->uaddr2, 
 		   (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 
 		error = do_rw_wrlock(td, uap->obj, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
 {
 	return do_rw_unlock(td, uap->obj);
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem_wait(td, uap->obj, tm_p));
 }
 
 static int
 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_sem_wake(td, uap->obj));
 }
 #endif
 
 static int
 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_wake2_umutex(td, uap->obj, uap->val));
 }
 
 static int
 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem2_wait(td, uap->obj, tm_p));
 }
 
 static int
 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_sem2_wake(td, uap->obj));
 }
 
+#define	USHM_OBJ_UMTX(o)						\
+    ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
+
+#define	USHMF_REG_LINKED	0x0001
+#define	USHMF_OBJ_LINKED	0x0002
+struct umtx_shm_reg {
+	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
+	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
+	struct umtx_key		ushm_key;
+	struct ucred		*ushm_cred;
+	struct shmfd		*ushm_obj;
+	u_int			ushm_refcnt;
+	u_int			ushm_flags;
+};
+
+LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
+TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
+
+static uma_zone_t umtx_shm_reg_zone;
+static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
+static struct mtx umtx_shm_lock;
+static struct umtx_shm_reg_head umtx_shm_reg_delfree =
+    TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
+
+static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
+
+static void
+umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
+{
+	struct umtx_shm_reg_head d;
+	struct umtx_shm_reg *reg, *reg1;
+
+	TAILQ_INIT(&d);
+	mtx_lock(&umtx_shm_lock);
+	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
+	mtx_unlock(&umtx_shm_lock);
+	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
+		TAILQ_REMOVE(&d, reg, ushm_reg_link);
+		umtx_shm_free_reg(reg);
+	}
+}
+
+static struct task umtx_shm_reg_delfree_task =
+    TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
+
+static struct umtx_shm_reg *
+umtx_shm_find_reg_locked(const struct umtx_key *key)
+{
+	struct umtx_shm_reg *reg;
+	struct umtx_shm_reg_head *reg_head;
+
+	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
+	mtx_assert(&umtx_shm_lock, MA_OWNED);
+	reg_head = &umtx_shm_registry[key->hash];
+	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
+		KASSERT(reg->ushm_key.shared,
+		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
+		if (reg->ushm_key.info.shared.object ==
+		    key->info.shared.object &&
+		    reg->ushm_key.info.shared.offset ==
+		    key->info.shared.offset) {
+			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
+			KASSERT(reg->ushm_refcnt > 0,
+			    ("reg %p refcnt 0 onlist", reg));
+			KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0,
+			    ("reg %p not linked", reg));
+			reg->ushm_refcnt++;
+			return (reg);
+		}
+	}
+	return (NULL);
+}
+
+static struct umtx_shm_reg *
+umtx_shm_find_reg(const struct umtx_key *key)
+{
+	struct umtx_shm_reg *reg;
+
+	mtx_lock(&umtx_shm_lock);
+	reg = umtx_shm_find_reg_locked(key);
+	mtx_unlock(&umtx_shm_lock);
+	return (reg);
+}
+
+static void
+umtx_shm_free_reg(struct umtx_shm_reg *reg)
+{
+
+	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
+	crfree(reg->ushm_cred);
+	shm_drop(reg->ushm_obj);
+	uma_zfree(umtx_shm_reg_zone, reg);
+}
+
+static bool
+umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force)
+{
+	bool res;
+
+	mtx_assert(&umtx_shm_lock, MA_OWNED);
+	KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg));
+	reg->ushm_refcnt--;
+	res = reg->ushm_refcnt == 0;
+	if (res || force) {
+		if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) {
+			TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash],
+			    reg, ushm_reg_link);
+			reg->ushm_flags &= ~USHMF_REG_LINKED;
+		}
+		if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) {
+			LIST_REMOVE(reg, ushm_obj_link);
+			reg->ushm_flags &= ~USHMF_OBJ_LINKED;
+		}
+	}
+	return (res);
+}
+
+static void
+umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force)
+{
+	vm_object_t object;
+	bool dofree;
+
+	if (force) {
+		object = reg->ushm_obj->shm_object;
+		VM_OBJECT_WLOCK(object);
+		object->flags |= OBJ_UMTXDEAD;
+		VM_OBJECT_WUNLOCK(object);
+	}
+	mtx_lock(&umtx_shm_lock);
+	dofree = umtx_shm_unref_reg_locked(reg, force);
+	mtx_unlock(&umtx_shm_lock);
+	if (dofree)
+		umtx_shm_free_reg(reg);
+}
+
+void
+umtx_shm_object_init(vm_object_t object)
+{
+
+	LIST_INIT(USHM_OBJ_UMTX(object));
+}
+
+void
+umtx_shm_object_terminated(vm_object_t object)
+{
+	struct umtx_shm_reg *reg, *reg1;
+	bool dofree;
+
+	dofree = false;
+	mtx_lock(&umtx_shm_lock);
+	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
+		if (umtx_shm_unref_reg_locked(reg, true)) {
+			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
+			    ushm_reg_link);
+			dofree = true;
+		}
+	}
+	mtx_unlock(&umtx_shm_lock);
+	if (dofree)
+		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
+}
+
+static int
+umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
+    struct umtx_shm_reg **res)
+{
+	struct umtx_shm_reg *reg, *reg1;
+	struct ucred *cred;
+	int error;
+
+	reg = umtx_shm_find_reg(key);
+	if (reg != NULL) {
+		*res = reg;
+		return (0);
+	}
+	cred = td->td_ucred;
+	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
+		return (ENOMEM);
+	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
+	reg->ushm_refcnt = 1;
+	bcopy(key, &reg->ushm_key, sizeof(*key));
+	reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
+	reg->ushm_cred = crhold(cred);
+	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
+	if (error != 0) {
+		umtx_shm_free_reg(reg);
+		return (error);
+	}
+	mtx_lock(&umtx_shm_lock);
+	reg1 = umtx_shm_find_reg_locked(key);
+	if (reg1 != NULL) {
+		mtx_unlock(&umtx_shm_lock);
+		umtx_shm_free_reg(reg);
+		*res = reg1;
+		return (0);
+	}
+	reg->ushm_refcnt++;
+	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
+	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
+	    ushm_obj_link);
+	reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED;
+	mtx_unlock(&umtx_shm_lock);
+	*res = reg;
+	return (0);
+}
+
+static int
+umtx_shm_alive(struct thread *td, void *addr)
+{
+	vm_map_t map;
+	vm_map_entry_t entry;
+	vm_object_t object;
+	vm_pindex_t pindex;
+	vm_prot_t prot;
+	int res, ret;
+	boolean_t wired;
+
+	map = &td->td_proc->p_vmspace->vm_map;
+	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
+	    &object, &pindex, &prot, &wired);
+	if (res != KERN_SUCCESS)
+		return (EFAULT);
+	if (object == NULL)
+		ret = EINVAL;
+	else
+		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
+	vm_map_lookup_done(map, entry);
+	return (ret);
+}
+
+static void
+umtx_shm_init(void)
+{
+	int i;
+
+	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
+	for (i = 0; i < nitems(umtx_shm_registry); i++)
+		TAILQ_INIT(&umtx_shm_registry[i]);
+}
+
+static int
+umtx_shm(struct thread *td, void *addr, u_int flags)
+{
+	struct umtx_key key;
+	struct umtx_shm_reg *reg;
+	struct file *fp;
+	int error, fd;
+
+	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
+	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
+		return (EINVAL);
+	if ((flags & UMTX_SHM_ALIVE) != 0)
+		return (umtx_shm_alive(td, addr));
+	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
+	if (error != 0)
+		return (error);
+	KASSERT(key.shared == 1, ("non-shared key"));
+	if ((flags & UMTX_SHM_CREAT) != 0) {
+		error = umtx_shm_create_reg(td, &key, &reg);
+	} else {
+		reg = umtx_shm_find_reg(&key);
+		if (reg == NULL)
+			error = ESRCH;
+	}
+	umtx_key_release(&key);
+	if (error != 0)
+		return (error);
+	KASSERT(reg != NULL, ("no reg"));
+	if ((flags & UMTX_SHM_DESTROY) != 0) {
+		umtx_shm_unref_reg(reg, true);
+	} else {
+#if 0
+#ifdef MAC
+		error = mac_posixshm_check_open(td->td_ucred,
+		    reg->ushm_obj, FFLAGS(O_RDWR));
+		if (error == 0)
+#endif
+			error = shm_access(reg->ushm_obj, td->td_ucred,
+			    FFLAGS(O_RDWR));
+		if (error == 0)
+#endif
+			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
+		if (error == 0) {
+			shm_hold(reg->ushm_obj);
+			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
+			    &shm_ops);
+			td->td_retval[0] = fd;
+			fdrop(fp, td);
+		}
+	}
+	umtx_shm_unref_reg(reg, false);
+	return (error);
+}
+
+static int
+__umtx_op_shm(struct thread *td, struct _umtx_op_args *uap)
+{
+
+	return (umtx_shm(td, uap->uaddr1, uap->val));
+}
+
 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
 
 static const _umtx_op_func op_table[] = {
 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
 	[UMTX_OP_WAIT]		= __umtx_op_wait,
 	[UMTX_OP_WAKE]		= __umtx_op_wake,
 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
 #else
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
 #endif
 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
+	[UMTX_OP_SHM]		= __umtx_op_shm,
 };
 
 int
 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	if ((unsigned)uap->op < nitems(op_table))
 		return (*op_table[uap->op])(td, uap);
 	return (EINVAL);
 }
 
 #ifdef COMPAT_FREEBSD32
 
 struct timespec32 {
 	int32_t tv_sec;
 	int32_t tv_nsec;
 };
 
 struct umtx_time32 {
 	struct	timespec32	timeout;
 	uint32_t		flags;
 	uint32_t		clockid;
 };
 
 static inline int
 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
 {
 	struct timespec32 ts32;
 	int error;
 
 	error = copyin(addr, &ts32, sizeof(struct timespec32));
 	if (error == 0) {
 		if (ts32.tv_sec < 0 ||
 		    ts32.tv_nsec >= 1000000000 ||
 		    ts32.tv_nsec < 0)
 			error = EINVAL;
 		else {
 			tsp->tv_sec = ts32.tv_sec;
 			tsp->tv_nsec = ts32.tv_nsec;
 		}
 	}
 	return (error);
 }
 
 static inline int
 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
 {
 	struct umtx_time32 t32;
 	int error;
 	
 	t32.clockid = CLOCK_REALTIME;
 	t32.flags   = 0;
 	if (size <= sizeof(struct timespec32))
 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
 	else 
 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
 	if (error != 0)
 		return (error);
 	if (t32.timeout.tv_sec < 0 ||
 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
 		return (EINVAL);
 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
 	tp->_flags = t32.flags;
 	tp->_clockid = t32.clockid;
 	return (0);
 }
 
 static int
 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 			(size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
 }
 
 static int
 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(uap->uaddr2,
 			    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return do_lock_umutex(td, uap->obj, tm_p, 0);
 }
 
 static int
 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2, 
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
 }
 
 static int
 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		ts = &timeout;
 	}
 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
 }
 
 static int
 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
 	} else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_wrlock(td, uap->obj, 0);
 	} else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_wrlock(td, uap->obj, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(
 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem_wait(td, uap->obj, tm_p));
 }
 #endif
 
 static int
 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem2_wait(td, uap->obj, tm_p));
 }
 
 static int
 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
 {
 	int count = uap->val;
 	uint32_t uaddrs[BATCH_SIZE];
 	uint32_t **upp = (uint32_t **)uap->obj;
 	int tocopy;
 	int error = 0;
 	int i, pos = 0;
 
 	while (count > 0) {
 		tocopy = count;
 		if (tocopy > BATCH_SIZE)
 			tocopy = BATCH_SIZE;
 		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
 		if (error != 0)
 			break;
 		for (i = 0; i < tocopy; ++i)
 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
 				INT_MAX, 1);
 		count -= tocopy;
 		pos += tocopy;
 	}
 	return (error);
 }
 
 static const _umtx_op_func op_table_compat32[] = {
 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
 	[UMTX_OP_WAIT]	= __umtx_op_wait_compat32,
 	[UMTX_OP_WAKE]	= __umtx_op_wake,
 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_trylock_umutex,
 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_lock_umutex_compat32,
 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait_compat32,
 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_compat32,
 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock_compat32,
 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock_compat32,
 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32,
 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex_compat32,
 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait_compat32,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
 #else
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
 #endif
 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private32,
 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait_compat32,
 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
+	[UMTX_OP_SHM]		= __umtx_op_shm,
 };
 
 int
 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
 {
 
 	if ((unsigned)uap->op < nitems(op_table_compat32)) {
 		return (*op_table_compat32[uap->op])(td,
 		    (struct _umtx_op_args *)uap);
 	}
 	return (EINVAL);
 }
 #endif
 
 void
 umtx_thread_init(struct thread *td)
 {
 	td->td_umtxq = umtxq_alloc();
 	td->td_umtxq->uq_thread = td;
 }
 
 void
 umtx_thread_fini(struct thread *td)
 {
 	umtxq_free(td->td_umtxq);
 }
 
 /*
  * It will be called when new thread is created, e.g fork().
  */
 void
 umtx_thread_alloc(struct thread *td)
 {
 	struct umtx_q *uq;
 
 	uq = td->td_umtxq;
 	uq->uq_inherited_pri = PRI_MAX;
 
 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
 }
 
 /*
  * exec() hook.
  */
 static void
 umtx_exec_hook(void *arg __unused, struct proc *p __unused,
 	struct image_params *imgp __unused)
 {
 	umtx_thread_cleanup(curthread);
 }
 
 /*
  * thread_exit() hook.
  */
 void
 umtx_thread_exit(struct thread *td)
 {
 	umtx_thread_cleanup(td);
 }
 
 /*
  * clean up umtx data.
  */
 static void
 umtx_thread_cleanup(struct thread *td)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 
 	if ((uq = td->td_umtxq) == NULL)
 		return;
 
 	mtx_lock(&umtx_lock);
 	uq->uq_inherited_pri = PRI_MAX;
 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
 		pi->pi_owner = NULL;
 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
 	}
 	mtx_unlock(&umtx_lock);
 	thread_lock(td);
 	sched_lend_user_prio(td, PRI_MAX);
 	thread_unlock(td);
 }
Index: head/sys/kern/uipc_shm.c
===================================================================
--- head/sys/kern/uipc_shm.c	(revision 296161)
+++ head/sys/kern/uipc_shm.c	(revision 296162)
@@ -1,1085 +1,1080 @@
 /*-
  * Copyright (c) 2006, 2011 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Support for shared swap-backed anonymous memory objects via
  * shm_open(2) and shm_unlink(2).  While most of the implementation is
  * here, vm_mmap.c contains mapping logic changes.
  *
  * TODO:
  *
  * (1) Need to export data to a userland tool via a sysctl.  Should ipcs(1)
  *     and ipcrm(1) be expanded or should new tools to manage both POSIX
  *     kernel semaphores and POSIX shared memory be written?
  *
  * (2) Add support for this file type to fstat(1).
  *
  * (3) Resource limits?  Does this need its own resource limits or are the
  *     existing limits in mmap(2) sufficient?
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
 #include <sys/uio.h>
 #include <sys/signal.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 struct shm_mapping {
 	char		*sm_path;
 	Fnv32_t		sm_fnv;
 	struct shmfd	*sm_shmfd;
 	LIST_ENTRY(shm_mapping) sm_link;
 };
 
 static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
 static LIST_HEAD(, shm_mapping) *shm_dictionary;
 static struct sx shm_dict_lock;
 static struct mtx shm_timestamp_lock;
 static u_long shm_hash;
 static struct unrhdr *shm_ino_unr;
 static dev_t shm_dev_ino;
 
 #define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])
 
-static int	shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags);
-static struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode);
 static void	shm_init(void *arg);
-static void	shm_drop(struct shmfd *shmfd);
-static struct shmfd *shm_hold(struct shmfd *shmfd);
 static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
 static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
 static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
-static int	shm_dotruncate(struct shmfd *shmfd, off_t length);
 
 static fo_rdwr_t	shm_read;
 static fo_rdwr_t	shm_write;
 static fo_truncate_t	shm_truncate;
 static fo_stat_t	shm_stat;
 static fo_close_t	shm_close;
 static fo_chmod_t	shm_chmod;
 static fo_chown_t	shm_chown;
 static fo_seek_t	shm_seek;
 static fo_fill_kinfo_t	shm_fill_kinfo;
 static fo_mmap_t	shm_mmap;
 
 /* File descriptor operations. */
-static struct fileops shm_ops = {
+struct fileops shm_ops = {
 	.fo_read = shm_read,
 	.fo_write = shm_write,
 	.fo_truncate = shm_truncate,
 	.fo_ioctl = invfo_ioctl,
 	.fo_poll = invfo_poll,
 	.fo_kqfilter = invfo_kqfilter,
 	.fo_stat = shm_stat,
 	.fo_close = shm_close,
 	.fo_chmod = shm_chmod,
 	.fo_chown = shm_chown,
 	.fo_sendfile = vn_sendfile,
 	.fo_seek = shm_seek,
 	.fo_fill_kinfo = shm_fill_kinfo,
 	.fo_mmap = shm_mmap,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 FEATURE(posix_shm, "POSIX shared memory");
 
 static int
 uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
 {
 	vm_page_t m;
 	vm_pindex_t idx;
 	size_t tlen;
 	int error, offset, rv;
 
 	idx = OFF_TO_IDX(uio->uio_offset);
 	offset = uio->uio_offset & PAGE_MASK;
 	tlen = MIN(PAGE_SIZE - offset, len);
 
 	VM_OBJECT_WLOCK(obj);
 
 	/*
 	 * Read I/O without either a corresponding resident page or swap
 	 * page: use zero_region.  This is intended to avoid instantiating
 	 * pages on read from a sparse region.
 	 */
 	if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL &&
 	    !vm_pager_has_page(obj, idx, NULL, NULL)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (uiomove(__DECONST(void *, zero_region), tlen, uio));
 	}
 
 	/*
 	 * Parallel reads of the page content from disk are prevented
 	 * by exclusive busy.
 	 *
 	 * Although the tmpfs vnode lock is held here, it is
 	 * nonetheless safe to sleep waiting for a free page.  The
 	 * pageout daemon does not need to acquire the tmpfs vnode
 	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
 	 * type object.
 	 */
 	m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL);
 	if (m->valid != VM_PAGE_BITS_ALL) {
 		if (vm_pager_has_page(obj, idx, NULL, NULL)) {
 			rv = vm_pager_get_pages(obj, &m, 1, NULL, NULL);
 			if (rv != VM_PAGER_OK) {
 				printf(
 	    "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n",
 				    obj, idx, m->valid, rv);
 				vm_page_lock(m);
 				vm_page_free(m);
 				vm_page_unlock(m);
 				VM_OBJECT_WUNLOCK(obj);
 				return (EIO);
 			}
 		} else
 			vm_page_zero_invalid(m, TRUE);
 	}
 	vm_page_xunbusy(m);
 	vm_page_lock(m);
 	vm_page_hold(m);
 	if (m->queue == PQ_NONE) {
 		vm_page_deactivate(m);
 	} else {
 		/* Requeue to maintain LRU ordering. */
 		vm_page_requeue(m);
 	}
 	vm_page_unlock(m);
 	VM_OBJECT_WUNLOCK(obj);
 	error = uiomove_fromphys(&m, offset, tlen, uio);
 	if (uio->uio_rw == UIO_WRITE && error == 0) {
 		VM_OBJECT_WLOCK(obj);
 		vm_page_dirty(m);
 		vm_pager_page_unswapped(m);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	vm_page_lock(m);
 	vm_page_unhold(m);
 	vm_page_unlock(m);
 
 	return (error);
 }
 
 int
 uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
 {
 	ssize_t resid;
 	size_t len;
 	int error;
 
 	error = 0;
 	while ((resid = uio->uio_resid) > 0) {
 		if (obj_size <= uio->uio_offset)
 			break;
 		len = MIN(obj_size - uio->uio_offset, resid);
 		if (len == 0)
 			break;
 		error = uiomove_object_page(obj, len, uio);
 		if (error != 0 || resid == uio->uio_resid)
 			break;
 	}
 	return (error);
 }
 
 static int
 shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 	struct shmfd *shmfd;
 	off_t foffset;
 	int error;
 
 	shmfd = fp->f_data;
 	foffset = foffset_lock(fp, 0);
 	error = 0;
 	switch (whence) {
 	case L_INCR:
 		if (foffset < 0 ||
 		    (offset > 0 && foffset > OFF_MAX - offset)) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += foffset;
 		break;
 	case L_XTND:
 		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += shmfd->shm_size;
 		break;
 	case L_SET:
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0) {
 		if (offset < 0 || offset > shmfd->shm_size)
 			error = EINVAL;
 		else
 			td->td_uretoff.tdu_off = offset;
 	}
 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
 static int
 shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 
 	shmfd = fp->f_data;
 	foffset_lock_uio(fp, uio, flags);
 	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
 	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
 #ifdef MAC
 	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	foffset_lock_uio(fp, uio, flags);
 	if ((flags & FOF_OFFSET) == 0) {
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 		    &shmfd->shm_mtx);
 	} else {
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
 		    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
 	}
 
 	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	return (shm_dotruncate(shmfd, length));
 }
 
 static int
 shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 
 #ifdef MAC
 	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * Attempt to return sanish values for fstat() on a memory file
 	 * descriptor.
 	 */
 	bzero(sb, sizeof(*sb));
 	sb->st_blksize = PAGE_SIZE;
 	sb->st_size = shmfd->shm_size;
 	sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize;
 	mtx_lock(&shm_timestamp_lock);
 	sb->st_atim = shmfd->shm_atime;
 	sb->st_ctim = shmfd->shm_ctime;
 	sb->st_mtim = shmfd->shm_mtime;
 	sb->st_birthtim = shmfd->shm_birthtime;
 	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
 	sb->st_uid = shmfd->shm_uid;
 	sb->st_gid = shmfd->shm_gid;
 	mtx_unlock(&shm_timestamp_lock);
 	sb->st_dev = shm_dev_ino;
 	sb->st_ino = shmfd->shm_ino;
 
 	return (0);
 }
 
 static int
 shm_close(struct file *fp, struct thread *td)
 {
 	struct shmfd *shmfd;
 
 	shmfd = fp->f_data;
 	fp->f_data = NULL;
 	shm_drop(shmfd);
 
 	return (0);
 }
 
-static int
+int
 shm_dotruncate(struct shmfd *shmfd, off_t length)
 {
 	vm_object_t object;
 	vm_page_t m;
 	vm_pindex_t idx, nobjsize;
 	vm_ooffset_t delta;
 	int base, rv;
 
 	object = shmfd->shm_object;
 	VM_OBJECT_WLOCK(object);
 	if (length == shmfd->shm_size) {
 		VM_OBJECT_WUNLOCK(object);
 		return (0);
 	}
 	nobjsize = OFF_TO_IDX(length + PAGE_MASK);
 
 	/* Are we shrinking?  If so, trim the end. */
 	if (length < shmfd->shm_size) {
 		/*
 		 * Disallow any requests to shrink the size if this
 		 * object is mapped into the kernel.
 		 */
 		if (shmfd->shm_kmappings > 0) {
 			VM_OBJECT_WUNLOCK(object);
 			return (EBUSY);
 		}
 
 		/*
 		 * Zero the truncated part of the last page.
 		 */
 		base = length & PAGE_MASK;
 		if (base != 0) {
 			idx = OFF_TO_IDX(length);
 retry:
 			m = vm_page_lookup(object, idx);
 			if (m != NULL) {
 				if (vm_page_sleep_if_busy(m, "shmtrc"))
 					goto retry;
 			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
 				m = vm_page_alloc(object, idx, VM_ALLOC_NORMAL);
 				if (m == NULL) {
 					VM_OBJECT_WUNLOCK(object);
 					VM_WAIT;
 					VM_OBJECT_WLOCK(object);
 					goto retry;
 				} else if (m->valid != VM_PAGE_BITS_ALL)
 					rv = vm_pager_get_pages(object, &m, 1,
 					    NULL, NULL);
 				else
 					/* A cached page was reactivated. */
 					rv = VM_PAGER_OK;
 				vm_page_lock(m);
 				if (rv == VM_PAGER_OK) {
 					vm_page_deactivate(m);
 					vm_page_unlock(m);
 					vm_page_xunbusy(m);
 				} else {
 					vm_page_free(m);
 					vm_page_unlock(m);
 					VM_OBJECT_WUNLOCK(object);
 					return (EIO);
 				}
 			}
 			if (m != NULL) {
 				pmap_zero_page_area(m, base, PAGE_SIZE - base);
 				KASSERT(m->valid == VM_PAGE_BITS_ALL,
 				    ("shm_dotruncate: page %p is invalid", m));
 				vm_page_dirty(m);
 				vm_pager_page_unswapped(m);
 			}
 		}
 		delta = ptoa(object->size - nobjsize);
 
 		/* Toss in memory pages. */
 		if (nobjsize < object->size)
 			vm_object_page_remove(object, nobjsize, object->size,
 			    0);
 
 		/* Toss pages from swap. */
 		if (object->type == OBJT_SWAP)
 			swap_pager_freespace(object, nobjsize, delta);
 
 		/* Free the swap accounted for shm */
 		swap_release_by_cred(delta, object->cred);
 		object->charge -= delta;
 	} else {
 		/* Attempt to reserve the swap */
 		delta = ptoa(nobjsize - object->size);
 		if (!swap_reserve_by_cred(delta, object->cred)) {
 			VM_OBJECT_WUNLOCK(object);
 			return (ENOMEM);
 		}
 		object->charge += delta;
 	}
 	shmfd->shm_size = length;
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_ctime);
 	shmfd->shm_mtime = shmfd->shm_ctime;
 	mtx_unlock(&shm_timestamp_lock);
 	object->size = nobjsize;
 	VM_OBJECT_WUNLOCK(object);
 	return (0);
 }
 
 /*
  * shmfd object management including creation and reference counting
  * routines.
  */
-static struct shmfd *
+struct shmfd *
 shm_alloc(struct ucred *ucred, mode_t mode)
 {
 	struct shmfd *shmfd;
 	int ino;
 
 	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
 	shmfd->shm_size = 0;
 	shmfd->shm_uid = ucred->cr_uid;
 	shmfd->shm_gid = ucred->cr_gid;
 	shmfd->shm_mode = mode;
 	shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL,
 	    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
 	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
 	shmfd->shm_object->pg_color = 0;
 	VM_OBJECT_WLOCK(shmfd->shm_object);
 	vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING);
 	vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT);
 	VM_OBJECT_WUNLOCK(shmfd->shm_object);
 	vfs_timestamp(&shmfd->shm_birthtime);
 	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
 	    shmfd->shm_birthtime;
 	ino = alloc_unr(shm_ino_unr);
 	if (ino == -1)
 		shmfd->shm_ino = 0;
 	else
 		shmfd->shm_ino = ino;
 	refcount_init(&shmfd->shm_refs, 1);
 	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
 	rangelock_init(&shmfd->shm_rl);
 #ifdef MAC
 	mac_posixshm_init(shmfd);
 	mac_posixshm_create(ucred, shmfd);
 #endif
 
 	return (shmfd);
 }
 
-static struct shmfd *
+struct shmfd *
 shm_hold(struct shmfd *shmfd)
 {
 
 	refcount_acquire(&shmfd->shm_refs);
 	return (shmfd);
 }
 
-static void
+void
 shm_drop(struct shmfd *shmfd)
 {
 
 	if (refcount_release(&shmfd->shm_refs)) {
 #ifdef MAC
 		mac_posixshm_destroy(shmfd);
 #endif
 		rangelock_destroy(&shmfd->shm_rl);
 		mtx_destroy(&shmfd->shm_mtx);
 		vm_object_deallocate(shmfd->shm_object);
 		if (shmfd->shm_ino != 0)
 			free_unr(shm_ino_unr, shmfd->shm_ino);
 		free(shmfd, M_SHMFD);
 	}
 }
 
 /*
  * Determine if the credentials have sufficient permissions for a
  * specified combination of FREAD and FWRITE.
  */
-static int
+int
 shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
 {
 	accmode_t accmode;
 	int error;
 
 	accmode = 0;
 	if (flags & FREAD)
 		accmode |= VREAD;
 	if (flags & FWRITE)
 		accmode |= VWRITE;
 	mtx_lock(&shm_timestamp_lock);
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
 	    accmode, ucred, NULL);
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 /*
  * Dictionary management.  We maintain an in-kernel dictionary to map
  * paths to shmfd objects.  We use the FNV hash on the path to store
  * the mappings in a hash table.
  */
 static void
 shm_init(void *arg)
 {
 
 	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
 	sx_init(&shm_dict_lock, "shm dictionary");
 	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
 	shm_ino_unr = new_unrhdr(1, INT32_MAX, NULL);
 	KASSERT(shm_ino_unr != NULL, ("shm fake inodes not initialized"));
 	shm_dev_ino = devfs_alloc_cdp_inode();
 	KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
 }
 SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
 
 static struct shmfd *
 shm_lookup(char *path, Fnv32_t fnv)
 {
 	struct shm_mapping *map;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0)
 			return (map->sm_shmfd);
 	}
 
 	return (NULL);
 }
 
 static void
 shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
 {
 	struct shm_mapping *map;
 
 	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
 	map->sm_path = path;
 	map->sm_fnv = fnv;
 	map->sm_shmfd = shm_hold(shmfd);
 	shmfd->shm_path = path;
 	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
 }
 
 static int
 shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 {
 	struct shm_mapping *map;
 	int error;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0) {
 #ifdef MAC
 			error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
 			if (error)
 				return (error);
 #endif
 			error = shm_access(map->sm_shmfd, ucred,
 			    FREAD | FWRITE);
 			if (error)
 				return (error);
 			map->sm_shmfd->shm_path = NULL;
 			LIST_REMOVE(map, sm_link);
 			shm_drop(map->sm_shmfd);
 			free(map->sm_path, M_SHMFD);
 			free(map, M_SHMFD);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 int
 kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode,
     struct filecaps *fcaps)
 {
 	struct filedesc *fdp;
 	struct shmfd *shmfd;
 	struct file *fp;
 	char *path;
 	Fnv32_t fnv;
 	mode_t cmode;
 	int fd, error;
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * shm_open(2) is only allowed for anonymous objects.
 	 */
 	if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON))
 		return (ECAPMODE);
 #endif
 
 	if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR)
 		return (EINVAL);
 
 	if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
 		return (EINVAL);
 
 	fdp = td->td_proc->p_fd;
 	cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS;
 
 	error = falloc_caps(td, &fp, &fd, O_CLOEXEC, fcaps);
 	if (error)
 		return (error);
 
 	/* A SHM_ANON path pointer creates an anonymous object. */
 	if (userpath == SHM_ANON) {
 		/* A read-only anonymous object is pointless. */
 		if ((flags & O_ACCMODE) == O_RDONLY) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 		shmfd = shm_alloc(td->td_ucred, cmode);
 	} else {
 		path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
 		error = copyinstr(userpath, path, MAXPATHLEN, NULL);
 #ifdef KTRACE
 		if (error == 0 && KTRPOINT(curthread, KTR_NAMEI))
 			ktrnamei(path);
 #endif
 		/* Require paths to start with a '/' character. */
 		if (error == 0 && path[0] != '/')
 			error = EINVAL;
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			free(path, M_SHMFD);
 			return (error);
 		}
 
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&shm_dict_lock);
 		shmfd = shm_lookup(path, fnv);
 		if (shmfd == NULL) {
 			/* Object does not yet exist, create it if requested. */
 			if (flags & O_CREAT) {
 #ifdef MAC
 				error = mac_posixshm_check_create(td->td_ucred,
 				    path);
 				if (error == 0) {
 #endif
 					shmfd = shm_alloc(td->td_ucred, cmode);
 					shm_insert(path, fnv, shmfd);
 #ifdef MAC
 				}
 #endif
 			} else {
 				free(path, M_SHMFD);
 				error = ENOENT;
 			}
 		} else {
 			/*
 			 * Object already exists, obtain a new
 			 * reference if requested and permitted.
 			 */
 			free(path, M_SHMFD);
 			if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
 				error = EEXIST;
 			else {
 #ifdef MAC
 				error = mac_posixshm_check_open(td->td_ucred,
 				    shmfd, FFLAGS(flags & O_ACCMODE));
 				if (error == 0)
 #endif
 				error = shm_access(shmfd, td->td_ucred,
 				    FFLAGS(flags & O_ACCMODE));
 			}
 
 			/*
 			 * Truncate the file back to zero length if
 			 * O_TRUNC was specified and the object was
 			 * opened with read/write.
 			 */
 			if (error == 0 &&
 			    (flags & (O_ACCMODE | O_TRUNC)) ==
 			    (O_RDWR | O_TRUNC)) {
 #ifdef MAC
 				error = mac_posixshm_check_truncate(
 					td->td_ucred, fp->f_cred, shmfd);
 				if (error == 0)
 #endif
 					shm_dotruncate(shmfd, 0);
 			}
 			if (error == 0)
 				shm_hold(shmfd);
 		}
 		sx_xunlock(&shm_dict_lock);
 
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (error);
 		}
 	}
 
 	finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 
 	return (0);
 }
 
 /* System calls. */
 int
 sys_shm_open(struct thread *td, struct shm_open_args *uap)
 {
 
 	return (kern_shm_open(td, uap->path, uap->flags, uap->mode, NULL));
 }
 
 int
 sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
 {
 	char *path;
 	Fnv32_t fnv;
 	int error;
 
 	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
 	if (error) {
 		free(path, M_TEMP);
 		return (error);
 	}
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_NAMEI))
 		ktrnamei(path);
 #endif
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&shm_dict_lock);
 	error = shm_remove(path, fnv, td->td_ucred);
 	sx_xunlock(&shm_dict_lock);
 	free(path, M_TEMP);
 
 	return (error);
 }
 
 int
 shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags,
     vm_ooffset_t foff, struct thread *td)
 {
 	struct shmfd *shmfd;
 	vm_prot_t maxprot;
 	int error;
 
 	shmfd = fp->f_data;
 	maxprot = VM_PROT_NONE;
 
 	/* FREAD should always be set. */
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
 	if ((fp->f_flag & FWRITE) != 0)
 		maxprot |= VM_PROT_WRITE;
 
 	/* Don't permit shared writable mappings on read-only descriptors. */
 	if ((flags & MAP_SHARED) != 0 &&
 	    (maxprot & VM_PROT_WRITE) == 0 &&
 	    (prot & VM_PROT_WRITE) != 0)
 		return (EACCES);
 	maxprot &= cap_maxprot;
 
 #ifdef MAC
 	error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags);
 	if (error != 0)
 		return (error);
 #endif
 	
 	/*
 	 * XXXRW: This validation is probably insufficient, and subject to
 	 * sign errors.  It should be fixed.
 	 */
 	if (foff >= shmfd->shm_size ||
 	    foff + objsize > round_page(shmfd->shm_size))
 		return (EINVAL);
 
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_atime);
 	mtx_unlock(&shm_timestamp_lock);
 	vm_object_reference(shmfd->shm_object);
 
 	error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
 	    shmfd->shm_object, foff, FALSE, td);
 	if (error != 0)
 		vm_object_deallocate(shmfd->shm_object);
 	return (0);
 }
 
 static int
 shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 	/*
 	 * SUSv4 says that x bits of permission need not be affected.
 	 * Be consistent with our shm_open there.
 	 */
 #ifdef MAC
 	error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
 	if (error != 0)
 		goto out;
 #endif
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid,
 	    shmfd->shm_gid, VADMIN, active_cred, NULL);
 	if (error != 0)
 		goto out;
 	shmfd->shm_mode = mode & ACCESSPERMS;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 static int
 shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 #ifdef MAC
 	error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
 	if (error != 0)
 		goto out;
 #endif
 	if (uid == (uid_t)-1)
 		uid = shmfd->shm_uid;
 	if (gid == (gid_t)-1)
                  gid = shmfd->shm_gid;
 	if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) ||
 	    (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
 		goto out;
 	shmfd->shm_uid = uid;
 	shmfd->shm_gid = gid;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 /*
  * Helper routines to allow the backing object of a shared memory file
  * descriptor to be mapped in the kernel.
  */
 int
 shm_map(struct file *fp, size_t size, off_t offset, void **memp)
 {
 	struct shmfd *shmfd;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	obj = shmfd->shm_object;
 	VM_OBJECT_WLOCK(obj);
 	/*
 	 * XXXRW: This validation is probably insufficient, and subject to
 	 * sign errors.  It should be fixed.
 	 */
 	if (offset >= shmfd->shm_size ||
 	    offset + size > round_page(shmfd->shm_size)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (EINVAL);
 	}
 
 	shmfd->shm_kmappings++;
 	vm_object_reference_locked(obj);
 	VM_OBJECT_WUNLOCK(obj);
 
 	/* Map the object into the kernel_map and wire it. */
 	kva = vm_map_min(kernel_map);
 	ofs = offset & PAGE_MASK;
 	offset = trunc_page(offset);
 	size = round_page(size + ofs);
 	rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0,
 	    VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
 	    VM_PROT_READ | VM_PROT_WRITE, 0);
 	if (rv == KERN_SUCCESS) {
 		rv = vm_map_wire(kernel_map, kva, kva + size,
 		    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 		if (rv == KERN_SUCCESS) {
 			*memp = (void *)(kva + ofs);
 			return (0);
 		}
 		vm_map_remove(kernel_map, kva, kva + size);
 	} else
 		vm_object_deallocate(obj);
 
 	/* On failure, drop our mapping reference. */
 	VM_OBJECT_WLOCK(obj);
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 
 	return (vm_mmap_to_errno(rv));
 }
 
 /*
  * We require the caller to unmap the entire entry.  This allows us to
  * safely decrement shm_kmappings when a mapping is removed.
  */
 int
 shm_unmap(struct file *fp, void *mem, size_t size)
 {
 	struct shmfd *shmfd;
 	vm_map_entry_t entry;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 	vm_map_t map;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	kva = (vm_offset_t)mem;
 	ofs = kva & PAGE_MASK;
 	kva = trunc_page(kva);
 	size = round_page(size + ofs);
 	map = kernel_map;
 	rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry,
 	    &obj, &pindex, &prot, &wired);
 	if (rv != KERN_SUCCESS)
 		return (EINVAL);
 	if (entry->start != kva || entry->end != kva + size) {
 		vm_map_lookup_done(map, entry);
 		return (EINVAL);
 	}
 	vm_map_lookup_done(map, entry);
 	if (obj != shmfd->shm_object)
 		return (EINVAL);
 	vm_map_remove(map, kva, kva + size);
 	VM_OBJECT_WLOCK(obj);
 	KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped"));
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 	return (0);
 }
 
 static int
 shm_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct shmfd *shmfd;
 
 	kif->kf_type = KF_TYPE_SHM;
 	shmfd = fp->f_data;
 
 	mtx_lock(&shm_timestamp_lock);
 	kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode;	/* XXX */
 	mtx_unlock(&shm_timestamp_lock);
 	kif->kf_un.kf_file.kf_file_size = shmfd->shm_size;
 	if (shmfd->shm_path != NULL) {
 		sx_slock(&shm_dict_lock);
 		if (shmfd->shm_path != NULL)
 			strlcpy(kif->kf_path, shmfd->shm_path,
 			    sizeof(kif->kf_path));
 		sx_sunlock(&shm_dict_lock);
 	}
 	return (0);
 }
Index: head/sys/sys/mman.h
===================================================================
--- head/sys/sys/mman.h	(revision 296161)
+++ head/sys/sys/mman.h	(revision 296162)
@@ -1,271 +1,278 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)mman.h	8.2 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_MMAN_H_
 #define _SYS_MMAN_H_
 
 #include <sys/cdefs.h>
 #include <sys/_types.h>
 
 #if __BSD_VISIBLE
 /*
  * Inheritance for minherit()
  */
 #define INHERIT_SHARE	0
 #define INHERIT_COPY	1
 #define INHERIT_NONE	2
 #endif
 
 /*
  * Protections are chosen from these bits, or-ed together
  */
 #define	PROT_NONE	0x00	/* no permissions */
 #define	PROT_READ	0x01	/* pages can be read */
 #define	PROT_WRITE	0x02	/* pages can be written */
 #define	PROT_EXEC	0x04	/* pages can be executed */
 
 /*
  * Flags contain sharing type and options.
  * Sharing types; choose one.
  */
 #define	MAP_SHARED	0x0001		/* share changes */
 #define	MAP_PRIVATE	0x0002		/* changes are private */
 #if __BSD_VISIBLE
 #define	MAP_COPY	MAP_PRIVATE	/* Obsolete */
 #endif
 
 /*
  * Other flags
  */
 #define	MAP_FIXED	 0x0010	/* map addr must be exactly as requested */
 
 #if __BSD_VISIBLE
 #define	MAP_RESERVED0020 0x0020	/* previously unimplemented MAP_RENAME */
 #define	MAP_RESERVED0040 0x0040	/* previously unimplemented MAP_NORESERVE */
 #define	MAP_RESERVED0080 0x0080	/* previously misimplemented MAP_INHERIT */
 #define	MAP_RESERVED0100 0x0100	/* previously unimplemented MAP_NOEXTEND */
 #define	MAP_HASSEMAPHORE 0x0200	/* region may contain semaphores */
 #define	MAP_STACK	 0x0400	/* region grows down, like a stack */
 #define	MAP_NOSYNC	 0x0800 /* page to but do not sync underlying file */
 
 /*
  * Mapping type
  */
 #define	MAP_FILE	 0x0000	/* map from file (default) */
 #define	MAP_ANON	 0x1000	/* allocated from memory, swap space */
 #ifndef _KERNEL
 #define	MAP_ANONYMOUS	 MAP_ANON /* For compatibility. */
 #endif /* !_KERNEL */
 
 /*
  * Extended flags
  */
 #define	MAP_EXCL	 0x00004000 /* for MAP_FIXED, fail if address is used */
 #define	MAP_NOCORE	 0x00020000 /* dont include these pages in a coredump */
 #define	MAP_PREFAULT_READ 0x00040000 /* prefault mapping for reading */
 #ifdef __LP64__
 #define	MAP_32BIT	 0x00080000 /* map in the low 2GB of address space */
 #endif
 
 /*
  * Request specific alignment (n == log2 of the desired alignment).
  *
  * MAP_ALIGNED_SUPER requests optimal superpage alignment, but does
  * not enforce a specific alignment.
  */
 #define	MAP_ALIGNED(n)	 ((n) << MAP_ALIGNMENT_SHIFT)
 #define	MAP_ALIGNMENT_SHIFT	24
 #define	MAP_ALIGNMENT_MASK	MAP_ALIGNED(0xff)
 #define	MAP_ALIGNED_SUPER	MAP_ALIGNED(1) /* align on a superpage */
 #endif /* __BSD_VISIBLE */
 
 #if __POSIX_VISIBLE >= 199309
 /*
  * Process memory locking
  */
 #define MCL_CURRENT	0x0001	/* Lock only current memory */
 #define MCL_FUTURE	0x0002	/* Lock all future memory as well */
 #endif
 
 /*
  * Error return from mmap()
  */
 #define MAP_FAILED	((void *)-1)
 
 /*
  * msync() flags
  */
 #define	MS_SYNC		0x0000	/* msync synchronously */
 #define MS_ASYNC	0x0001	/* return immediately */
 #define MS_INVALIDATE	0x0002	/* invalidate all cached data */
 
 /*
  * Advice to madvise
  */
 #define	_MADV_NORMAL	0	/* no further special treatment */
 #define	_MADV_RANDOM	1	/* expect random page references */
 #define	_MADV_SEQUENTIAL 2	/* expect sequential page references */
 #define	_MADV_WILLNEED	3	/* will need these pages */
 #define	_MADV_DONTNEED	4	/* dont need these pages */
 
 #if __BSD_VISIBLE
 #define	MADV_NORMAL	_MADV_NORMAL
 #define	MADV_RANDOM	_MADV_RANDOM
 #define	MADV_SEQUENTIAL _MADV_SEQUENTIAL
 #define	MADV_WILLNEED	_MADV_WILLNEED
 #define	MADV_DONTNEED	_MADV_DONTNEED
 #define	MADV_FREE	5	/* dont need these pages, and junk contents */
 #define	MADV_NOSYNC	6	/* try to avoid flushes to physical media */
 #define	MADV_AUTOSYNC	7	/* revert to default flushing strategy */
 #define	MADV_NOCORE	8	/* do not include these pages in a core file */
 #define	MADV_CORE	9	/* revert to including pages in a core file */
 #define	MADV_PROTECT	10	/* protect process from pageout kill */
 
 /*
  * Return bits from mincore
  */
 #define	MINCORE_INCORE	 	 0x1 /* Page is incore */
 #define	MINCORE_REFERENCED	 0x2 /* Page has been referenced by us */
 #define	MINCORE_MODIFIED	 0x4 /* Page has been modified by us */
 #define	MINCORE_REFERENCED_OTHER 0x8 /* Page has been referenced */
 #define	MINCORE_MODIFIED_OTHER	0x10 /* Page has been modified */
 #define	MINCORE_SUPER		0x20 /* Page is a "super" page */
 
 /*
  * Anonymous object constant for shm_open().
  */
 #define	SHM_ANON		((char *)1)
 #endif /* __BSD_VISIBLE */
 
 /*
  * XXX missing POSIX_TYPED_MEM_* macros and
  * posix_typed_mem_info structure.
  */
 #if __POSIX_VISIBLE >= 200112
 #define	POSIX_MADV_NORMAL	_MADV_NORMAL
 #define	POSIX_MADV_RANDOM	_MADV_RANDOM
 #define	POSIX_MADV_SEQUENTIAL	_MADV_SEQUENTIAL
 #define	POSIX_MADV_WILLNEED	_MADV_WILLNEED
 #define	POSIX_MADV_DONTNEED	_MADV_DONTNEED
 #endif
 
 #ifndef _MODE_T_DECLARED
 typedef	__mode_t	mode_t;
 #define	_MODE_T_DECLARED
 #endif
 
 #ifndef _OFF_T_DECLARED
 typedef	__off_t		off_t;
 #define	_OFF_T_DECLARED
 #endif
 
 #ifndef _SIZE_T_DECLARED
 typedef	__size_t	size_t;
 #define	_SIZE_T_DECLARED
 #endif
 
 #if defined(_KERNEL) || defined(_WANT_FILE)
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/rangelock.h>
 #include <vm/vm.h>
 
 struct file;
 
 struct shmfd {
 	size_t		shm_size;
 	vm_object_t	shm_object;
 	int		shm_refs;
 	uid_t		shm_uid;
 	gid_t		shm_gid;
 	mode_t		shm_mode;
 	int		shm_kmappings;
 
 	/*
 	 * Values maintained solely to make this a better-behaved file
 	 * descriptor for fstat() to run on.
 	 */
 	struct timespec	shm_atime;
 	struct timespec	shm_mtime;
 	struct timespec	shm_ctime;
 	struct timespec	shm_birthtime;
 	ino_t		shm_ino;
 
 	struct label	*shm_label;		/* MAC label */
 	const char	*shm_path;
 
 	struct rangelock shm_rl;
 	struct mtx	shm_mtx;
 };
 #endif
 
 #ifdef _KERNEL
 int	shm_map(struct file *fp, size_t size, off_t offset, void **memp);
 int	shm_unmap(struct file *fp, void *mem, size_t size);
 
+int	shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags);
+struct shmfd *shm_alloc(struct ucred *ucred, mode_t mode);
+struct shmfd *shm_hold(struct shmfd *shmfd);
+void	shm_drop(struct shmfd *shmfd);
+int	shm_dotruncate(struct shmfd *shmfd, off_t length);
+
+extern struct fileops shm_ops;
 #else /* !_KERNEL */
 
 __BEGIN_DECLS
 /*
  * XXX not yet implemented: posix_mem_offset(), posix_typed_mem_get_info(),
  * posix_typed_mem_open().
  */
 #if __BSD_VISIBLE
 int	getpagesizes(size_t *, int);
 int	madvise(void *, size_t, int);
 int	mincore(const void *, size_t, char *);
 int	minherit(void *, size_t, int);
 #endif
 int	mlock(const void *, size_t);
 #ifndef _MMAP_DECLARED
 #define	_MMAP_DECLARED
 void *	mmap(void *, size_t, int, int, int, off_t);
 #endif
 int	mprotect(const void *, size_t, int);
 int	msync(void *, size_t, int);
 int	munlock(const void *, size_t);
 int	munmap(void *, size_t);
 #if __POSIX_VISIBLE >= 200112
 int	posix_madvise(void *, size_t, int);
 #endif
 #if __POSIX_VISIBLE >= 199309
 int	mlockall(int);
 int	munlockall(void);
 int	shm_open(const char *, int, mode_t);
 int	shm_unlink(const char *);
 #endif
 __END_DECLS
 
 #endif /* !_KERNEL */
 
 #endif /* !_SYS_MMAN_H_ */
Index: head/sys/sys/resource.h
===================================================================
--- head/sys/sys/resource.h	(revision 296161)
+++ head/sys/sys/resource.h	(revision 296162)
@@ -1,185 +1,187 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)resource.h	8.4 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_RESOURCE_H_
 #define	_SYS_RESOURCE_H_
 
 #include <sys/cdefs.h>
 #include <sys/_timeval.h>
 #include <sys/_types.h>
 
 /*
  * Process priority specifications to get/setpriority.
  */
 #define	PRIO_MIN	-20
 #define	PRIO_MAX	20
 
 #define	PRIO_PROCESS	0
 #define	PRIO_PGRP	1
 #define	PRIO_USER	2
 
 /*
  * Resource utilization information.
  *
  * All fields are only modified by curthread and
  * no locks are required to read.
  */
 
 #define	RUSAGE_SELF	0
 #define	RUSAGE_CHILDREN	-1
 #define	RUSAGE_THREAD	1
 
 struct rusage {
 	struct timeval ru_utime;	/* user time used */
 	struct timeval ru_stime;	/* system time used */
 	long	ru_maxrss;		/* max resident set size */
 #define	ru_first	ru_ixrss
 	long	ru_ixrss;		/* integral shared memory size */
 	long	ru_idrss;		/* integral unshared data " */
 	long	ru_isrss;		/* integral unshared stack " */
 	long	ru_minflt;		/* page reclaims */
 	long	ru_majflt;		/* page faults */
 	long	ru_nswap;		/* swaps */
 	long	ru_inblock;		/* block input operations */
 	long	ru_oublock;		/* block output operations */
 	long	ru_msgsnd;		/* messages sent */
 	long	ru_msgrcv;		/* messages received */
 	long	ru_nsignals;		/* signals received */
 	long	ru_nvcsw;		/* voluntary context switches */
 	long	ru_nivcsw;		/* involuntary " */
 #define	ru_last		ru_nivcsw
 };
 
 #if __BSD_VISIBLE
 struct __wrusage {
 	struct rusage	wru_self;
 	struct rusage	wru_children;
 };
 #endif
 
 /*
  * Resource limits
  */
 #define	RLIMIT_CPU	0		/* maximum cpu time in seconds */
 #define	RLIMIT_FSIZE	1		/* maximum file size */
 #define	RLIMIT_DATA	2		/* data size */
 #define	RLIMIT_STACK	3		/* stack size */
 #define	RLIMIT_CORE	4		/* core file size */
 #define	RLIMIT_RSS	5		/* resident set size */
 #define	RLIMIT_MEMLOCK	6		/* locked-in-memory address space */
 #define	RLIMIT_NPROC	7		/* number of processes */
 #define	RLIMIT_NOFILE	8		/* number of open files */
 #define	RLIMIT_SBSIZE	9		/* maximum size of all socket buffers */
 #define	RLIMIT_VMEM	10		/* virtual process size (incl. mmap) */
 #define	RLIMIT_AS	RLIMIT_VMEM	/* standard name for RLIMIT_VMEM */
 #define	RLIMIT_NPTS	11		/* pseudo-terminals */
 #define	RLIMIT_SWAP	12		/* swap used */
 #define	RLIMIT_KQUEUES	13		/* kqueues allocated */
+#define	RLIMIT_UMTXP	14		/* process-shared umtx */
 
-#define	RLIM_NLIMITS	14		/* number of resource limits */
+#define	RLIM_NLIMITS	15		/* number of resource limits */
 
 #define	RLIM_INFINITY	((rlim_t)(((uint64_t)1 << 63) - 1))
 /* XXX Missing: RLIM_SAVED_MAX, RLIM_SAVED_CUR */
 
 
 /*
  * Resource limit string identifiers
  */
 
 #ifdef _RLIMIT_IDENT
 static const char *rlimit_ident[RLIM_NLIMITS] = {
 	"cpu",
 	"fsize",
 	"data",
 	"stack",
 	"core",
 	"rss",
 	"memlock",
 	"nproc",
 	"nofile",
 	"sbsize",
 	"vmem",
 	"npts",
 	"swap",
 	"kqueues",
+	"umtx",
 };
 #endif
 
 #ifndef _RLIM_T_DECLARED
 typedef	__rlim_t	rlim_t;
 #define	_RLIM_T_DECLARED
 #endif
 
 struct rlimit {
 	rlim_t	rlim_cur;		/* current (soft) limit */
 	rlim_t	rlim_max;		/* maximum value for rlim_cur */
 };
 
 #if __BSD_VISIBLE
 
 struct orlimit {
 	__int32_t	rlim_cur;	/* current (soft) limit */
 	__int32_t	rlim_max;	/* maximum value for rlim_cur */
 };
 
 struct loadavg {
 	__fixpt_t	ldavg[3];
 	long		fscale;
 };
 
 #define	CP_USER		0
 #define	CP_NICE		1
 #define	CP_SYS		2
 #define	CP_INTR		3
 #define	CP_IDLE		4
 #define	CPUSTATES	5
 
 #endif	/* __BSD_VISIBLE */
 
 #ifdef _KERNEL
 
 extern struct loadavg averunnable;
 void	read_cpu_time(long *cp_time);	/* Writes array of CPUSTATES */
 
 #else
 
 __BEGIN_DECLS
 /* XXX 2nd arg to [gs]etpriority() should be an id_t */
 int	getpriority(int, int);
 int	getrlimit(int, struct rlimit *);
 int	getrusage(int, struct rusage *);
 int	setpriority(int, int, int);
 int	setrlimit(int, const struct rlimit *);
 __END_DECLS
 
 #endif	/* _KERNEL */
 #endif	/* !_SYS_RESOURCE_H_ */
Index: head/sys/sys/resourcevar.h
===================================================================
--- head/sys/sys/resourcevar.h	(revision 296161)
+++ head/sys/sys/resourcevar.h	(revision 296162)
@@ -1,164 +1,166 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)resourcevar.h	8.4 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef	_SYS_RESOURCEVAR_H_
 #define	_SYS_RESOURCEVAR_H_
 
 #include <sys/resource.h>
 #include <sys/queue.h>
 #ifdef _KERNEL
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #endif
 
 /*
  * Kernel per-process accounting / statistics
  * (not necessarily resident except when running).
  *
  * Locking key:
  *      b - created at fork, never changes
  *      c - locked by proc mtx
  *      k - only accessed by curthread
  *      w - locked by proc itim lock
  *	w2 - locked by proc prof lock
  */
 struct pstats {
 #define	pstat_startzero	p_cru
 	struct	rusage p_cru;		/* Stats for reaped children. */
 	struct	itimerval p_timer[3];	/* (w) Virtual-time timers. */
 #define	pstat_endzero	pstat_startcopy
 
 #define	pstat_startcopy	p_prof
 	struct uprof {			/* Profile arguments. */
 		caddr_t	pr_base;	/* (c + w2) Buffer base. */
 		u_long	pr_size;	/* (c + w2) Buffer size. */
 		u_long	pr_off;		/* (c + w2) PC offset. */
 		u_long	pr_scale;	/* (c + w2) PC scaling. */
 	} p_prof;
 #define	pstat_endcopy	p_start
 	struct	timeval p_start;	/* (b) Starting time. */
 };
 
 #ifdef _KERNEL
 
 /*
  * Kernel shareable process resource limits.  Because this structure
  * is moderately large but changes infrequently, it is normally
  * shared copy-on-write after forks.
  */
 struct plimit {
 	struct	rlimit pl_rlimit[RLIM_NLIMITS];
 	int	pl_refcnt;		/* number of references */
 };
 
 struct racct;
 
 /*-
  * Per uid resource consumption.  This structure is used to track
  * the total resource consumption (process count, socket buffer size,
  * etc) for the uid and impose limits.
  *
  * Locking guide:
  * (a) Constant from inception
  * (b) Lockless, updated using atomics
  * (c) Locked by global uihashtbl_lock
  * (d) Locked by the ui_vmsize_mtx
  */
 struct uidinfo {
 	LIST_ENTRY(uidinfo) ui_hash;	/* (c) hash chain of uidinfos */
 	struct mtx ui_vmsize_mtx;
 	vm_ooffset_t ui_vmsize;		/* (d) swap reservation by uid */
 	long	ui_sbsize;		/* (b) socket buffer space consumed */
 	long	ui_proccnt;		/* (b) number of processes */
 	long	ui_ptscnt;		/* (b) number of pseudo-terminals */
 	long	ui_kqcnt;		/* (b) number of kqueues */
+	long	ui_umtxcnt;		/* (b) number of shared umtxs */
 	uid_t	ui_uid;			/* (a) uid */
 	u_int	ui_ref;			/* (b) reference count */
 #ifdef	RACCT
 	struct racct *ui_racct;		/* (a) resource accounting */
 #endif
 };
 
 #define	UIDINFO_VMSIZE_LOCK(ui)		mtx_lock(&((ui)->ui_vmsize_mtx))
 #define	UIDINFO_VMSIZE_UNLOCK(ui)	mtx_unlock(&((ui)->ui_vmsize_mtx))
 
 struct proc;
 struct rusage_ext;
 struct thread;
 
 void	 addupc_intr(struct thread *td, uintfptr_t pc, u_int ticks);
 void	 addupc_task(struct thread *td, uintfptr_t pc, u_int ticks);
 void	 calccru(struct proc *p, struct timeval *up, struct timeval *sp);
 void	 calcru(struct proc *p, struct timeval *up, struct timeval *sp);
 int	 chgkqcnt(struct uidinfo *uip, int diff, rlim_t max);
 int	 chgproccnt(struct uidinfo *uip, int diff, rlim_t maxval);
 int	 chgsbsize(struct uidinfo *uip, u_int *hiwat, u_int to,
 	    rlim_t maxval);
 int	 chgptscnt(struct uidinfo *uip, int diff, rlim_t maxval);
+int	 chgumtxcnt(struct uidinfo *uip, int diff, rlim_t maxval);
 int	 fuswintr(void *base);
 int	 kern_proc_setrlimit(struct thread *td, struct proc *p, u_int which,
 	    struct rlimit *limp);
 struct plimit
 	*lim_alloc(void);
 void	 lim_copy(struct plimit *dst, struct plimit *src);
 rlim_t	 lim_cur(struct thread *td, int which);
 rlim_t	 lim_cur_proc(struct proc *p, int which);
 void	 lim_fork(struct proc *p1, struct proc *p2);
 void	 lim_free(struct plimit *limp);
 struct plimit
 	*lim_hold(struct plimit *limp);
 rlim_t	 lim_max(struct thread *td, int which);
 rlim_t	 lim_max_proc(struct proc *p, int which);
 void	 lim_rlimit(struct thread *td, int which, struct rlimit *rlp);
 void	 lim_rlimit_proc(struct proc *p, int which, struct rlimit *rlp);
 void	 ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2,
 	    struct rusage_ext *rux2);
 void	 rucollect(struct rusage *ru, struct rusage *ru2);
 void	 rufetch(struct proc *p, struct rusage *ru);
 void	 rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up,
 	    struct timeval *sp);
 void	 rufetchtd(struct thread *td, struct rusage *ru);
 void	 ruxagg(struct proc *p, struct thread *td);
 int	 suswintr(void *base, int word);
 struct uidinfo
 	*uifind(uid_t uid);
 void	 uifree(struct uidinfo *uip);
 void	 uihashinit(void);
 void	 uihold(struct uidinfo *uip);
 #ifdef	RACCT
 void	 ui_racct_foreach(void (*callback)(struct racct *racct,
 	    void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
 	    void *arg2, void *arg3);
 #endif
 
 #endif /* _KERNEL */
 #endif /* !_SYS_RESOURCEVAR_H_ */
Index: head/sys/sys/umtx.h
===================================================================
--- head/sys/sys/umtx.h	(revision 296161)
+++ head/sys/sys/umtx.h	(revision 296162)
@@ -1,166 +1,174 @@
 /*-
  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 #ifndef _SYS_UMTX_H_
 #define	_SYS_UMTX_H_
 
 #include <sys/_umtx.h>
 
 #define USYNC_PROCESS_SHARED	0x0001	/* Process shared sync objs */
 
 #define	UMUTEX_UNOWNED		0x0
 #define	UMUTEX_CONTESTED	0x80000000U
 
 #define	UMUTEX_PRIO_INHERIT	0x0004	/* Priority inherited mutex */
 #define	UMUTEX_PRIO_PROTECT	0x0008	/* Priority protect mutex */
 
 /* urwlock flags */
 #define URWLOCK_PREFER_READER	0x0002
 
 #define URWLOCK_WRITE_OWNER	0x80000000U
 #define URWLOCK_WRITE_WAITERS	0x40000000U
 #define URWLOCK_READ_WAITERS	0x20000000U
 #define URWLOCK_MAX_READERS	0x1fffffffU
 #define URWLOCK_READER_COUNT(c)	((c) & URWLOCK_MAX_READERS)
 
 /* _usem flags */
 #define SEM_NAMED	0x0002
 
 /* _usem2 count field */
 #define	USEM_HAS_WAITERS	0x80000000U
 #define	USEM_MAX_COUNT		0x7fffffffU
 #define	USEM_COUNT(c)		((c) & USEM_MAX_COUNT)
 
 /* op code for _umtx_op */
 #define	UMTX_OP_RESERVED0	0
 #define	UMTX_OP_RESERVED1	1
 #define	UMTX_OP_WAIT		2
 #define	UMTX_OP_WAKE		3
 #define	UMTX_OP_MUTEX_TRYLOCK	4
 #define	UMTX_OP_MUTEX_LOCK	5
 #define	UMTX_OP_MUTEX_UNLOCK	6
 #define	UMTX_OP_SET_CEILING	7
 #define	UMTX_OP_CV_WAIT		8
 #define	UMTX_OP_CV_SIGNAL	9
 #define	UMTX_OP_CV_BROADCAST	10
 #define	UMTX_OP_WAIT_UINT	11
 #define	UMTX_OP_RW_RDLOCK	12
 #define	UMTX_OP_RW_WRLOCK	13
 #define	UMTX_OP_RW_UNLOCK	14
 #define	UMTX_OP_WAIT_UINT_PRIVATE	15
 #define	UMTX_OP_WAKE_PRIVATE	16
 #define	UMTX_OP_MUTEX_WAIT	17
 #define	UMTX_OP_MUTEX_WAKE	18	/* deprecated */
 #define	UMTX_OP_SEM_WAIT	19	/* deprecated */
 #define	UMTX_OP_SEM_WAKE	20	/* deprecated */
 #define	UMTX_OP_NWAKE_PRIVATE   21
 #define	UMTX_OP_MUTEX_WAKE2	22
 #define	UMTX_OP_SEM2_WAIT	23
 #define	UMTX_OP_SEM2_WAKE	24
+#define	UMTX_OP_SHM		25
 
 /* Flags for UMTX_OP_CV_WAIT */
 #define	CVWAIT_CHECK_UNPARKING	0x01
 #define	CVWAIT_ABSTIME		0x02
 #define	CVWAIT_CLOCKID		0x04
 
 #define	UMTX_ABSTIME		0x01
 
 #define	UMTX_CHECK_UNPARKING	CVWAIT_CHECK_UNPARKING
 
+/* Flags for UMTX_OP_SHM */
+#define	UMTX_SHM_CREAT		0x0001
+#define	UMTX_SHM_LOOKUP		0x0002
+#define	UMTX_SHM_DESTROY	0x0004
+#define	UMTX_SHM_ALIVE		0x0008
+
 #ifndef _KERNEL
 
 int _umtx_op(void *obj, int op, u_long val, void *uaddr, void *uaddr2);
 
 #else
 
 /*
  * The umtx_key structure is used by both the Linux futex code and the
  * umtx implementation to map userland addresses to unique keys.
  */
 
 enum {
 	TYPE_SIMPLE_WAIT,
 	TYPE_CV,
 	TYPE_SEM,
 	TYPE_SIMPLE_LOCK,
 	TYPE_NORMAL_UMUTEX,
 	TYPE_PI_UMUTEX,
 	TYPE_PP_UMUTEX,
 	TYPE_RWLOCK,
-	TYPE_FUTEX
+	TYPE_FUTEX,
+	TYPE_SHM,
 };
 
 /* Key to represent a unique userland synchronous object */
 struct umtx_key {
 	int	hash;
 	int	type;
 	int	shared;
 	union {
 		struct {
 			struct vm_object *object;
 			uintptr_t	offset;
 		} shared;
 		struct {
 			struct vmspace	*vs;
 			uintptr_t	addr;
 		} private;
 		struct {
 			void		*a;
 			uintptr_t	b;
 		} both;
 	} info;
 };
 
 #define THREAD_SHARE		0
 #define PROCESS_SHARE		1
 #define AUTO_SHARE		2
 
 struct thread;
 
 static inline int
 umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
 {
 	return (k1->type == k2->type &&
 		k1->info.both.a == k2->info.both.a &&
 	        k1->info.both.b == k2->info.both.b);
 }
 
 int umtx_copyin_timeout(const void *, struct timespec *);
 int umtx_key_get(const void *, int, int, struct umtx_key *);
 void umtx_key_release(struct umtx_key *);
 struct umtx_q *umtxq_alloc(void);
 void umtxq_free(struct umtx_q *);
 int kern_umtx_wake(struct thread *, void *, int, int);
 void umtx_pi_adjust(struct thread *, u_char);
 void umtx_thread_init(struct thread *);
 void umtx_thread_fini(struct thread *);
 void umtx_thread_alloc(struct thread *);
 void umtx_thread_exit(struct thread *);
 #endif /* !_KERNEL */
 #endif /* !_SYS_UMTX_H_ */
Index: head/sys/vm/vm_object.c
===================================================================
--- head/sys/vm/vm_object.c	(revision 296161)
+++ head/sys/vm/vm_object.c	(revision 296162)
@@ -1,2627 +1,2632 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory object module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>		/* for curproc, pageproc */
 #include <sys/socket.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 static int old_msync;
 SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0,
     "Use old (insecure) msync behavior");
 
 static int	vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
 		    int pagerflags, int flags, boolean_t *clearobjflags,
 		    boolean_t *eio);
 static boolean_t vm_object_page_remove_write(vm_page_t p, int flags,
 		    boolean_t *clearobjflags);
 static void	vm_object_qcollapse(vm_object_t object);
 static void	vm_object_vndeallocate(vm_object_t object);
 
 /*
  *	Virtual memory objects maintain the actual data
  *	associated with allocated virtual memory.  A given
  *	page of memory exists within exactly one object.
  *
  *	An object is only deallocated when all "references"
  *	are given up.  Only one "reference" to a given
  *	region of an object should be writeable.
  *
  *	Associated with each object is a list of all resident
  *	memory pages belonging to that object; this list is
  *	maintained by the "vm_page" module, and locked by the object's
  *	lock.
  *
  *	Each object also records a "pager" routine which is
  *	used to retrieve (and store) pages to the proper backing
  *	storage.  In addition, objects may be backed by other
  *	objects from which they were virtual-copied.
  *
  *	The only items within the object structure which are
  *	modified after time of creation are:
  *		reference count		locked by object's lock
  *		pager routine		locked by object's lock
  *
  */
 
 struct object_q vm_object_list;
 struct mtx vm_object_list_mtx;	/* lock for object list and count */
 
 struct vm_object kernel_object_store;
 struct vm_object kmem_object_store;
 
 static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0,
     "VM object stats");
 
 static long object_collapses;
 SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD,
     &object_collapses, 0, "VM object collapses");
 
 static long object_bypasses;
 SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
     &object_bypasses, 0, "VM object bypasses");
 
 static uma_zone_t obj_zone;
 
 static int vm_object_zinit(void *mem, int size, int flags);
 
 #ifdef INVARIANTS
 static void vm_object_zdtor(void *mem, int size, void *arg);
 
 static void
 vm_object_zdtor(void *mem, int size, void *arg)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	KASSERT(object->ref_count == 0,
 	    ("object %p ref_count = %d", object, object->ref_count));
 	KASSERT(TAILQ_EMPTY(&object->memq),
 	    ("object %p has resident pages in its memq", object));
 	KASSERT(vm_radix_is_empty(&object->rtree),
 	    ("object %p has resident pages in its trie", object));
 #if VM_NRESERVLEVEL > 0
 	KASSERT(LIST_EMPTY(&object->rvq),
 	    ("object %p has reservations",
 	    object));
 #endif
 	KASSERT(vm_object_cache_is_empty(object),
 	    ("object %p has cached pages",
 	    object));
 	KASSERT(object->paging_in_progress == 0,
 	    ("object %p paging_in_progress = %d",
 	    object, object->paging_in_progress));
 	KASSERT(object->resident_page_count == 0,
 	    ("object %p resident_page_count = %d",
 	    object, object->resident_page_count));
 	KASSERT(object->shadow_count == 0,
 	    ("object %p shadow_count = %d",
 	    object, object->shadow_count));
 	KASSERT(object->type == OBJT_DEAD,
 	    ("object %p has non-dead type %d",
 	    object, object->type));
 }
 #endif
 
 static int
 vm_object_zinit(void *mem, int size, int flags)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW);
 
 	/* These are true for any object that has been freed */
 	object->type = OBJT_DEAD;
 	object->ref_count = 0;
 	object->rtree.rt_root = 0;
 	object->rtree.rt_flags = 0;
 	object->paging_in_progress = 0;
 	object->resident_page_count = 0;
 	object->shadow_count = 0;
 	object->cache.rt_root = 0;
 	object->cache.rt_flags = 0;
 
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
 	mtx_unlock(&vm_object_list_mtx);
 	return (0);
 }
 
 static void
 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
 {
 
 	TAILQ_INIT(&object->memq);
 	LIST_INIT(&object->shadow_head);
 
 	object->type = type;
 	switch (type) {
 	case OBJT_DEAD:
 		panic("_vm_object_allocate: can't create OBJT_DEAD");
 	case OBJT_DEFAULT:
 	case OBJT_SWAP:
 		object->flags = OBJ_ONEMAPPING;
 		break;
 	case OBJT_DEVICE:
 	case OBJT_SG:
 		object->flags = OBJ_FICTITIOUS | OBJ_UNMANAGED;
 		break;
 	case OBJT_MGTDEVICE:
 		object->flags = OBJ_FICTITIOUS;
 		break;
 	case OBJT_PHYS:
 		object->flags = OBJ_UNMANAGED;
 		break;
 	case OBJT_VNODE:
 		object->flags = 0;
 		break;
 	default:
 		panic("_vm_object_allocate: type %d is undefined", type);
 	}
 	object->size = size;
 	object->generation = 1;
 	object->ref_count = 1;
 	object->memattr = VM_MEMATTR_DEFAULT;
 	object->cred = NULL;
 	object->charge = 0;
 	object->handle = NULL;
 	object->backing_object = NULL;
 	object->backing_object_offset = (vm_ooffset_t) 0;
 #if VM_NRESERVLEVEL > 0
 	LIST_INIT(&object->rvq);
 #endif
+	umtx_shm_object_init(object);
 }
 
 /*
  *	vm_object_init:
  *
  *	Initialize the VM objects module.
  */
 void
 vm_object_init(void)
 {
 	TAILQ_INIT(&vm_object_list);
 	mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
 	
 	rw_init(&kernel_object->lock, "kernel vm object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kernel_object);
 #if VM_NRESERVLEVEL > 0
 	kernel_object->flags |= OBJ_COLORED;
 	kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
 #endif
 
 	rw_init(&kmem_object->lock, "kmem vm object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kmem_object);
 #if VM_NRESERVLEVEL > 0
 	kmem_object->flags |= OBJ_COLORED;
 	kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
 #endif
 
 	/*
 	 * The lock portion of struct vm_object must be type stable due
 	 * to vm_pageout_fallback_object_lock locking a vm object
 	 * without holding any references to it.
 	 */
 	obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL,
 #ifdef INVARIANTS
 	    vm_object_zdtor,
 #else
 	    NULL,
 #endif
 	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 
 	vm_radix_init();
 }
 
 void
 vm_object_clear_flag(vm_object_t object, u_short bits)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->flags &= ~bits;
 }
 
 /*
  *	Sets the default memory attribute for the specified object.  Pages
  *	that are allocated to this object are by default assigned this memory
  *	attribute.
  *
  *	Presently, this function must be called before any pages are allocated
  *	to the object.  In the future, this requirement may be relaxed for
  *	"default" and "swap" objects.
  */
 int
 vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	switch (object->type) {
 	case OBJT_DEFAULT:
 	case OBJT_DEVICE:
 	case OBJT_MGTDEVICE:
 	case OBJT_PHYS:
 	case OBJT_SG:
 	case OBJT_SWAP:
 	case OBJT_VNODE:
 		if (!TAILQ_EMPTY(&object->memq))
 			return (KERN_FAILURE);
 		break;
 	case OBJT_DEAD:
 		return (KERN_INVALID_ARGUMENT);
 	default:
 		panic("vm_object_set_memattr: object %p is of undefined type",
 		    object);
 	}
 	object->memattr = memattr;
 	return (KERN_SUCCESS);
 }
 
 void
 vm_object_pip_add(vm_object_t object, short i)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->paging_in_progress += i;
 }
 
 void
 vm_object_pip_subtract(vm_object_t object, short i)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->paging_in_progress -= i;
 }
 
 void
 vm_object_pip_wakeup(vm_object_t object)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->paging_in_progress--;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
 		wakeup(object);
 	}
 }
 
 void
 vm_object_pip_wakeupn(vm_object_t object, short i)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (i)
 		object->paging_in_progress -= i;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
 		wakeup(object);
 	}
 }
 
 void
 vm_object_pip_wait(vm_object_t object, char *waitid)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	while (object->paging_in_progress) {
 		object->flags |= OBJ_PIPWNT;
 		VM_OBJECT_SLEEP(object, object, PVM, waitid, 0);
 	}
 }
 
 /*
  *	vm_object_allocate:
  *
  *	Returns a new object with the given size.
  */
 vm_object_t
 vm_object_allocate(objtype_t type, vm_pindex_t size)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK);
 	_vm_object_allocate(type, size, object);
 	return (object);
 }
 
 
 /*
  *	vm_object_reference:
  *
  *	Gets another reference to the given object.  Note: OBJ_DEAD
  *	objects can be referenced during final cleaning.
  */
 void
 vm_object_reference(vm_object_t object)
 {
 	if (object == NULL)
 		return;
 	VM_OBJECT_WLOCK(object);
 	vm_object_reference_locked(object);
 	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
  *	vm_object_reference_locked:
  *
  *	Gets another reference to the given object.
  *
  *	The object must be locked.
  */
 void
 vm_object_reference_locked(vm_object_t object)
 {
 	struct vnode *vp;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->ref_count++;
 	if (object->type == OBJT_VNODE) {
 		vp = object->handle;
 		vref(vp);
 	}
 }
 
 /*
  * Handle deallocating an object of type OBJT_VNODE.
  */
 static void
 vm_object_vndeallocate(vm_object_t object)
 {
 	struct vnode *vp = (struct vnode *) object->handle;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_VNODE,
 	    ("vm_object_vndeallocate: not a vnode object"));
 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
 #ifdef INVARIANTS
 	if (object->ref_count == 0) {
 		vprint("vm_object_vndeallocate", vp);
 		panic("vm_object_vndeallocate: bad object reference count");
 	}
 #endif
 
+	if (object->ref_count == 1)
+		umtx_shm_object_terminated(object);
+
 	/*
 	 * The test for text of vp vnode does not need a bypass to
 	 * reach right VV_TEXT there, since it is obtained from
 	 * object->handle.
 	 */
 	if (object->ref_count > 1 || (vp->v_vflag & VV_TEXT) == 0) {
 		object->ref_count--;
 		VM_OBJECT_WUNLOCK(object);
 		/* vrele may need the vnode lock. */
 		vrele(vp);
 	} else {
 		vhold(vp);
 		VM_OBJECT_WUNLOCK(object);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vdrop(vp);
 		VM_OBJECT_WLOCK(object);
 		object->ref_count--;
 		if (object->type == OBJT_DEAD) {
 			VM_OBJECT_WUNLOCK(object);
 			VOP_UNLOCK(vp, 0);
 		} else {
 			if (object->ref_count == 0)
 				VOP_UNSET_TEXT(vp);
 			VM_OBJECT_WUNLOCK(object);
 			vput(vp);
 		}
 	}
 }
 
 /*
  *	vm_object_deallocate:
  *
  *	Release a reference to the specified object,
  *	gained either through a vm_object_allocate
  *	or a vm_object_reference call.  When all references
  *	are gone, storage associated with this object
  *	may be relinquished.
  *
  *	No object may be locked.
  */
 void
 vm_object_deallocate(vm_object_t object)
 {
 	vm_object_t temp;
 	struct vnode *vp;
 
 	while (object != NULL) {
 		VM_OBJECT_WLOCK(object);
 		if (object->type == OBJT_VNODE) {
 			vm_object_vndeallocate(object);
 			return;
 		}
 
 		KASSERT(object->ref_count != 0,
 			("vm_object_deallocate: object deallocated too many times: %d", object->type));
 
 		/*
 		 * If the reference count goes to 0 we start calling
 		 * vm_object_terminate() on the object chain.
 		 * A ref count of 1 may be a special case depending on the
 		 * shadow count being 0 or 1.
 		 */
 		object->ref_count--;
 		if (object->ref_count > 1) {
 			VM_OBJECT_WUNLOCK(object);
 			return;
 		} else if (object->ref_count == 1) {
 			if (object->type == OBJT_SWAP &&
 			    (object->flags & OBJ_TMPFS) != 0) {
 				vp = object->un_pager.swp.swp_tmpfs;
 				vhold(vp);
 				VM_OBJECT_WUNLOCK(object);
 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 				VM_OBJECT_WLOCK(object);
 				if (object->type == OBJT_DEAD ||
 				    object->ref_count != 1) {
 					VM_OBJECT_WUNLOCK(object);
 					VOP_UNLOCK(vp, 0);
 					vdrop(vp);
 					return;
 				}
 				if ((object->flags & OBJ_TMPFS) != 0)
 					VOP_UNSET_TEXT(vp);
 				VOP_UNLOCK(vp, 0);
 				vdrop(vp);
 			}
 			if (object->shadow_count == 0 &&
 			    object->handle == NULL &&
 			    (object->type == OBJT_DEFAULT ||
 			    (object->type == OBJT_SWAP &&
 			    (object->flags & OBJ_TMPFS_NODE) == 0))) {
 				vm_object_set_flag(object, OBJ_ONEMAPPING);
 			} else if ((object->shadow_count == 1) &&
 			    (object->handle == NULL) &&
 			    (object->type == OBJT_DEFAULT ||
 			     object->type == OBJT_SWAP)) {
 				vm_object_t robject;
 
 				robject = LIST_FIRST(&object->shadow_head);
 				KASSERT(robject != NULL,
 				    ("vm_object_deallocate: ref_count: %d, shadow_count: %d",
 					 object->ref_count,
 					 object->shadow_count));
 				KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0,
 				    ("shadowed tmpfs v_object %p", object));
 				if (!VM_OBJECT_TRYWLOCK(robject)) {
 					/*
 					 * Avoid a potential deadlock.
 					 */
 					object->ref_count++;
 					VM_OBJECT_WUNLOCK(object);
 					/*
 					 * More likely than not the thread
 					 * holding robject's lock has lower
 					 * priority than the current thread.
 					 * Let the lower priority thread run.
 					 */
 					pause("vmo_de", 1);
 					continue;
 				}
 				/*
 				 * Collapse object into its shadow unless its
 				 * shadow is dead.  In that case, object will
 				 * be deallocated by the thread that is
 				 * deallocating its shadow.
 				 */
 				if ((robject->flags & OBJ_DEAD) == 0 &&
 				    (robject->handle == NULL) &&
 				    (robject->type == OBJT_DEFAULT ||
 				     robject->type == OBJT_SWAP)) {
 
 					robject->ref_count++;
 retry:
 					if (robject->paging_in_progress) {
 						VM_OBJECT_WUNLOCK(object);
 						vm_object_pip_wait(robject,
 						    "objde1");
 						temp = robject->backing_object;
 						if (object == temp) {
 							VM_OBJECT_WLOCK(object);
 							goto retry;
 						}
 					} else if (object->paging_in_progress) {
 						VM_OBJECT_WUNLOCK(robject);
 						object->flags |= OBJ_PIPWNT;
 						VM_OBJECT_SLEEP(object, object,
 						    PDROP | PVM, "objde2", 0);
 						VM_OBJECT_WLOCK(robject);
 						temp = robject->backing_object;
 						if (object == temp) {
 							VM_OBJECT_WLOCK(object);
 							goto retry;
 						}
 					} else
 						VM_OBJECT_WUNLOCK(object);
 
 					if (robject->ref_count == 1) {
 						robject->ref_count--;
 						object = robject;
 						goto doterm;
 					}
 					object = robject;
 					vm_object_collapse(object);
 					VM_OBJECT_WUNLOCK(object);
 					continue;
 				}
 				VM_OBJECT_WUNLOCK(robject);
 			}
 			VM_OBJECT_WUNLOCK(object);
 			return;
 		}
 doterm:
+		umtx_shm_object_terminated(object);
 		temp = object->backing_object;
 		if (temp != NULL) {
 			KASSERT((object->flags & OBJ_TMPFS_NODE) == 0,
 			    ("shadowed tmpfs v_object 2 %p", object));
 			VM_OBJECT_WLOCK(temp);
 			LIST_REMOVE(object, shadow_list);
 			temp->shadow_count--;
 			VM_OBJECT_WUNLOCK(temp);
 			object->backing_object = NULL;
 		}
 		/*
 		 * Don't double-terminate, we could be in a termination
 		 * recursion due to the terminate having to sync data
 		 * to disk.
 		 */
 		if ((object->flags & OBJ_DEAD) == 0)
 			vm_object_terminate(object);
 		else
 			VM_OBJECT_WUNLOCK(object);
 		object = temp;
 	}
 }
 
 /*
  *	vm_object_destroy removes the object from the global object list
  *      and frees the space for the object.
  */
 void
 vm_object_destroy(vm_object_t object)
 {
 
 	/*
 	 * Release the allocation charge.
 	 */
 	if (object->cred != NULL) {
 		swap_release_by_cred(object->charge, object->cred);
 		object->charge = 0;
 		crfree(object->cred);
 		object->cred = NULL;
 	}
 
 	/*
 	 * Free the space for the object.
 	 */
 	uma_zfree(obj_zone, object);
 }
 
 /*
  *	vm_object_terminate actually destroys the specified object, freeing
  *	up all previously used resources.
  *
  *	The object must be locked.
  *	This routine may block.
  */
 void
 vm_object_terminate(vm_object_t object)
 {
 	vm_page_t p, p_next;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Make sure no one uses us.
 	 */
 	vm_object_set_flag(object, OBJ_DEAD);
 
 	/*
 	 * wait for the pageout daemon to be done with the object
 	 */
 	vm_object_pip_wait(object, "objtrm");
 
 	KASSERT(!object->paging_in_progress,
 		("vm_object_terminate: pageout in progress"));
 
 	/*
 	 * Clean and free the pages, as appropriate. All references to the
 	 * object are gone, so we don't need to lock it.
 	 */
 	if (object->type == OBJT_VNODE) {
 		struct vnode *vp = (struct vnode *)object->handle;
 
 		/*
 		 * Clean pages and flush buffers.
 		 */
 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_WUNLOCK(object);
 
 		vinvalbuf(vp, V_SAVE, 0, 0);
 
 		VM_OBJECT_WLOCK(object);
 	}
 
 	KASSERT(object->ref_count == 0, 
 		("vm_object_terminate: object with references, ref_count=%d",
 		object->ref_count));
 
 	/*
 	 * Free any remaining pageable pages.  This also removes them from the
 	 * paging queues.  However, don't free wired pages, just remove them
 	 * from the object.  Rather than incrementally removing each page from
 	 * the object, the page and object are reset to any empty state. 
 	 */
 	TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
 		vm_page_assert_unbusied(p);
 		vm_page_lock(p);
 		/*
 		 * Optimize the page's removal from the object by resetting
 		 * its "object" field.  Specifically, if the page is not
 		 * wired, then the effect of this assignment is that
 		 * vm_page_free()'s call to vm_page_remove() will return
 		 * immediately without modifying the page or the object.
 		 */ 
 		p->object = NULL;
 		if (p->wire_count == 0) {
 			vm_page_free(p);
 			PCPU_INC(cnt.v_pfree);
 		}
 		vm_page_unlock(p);
 	}
 	/*
 	 * If the object contained any pages, then reset it to an empty state.
 	 * None of the object's fields, including "resident_page_count", were
 	 * modified by the preceding loop.
 	 */
 	if (object->resident_page_count != 0) {
 		vm_radix_reclaim_allnodes(&object->rtree);
 		TAILQ_INIT(&object->memq);
 		object->resident_page_count = 0;
 		if (object->type == OBJT_VNODE)
 			vdrop(object->handle);
 	}
 
 #if VM_NRESERVLEVEL > 0
 	if (__predict_false(!LIST_EMPTY(&object->rvq)))
 		vm_reserv_break_all(object);
 #endif
 	if (__predict_false(!vm_object_cache_is_empty(object)))
 		vm_page_cache_free(object, 0, 0);
 
 	KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT ||
 	    object->type == OBJT_SWAP,
 	    ("%s: non-swap obj %p has cred", __func__, object));
 
 	/*
 	 * Let the pager know object is dead.
 	 */
 	vm_pager_deallocate(object);
 	VM_OBJECT_WUNLOCK(object);
 
 	vm_object_destroy(object);
 }
 
 /*
  * Make the page read-only so that we can clear the object flags.  However, if
  * this is a nosync mmap then the object is likely to stay dirty so do not
  * mess with the page and do not clear the object flags.  Returns TRUE if the
  * page should be flushed, and FALSE otherwise.
  */
 static boolean_t
 vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags)
 {
 
 	/*
 	 * If we have been asked to skip nosync pages and this is a
 	 * nosync page, skip it.  Note that the object flags were not
 	 * cleared in this case so we do not have to set them.
 	 */
 	if ((flags & OBJPC_NOSYNC) != 0 && (p->oflags & VPO_NOSYNC) != 0) {
 		*clearobjflags = FALSE;
 		return (FALSE);
 	} else {
 		pmap_remove_write(p);
 		return (p->dirty != 0);
 	}
 }
 
 /*
  *	vm_object_page_clean
  *
  *	Clean all dirty pages in the specified range of object.  Leaves page 
  * 	on whatever queue it is currently on.   If NOSYNC is set then do not
  *	write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC),
  *	leaving the object dirty.
  *
  *	When stuffing pages asynchronously, allow clustering.  XXX we need a
  *	synchronous clustering mode implementation.
  *
  *	Odd semantics: if start == end, we clean everything.
  *
  *	The object must be locked.
  *
  *	Returns FALSE if some page from the range was not written, as
  *	reported by the pager, and TRUE otherwise.
  */
 boolean_t
 vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end,
     int flags)
 {
 	vm_page_t np, p;
 	vm_pindex_t pi, tend, tstart;
 	int curgeneration, n, pagerflags;
 	boolean_t clearobjflags, eio, res;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * The OBJ_MIGHTBEDIRTY flag is only set for OBJT_VNODE
 	 * objects.  The check below prevents the function from
 	 * operating on non-vnode objects.
 	 */
 	if ((object->flags & OBJ_MIGHTBEDIRTY) == 0 ||
 	    object->resident_page_count == 0)
 		return (TRUE);
 
 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ?
 	    VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
 	pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0;
 
 	tstart = OFF_TO_IDX(start);
 	tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK);
 	clearobjflags = tstart == 0 && tend >= object->size;
 	res = TRUE;
 
 rescan:
 	curgeneration = object->generation;
 
 	for (p = vm_page_find_least(object, tstart); p != NULL; p = np) {
 		pi = p->pindex;
 		if (pi >= tend)
 			break;
 		np = TAILQ_NEXT(p, listq);
 		if (p->valid == 0)
 			continue;
 		if (vm_page_sleep_if_busy(p, "vpcwai")) {
 			if (object->generation != curgeneration) {
 				if ((flags & OBJPC_SYNC) != 0)
 					goto rescan;
 				else
 					clearobjflags = FALSE;
 			}
 			np = vm_page_find_least(object, pi);
 			continue;
 		}
 		if (!vm_object_page_remove_write(p, flags, &clearobjflags))
 			continue;
 
 		n = vm_object_page_collect_flush(object, p, pagerflags,
 		    flags, &clearobjflags, &eio);
 		if (eio) {
 			res = FALSE;
 			clearobjflags = FALSE;
 		}
 		if (object->generation != curgeneration) {
 			if ((flags & OBJPC_SYNC) != 0)
 				goto rescan;
 			else
 				clearobjflags = FALSE;
 		}
 
 		/*
 		 * If the VOP_PUTPAGES() did a truncated write, so
 		 * that even the first page of the run is not fully
 		 * written, vm_pageout_flush() returns 0 as the run
 		 * length.  Since the condition that caused truncated
 		 * write may be permanent, e.g. exhausted free space,
 		 * accepting n == 0 would cause an infinite loop.
 		 *
 		 * Forwarding the iterator leaves the unwritten page
 		 * behind, but there is not much we can do there if
 		 * filesystem refuses to write it.
 		 */
 		if (n == 0) {
 			n = 1;
 			clearobjflags = FALSE;
 		}
 		np = vm_page_find_least(object, pi + n);
 	}
 #if 0
 	VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0);
 #endif
 
 	if (clearobjflags)
 		vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY);
 	return (res);
 }
 
 static int
 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags,
     int flags, boolean_t *clearobjflags, boolean_t *eio)
 {
 	vm_page_t ma[vm_pageout_page_count], p_first, tp;
 	int count, i, mreq, runlen;
 
 	vm_page_lock_assert(p, MA_NOTOWNED);
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	count = 1;
 	mreq = 0;
 
 	for (tp = p; count < vm_pageout_page_count; count++) {
 		tp = vm_page_next(tp);
 		if (tp == NULL || vm_page_busied(tp))
 			break;
 		if (!vm_object_page_remove_write(tp, flags, clearobjflags))
 			break;
 	}
 
 	for (p_first = p; count < vm_pageout_page_count; count++) {
 		tp = vm_page_prev(p_first);
 		if (tp == NULL || vm_page_busied(tp))
 			break;
 		if (!vm_object_page_remove_write(tp, flags, clearobjflags))
 			break;
 		p_first = tp;
 		mreq++;
 	}
 
 	for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++)
 		ma[i] = tp;
 
 	vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio);
 	return (runlen);
 }
 
 /*
  * Note that there is absolutely no sense in writing out
  * anonymous objects, so we track down the vnode object
  * to write out.
  * We invalidate (remove) all pages from the address space
  * for semantic correctness.
  *
  * If the backing object is a device object with unmanaged pages, then any
  * mappings to the specified range of pages must be removed before this
  * function is called.
  *
  * Note: certain anonymous maps, such as MAP_NOSYNC maps,
  * may start out with a NULL object.
  */
 boolean_t
 vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size,
     boolean_t syncio, boolean_t invalidate)
 {
 	vm_object_t backing_object;
 	struct vnode *vp;
 	struct mount *mp;
 	int error, flags, fsync_after;
 	boolean_t res;
 
 	if (object == NULL)
 		return (TRUE);
 	res = TRUE;
 	error = 0;
 	VM_OBJECT_WLOCK(object);
 	while ((backing_object = object->backing_object) != NULL) {
 		VM_OBJECT_WLOCK(backing_object);
 		offset += object->backing_object_offset;
 		VM_OBJECT_WUNLOCK(object);
 		object = backing_object;
 		if (object->size < OFF_TO_IDX(offset + size))
 			size = IDX_TO_OFF(object->size) - offset;
 	}
 	/*
 	 * Flush pages if writing is allowed, invalidate them
 	 * if invalidation requested.  Pages undergoing I/O
 	 * will be ignored by vm_object_page_remove().
 	 *
 	 * We cannot lock the vnode and then wait for paging
 	 * to complete without deadlocking against vm_fault.
 	 * Instead we simply call vm_object_page_remove() and
 	 * allow it to block internally on a page-by-page
 	 * basis when it encounters pages undergoing async
 	 * I/O.
 	 */
 	if (object->type == OBJT_VNODE &&
 	    (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		vp = object->handle;
 		VM_OBJECT_WUNLOCK(object);
 		(void) vn_start_write(vp, &mp, V_WAIT);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (syncio && !invalidate && offset == 0 &&
 		    OFF_TO_IDX(size) == object->size) {
 			/*
 			 * If syncing the whole mapping of the file,
 			 * it is faster to schedule all the writes in
 			 * async mode, also allowing the clustering,
 			 * and then wait for i/o to complete.
 			 */
 			flags = 0;
 			fsync_after = TRUE;
 		} else {
 			flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
 			flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0;
 			fsync_after = FALSE;
 		}
 		VM_OBJECT_WLOCK(object);
 		res = vm_object_page_clean(object, offset, offset + size,
 		    flags);
 		VM_OBJECT_WUNLOCK(object);
 		if (fsync_after)
 			error = VOP_FSYNC(vp, MNT_WAIT, curthread);
 		VOP_UNLOCK(vp, 0);
 		vn_finished_write(mp);
 		if (error != 0)
 			res = FALSE;
 		VM_OBJECT_WLOCK(object);
 	}
 	if ((object->type == OBJT_VNODE ||
 	     object->type == OBJT_DEVICE) && invalidate) {
 		if (object->type == OBJT_DEVICE)
 			/*
 			 * The option OBJPR_NOTMAPPED must be passed here
 			 * because vm_object_page_remove() cannot remove
 			 * unmanaged mappings.
 			 */
 			flags = OBJPR_NOTMAPPED;
 		else if (old_msync)
 			flags = 0;
 		else
 			flags = OBJPR_CLEANONLY;
 		vm_object_page_remove(object, OFF_TO_IDX(offset),
 		    OFF_TO_IDX(offset + size + PAGE_MASK), flags);
 	}
 	VM_OBJECT_WUNLOCK(object);
 	return (res);
 }
 
 /*
  *	vm_object_madvise:
  *
  *	Implements the madvise function at the object/page level.
  *
  *	MADV_WILLNEED	(any object)
  *
  *	    Activate the specified pages if they are resident.
  *
  *	MADV_DONTNEED	(any object)
  *
  *	    Deactivate the specified pages if they are resident.
  *
  *	MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects,
  *			 OBJ_ONEMAPPING only)
  *
  *	    Deactivate and clean the specified pages if they are
  *	    resident.  This permits the process to reuse the pages
  *	    without faulting or the kernel to reclaim the pages
  *	    without I/O.
  */
 void
 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end,
     int advise)
 {
 	vm_pindex_t tpindex;
 	vm_object_t backing_object, tobject;
 	vm_page_t m;
 
 	if (object == NULL)
 		return;
 	VM_OBJECT_WLOCK(object);
 	/*
 	 * Locate and adjust resident pages
 	 */
 	for (; pindex < end; pindex += 1) {
 relookup:
 		tobject = object;
 		tpindex = pindex;
 shadowlookup:
 		/*
 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
 		 * and those pages must be OBJ_ONEMAPPING.
 		 */
 		if (advise == MADV_FREE) {
 			if ((tobject->type != OBJT_DEFAULT &&
 			     tobject->type != OBJT_SWAP) ||
 			    (tobject->flags & OBJ_ONEMAPPING) == 0) {
 				goto unlock_tobject;
 			}
 		} else if ((tobject->flags & OBJ_UNMANAGED) != 0)
 			goto unlock_tobject;
 		m = vm_page_lookup(tobject, tpindex);
 		if (m == NULL && advise == MADV_WILLNEED) {
 			/*
 			 * If the page is cached, reactivate it.
 			 */
 			m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED |
 			    VM_ALLOC_NOBUSY);
 		}
 		if (m == NULL) {
 			/*
 			 * There may be swap even if there is no backing page
 			 */
 			if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
 				swap_pager_freespace(tobject, tpindex, 1);
 			/*
 			 * next object
 			 */
 			backing_object = tobject->backing_object;
 			if (backing_object == NULL)
 				goto unlock_tobject;
 			VM_OBJECT_WLOCK(backing_object);
 			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
 			if (tobject != object)
 				VM_OBJECT_WUNLOCK(tobject);
 			tobject = backing_object;
 			goto shadowlookup;
 		} else if (m->valid != VM_PAGE_BITS_ALL)
 			goto unlock_tobject;
 		/*
 		 * If the page is not in a normal state, skip it.
 		 */
 		vm_page_lock(m);
 		if (m->hold_count != 0 || m->wire_count != 0) {
 			vm_page_unlock(m);
 			goto unlock_tobject;
 		}
 		KASSERT((m->flags & PG_FICTITIOUS) == 0,
 		    ("vm_object_madvise: page %p is fictitious", m));
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 		    ("vm_object_madvise: page %p is not managed", m));
 		if (vm_page_busied(m)) {
 			if (advise == MADV_WILLNEED) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it. 
 				 */
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			}
 			if (object != tobject)
 				VM_OBJECT_WUNLOCK(object);
 			VM_OBJECT_WUNLOCK(tobject);
 			vm_page_busy_sleep(m, "madvpo");
 			VM_OBJECT_WLOCK(object);
   			goto relookup;
 		}
 		if (advise == MADV_WILLNEED) {
 			vm_page_activate(m);
 		} else {
 			vm_page_advise(m, advise);
 		}
 		vm_page_unlock(m);
 		if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
 			swap_pager_freespace(tobject, tpindex, 1);
 unlock_tobject:
 		if (tobject != object)
 			VM_OBJECT_WUNLOCK(tobject);
 	}	
 	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
  *	vm_object_shadow:
  *
  *	Create a new object which is backed by the
  *	specified existing object range.  The source
  *	object reference is deallocated.
  *
  *	The new object and offset into that object
  *	are returned in the source parameters.
  */
 void
 vm_object_shadow(
 	vm_object_t *object,	/* IN/OUT */
 	vm_ooffset_t *offset,	/* IN/OUT */
 	vm_size_t length)
 {
 	vm_object_t source;
 	vm_object_t result;
 
 	source = *object;
 
 	/*
 	 * Don't create the new object if the old object isn't shared.
 	 */
 	if (source != NULL) {
 		VM_OBJECT_WLOCK(source);
 		if (source->ref_count == 1 &&
 		    source->handle == NULL &&
 		    (source->type == OBJT_DEFAULT ||
 		     source->type == OBJT_SWAP)) {
 			VM_OBJECT_WUNLOCK(source);
 			return;
 		}
 		VM_OBJECT_WUNLOCK(source);
 	}
 
 	/*
 	 * Allocate a new object with the given length.
 	 */
 	result = vm_object_allocate(OBJT_DEFAULT, atop(length));
 
 	/*
 	 * The new object shadows the source object, adding a reference to it.
 	 * Our caller changes his reference to point to the new object,
 	 * removing a reference to the source object.  Net result: no change
 	 * of reference count.
 	 *
 	 * Try to optimize the result object's page color when shadowing
 	 * in order to maintain page coloring consistency in the combined 
 	 * shadowed object.
 	 */
 	result->backing_object = source;
 	/*
 	 * Store the offset into the source object, and fix up the offset into
 	 * the new object.
 	 */
 	result->backing_object_offset = *offset;
 	if (source != NULL) {
 		VM_OBJECT_WLOCK(source);
 		LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
 		source->shadow_count++;
 #if VM_NRESERVLEVEL > 0
 		result->flags |= source->flags & OBJ_COLORED;
 		result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) &
 		    ((1 << (VM_NFREEORDER - 1)) - 1);
 #endif
 		VM_OBJECT_WUNLOCK(source);
 	}
 
 
 	/*
 	 * Return the new things
 	 */
 	*offset = 0;
 	*object = result;
 }
 
 /*
  *	vm_object_split:
  *
  * Split the pages in a map entry into a new object.  This affords
  * easier removal of unused pages, and keeps object inheritance from
  * being a negative impact on memory usage.
  */
 void
 vm_object_split(vm_map_entry_t entry)
 {
 	vm_page_t m, m_next;
 	vm_object_t orig_object, new_object, source;
 	vm_pindex_t idx, offidxstart;
 	vm_size_t size;
 
 	orig_object = entry->object.vm_object;
 	if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
 		return;
 	if (orig_object->ref_count <= 1)
 		return;
 	VM_OBJECT_WUNLOCK(orig_object);
 
 	offidxstart = OFF_TO_IDX(entry->offset);
 	size = atop(entry->end - entry->start);
 
 	/*
 	 * If swap_pager_copy() is later called, it will convert new_object
 	 * into a swap object.
 	 */
 	new_object = vm_object_allocate(OBJT_DEFAULT, size);
 
 	/*
 	 * At this point, the new object is still private, so the order in
 	 * which the original and new objects are locked does not matter.
 	 */
 	VM_OBJECT_WLOCK(new_object);
 	VM_OBJECT_WLOCK(orig_object);
 	source = orig_object->backing_object;
 	if (source != NULL) {
 		VM_OBJECT_WLOCK(source);
 		if ((source->flags & OBJ_DEAD) != 0) {
 			VM_OBJECT_WUNLOCK(source);
 			VM_OBJECT_WUNLOCK(orig_object);
 			VM_OBJECT_WUNLOCK(new_object);
 			vm_object_deallocate(new_object);
 			VM_OBJECT_WLOCK(orig_object);
 			return;
 		}
 		LIST_INSERT_HEAD(&source->shadow_head,
 				  new_object, shadow_list);
 		source->shadow_count++;
 		vm_object_reference_locked(source);	/* for new_object */
 		vm_object_clear_flag(source, OBJ_ONEMAPPING);
 		VM_OBJECT_WUNLOCK(source);
 		new_object->backing_object_offset = 
 			orig_object->backing_object_offset + entry->offset;
 		new_object->backing_object = source;
 	}
 	if (orig_object->cred != NULL) {
 		new_object->cred = orig_object->cred;
 		crhold(orig_object->cred);
 		new_object->charge = ptoa(size);
 		KASSERT(orig_object->charge >= ptoa(size),
 		    ("orig_object->charge < 0"));
 		orig_object->charge -= ptoa(size);
 	}
 retry:
 	m = vm_page_find_least(orig_object, offidxstart);
 	for (; m != NULL && (idx = m->pindex - offidxstart) < size;
 	    m = m_next) {
 		m_next = TAILQ_NEXT(m, listq);
 
 		/*
 		 * We must wait for pending I/O to complete before we can
 		 * rename the page.
 		 *
 		 * We do not have to VM_PROT_NONE the page as mappings should
 		 * not be changed by this operation.
 		 */
 		if (vm_page_busied(m)) {
 			VM_OBJECT_WUNLOCK(new_object);
 			vm_page_lock(m);
 			VM_OBJECT_WUNLOCK(orig_object);
 			vm_page_busy_sleep(m, "spltwt");
 			VM_OBJECT_WLOCK(orig_object);
 			VM_OBJECT_WLOCK(new_object);
 			goto retry;
 		}
 
 		/* vm_page_rename() will handle dirty and cache. */
 		if (vm_page_rename(m, new_object, idx)) {
 			VM_OBJECT_WUNLOCK(new_object);
 			VM_OBJECT_WUNLOCK(orig_object);
 			VM_WAIT;
 			VM_OBJECT_WLOCK(orig_object);
 			VM_OBJECT_WLOCK(new_object);
 			goto retry;
 		}
 #if VM_NRESERVLEVEL > 0
 		/*
 		 * If some of the reservation's allocated pages remain with
 		 * the original object, then transferring the reservation to
 		 * the new object is neither particularly beneficial nor
 		 * particularly harmful as compared to leaving the reservation
 		 * with the original object.  If, however, all of the
 		 * reservation's allocated pages are transferred to the new
 		 * object, then transferring the reservation is typically
 		 * beneficial.  Determining which of these two cases applies
 		 * would be more costly than unconditionally renaming the
 		 * reservation.
 		 */
 		vm_reserv_rename(m, new_object, orig_object, offidxstart);
 #endif
 		if (orig_object->type == OBJT_SWAP)
 			vm_page_xbusy(m);
 	}
 	if (orig_object->type == OBJT_SWAP) {
 		/*
 		 * swap_pager_copy() can sleep, in which case the orig_object's
 		 * and new_object's locks are released and reacquired. 
 		 */
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
 		TAILQ_FOREACH(m, &new_object->memq, listq)
 			vm_page_xunbusy(m);
 
 		/*
 		 * Transfer any cached pages from orig_object to new_object.
 		 * If swap_pager_copy() found swapped out pages within the
 		 * specified range of orig_object, then it changed
 		 * new_object's type to OBJT_SWAP when it transferred those
 		 * pages to new_object.  Otherwise, new_object's type
 		 * should still be OBJT_DEFAULT and orig_object should not
 		 * contain any cached pages within the specified range.
 		 */
 		if (__predict_false(!vm_object_cache_is_empty(orig_object)))
 			vm_page_cache_transfer(orig_object, offidxstart,
 			    new_object);
 	}
 	VM_OBJECT_WUNLOCK(orig_object);
 	VM_OBJECT_WUNLOCK(new_object);
 	entry->object.vm_object = new_object;
 	entry->offset = 0LL;
 	vm_object_deallocate(orig_object);
 	VM_OBJECT_WLOCK(new_object);
 }
 
 #define	OBSC_COLLAPSE_NOWAIT	0x0002
 #define	OBSC_COLLAPSE_WAIT	0x0004
 
 static vm_page_t
 vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
     int op)
 {
 	vm_object_t backing_object;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	backing_object = object->backing_object;
 	VM_OBJECT_ASSERT_WLOCKED(backing_object);
 
 	KASSERT(p == NULL || vm_page_busied(p), ("unbusy page %p", p));
 	KASSERT(p == NULL || p->object == object || p->object == backing_object,
 	    ("invalid ownership %p %p %p", p, object, backing_object));
 	if ((op & OBSC_COLLAPSE_NOWAIT) != 0)
 		return (next);
 	if (p != NULL)
 		vm_page_lock(p);
 	VM_OBJECT_WUNLOCK(object);
 	VM_OBJECT_WUNLOCK(backing_object);
 	if (p == NULL)
 		VM_WAIT;
 	else
 		vm_page_busy_sleep(p, "vmocol");
 	VM_OBJECT_WLOCK(object);
 	VM_OBJECT_WLOCK(backing_object);
 	return (TAILQ_FIRST(&backing_object->memq));
 }
 
 static bool
 vm_object_scan_all_shadowed(vm_object_t object)
 {
 	vm_object_t backing_object;
 	vm_page_t p, pp;
 	vm_pindex_t backing_offset_index, new_pindex;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
 
 	backing_object = object->backing_object;
 
 	/*
 	 * Initial conditions:
 	 *
 	 * We do not want to have to test for the existence of cache or swap
 	 * pages in the backing object.  XXX but with the new swapper this
 	 * would be pretty easy to do.
 	 */
 	if (backing_object->type != OBJT_DEFAULT)
 		return (false);
 
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
 	for (p = TAILQ_FIRST(&backing_object->memq); p != NULL;
 	    p = TAILQ_NEXT(p, listq)) {
 		new_pindex = p->pindex - backing_offset_index;
 
 		/*
 		 * Ignore pages outside the parent object's range and outside
 		 * the parent object's mapping of the backing object.
 		 */
 		if (p->pindex < backing_offset_index ||
 		    new_pindex >= object->size)
 			continue;
 
 		/*
 		 * See if the parent has the page or if the parent's object
 		 * pager has the page.  If the parent has the page but the page
 		 * is not valid, the parent's object pager must have the page.
 		 *
 		 * If this fails, the parent does not completely shadow the
 		 * object and we might as well give up now.
 		 */
 		pp = vm_page_lookup(object, new_pindex);
 		if ((pp == NULL || pp->valid == 0) &&
 		    !vm_pager_has_page(object, new_pindex, NULL, NULL))
 			return (false);
 	}
 	return (true);
 }
 
 static bool
 vm_object_collapse_scan(vm_object_t object, int op)
 {
 	vm_object_t backing_object;
 	vm_page_t next, p, pp;
 	vm_pindex_t backing_offset_index, new_pindex;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
 
 	backing_object = object->backing_object;
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
 	/*
 	 * Initial conditions
 	 */
 	if ((op & OBSC_COLLAPSE_WAIT) != 0)
 		vm_object_set_flag(backing_object, OBJ_DEAD);
 
 	/*
 	 * Our scan
 	 */
 	for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) {
 		next = TAILQ_NEXT(p, listq);
 		new_pindex = p->pindex - backing_offset_index;
 
 		/*
 		 * Check for busy page
 		 */
 		if (vm_page_busied(p)) {
 			next = vm_object_collapse_scan_wait(object, p, next, op);
 			continue;
 		}
 
 		KASSERT(p->object == backing_object,
 		    ("vm_object_collapse_scan: object mismatch"));
 
 		if (p->pindex < backing_offset_index ||
 		    new_pindex >= object->size) {
 			if (backing_object->type == OBJT_SWAP)
 				swap_pager_freespace(backing_object, p->pindex,
 				    1);
 
 			/*
 			 * Page is out of the parent object's range, we can
 			 * simply destroy it.
 			 */
 			vm_page_lock(p);
 			KASSERT(!pmap_page_is_mapped(p),
 			    ("freeing mapped page %p", p));
 			if (p->wire_count == 0)
 				vm_page_free(p);
 			else
 				vm_page_remove(p);
 			vm_page_unlock(p);
 			continue;
 		}
 
 		pp = vm_page_lookup(object, new_pindex);
 		if (pp != NULL && vm_page_busied(pp)) {
 			/*
 			 * The page in the parent is busy and possibly not
 			 * (yet) valid.  Until its state is finalized by the
 			 * busy bit owner, we can't tell whether it shadows the
 			 * original page.  Therefore, we must either skip it
 			 * and the original (backing_object) page or wait for
 			 * its state to be finalized.
 			 *
 			 * This is due to a race with vm_fault() where we must
 			 * unbusy the original (backing_obj) page before we can
 			 * (re)lock the parent.  Hence we can get here.
 			 */
 			next = vm_object_collapse_scan_wait(object, pp, next,
 			    op);
 			continue;
 		}
 
 		KASSERT(pp == NULL || pp->valid != 0,
 		    ("unbusy invalid page %p", pp));
 
 		if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL,
 			NULL)) {
 			/*
 			 * The page already exists in the parent OR swap exists
 			 * for this location in the parent.  Leave the parent's
 			 * page alone.  Destroy the original page from the
 			 * backing object.
 			 */
 			if (backing_object->type == OBJT_SWAP)
 				swap_pager_freespace(backing_object, p->pindex,
 				    1);
 			vm_page_lock(p);
 			KASSERT(!pmap_page_is_mapped(p),
 			    ("freeing mapped page %p", p));
 			if (p->wire_count == 0)
 				vm_page_free(p);
 			else
 				vm_page_remove(p);
 			vm_page_unlock(p);
 			continue;
 		}
 
 		/*
 		 * Page does not exist in parent, rename the page from the
 		 * backing object to the main object.
 		 *
 		 * If the page was mapped to a process, it can remain mapped
 		 * through the rename.  vm_page_rename() will handle dirty and
 		 * cache.
 		 */
 		if (vm_page_rename(p, object, new_pindex)) {
 			next = vm_object_collapse_scan_wait(object, NULL, next,
 			    op);
 			continue;
 		}
 
 		/* Use the old pindex to free the right page. */
 		if (backing_object->type == OBJT_SWAP)
 			swap_pager_freespace(backing_object,
 			    new_pindex + backing_offset_index, 1);
 
 #if VM_NRESERVLEVEL > 0
 		/*
 		 * Rename the reservation.
 		 */
 		vm_reserv_rename(p, object, backing_object,
 		    backing_offset_index);
 #endif
 	}
 	return (true);
 }
 
 
 /*
  * this version of collapse allows the operation to occur earlier and
  * when paging_in_progress is true for an object...  This is not a complete
  * operation, but should plug 99.9% of the rest of the leaks.
  */
 static void
 vm_object_qcollapse(vm_object_t object)
 {
 	vm_object_t backing_object = object->backing_object;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(backing_object);
 
 	if (backing_object->ref_count != 1)
 		return;
 
 	vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT);
 }
 
 /*
  *	vm_object_collapse:
  *
  *	Collapse an object with the object backing it.
  *	Pages in the backing object are moved into the
  *	parent, and the backing object is deallocated.
  */
 void
 vm_object_collapse(vm_object_t object)
 {
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	
 	while (TRUE) {
 		vm_object_t backing_object;
 
 		/*
 		 * Verify that the conditions are right for collapse:
 		 *
 		 * The object exists and the backing object exists.
 		 */
 		if ((backing_object = object->backing_object) == NULL)
 			break;
 
 		/*
 		 * we check the backing object first, because it is most likely
 		 * not collapsable.
 		 */
 		VM_OBJECT_WLOCK(backing_object);
 		if (backing_object->handle != NULL ||
 		    (backing_object->type != OBJT_DEFAULT &&
 		     backing_object->type != OBJT_SWAP) ||
 		    (backing_object->flags & OBJ_DEAD) ||
 		    object->handle != NULL ||
 		    (object->type != OBJT_DEFAULT &&
 		     object->type != OBJT_SWAP) ||
 		    (object->flags & OBJ_DEAD)) {
 			VM_OBJECT_WUNLOCK(backing_object);
 			break;
 		}
 
 		if (
 		    object->paging_in_progress != 0 ||
 		    backing_object->paging_in_progress != 0
 		) {
 			vm_object_qcollapse(object);
 			VM_OBJECT_WUNLOCK(backing_object);
 			break;
 		}
 		/*
 		 * We know that we can either collapse the backing object (if
 		 * the parent is the only reference to it) or (perhaps) have
 		 * the parent bypass the object if the parent happens to shadow
 		 * all the resident pages in the entire backing object.
 		 *
 		 * This is ignoring pager-backed pages such as swap pages.
 		 * vm_object_collapse_scan fails the shadowing test in this
 		 * case.
 		 */
 		if (backing_object->ref_count == 1) {
 			/*
 			 * If there is exactly one reference to the backing
 			 * object, we can collapse it into the parent.
 			 */
 			vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT);
 
 #if VM_NRESERVLEVEL > 0
 			/*
 			 * Break any reservations from backing_object.
 			 */
 			if (__predict_false(!LIST_EMPTY(&backing_object->rvq)))
 				vm_reserv_break_all(backing_object);
 #endif
 
 			/*
 			 * Move the pager from backing_object to object.
 			 */
 			if (backing_object->type == OBJT_SWAP) {
 				/*
 				 * swap_pager_copy() can sleep, in which case
 				 * the backing_object's and object's locks are
 				 * released and reacquired.
 				 * Since swap_pager_copy() is being asked to
 				 * destroy the source, it will change the
 				 * backing_object's type to OBJT_DEFAULT.
 				 */
 				swap_pager_copy(
 				    backing_object,
 				    object,
 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
 
 				/*
 				 * Free any cached pages from backing_object.
 				 */
 				if (__predict_false(
 				    !vm_object_cache_is_empty(backing_object)))
 					vm_page_cache_free(backing_object, 0, 0);
 			}
 			/*
 			 * Object now shadows whatever backing_object did.
 			 * Note that the reference to 
 			 * backing_object->backing_object moves from within 
 			 * backing_object to within object.
 			 */
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 			if (backing_object->backing_object) {
 				VM_OBJECT_WLOCK(backing_object->backing_object);
 				LIST_REMOVE(backing_object, shadow_list);
 				LIST_INSERT_HEAD(
 				    &backing_object->backing_object->shadow_head,
 				    object, shadow_list);
 				/*
 				 * The shadow_count has not changed.
 				 */
 				VM_OBJECT_WUNLOCK(backing_object->backing_object);
 			}
 			object->backing_object = backing_object->backing_object;
 			object->backing_object_offset +=
 			    backing_object->backing_object_offset;
 
 			/*
 			 * Discard backing_object.
 			 *
 			 * Since the backing object has no pages, no pager left,
 			 * and no object references within it, all that is
 			 * necessary is to dispose of it.
 			 */
 			KASSERT(backing_object->ref_count == 1, (
 "backing_object %p was somehow re-referenced during collapse!",
 			    backing_object));
 			backing_object->type = OBJT_DEAD;
 			backing_object->ref_count = 0;
 			VM_OBJECT_WUNLOCK(backing_object);
 			vm_object_destroy(backing_object);
 
 			object_collapses++;
 		} else {
 			vm_object_t new_backing_object;
 
 			/*
 			 * If we do not entirely shadow the backing object,
 			 * there is nothing we can do so we give up.
 			 */
 			if (object->resident_page_count != object->size &&
 			    !vm_object_scan_all_shadowed(object)) {
 				VM_OBJECT_WUNLOCK(backing_object);
 				break;
 			}
 
 			/*
 			 * Make the parent shadow the next object in the
 			 * chain.  Deallocating backing_object will not remove
 			 * it, since its reference count is at least 2.
 			 */
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 
 			new_backing_object = backing_object->backing_object;
 			if ((object->backing_object = new_backing_object) != NULL) {
 				VM_OBJECT_WLOCK(new_backing_object);
 				LIST_INSERT_HEAD(
 				    &new_backing_object->shadow_head,
 				    object,
 				    shadow_list
 				);
 				new_backing_object->shadow_count++;
 				vm_object_reference_locked(new_backing_object);
 				VM_OBJECT_WUNLOCK(new_backing_object);
 				object->backing_object_offset +=
 					backing_object->backing_object_offset;
 			}
 
 			/*
 			 * Drop the reference count on backing_object. Since
 			 * its ref_count was at least 2, it will not vanish.
 			 */
 			backing_object->ref_count--;
 			VM_OBJECT_WUNLOCK(backing_object);
 			object_bypasses++;
 		}
 
 		/*
 		 * Try again with this object's new backing object.
 		 */
 	}
 }
 
 /*
  *	vm_object_page_remove:
  *
  *	For the given object, either frees or invalidates each of the
  *	specified pages.  In general, a page is freed.  However, if a page is
  *	wired for any reason other than the existence of a managed, wired
  *	mapping, then it may be invalidated but not removed from the object.
  *	Pages are specified by the given range ["start", "end") and the option
  *	OBJPR_CLEANONLY.  As a special case, if "end" is zero, then the range
  *	extends from "start" to the end of the object.  If the option
  *	OBJPR_CLEANONLY is specified, then only the non-dirty pages within the
  *	specified range are affected.  If the option OBJPR_NOTMAPPED is
  *	specified, then the pages within the specified range must have no
  *	mappings.  Otherwise, if this option is not specified, any mappings to
  *	the specified pages are removed before the pages are freed or
  *	invalidated.
  *
  *	In general, this operation should only be performed on objects that
  *	contain managed pages.  There are, however, two exceptions.  First, it
  *	is performed on the kernel and kmem objects by vm_map_entry_delete().
  *	Second, it is used by msync(..., MS_INVALIDATE) to invalidate device-
  *	backed pages.  In both of these cases, the option OBJPR_CLEANONLY must
  *	not be specified and the option OBJPR_NOTMAPPED must be specified.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
     int options)
 {
 	vm_page_t p, next;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
 	    (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED,
 	    ("vm_object_page_remove: illegal options for object %p", object));
 	if (object->resident_page_count == 0)
 		goto skipmemq;
 	vm_object_pip_add(object, 1);
 again:
 	p = vm_page_find_least(object, start);
 
 	/*
 	 * Here, the variable "p" is either (1) the page with the least pindex
 	 * greater than or equal to the parameter "start" or (2) NULL. 
 	 */
 	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
 		next = TAILQ_NEXT(p, listq);
 
 		/*
 		 * If the page is wired for any reason besides the existence
 		 * of managed, wired mappings, then it cannot be freed.  For
 		 * example, fictitious pages, which represent device memory,
 		 * are inherently wired and cannot be freed.  They can,
 		 * however, be invalidated if the option OBJPR_CLEANONLY is
 		 * not specified.
 		 */
 		vm_page_lock(p);
 		if (vm_page_xbusied(p)) {
 			VM_OBJECT_WUNLOCK(object);
 			vm_page_busy_sleep(p, "vmopax");
 			VM_OBJECT_WLOCK(object);
 			goto again;
 		}
 		if (p->wire_count != 0) {
 			if ((options & OBJPR_NOTMAPPED) == 0)
 				pmap_remove_all(p);
 			if ((options & OBJPR_CLEANONLY) == 0) {
 				p->valid = 0;
 				vm_page_undirty(p);
 			}
 			goto next;
 		}
 		if (vm_page_busied(p)) {
 			VM_OBJECT_WUNLOCK(object);
 			vm_page_busy_sleep(p, "vmopar");
 			VM_OBJECT_WLOCK(object);
 			goto again;
 		}
 		KASSERT((p->flags & PG_FICTITIOUS) == 0,
 		    ("vm_object_page_remove: page %p is fictitious", p));
 		if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) {
 			if ((options & OBJPR_NOTMAPPED) == 0)
 				pmap_remove_write(p);
 			if (p->dirty)
 				goto next;
 		}
 		if ((options & OBJPR_NOTMAPPED) == 0)
 			pmap_remove_all(p);
 		vm_page_free(p);
 next:
 		vm_page_unlock(p);
 	}
 	vm_object_pip_wakeup(object);
 skipmemq:
 	if (__predict_false(!vm_object_cache_is_empty(object)))
 		vm_page_cache_free(object, start, end);
 }
 
 /*
  *	vm_object_page_noreuse:
  *
  *	For the given object, attempt to move the specified pages to
  *	the head of the inactive queue.  This bypasses regular LRU
  *	operation and allows the pages to be reused quickly under memory
  *	pressure.  If a page is wired for any reason, then it will not
  *	be queued.  Pages are specified by the range ["start", "end").
  *	As a special case, if "end" is zero, then the range extends from
  *	"start" to the end of the object.
  *
  *	This operation should only be performed on objects that
  *	contain non-fictitious, managed pages.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
 	struct mtx *mtx, *new_mtx;
 	vm_page_t p, next;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0,
 	    ("vm_object_page_noreuse: illegal object %p", object));
 	if (object->resident_page_count == 0)
 		return;
 	p = vm_page_find_least(object, start);
 
 	/*
 	 * Here, the variable "p" is either (1) the page with the least pindex
 	 * greater than or equal to the parameter "start" or (2) NULL. 
 	 */
 	mtx = NULL;
 	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
 		next = TAILQ_NEXT(p, listq);
 
 		/*
 		 * Avoid releasing and reacquiring the same page lock.
 		 */
 		new_mtx = vm_page_lockptr(p);
 		if (mtx != new_mtx) {
 			if (mtx != NULL)
 				mtx_unlock(mtx);
 			mtx = new_mtx;
 			mtx_lock(mtx);
 		}
 		vm_page_deactivate_noreuse(p);
 	}
 	if (mtx != NULL)
 		mtx_unlock(mtx);
 }
 
 /*
  *	Populate the specified range of the object with valid pages.  Returns
  *	TRUE if the range is successfully populated and FALSE otherwise.
  *
  *	Note: This function should be optimized to pass a larger array of
  *	pages to vm_pager_get_pages() before it is applied to a non-
  *	OBJT_DEVICE object.
  *
  *	The object must be locked.
  */
 boolean_t
 vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_page_t m;
 	vm_pindex_t pindex;
 	int rv;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	for (pindex = start; pindex < end; pindex++) {
 		m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
 		if (m->valid != VM_PAGE_BITS_ALL) {
 			rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
 			if (rv != VM_PAGER_OK) {
 				vm_page_lock(m);
 				vm_page_free(m);
 				vm_page_unlock(m);
 				break;
 			}
 		}
 		/*
 		 * Keep "m" busy because a subsequent iteration may unlock
 		 * the object.
 		 */
 	}
 	if (pindex > start) {
 		m = vm_page_lookup(object, start);
 		while (m != NULL && m->pindex < pindex) {
 			vm_page_xunbusy(m);
 			m = TAILQ_NEXT(m, listq);
 		}
 	}
 	return (pindex == end);
 }
 
 /*
  *	Routine:	vm_object_coalesce
  *	Function:	Coalesces two objects backing up adjoining
  *			regions of memory into a single object.
  *
  *	returns TRUE if objects were combined.
  *
  *	NOTE:	Only works at the moment if the second object is NULL -
  *		if it's not, which object do we lock first?
  *
  *	Parameters:
  *		prev_object	First object to coalesce
  *		prev_offset	Offset into prev_object
  *		prev_size	Size of reference to prev_object
  *		next_size	Size of reference to the second object
  *		reserved	Indicator that extension region has
  *				swap accounted for
  *
  *	Conditions:
  *	The object must *not* be locked.
  */
 boolean_t
 vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset,
     vm_size_t prev_size, vm_size_t next_size, boolean_t reserved)
 {
 	vm_pindex_t next_pindex;
 
 	if (prev_object == NULL)
 		return (TRUE);
 	VM_OBJECT_WLOCK(prev_object);
 	if ((prev_object->type != OBJT_DEFAULT &&
 	    prev_object->type != OBJT_SWAP) ||
 	    (prev_object->flags & OBJ_TMPFS_NODE) != 0) {
 		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Try to collapse the object first
 	 */
 	vm_object_collapse(prev_object);
 
 	/*
 	 * Can't coalesce if: . more than one reference . paged out . shadows
 	 * another object . has a copy elsewhere (any of which mean that the
 	 * pages not mapped to prev_entry may be in use anyway)
 	 */
 	if (prev_object->backing_object != NULL) {
 		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	prev_size >>= PAGE_SHIFT;
 	next_size >>= PAGE_SHIFT;
 	next_pindex = OFF_TO_IDX(prev_offset) + prev_size;
 
 	if ((prev_object->ref_count > 1) &&
 	    (prev_object->size != next_pindex)) {
 		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Account for the charge.
 	 */
 	if (prev_object->cred != NULL) {
 
 		/*
 		 * If prev_object was charged, then this mapping,
 		 * althought not charged now, may become writable
 		 * later. Non-NULL cred in the object would prevent
 		 * swap reservation during enabling of the write
 		 * access, so reserve swap now. Failed reservation
 		 * cause allocation of the separate object for the map
 		 * entry, and swap reservation for this entry is
 		 * managed in appropriate time.
 		 */
 		if (!reserved && !swap_reserve_by_cred(ptoa(next_size),
 		    prev_object->cred)) {
 			return (FALSE);
 		}
 		prev_object->charge += ptoa(next_size);
 	}
 
 	/*
 	 * Remove any pages that may still be in the object from a previous
 	 * deallocation.
 	 */
 	if (next_pindex < prev_object->size) {
 		vm_object_page_remove(prev_object, next_pindex, next_pindex +
 		    next_size, 0);
 		if (prev_object->type == OBJT_SWAP)
 			swap_pager_freespace(prev_object,
 					     next_pindex, next_size);
 #if 0
 		if (prev_object->cred != NULL) {
 			KASSERT(prev_object->charge >=
 			    ptoa(prev_object->size - next_pindex),
 			    ("object %p overcharged 1 %jx %jx", prev_object,
 				(uintmax_t)next_pindex, (uintmax_t)next_size));
 			prev_object->charge -= ptoa(prev_object->size -
 			    next_pindex);
 		}
 #endif
 	}
 
 	/*
 	 * Extend the object if necessary.
 	 */
 	if (next_pindex + next_size > prev_object->size)
 		prev_object->size = next_pindex + next_size;
 
 	VM_OBJECT_WUNLOCK(prev_object);
 	return (TRUE);
 }
 
 void
 vm_object_set_writeable_dirty(vm_object_t object)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type != OBJT_VNODE) {
 		if ((object->flags & OBJ_TMPFS_NODE) != 0) {
 			KASSERT(object->type == OBJT_SWAP, ("non-swap tmpfs"));
 			vm_object_set_flag(object, OBJ_TMPFS_DIRTY);
 		}
 		return;
 	}
 	object->generation++;
 	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0)
 		return;
 	vm_object_set_flag(object, OBJ_MIGHTBEDIRTY);
 }
 
 /*
  *	vm_object_unwire:
  *
  *	For each page offset within the specified range of the given object,
  *	find the highest-level page in the shadow chain and unwire it.  A page
  *	must exist at every page offset, and the highest-level page must be
  *	wired.
  */
 void
 vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length,
     uint8_t queue)
 {
 	vm_object_t tobject;
 	vm_page_t m, tm;
 	vm_pindex_t end_pindex, pindex, tpindex;
 	int depth, locked_depth;
 
 	KASSERT((offset & PAGE_MASK) == 0,
 	    ("vm_object_unwire: offset is not page aligned"));
 	KASSERT((length & PAGE_MASK) == 0,
 	    ("vm_object_unwire: length is not a multiple of PAGE_SIZE"));
 	/* The wired count of a fictitious page never changes. */
 	if ((object->flags & OBJ_FICTITIOUS) != 0)
 		return;
 	pindex = OFF_TO_IDX(offset);
 	end_pindex = pindex + atop(length);
 	locked_depth = 1;
 	VM_OBJECT_RLOCK(object);
 	m = vm_page_find_least(object, pindex);
 	while (pindex < end_pindex) {
 		if (m == NULL || pindex < m->pindex) {
 			/*
 			 * The first object in the shadow chain doesn't
 			 * contain a page at the current index.  Therefore,
 			 * the page must exist in a backing object.
 			 */
 			tobject = object;
 			tpindex = pindex;
 			depth = 0;
 			do {
 				tpindex +=
 				    OFF_TO_IDX(tobject->backing_object_offset);
 				tobject = tobject->backing_object;
 				KASSERT(tobject != NULL,
 				    ("vm_object_unwire: missing page"));
 				if ((tobject->flags & OBJ_FICTITIOUS) != 0)
 					goto next_page;
 				depth++;
 				if (depth == locked_depth) {
 					locked_depth++;
 					VM_OBJECT_RLOCK(tobject);
 				}
 			} while ((tm = vm_page_lookup(tobject, tpindex)) ==
 			    NULL);
 		} else {
 			tm = m;
 			m = TAILQ_NEXT(m, listq);
 		}
 		vm_page_lock(tm);
 		vm_page_unwire(tm, queue);
 		vm_page_unlock(tm);
 next_page:
 		pindex++;
 	}
 	/* Release the accumulated object locks. */
 	for (depth = 0; depth < locked_depth; depth++) {
 		tobject = object->backing_object;
 		VM_OBJECT_RUNLOCK(object);
 		object = tobject;
 	}
 }
 
 struct vnode *
 vm_object_vnode(vm_object_t object)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	if (object->type == OBJT_VNODE)
 		return (object->handle);
 	if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0)
 		return (object->un_pager.swp.swp_tmpfs);
 	return (NULL);
 }
 
 static int
 sysctl_vm_object_list(SYSCTL_HANDLER_ARGS)
 {
 	struct kinfo_vmobject kvo;
 	char *fullpath, *freepath;
 	struct vnode *vp;
 	struct vattr va;
 	vm_object_t obj;
 	vm_page_t m;
 	int count, error;
 
 	if (req->oldptr == NULL) {
 		/*
 		 * If an old buffer has not been provided, generate an
 		 * estimate of the space needed for a subsequent call.
 		 */
 		mtx_lock(&vm_object_list_mtx);
 		count = 0;
 		TAILQ_FOREACH(obj, &vm_object_list, object_list) {
 			if (obj->type == OBJT_DEAD)
 				continue;
 			count++;
 		}
 		mtx_unlock(&vm_object_list_mtx);
 		return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) *
 		    count * 11 / 10));
 	}
 
 	error = 0;
 
 	/*
 	 * VM objects are type stable and are never removed from the
 	 * list once added.  This allows us to safely read obj->object_list
 	 * after reacquiring the VM object lock.
 	 */
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(obj, &vm_object_list, object_list) {
 		if (obj->type == OBJT_DEAD)
 			continue;
 		VM_OBJECT_RLOCK(obj);
 		if (obj->type == OBJT_DEAD) {
 			VM_OBJECT_RUNLOCK(obj);
 			continue;
 		}
 		mtx_unlock(&vm_object_list_mtx);
 		kvo.kvo_size = ptoa(obj->size);
 		kvo.kvo_resident = obj->resident_page_count;
 		kvo.kvo_ref_count = obj->ref_count;
 		kvo.kvo_shadow_count = obj->shadow_count;
 		kvo.kvo_memattr = obj->memattr;
 		kvo.kvo_active = 0;
 		kvo.kvo_inactive = 0;
 		TAILQ_FOREACH(m, &obj->memq, listq) {
 			/*
 			 * A page may belong to the object but be
 			 * dequeued and set to PQ_NONE while the
 			 * object lock is not held.  This makes the
 			 * reads of m->queue below racy, and we do not
 			 * count pages set to PQ_NONE.  However, this
 			 * sysctl is only meant to give an
 			 * approximation of the system anyway.
 			 */
 			if (m->queue == PQ_ACTIVE)
 				kvo.kvo_active++;
 			else if (m->queue == PQ_INACTIVE)
 				kvo.kvo_inactive++;
 		}
 
 		kvo.kvo_vn_fileid = 0;
 		kvo.kvo_vn_fsid = 0;
 		freepath = NULL;
 		fullpath = "";
 		vp = NULL;
 		switch (obj->type) {
 		case OBJT_DEFAULT:
 			kvo.kvo_type = KVME_TYPE_DEFAULT;
 			break;
 		case OBJT_VNODE:
 			kvo.kvo_type = KVME_TYPE_VNODE;
 			vp = obj->handle;
 			vref(vp);
 			break;
 		case OBJT_SWAP:
 			kvo.kvo_type = KVME_TYPE_SWAP;
 			break;
 		case OBJT_DEVICE:
 			kvo.kvo_type = KVME_TYPE_DEVICE;
 			break;
 		case OBJT_PHYS:
 			kvo.kvo_type = KVME_TYPE_PHYS;
 			break;
 		case OBJT_DEAD:
 			kvo.kvo_type = KVME_TYPE_DEAD;
 			break;
 		case OBJT_SG:
 			kvo.kvo_type = KVME_TYPE_SG;
 			break;
 		case OBJT_MGTDEVICE:
 			kvo.kvo_type = KVME_TYPE_MGTDEVICE;
 			break;
 		default:
 			kvo.kvo_type = KVME_TYPE_UNKNOWN;
 			break;
 		}
 		VM_OBJECT_RUNLOCK(obj);
 		if (vp != NULL) {
 			vn_fullpath(curthread, vp, &fullpath, &freepath);
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 			if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) {
 				kvo.kvo_vn_fileid = va.va_fileid;
 				kvo.kvo_vn_fsid = va.va_fsid;
 			}
 			vput(vp);
 		}
 
 		strlcpy(kvo.kvo_path, fullpath, sizeof(kvo.kvo_path));
 		if (freepath != NULL)
 			free(freepath, M_TEMP);
 
 		/* Pack record size down */
 		kvo.kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) +
 		    strlen(kvo.kvo_path) + 1;
 		kvo.kvo_structsize = roundup(kvo.kvo_structsize,
 		    sizeof(uint64_t));
 		error = SYSCTL_OUT(req, &kvo, kvo.kvo_structsize);
 		mtx_lock(&vm_object_list_mtx);
 		if (error)
 			break;
 	}
 	mtx_unlock(&vm_object_list_mtx);
 	return (error);
 }
 SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject",
     "List of VM objects");
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <sys/cons.h>
 
 #include <ddb/ddb.h>
 
 static int
 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
 {
 	vm_map_t tmpm;
 	vm_map_entry_t tmpe;
 	vm_object_t obj;
 	int entcount;
 
 	if (map == 0)
 		return 0;
 
 	if (entry == 0) {
 		tmpe = map->header.next;
 		entcount = map->nentries;
 		while (entcount-- && (tmpe != &map->header)) {
 			if (_vm_object_in_map(map, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		tmpm = entry->object.sub_map;
 		tmpe = tmpm->header.next;
 		entcount = tmpm->nentries;
 		while (entcount-- && tmpe != &tmpm->header) {
 			if (_vm_object_in_map(tmpm, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if ((obj = entry->object.vm_object) != NULL) {
 		for (; obj; obj = obj->backing_object)
 			if (obj == object) {
 				return 1;
 			}
 	}
 	return 0;
 }
 
 static int
 vm_object_in_map(vm_object_t object)
 {
 	struct proc *p;
 
 	/* sx_slock(&allproc_lock); */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
 			continue;
 		if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
 			/* sx_sunlock(&allproc_lock); */
 			return 1;
 		}
 	}
 	/* sx_sunlock(&allproc_lock); */
 	if (_vm_object_in_map(kernel_map, object, 0))
 		return 1;
 	return 0;
 }
 
 DB_SHOW_COMMAND(vmochk, vm_object_check)
 {
 	vm_object_t object;
 
 	/*
 	 * make sure that internal objs are in a map somewhere
 	 * and none have zero ref counts.
 	 */
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		if (object->handle == NULL &&
 		    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
 			if (object->ref_count == 0) {
 				db_printf("vmochk: internal obj has zero ref count: %ld\n",
 					(long)object->size);
 			}
 			if (!vm_object_in_map(object)) {
 				db_printf(
 			"vmochk: internal obj is not in a map: "
 			"ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
 				    object->ref_count, (u_long)object->size, 
 				    (u_long)object->size,
 				    (void *)object->backing_object);
 			}
 		}
 	}
 }
 
 /*
  *	vm_object_print:	[ debug ]
  */
 DB_SHOW_COMMAND(object, vm_object_print_static)
 {
 	/* XXX convert args. */
 	vm_object_t object = (vm_object_t)addr;
 	boolean_t full = have_addr;
 
 	vm_page_t p;
 
 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
 #define	count	was_count
 
 	int count;
 
 	if (object == NULL)
 		return;
 
 	db_iprintf(
 	    "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n",
 	    object, (int)object->type, (uintmax_t)object->size,
 	    object->resident_page_count, object->ref_count, object->flags,
 	    object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge);
 	db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n",
 	    object->shadow_count, 
 	    object->backing_object ? object->backing_object->ref_count : 0,
 	    object->backing_object, (uintmax_t)object->backing_object_offset);
 
 	if (!full)
 		return;
 
 	db_indent += 2;
 	count = 0;
 	TAILQ_FOREACH(p, &object->memq, listq) {
 		if (count == 0)
 			db_iprintf("memory:=");
 		else if (count == 6) {
 			db_printf("\n");
 			db_iprintf(" ...");
 			count = 0;
 		} else
 			db_printf(",");
 		count++;
 
 		db_printf("(off=0x%jx,page=0x%jx)",
 		    (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p));
 	}
 	if (count != 0)
 		db_printf("\n");
 	db_indent -= 2;
 }
 
 /* XXX. */
 #undef count
 
 /* XXX need this non-static entry for calling from vm_map_print. */
 void
 vm_object_print(
         /* db_expr_t */ long addr,
 	boolean_t have_addr,
 	/* db_expr_t */ long count,
 	char *modif)
 {
 	vm_object_print_static(addr, have_addr, count, modif);
 }
 
 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
 {
 	vm_object_t object;
 	vm_pindex_t fidx;
 	vm_paddr_t pa;
 	vm_page_t m, prev_m;
 	int rcount, nl, c;
 
 	nl = 0;
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		db_printf("new object: %p\n", (void *)object);
 		if (nl > 18) {
 			c = cngetc();
 			if (c != ' ')
 				return;
 			nl = 0;
 		}
 		nl++;
 		rcount = 0;
 		fidx = 0;
 		pa = -1;
 		TAILQ_FOREACH(m, &object->memq, listq) {
 			if (m->pindex > 128)
 				break;
 			if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL &&
 			    prev_m->pindex + 1 != m->pindex) {
 				if (rcount) {
 					db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 						(long)fidx, rcount, (long)pa);
 					if (nl > 18) {
 						c = cngetc();
 						if (c != ' ')
 							return;
 						nl = 0;
 					}
 					nl++;
 					rcount = 0;
 				}
 			}				
 			if (rcount &&
 				(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
 				++rcount;
 				continue;
 			}
 			if (rcount) {
 				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 					(long)fidx, rcount, (long)pa);
 				if (nl > 18) {
 					c = cngetc();
 					if (c != ' ')
 						return;
 					nl = 0;
 				}
 				nl++;
 			}
 			fidx = m->pindex;
 			pa = VM_PAGE_TO_PHYS(m);
 			rcount = 1;
 		}
 		if (rcount) {
 			db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 				(long)fidx, rcount, (long)pa);
 			if (nl > 18) {
 				c = cngetc();
 				if (c != ' ')
 					return;
 				nl = 0;
 			}
 			nl++;
 		}
 	}
 }
 #endif /* DDB */
Index: head/sys/vm/vm_object.h
===================================================================
--- head/sys/vm/vm_object.h	(revision 296161)
+++ head/sys/vm/vm_object.h	(revision 296162)
@@ -1,329 +1,334 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_object.h	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
  * $FreeBSD$
  */
 
 /*
  *	Virtual memory object module definitions.
  */
 
 #ifndef	_VM_OBJECT_
 #define	_VM_OBJECT_
 
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/_rwlock.h>
 
 #include <vm/_vm_radix.h>
 
 /*
  *	Types defined:
  *
  *	vm_object_t		Virtual memory object.
  *
  *	The root of cached pages pool is protected by both the per-object lock
  *	and the free pages queue mutex.
  *	On insert in the cache radix trie, the per-object lock is expected
  *	to be already held and the free pages queue mutex will be
  *	acquired during the operation too.
  *	On remove and lookup from the cache radix trie, only the free
  *	pages queue mutex is expected to be locked.
  *	These rules allow for reliably checking for the presence of cached
  *	pages with only the per-object lock held, thereby reducing contention
  *	for the free pages queue mutex.
  *
  * List of locks
  *	(c)	const until freed
  *	(o)	per-object lock 
  *	(f)	free pages queue mutex
  *
  */
 
 struct vm_object {
 	struct rwlock lock;
 	TAILQ_ENTRY(vm_object) object_list; /* list of all objects */
 	LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */
 	LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
 	TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */
 	struct vm_radix rtree;		/* root of the resident page radix trie*/
 	vm_pindex_t size;		/* Object size */
 	int generation;			/* generation ID */
 	int ref_count;			/* How many refs?? */
 	int shadow_count;		/* how many objects that this is a shadow for */
 	vm_memattr_t memattr;		/* default memory attribute for pages */
 	objtype_t type;			/* type of pager */
 	u_short flags;			/* see below */
 	u_short pg_color;		/* (c) color of first page in obj */
 	u_int paging_in_progress;	/* Paging (in or out) so don't collapse or destroy */
 	int resident_page_count;	/* number of resident pages */
 	struct vm_object *backing_object; /* object that I'm a shadow of */
 	vm_ooffset_t backing_object_offset;/* Offset in backing object */
 	TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
 	LIST_HEAD(, vm_reserv) rvq;	/* list of reservations */
 	struct vm_radix cache;		/* (o + f) root of the cache page radix trie */
 	void *handle;
 	union {
 		/*
 		 * VNode pager
 		 *
 		 *	vnp_size - current size of file
 		 */
 		struct {
 			off_t vnp_size;
 			vm_ooffset_t writemappings;
 		} vnp;
 
 		/*
 		 * Device pager
 		 *
 		 *	devp_pglist - list of allocated pages
 		 */
 		struct {
 			TAILQ_HEAD(, vm_page) devp_pglist;
 			struct cdev_pager_ops *ops;
 			struct cdev *dev;
 		} devp;
 
 		/*
 		 * SG pager
 		 *
 		 *	sgp_pglist - list of allocated pages
 		 */
 		struct {
 			TAILQ_HEAD(, vm_page) sgp_pglist;
 		} sgp;
 
 		/*
 		 * Swap pager
 		 *
 		 *	swp_tmpfs - back-pointer to the tmpfs vnode,
 		 *		     if any, which uses the vm object
 		 *		     as backing store.  The handle
 		 *		     cannot be reused for linking,
 		 *		     because the vnode can be
 		 *		     reclaimed and recreated, making
 		 *		     the handle changed and hash-chain
 		 *		     invalid.
 		 *
 		 *	swp_bcount - number of swap 'swblock' metablocks, each
 		 *		     contains up to 16 swapblk assignments.
 		 *		     see vm/swap_pager.h
 		 */
 		struct {
 			void *swp_tmpfs;
 			int swp_bcount;
 		} swp;
 	} un_pager;
 	struct ucred *cred;
 	vm_ooffset_t charge;
+	void *umtx_data;
 };
 
 /*
  * Flags
  */
 #define	OBJ_FICTITIOUS	0x0001		/* (c) contains fictitious pages */
 #define	OBJ_UNMANAGED	0x0002		/* (c) contains unmanaged pages */
 #define OBJ_ACTIVE	0x0004		/* active objects */
 #define OBJ_DEAD	0x0008		/* dead objects (during rundown) */
 #define	OBJ_NOSPLIT	0x0010		/* dont split this object */
+#define	OBJ_UMTXDEAD	0x0020		/* umtx pshared was terminated */
 #define OBJ_PIPWNT	0x0040		/* paging in progress wanted */
 #define OBJ_MIGHTBEDIRTY 0x0100		/* object might be dirty, only for vnode */
 #define	OBJ_TMPFS_NODE	0x0200		/* object belongs to tmpfs VREG node */
 #define	OBJ_TMPFS_DIRTY	0x0400		/* dirty tmpfs obj */
 #define	OBJ_COLORED	0x1000		/* pg_color is defined */
 #define	OBJ_ONEMAPPING	0x2000		/* One USE (a single, non-forked) mapping flag */
 #define	OBJ_DISCONNECTWNT 0x4000	/* disconnect from vnode wanted */
 #define	OBJ_TMPFS	0x8000		/* has tmpfs vnode allocated */
 
 #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
 #define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT))
 
 #ifdef	_KERNEL
 
 #define OBJPC_SYNC	0x1			/* sync I/O */
 #define OBJPC_INVAL	0x2			/* invalidate */
 #define OBJPC_NOSYNC	0x4			/* skip if VPO_NOSYNC */
 
 /*
  * The following options are supported by vm_object_page_remove().
  */
 #define	OBJPR_CLEANONLY	0x1		/* Don't remove dirty pages. */
 #define	OBJPR_NOTMAPPED	0x2		/* Don't unmap pages. */
 
 TAILQ_HEAD(object_q, vm_object);
 
 extern struct object_q vm_object_list;	/* list of allocated objects */
 extern struct mtx vm_object_list_mtx;	/* lock for object list and count */
 
 extern struct vm_object kernel_object_store;
 extern struct vm_object kmem_object_store;
 
 #define	kernel_object	(&kernel_object_store)
 #define	kmem_object	(&kmem_object_store)
 
 #define	VM_OBJECT_ASSERT_LOCKED(object)					\
 	rw_assert(&(object)->lock, RA_LOCKED)
 #define	VM_OBJECT_ASSERT_RLOCKED(object)				\
 	rw_assert(&(object)->lock, RA_RLOCKED)
 #define	VM_OBJECT_ASSERT_WLOCKED(object)				\
 	rw_assert(&(object)->lock, RA_WLOCKED)
 #define	VM_OBJECT_ASSERT_UNLOCKED(object)				\
 	rw_assert(&(object)->lock, RA_UNLOCKED)
 #define	VM_OBJECT_LOCK_DOWNGRADE(object)				\
 	rw_downgrade(&(object)->lock)
 #define	VM_OBJECT_RLOCK(object)						\
 	rw_rlock(&(object)->lock)
 #define	VM_OBJECT_RUNLOCK(object)					\
 	rw_runlock(&(object)->lock)
 #define	VM_OBJECT_SLEEP(object, wchan, pri, wmesg, timo)		\
 	rw_sleep((wchan), &(object)->lock, (pri), (wmesg), (timo))
 #define	VM_OBJECT_TRYRLOCK(object)					\
 	rw_try_rlock(&(object)->lock)
 #define	VM_OBJECT_TRYWLOCK(object)					\
 	rw_try_wlock(&(object)->lock)
 #define	VM_OBJECT_TRYUPGRADE(object)					\
 	rw_try_upgrade(&(object)->lock)
 #define	VM_OBJECT_WLOCK(object)						\
 	rw_wlock(&(object)->lock)
 #define	VM_OBJECT_WOWNED(object)					\
 	rw_wowned(&(object)->lock)
 #define	VM_OBJECT_WUNLOCK(object)					\
 	rw_wunlock(&(object)->lock)
 
 /*
  *	The object must be locked or thread private.
  */
 static __inline void
 vm_object_set_flag(vm_object_t object, u_short bits)
 {
 
 	object->flags |= bits;
 }
 
 /*
  *	Conditionally set the object's color, which (1) enables the allocation
  *	of physical memory reservations for anonymous objects and larger-than-
  *	superpage-sized named objects and (2) determines the first page offset
  *	within the object at which a reservation may be allocated.  In other
  *	words, the color determines the alignment of the object with respect
  *	to the largest superpage boundary.  When mapping named objects, like
  *	files or POSIX shared memory objects, the color should be set to zero
  *	before a virtual address is selected for the mapping.  In contrast,
  *	for anonymous objects, the color may be set after the virtual address
  *	is selected.
  *
  *	The object must be locked.
  */
 static __inline void
 vm_object_color(vm_object_t object, u_short color)
 {
 
 	if ((object->flags & OBJ_COLORED) == 0) {
 		object->pg_color = color;
 		object->flags |= OBJ_COLORED;
 	}
 }
 
 void vm_object_clear_flag(vm_object_t object, u_short bits);
 void vm_object_pip_add(vm_object_t object, short i);
 void vm_object_pip_subtract(vm_object_t object, short i);
 void vm_object_pip_wakeup(vm_object_t object);
 void vm_object_pip_wakeupn(vm_object_t object, short i);
 void vm_object_pip_wait(vm_object_t object, char *waitid);
 
 static __inline boolean_t
 vm_object_cache_is_empty(vm_object_t object)
 {
 
 	return (vm_radix_is_empty(&object->cache));
 }
+
+void umtx_shm_object_init(vm_object_t object);
+void umtx_shm_object_terminated(vm_object_t object);
 
 vm_object_t vm_object_allocate (objtype_t, vm_pindex_t);
 boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t,
    boolean_t);
 void vm_object_collapse (vm_object_t);
 void vm_object_deallocate (vm_object_t);
 void vm_object_destroy (vm_object_t);
 void vm_object_terminate (vm_object_t);
 void vm_object_set_writeable_dirty (vm_object_t);
 void vm_object_init (void);
 void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int);
 boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start,
     vm_ooffset_t end, int flags);
 void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start,
     vm_pindex_t end);
 void vm_object_page_remove(vm_object_t object, vm_pindex_t start,
     vm_pindex_t end, int options);
 boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t);
 void vm_object_print(long addr, boolean_t have_addr, long count, char *modif);
 void vm_object_reference (vm_object_t);
 void vm_object_reference_locked(vm_object_t);
 int  vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr);
 void vm_object_shadow (vm_object_t *, vm_ooffset_t *, vm_size_t);
 void vm_object_split(vm_map_entry_t);
 boolean_t vm_object_sync(vm_object_t, vm_ooffset_t, vm_size_t, boolean_t,
     boolean_t);
 void vm_object_unwire(vm_object_t object, vm_ooffset_t offset,
     vm_size_t length, uint8_t queue);
 struct vnode *vm_object_vnode(vm_object_t object);
 #endif				/* _KERNEL */
 
 #endif				/* _VM_OBJECT_ */
Index: head/usr.bin/limits/limits.c
===================================================================
--- head/usr.bin/limits/limits.c	(revision 296161)
+++ head/usr.bin/limits/limits.c	(revision 296162)
@@ -1,773 +1,778 @@
 /*-
  * Copyright (c) 1997 by
  * David L. Nugent <davidn@blaze.net.au>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, is permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice immediately at the beginning of the file, without modification,
  *    this list of conditions, and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. This work was done expressly for inclusion into FreeBSD.  Other use
  *    is permitted provided this notation is included.
  * 4. Absolutely no warranty of function or purpose is made by the authors.
  * 5. Modifications may be freely made to this file providing the above
  *    conditions are met.
  *
  * Display/change(+runprogram)/eval resource limits.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <err.h>
 #include <stdio.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/user.h>
 #include <sys/param.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <stdarg.h>
 #include <stdint.h>
 #include <ctype.h>
 #include <errno.h>
 #include <pwd.h>
 #include <login_cap.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 
 enum
 {
     SH_NONE,
     SH_SH,      /* sh */
     SH_CSH,     /* csh */
     SH_BASH,    /* gnu bash */
     SH_TCSH,    /* tcsh */
     SH_KSH,     /* (pd)ksh */
     SH_ZSH,     /* zsh */
     SH_RC,      /* rc or es */
     SH_NUMBER
 };
 
 
 /* eval emitter for popular shells.
  * Why aren't there any standards here? Most shells support either
  * the csh 'limit' or sh 'ulimit' command, but each varies just
  * enough that they aren't very compatible from one to the other.
  */
 static struct {
     const char * name;	    /* Name of shell */
     const char * inf;	    /* Name used for 'unlimited' resource */
     const char * cmd;	    /* Intro text */
     const char * hard;	    /* Hard limit text */
     const char * soft;	    /* Soft limit text */
     const char * both;	    /* Hard+Soft limit text */
     struct {
 	const char * pfx;
 	const char * sfx;
 	int divisor;
     } lprm[RLIM_NLIMITS];
 } shellparm[] =
 {
     { "", "infinity", "Resource limits%s%s:\n", "-max", "-cur", "",
       {
 	  { "  cputime%-4s          %8s", " secs\n",  1    },
 	  { "  filesize%-4s         %8s", " kB\n",    1024 },
 	  { "  datasize%-4s         %8s", " kB\n",    1024 },
 	  { "  stacksize%-4s        %8s", " kB\n",    1024 },
 	  { "  coredumpsize%-4s     %8s", " kB\n",    1024 },
 	  { "  memoryuse%-4s        %8s", " kB\n",    1024 },
 	  { "  memorylocked%-4s     %8s", " kB\n",    1024 },
 	  { "  maxprocesses%-4s     %8s", "\n",       1    },
 	  { "  openfiles%-4s        %8s", "\n",       1    },
 	  { "  sbsize%-4s           %8s", " bytes\n", 1    },
 	  { "  vmemoryuse%-4s       %8s", " kB\n",    1024 },
 	  { "  pseudo-terminals%-4s %8s", "\n",       1    },
 	  { "  swapuse%-4s          %8s", " kB\n",    1024 },
 	  { "  kqueues%-4s          %8s", "\n",       1    },
+	  { "  umtxp%-4s            %8s", "\n",       1    },
       }
     },
     { "sh", "unlimited", "", " -H", " -S", "",
       {
 	  { "ulimit%s -t %s", ";\n",  1    },
 	  { "ulimit%s -f %s", ";\n",  512  },
 	  { "ulimit%s -d %s", ";\n",  1024 },
 	  { "ulimit%s -s %s", ";\n",  1024 },
 	  { "ulimit%s -c %s", ";\n",  512  },
 	  { "ulimit%s -m %s", ";\n",  1024 },
 	  { "ulimit%s -l %s", ";\n",  1024 },
 	  { "ulimit%s -u %s", ";\n",  1    },
 	  { "ulimit%s -n %s", ";\n",  1    },
 	  { "ulimit%s -b %s", ";\n",  1    },
 	  { "ulimit%s -v %s", ";\n",  1024 },
 	  { "ulimit%s -p %s", ";\n",  1    },
 	  { "ulimit%s -w %s", ";\n",  1024 },
 	  { "ulimit%s -k %s", ";\n",  1    },
+	  { "ulimit%s -o %s", ";\n",  1    },
       }
     },
     { "csh", "unlimited", "", " -h", "", NULL,
       {
 	  { "limit%s cputime %s",         ";\n",  1    },
 	  { "limit%s filesize %s",        ";\n",  1024 },
 	  { "limit%s datasize %s",        ";\n",  1024 },
 	  { "limit%s stacksize %s",       ";\n",  1024 },
 	  { "limit%s coredumpsize %s",    ";\n",  1024 },
 	  { "limit%s memoryuse %s",       ";\n",  1024 },
 	  { "limit%s memorylocked %s",    ";\n",  1024 },
 	  { "limit%s maxproc %s",         ";\n",  1    },
 	  { "limit%s openfiles %s",       ";\n",  1    },
 	  { "limit%s sbsize %s",          ";\n",  1    },
 	  { "limit%s vmemoryuse %s",      ";\n",  1024 },
 	  { "limit%s pseudoterminals %s", ";\n",  1    },
 	  { "limit%s swapsize %s",        ";\n",  1024 },
 	  { "limit%s kqueues %s",         ";\n",  1    },
+	  { "limit%s umtxp %s",           ";\n",  1    },
       }
     },
     { "bash|bash2", "unlimited", "", " -H", " -S", "",
       {
 	  { "ulimit%s -t %s", ";\n",  1    },
 	  { "ulimit%s -f %s", ";\n",  1024 },
 	  { "ulimit%s -d %s", ";\n",  1024 },
 	  { "ulimit%s -s %s", ";\n",  1024 },
 	  { "ulimit%s -c %s", ";\n",  1024 },
 	  { "ulimit%s -m %s", ";\n",  1024 },
 	  { "ulimit%s -l %s", ";\n",  1024 },
 	  { "ulimit%s -u %s", ";\n",  1    },
 	  { "ulimit%s -n %s", ";\n",  1    },
 	  { "ulimit%s -b %s", ";\n",  1    },
 	  { "ulimit%s -v %s", ";\n",  1024 },
 	  { "ulimit%s -p %s", ";\n",  1    },
 	  { "ulimit%s -w %s", ";\n",  1024 }
       }
     },
     { "tcsh", "unlimited", "", " -h", "", NULL,
       {
 	  { "limit%s cputime %s",         ";\n",  1    },
 	  { "limit%s filesize %s",        ";\n",  1024 },
 	  { "limit%s datasize %s",        ";\n",  1024 },
 	  { "limit%s stacksize %s",       ";\n",  1024 },
 	  { "limit%s coredumpsize %s",    ";\n",  1024 },
 	  { "limit%s memoryuse %s",       ";\n",  1024 },
 	  { "limit%s memorylocked %s",    ";\n",  1024 },
 	  { "limit%s maxproc %s",         ";\n",  1    },
 	  { "limit%s descriptors %s",     ";\n",  1    },
 	  { "limit%s sbsize %s",          ";\n",  1    },
 	  { "limit%s vmemoryuse %s",      ";\n",  1024 },
 	  { "limit%s pseudoterminals %s", ";\n",  1    },
 	  { "limit%s swapsize %s",        ";\n",  1024 },
 	  { "limit%s kqueues %s",         ";\n",  1    },
+	  { "limit%s umtxp %s",           ";\n",  1    },
       }
     },
     { "ksh|pdksh", "unlimited", "", " -H", " -S", "",
       {
 	  { "ulimit%s -t %s", ";\n",  1    },
 	  { "ulimit%s -f %s", ";\n",  512  },
 	  { "ulimit%s -d %s", ";\n",  1024 },
 	  { "ulimit%s -s %s", ";\n",  1024 },
 	  { "ulimit%s -c %s", ";\n",  512  },
 	  { "ulimit%s -m %s", ";\n",  1024 },
 	  { "ulimit%s -l %s", ";\n",  1024 },
 	  { "ulimit%s -p %s", ";\n",  1    },
 	  { "ulimit%s -n %s", ";\n",  1    },
 	  { "ulimit%s -b %s", ";\n",  1    },
 	  { "ulimit%s -v %s", ";\n",  1024 },
 	  { "ulimit%s -p %s", ";\n",  1    },
 	  { "ulimit%s -w %s", ";\n",  1024 }
       }
     },
     { "zsh", "unlimited", "", " -H", " -S", "",
       {
 	  { "ulimit%s -t %s", ";\n",  1    },
 	  { "ulimit%s -f %s", ";\n",  512  },
 	  { "ulimit%s -d %s", ";\n",  1024 },
 	  { "ulimit%s -s %s", ";\n",  1024 },
 	  { "ulimit%s -c %s", ";\n",  512  },
 	  { "ulimit%s -m %s", ";\n",  1024 },
 	  { "ulimit%s -l %s", ";\n",  1024 },
 	  { "ulimit%s -u %s", ";\n",  1    },
 	  { "ulimit%s -n %s", ";\n",  1    },
 	  { "ulimit%s -b %s", ";\n",  1    },
 	  { "ulimit%s -v %s", ";\n",  1024 },
 	  { "ulimit%s -p %s", ";\n",  1    },
 	  { "ulimit%s -w %s", ";\n",  1024 }
       }
     },
     { "rc|es", "unlimited", "", " -h", "", NULL,
       {
 	  { "limit%s cputime %s",         ";\n",  1    },
 	  { "limit%s filesize %s",        ";\n",  1024 },
 	  { "limit%s datasize %s",        ";\n",  1024 },
 	  { "limit%s stacksize %s",       ";\n",  1024 },
 	  { "limit%s coredumpsize %s",    ";\n",  1024 },
 	  { "limit%s memoryuse %s",       ";\n",  1024 },
 	  { "limit%s lockedmemory %s",    ";\n",  1024 },
 	  { "limit%s processes %s",       ";\n",  1    },
 	  { "limit%s descriptors %s",     ";\n",  1    },
 	  { "limit%s sbsize %s",          ";\n",  1    },
 	  { "limit%s vmemoryuse %s",      ";\n",  1024 },
 	  { "limit%s pseudoterminals %s", ";\n",  1    },
 	  { "limit%s swapuse %s",         ";\n",  1024 }
       }
     },
     { NULL, NULL, NULL, NULL, NULL, NULL,
       { }
     }
 };
 
 static struct {
     const char * cap;
     rlim_t (*func)(login_cap_t *, const char *, rlim_t, rlim_t);
 } resources[RLIM_NLIMITS] = {
     { "cputime",	login_getcaptime },
     { "filesize",	login_getcapsize },
     { "datasize",	login_getcapsize },
     { "stacksize",	login_getcapsize },
     { "coredumpsize",	login_getcapsize },
     { "memoryuse",	login_getcapsize },
     { "memorylocked",	login_getcapsize },
     { "maxproc",	login_getcapnum  },
     { "openfiles",	login_getcapnum  },
     { "sbsize",		login_getcapsize },
     { "vmemoryuse",	login_getcapsize },
     { "pseudoterminals",login_getcapnum  },
     { "swapuse",	login_getcapsize },
     { "kqueues",	login_getcapnum  },
+    { "umtxp",		login_getcapnum  },
 };
 
 /*
  * One letter for each resource levels.
  * NOTE: There is a dependency on the corresponding
  * letter index being equal to the resource number.
  * If sys/resource.h defines are changed, this needs
  * to be modified accordingly!
  */
 
 #define RCS_STRING  "tfdscmlunbvpwk"
 
 static rlim_t resource_num(int which, int ch, const char *str);
 static void usage(void);
 static int getshelltype(void);
 static void print_limit(rlim_t limit, unsigned divisor, const char *inf,
 			const char *pfx, const char *sfx, const char *which);
 static void getrlimit_proc(pid_t pid, int resource, struct rlimit *rlp);
 static void setrlimit_proc(pid_t pid, int resource, const struct rlimit *rlp);
 extern char **environ;
 
 static const char rcs_string[] = RCS_STRING;
 
 int
 main(int argc, char *argv[])
 {
     char *p, *cls = NULL;
     char *cleanenv[1];
     struct passwd * pwd = NULL;
     int rcswhich, shelltype;
     int i, num_limits = 0;
     int ch, doeval = 0, doall = 0;
     int rtrn, setproc;
     login_cap_t * lc = NULL;
     enum { ANY=0, SOFT=1, HARD=2, BOTH=3, DISPLAYONLY=4 } type = ANY;
     enum { RCSUNKNOWN=0, RCSSET=1, RCSSEL=2 } todo = RCSUNKNOWN;
     int which_limits[RLIM_NLIMITS];
     rlim_t set_limits[RLIM_NLIMITS];
     struct rlimit limits[RLIM_NLIMITS];
     pid_t pid;
 
     /* init resource tables */
     for (i = 0; i < RLIM_NLIMITS; i++) {
 	which_limits[i] = 0; /* Don't set/display any */
 	set_limits[i] = RLIM_INFINITY;
     }
 
     pid = -1;
     optarg = NULL;
     while ((ch = getopt(argc, argv,
-      ":EeC:U:BSHP:ab:c:d:f:l:m:n:s:t:u:v:p:w:k:")) != -1) {
+      ":EeC:U:BSHP:ab:c:d:f:l:m:n:s:t:u:v:p:w:k:o:")) != -1) {
 	switch(ch) {
 	case 'a':
 	    doall = 1;
 	    break;
 	case 'E':
 	    environ = cleanenv;
 	    cleanenv[0] = NULL;
 	    break;
 	case 'e':
 	    doeval = 1;
 	    break;
 	case 'C':
 	    cls = optarg;
 	    break;
 	case 'U':
 	    if ((pwd = getpwnam(optarg)) == NULL) {
 		if (!isdigit(*optarg) ||
 		    (pwd = getpwuid(atoi(optarg))) == NULL) {
 		    warnx("invalid user `%s'", optarg);
 		    usage();
 		}
 	    }
 	    break;
 	case 'H':
 	    type = HARD;
 	    break;
 	case 'S':
 	    type = SOFT;
 	    break;
 	case 'B':
 	    type = SOFT|HARD;
 	    break;
 	case 'P':
 	    if (!isdigit(*optarg) || (pid = atoi(optarg)) < 0) {
 		warnx("invalid pid `%s'", optarg);
 		usage();
 	    }
 	    break;
 	default:
 	case ':': /* Without arg */
 	    if ((p = strchr(rcs_string, optopt)) != NULL) {
 		int rcswhich1 = p - rcs_string;
 		if (optarg && *optarg == '-') { /* 'arg' is actually a switch */
 		    --optind;		/* back one arg, and make arg NULL */
 		    optarg = NULL;
 		}
 		todo = optarg == NULL ? RCSSEL : RCSSET;
 		if (type == ANY)
 		    type = BOTH;
 		which_limits[rcswhich1] = optarg ? type : DISPLAYONLY;
 		set_limits[rcswhich1] = resource_num(rcswhich1, optopt, optarg);
 		num_limits++;
 		break;
 	    }
 	    /* FALLTHRU */
 	case '?':
 	    usage();
 	}
 	optarg = NULL;
     }
 
     if (pid != -1) {
 	if (cls != NULL) {
 	    warnx("-C cannot be used with -P option");
 	    usage();
 	}
 	if (pwd != NULL) {
 	    warnx("-U cannot be used with -P option");
 	    usage();
 	}
     }
 
     /* Get current resource values */
     setproc = 0;
     for (i = 0; i < RLIM_NLIMITS; i++) {
 	if (pid == -1) {
 	    getrlimit(i, &limits[i]);
 	} else if (doall || num_limits == 0) {
 	    getrlimit_proc(pid, i, &limits[i]);
 	} else if (which_limits[i] != 0) {
 	    getrlimit_proc(pid, i, &limits[i]);
 	    setproc = 1;
 	}
     }
 
     /* If user was specified, get class from that */
     if (pwd != NULL)
 	lc = login_getpwclass(pwd);
     else if (cls != NULL && *cls != '\0') {
 	lc = login_getclassbyname(cls, NULL);
 	if (lc == NULL || strcmp(cls, lc->lc_class) != 0)
 	    fprintf(stderr, "login class '%s' non-existent, using %s\n",
 		    cls, lc?lc->lc_class:"current settings");
     }
 
     /* If we have a login class, update resource table from that */
     if (lc != NULL) {
 	for (rcswhich = 0; rcswhich < RLIM_NLIMITS; rcswhich++) {
 	    char str[40];
 	    rlim_t val;
 
 	    /* current value overridden by resourcename or resourcename-cur */
 	    sprintf(str, "%s-cur", resources[rcswhich].cap);
 	    val = resources[rcswhich].func(lc, resources[rcswhich].cap, limits[rcswhich].rlim_cur, limits[rcswhich].rlim_cur);
 	    limits[rcswhich].rlim_cur = resources[rcswhich].func(lc, str, val, val);
 	    /* maximum value overridden by resourcename or resourcename-max */
 	    sprintf(str, "%s-max", resources[rcswhich].cap);
 	    val = resources[rcswhich].func(lc, resources[rcswhich].cap, limits[rcswhich].rlim_max, limits[rcswhich].rlim_max);
 	    limits[rcswhich].rlim_max = resources[rcswhich].func(lc, str, val, val);
 	}
     }
 
     /* now, let's determine what we wish to do with all this */
 
     argv += optind;
 
     /* If we're setting limits or doing an eval (ie. we're not just
      * displaying), then check that hard limits are not lower than
      * soft limits, and force rasing the hard limit if we need to if
      * we are raising the soft limit, or lower the soft limit if we
      * are lowering the hard limit.
      */
     if ((*argv || doeval) && getuid() == 0) {
 
 	for (rcswhich = 0; rcswhich < RLIM_NLIMITS; rcswhich++) {
 	    if (limits[rcswhich].rlim_max != RLIM_INFINITY) {
 		if (limits[rcswhich].rlim_cur == RLIM_INFINITY) {
 		    limits[rcswhich].rlim_max = RLIM_INFINITY;
 		    which_limits[rcswhich] |= HARD;
 		} else if (limits[rcswhich].rlim_cur > limits[rcswhich].rlim_max) {
 		    if (which_limits[rcswhich] == SOFT) {
 			limits[rcswhich].rlim_max = limits[rcswhich].rlim_cur;
 			which_limits[rcswhich] |= HARD;
 		    }  else if (which_limits[rcswhich] == HARD) {
 			limits[rcswhich].rlim_cur = limits[rcswhich].rlim_max;
 			which_limits[rcswhich] |= SOFT;
 		    } else {
 			/* else.. if we're specifically setting both to
 			 * silly values, then let it error out.
 			 */
 		    }
 		}
 	    }
 	}
     }
 
     /* See if we've overridden anything specific on the command line */
     if (num_limits && todo == RCSSET) {
 	for (rcswhich = 0; rcswhich < RLIM_NLIMITS; rcswhich++) {
 	    if (which_limits[rcswhich] & HARD)
 		limits[rcswhich].rlim_max = set_limits[rcswhich];
 	    if (which_limits[rcswhich] & SOFT)
 		limits[rcswhich].rlim_cur = set_limits[rcswhich];
 	}
     }
 
     /* If *argv is not NULL, then we are being asked to
      * (perhaps) set environment variables and run a program
      */
     if (*argv) {
 	if (doeval) {
 	    warnx("-e cannot be used with `cmd' option");
 	    usage();
 	}
 	if (pid != -1) {
 	    warnx("-P cannot be used with `cmd' option");
 	    usage();
 	}
 
 	login_close(lc);
 
 	/* set leading environment variables, like eval(1) */
 	while (*argv && (p = strchr(*argv, '='))) {
 		*p = '\0';
 		rtrn = setenv(*argv++, p + 1, 1);
 		*p = '=';
 		if (rtrn == -1)
 			err(EXIT_FAILURE, "setenv %s", *argv);
 	}
 
 	/* Set limits */
 	for (rcswhich = 0; rcswhich < RLIM_NLIMITS; rcswhich++) {
 	    if (doall || num_limits == 0 || which_limits[rcswhich] != 0)
 		if (setrlimit(rcswhich, &limits[rcswhich]) == -1)
 		    err(1, "setrlimit %s", resources[rcswhich].cap);
 	}
 
 	if (*argv == NULL)
 	    usage();
 
 	execvp(*argv, argv);
 	err(1, "%s", *argv);
     }
 
     if (setproc) {
 	for (rcswhich = 0; rcswhich < RLIM_NLIMITS; rcswhich++) {
 	    if (which_limits[rcswhich] != 0)
 		setrlimit_proc(pid, rcswhich, &limits[rcswhich]);
 	}
 	exit(EXIT_SUCCESS);
     }
 
     shelltype = doeval ? getshelltype() : SH_NONE;
 
     if (type == ANY) /* Default to soft limits */
 	type = SOFT;
 
     /* Display limits */
     printf(shellparm[shelltype].cmd,
 	   lc ? " for class " : " (current)",
 	   lc ? lc->lc_class : "");
 
     for (rcswhich = 0; rcswhich < RLIM_NLIMITS; rcswhich++) {
 	if (doall || num_limits == 0 || which_limits[rcswhich] != 0) {
 	    if (which_limits[rcswhich] == ANY || which_limits[rcswhich])
 		which_limits[rcswhich] = type;
 	    if (shellparm[shelltype].lprm[rcswhich].pfx) {
 		if (shellparm[shelltype].both && limits[rcswhich].rlim_cur == limits[rcswhich].rlim_max) {
 		    print_limit(limits[rcswhich].rlim_max,
 				shellparm[shelltype].lprm[rcswhich].divisor,
 				shellparm[shelltype].inf,
 				shellparm[shelltype].lprm[rcswhich].pfx,
 				shellparm[shelltype].lprm[rcswhich].sfx,
 				shellparm[shelltype].both);
 		} else {
 		    if (which_limits[rcswhich] & HARD) {
 			print_limit(limits[rcswhich].rlim_max,
 				    shellparm[shelltype].lprm[rcswhich].divisor,
 				    shellparm[shelltype].inf,
 				    shellparm[shelltype].lprm[rcswhich].pfx,
 				    shellparm[shelltype].lprm[rcswhich].sfx,
 				    shellparm[shelltype].hard);
 		    }
 		    if (which_limits[rcswhich] & SOFT) {
 			print_limit(limits[rcswhich].rlim_cur,
 				    shellparm[shelltype].lprm[rcswhich].divisor,
 				    shellparm[shelltype].inf,
 				    shellparm[shelltype].lprm[rcswhich].pfx,
 				    shellparm[shelltype].lprm[rcswhich].sfx,
 				    shellparm[shelltype].soft);
 		    }
 		}
 	    }
 	}
     }
 
     login_close(lc);
     exit(EXIT_SUCCESS);
 }
 
 
 static void
 usage(void)
 {
     (void)fprintf(stderr,
 	"usage: limits [-C class|-P pid|-U user] [-eaSHBE] "
 	"[-bcdflmnstuvpwk [val]] [[name=val ...] cmd]\n");
     exit(EXIT_FAILURE);
 }
 
 static void
 print_limit(rlim_t limit, unsigned divisor, const char * inf, const char * pfx, const char * sfx, const char * which)
 {
     char numbr[64];
 
     if (limit == RLIM_INFINITY)
 	strcpy(numbr, inf);
     else
 	sprintf(numbr, "%jd", (intmax_t)((limit + divisor/2) / divisor));
     printf(pfx, which, numbr);
     printf(sfx, which);
 
 }
 
 
 static rlim_t
 resource_num(int which, int ch, const char *str)
 {
     rlim_t res = RLIM_INFINITY;
 
     if (str != NULL &&
 	!(strcasecmp(str, "inf") == 0 ||
 	  strcasecmp(str, "infinity") == 0 ||
 	  strcasecmp(str, "unlimit") == 0 ||
 	  strcasecmp(str, "unlimited") == 0)) {
 	const char * s = str;
 	char *e;
 
 	switch (which) {
 	case RLIMIT_CPU:	/* time values */
 	    errno = 0;
 	    res = 0;
 	    while (*s) {
 		rlim_t tim = strtoq(s, &e, 0);
 		if (e == NULL || e == s || errno)
 		    break;
 		switch (*e++) {
 		case 0:		   	/* end of string */
 		    e--;
 		default:
 		case 's': case 'S':	/* seconds */
 		    break;
 		case 'm': case 'M':	/* minutes */
 		    tim *= 60L;
 		    break;
 		case 'h': case 'H':	/* hours */
 		    tim *= (60L * 60L);
 		    break;
 		case 'd': case 'D':	/* days */
 		    tim *= (60L * 60L * 24L);
 		    break;
 		case 'w': case 'W':	/* weeks */
 		    tim *= (60L * 60L * 24L * 7L);
 		case 'y': case 'Y':	/* Years */
 		    tim *= (60L * 60L * 24L * 365L);
 		}
 		s = e;
 		res += tim;
 	    }
 	    break;
 	case RLIMIT_FSIZE: /* Size values */
 	case RLIMIT_DATA:
 	case RLIMIT_STACK:
 	case RLIMIT_CORE:
 	case RLIMIT_RSS:
 	case RLIMIT_MEMLOCK:
 	case RLIMIT_SBSIZE:
 	case RLIMIT_VMEM:
 	case RLIMIT_SWAP:
 	    errno = 0;
 	    res = 0;
 	    while (*s) {
 		rlim_t mult, tim = strtoq(s, &e, 0);
 		if (e == NULL || e == s || errno)
 		    break;
 		switch (*e++) {
 		case 0:	/* end of string */
 		    e--;
 		default:
 		    mult = 1;
 		    break;
 		case 'b': case 'B':	/* 512-byte blocks */
 		    mult = 512;
 		    break;
 		case 'k': case 'K':	/* 1024-byte Kilobytes */
 		    mult = 1024;
 		    break;
 		case 'm': case 'M':	/* 1024-k kbytes */
 		    mult = 1024 * 1024;
 		    break;
 		case 'g': case 'G':	/* 1Gbyte */
 		    mult = 1024 * 1024 * 1024;
 		    break;
 		case 't': case 'T':	/* 1TBte */
 		    mult = 1024LL * 1024LL * 1024LL * 1024LL;
 		    break;
 		}
 		s = e;
 		res += (tim * mult);
 	    }
 	    break;
 	case RLIMIT_NPROC:
 	case RLIMIT_NOFILE:
 	case RLIMIT_NPTS:
 	case RLIMIT_KQUEUES:
 	    res = strtoq(s, &e, 0);
 	    s = e;
 	    break;
 	}
 	if (*s) {
 	    warnx("invalid value -%c `%s'", ch, str);
 	    usage();
 	}
     }
     return res;
 }
 
 
 static int
 getshellbyname(const char * shell)
 {
     int i;
     const char * q;
     const char * p = strrchr(shell, '/');
 
     p = p ? p+1 : shell;
     for (i = 0; (q = shellparm[i].name) != NULL; i++) {
 	while (*q) {
 	    int j = strcspn(q, "|");
 
 	    if (j == 0)
 		break;
 	    if (strncmp(p, q, j) == 0)
 		return i;
 	    if (*(q += j))
 		++q;
 	}
     }
     return SH_SH;
 }
 
 
 /*
  * Determine the type of shell our parent process is
  * This is quite tricky, not 100% reliable and probably
  * not nearly as thorough as it should be. Basically, this
  * is a "best guess" only, but hopefully will work in
  * most cases.
  */
 
 static int
 getshelltype(void)
 {
     pid_t ppid = getppid();
 
     if (ppid != 1) {
 	struct kinfo_proc kp;
 	struct stat st;
 	char path[MAXPATHLEN];
 	char * shell = getenv("SHELL");
 	int mib[4];
 	size_t len;
 
 	mib[0] = CTL_KERN;
 	mib[1] = KERN_PROC;
 	mib[3] = ppid;
 
 	if (shell != NULL && stat(shell, &st) != -1) {
 	    struct stat st1;
 
 	    mib[2] = KERN_PROC_PATHNAME;
 	    len = sizeof(path);
 	    if (sysctl(mib, 4, path, &len, NULL, 0) != -1) {
 		/* $SHELL is actual shell? */
 		if (stat(path, &st1) != -1 && memcmp(&st, &st1, sizeof st) == 0)
 		    return getshellbyname(shell);
 	    }
 	}
 	mib[2] = KERN_PROC_PID;
 	len = sizeof(kp);
 	if (sysctl(mib, 4, &kp, &len, NULL, 0) != -1)
 	    return getshellbyname(kp.ki_comm);
     }
     return SH_SH;
 }
 
 static void
 getrlimit_proc(pid_t pid, int resource, struct rlimit *rlp)
 {
     int error;
     int name[5];
     size_t len;
 
     name[0] = CTL_KERN;
     name[1] = KERN_PROC;
     name[2] = KERN_PROC_RLIMIT;
     name[3] = pid;
     name[4] = resource;
     len = sizeof(*rlp);
     error = sysctl(name, 5, rlp, &len, NULL, 0);
     if (error == -1)
 	err(EXIT_FAILURE, "sysctl: kern.proc.rlimit: %d", pid);
     if (len != sizeof(*rlp))
 	errx(EXIT_FAILURE, "sysctl() returns wrong size");
 }
 
 static void
 setrlimit_proc(pid_t pid, int resource, const struct rlimit *rlp)
 {
     int error;
     int name[5];
 
     name[0] = CTL_KERN;
     name[1] = KERN_PROC;
     name[2] = KERN_PROC_RLIMIT;
     name[3] = pid;
     name[4] = resource;
     error = sysctl(name, 5, NULL, 0, rlp, sizeof(*rlp));
     if (error == -1)
 	err(EXIT_FAILURE, "sysctl: kern.proc.rlimit: %d", pid);
 }
Index: head/usr.bin/procstat/procstat_rlimit.c
===================================================================
--- head/usr.bin/procstat/procstat_rlimit.c	(revision 296161)
+++ head/usr.bin/procstat/procstat_rlimit.c	(revision 296162)
@@ -1,126 +1,127 @@
 /*-
  * Copyright (c) 2011 Mikolaj Golub
  * Copyright (c) 2015 Allan Jude <allanjude@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/resourcevar.h>
 #include <sys/sysctl.h>
 #include <sys/user.h>
 
 #include <err.h>
 #include <errno.h>
 #include <libprocstat.h>
 #include <libutil.h>
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include "procstat.h"
 
 static struct {
 	const char *name;
 	const char *suffix;
-} rlimit_param[14] = {
+} rlimit_param[15] = {
 	{"cputime",          "sec"},
 	{"filesize",         "B  "},
 	{"datasize",         "B  "},
 	{"stacksize",        "B  "},
 	{"coredumpsize",     "B  "},
 	{"memoryuse",        "B  "},
 	{"memorylocked",     "B  "},
 	{"maxprocesses",     "   "},
 	{"openfiles",        "   "},
 	{"sbsize",           "B  "},
 	{"vmemoryuse",       "B  "},
 	{"pseudo-terminals", "   "},
 	{"swapuse",          "B  "},
 	{"kqueues",          "   "},
+	{"umtxp",            "   "},
 };
 
-#if RLIM_NLIMITS > 14
+#if RLIM_NLIMITS > 15
 #error "Resource limits have grown. Add new entries to rlimit_param[]."
 #endif
 
 static const char *
 humanize_rlimit(int indx, rlim_t limit)
 {
 	static char buf[14];
 	int scale;
 
 	if (limit == RLIM_INFINITY)
 		return ("infinity     ");
 
 	scale = humanize_number(buf, sizeof(buf) - 1, (int64_t)limit,
 	    rlimit_param[indx].suffix, HN_AUTOSCALE | HN_GETSCALE, HN_DECIMAL);
 	(void)humanize_number(buf, sizeof(buf) - 1, (int64_t)limit,
 	    rlimit_param[indx].suffix, HN_AUTOSCALE, HN_DECIMAL);
 	/* Pad with one space if there is no suffix prefix. */
 	if (scale == 0)
 		sprintf(buf + strlen(buf), " ");
 	return (buf);
 }
 
 void
 procstat_rlimit(struct procstat *prstat, struct kinfo_proc *kipp)
 {
 	struct rlimit rlimit;
 	int i;
 
 	if (!hflag) {
 		xo_emit("{T:/%5s %-16s %-16s %16s %16s}\n",
 		    "PID", "COMM", "RLIMIT", "SOFT     ", "HARD     ");
 	}
 	xo_emit("{ek:process_id/%5d}{e:command/%-16s/%s}", kipp->ki_pid,
 	    kipp->ki_comm);
 	for (i = 0; i < RLIM_NLIMITS; i++) {
 		if (procstat_getrlimit(prstat, kipp, i, &rlimit) == -1)
 			return;
 		xo_emit("{dk:process_id/%5d} {d:command/%-16s} "
 		    "{d:rlimit_param/%-16s} ", kipp->ki_pid, kipp->ki_comm,
 		    rlimit_param[i].name);
 
 		xo_open_container(rlimit_param[i].name);
 		if (rlimit.rlim_cur == RLIM_INFINITY)
 			xo_emit("{e:soft_limit/infinity}");
 		else
 			xo_emit("{e:soft_limit/%U}", rlimit.rlim_cur);
 
 		if (rlimit.rlim_max == RLIM_INFINITY)
 			xo_emit("{e:hard_limit/infinity}");
 		else
 			xo_emit("{e:hard_limit/%U}", rlimit.rlim_max);
 		xo_close_container(rlimit_param[i].name);
 
 		xo_emit("{d:rlim_cur/%16s} ",
 		    humanize_rlimit(i, rlimit.rlim_cur));
 		xo_emit("{d:rlim_max/%16s}\n",
 		    humanize_rlimit(i, rlimit.rlim_max));
 	}
 }