Index: head/include/pthread.h
===================================================================
--- head/include/pthread.h	(revision 300042)
+++ head/include/pthread.h	(revision 300043)
@@ -1,339 +1,349 @@
 /*
  * Copyright (c) 1993, 1994 by Chris Provenzano, proven@mit.edu
  * Copyright (c) 1995-1998 by John Birrell <jb@cimlogic.com.au>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *  This product includes software developed by Chris Provenzano.
  * 4. The name of Chris Provenzano may not be used to endorse or promote 
  *	  products derived from this software without specific prior written
  *	  permission.
  *
  * THIS SOFTWARE IS PROVIDED BY CHRIS PROVENZANO ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL CHRIS PROVENZANO BE LIABLE FOR ANY 
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef _PTHREAD_H_
 #define	_PTHREAD_H_
 
 /*
  * Header files.
  */
 #include <sys/cdefs.h>
 #include <sys/_pthreadtypes.h>
 #include <machine/_limits.h>
 #include <machine/_types.h>
 #include <sys/_sigset.h>
 #include <sched.h>
 #include <time.h>
 
 /*
  * Run-time invariant values:
  */
 #define	PTHREAD_DESTRUCTOR_ITERATIONS		4
 #define	PTHREAD_KEYS_MAX			256
 #define	PTHREAD_STACK_MIN			__MINSIGSTKSZ
 #define	PTHREAD_THREADS_MAX			__ULONG_MAX
 #define	PTHREAD_BARRIER_SERIAL_THREAD		-1
 
 /*
  * Flags for threads and thread attributes.
  */
 #define	PTHREAD_DETACHED		0x1
 #define	PTHREAD_SCOPE_SYSTEM		0x2
 #define	PTHREAD_INHERIT_SCHED		0x4
 #define	PTHREAD_NOFLOAT			0x8
 
 #define	PTHREAD_CREATE_DETACHED		PTHREAD_DETACHED
 #define	PTHREAD_CREATE_JOINABLE		0
 #define	PTHREAD_SCOPE_PROCESS		0
 #define	PTHREAD_EXPLICIT_SCHED		0
 
 /*
  * Values for process shared/private attributes.
  */
 #define	PTHREAD_PROCESS_PRIVATE		0
 #define	PTHREAD_PROCESS_SHARED		1
 
 /*
  * Flags for cancelling threads
  */
 #define	PTHREAD_CANCEL_ENABLE		0
 #define	PTHREAD_CANCEL_DISABLE		1
 #define	PTHREAD_CANCEL_DEFERRED		0
 #define	PTHREAD_CANCEL_ASYNCHRONOUS	2
 #define	PTHREAD_CANCELED		((void *) 1)
 
 /*
  * Flags for once initialization.
  */
 #define	PTHREAD_NEEDS_INIT	0
 #define	PTHREAD_DONE_INIT	1
 
 /*
  * Static once initialization values.
  */
 #define	PTHREAD_ONCE_INIT	{ PTHREAD_NEEDS_INIT, NULL }
 
 /*
  * Static initialization values.
  */
 #define	PTHREAD_MUTEX_INITIALIZER	NULL
 #define	PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP	((pthread_mutex_t)1)
 #define	PTHREAD_COND_INITIALIZER	NULL
 #define	PTHREAD_RWLOCK_INITIALIZER	NULL
 
 /*
  * Default attribute arguments (draft 4, deprecated).
  */
 #ifndef PTHREAD_KERNEL
 #define	pthread_condattr_default	NULL
 #define	pthread_mutexattr_default	NULL
 #define	pthread_attr_default		NULL
 #endif
 
 #define	PTHREAD_PRIO_NONE	0
 #define	PTHREAD_PRIO_INHERIT	1
 #define	PTHREAD_PRIO_PROTECT	2
 
 /*
  * Mutex types (Single UNIX Specification, Version 2, 1997).
  *
  * Note that a mutex attribute with one of the following types:
  *
  *	PTHREAD_MUTEX_NORMAL
  *	PTHREAD_MUTEX_RECURSIVE
  *
  * will deviate from POSIX specified semantics.
  */
 enum pthread_mutextype {
 	PTHREAD_MUTEX_ERRORCHECK	= 1,	/* Default POSIX mutex */
 	PTHREAD_MUTEX_RECURSIVE		= 2,	/* Recursive mutex */
 	PTHREAD_MUTEX_NORMAL		= 3,	/* No error checking */
 	PTHREAD_MUTEX_ADAPTIVE_NP	= 4,	/* Adaptive mutex, spins briefly before blocking on lock */
 	PTHREAD_MUTEX_TYPE_MAX
 };
 
 #define	PTHREAD_MUTEX_DEFAULT		PTHREAD_MUTEX_ERRORCHECK
 
+#define	PTHREAD_MUTEX_STALLED		0
+#define	PTHREAD_MUTEX_ROBUST		1
+
 struct _pthread_cleanup_info {
 	__uintptr_t	pthread_cleanup_pad[8];
 };
 
 /*
  * Thread function prototype definitions:
  */
 __BEGIN_DECLS
 int		pthread_atfork(void (*)(void), void (*)(void), void (*)(void));
 int		pthread_attr_destroy(pthread_attr_t *) __nonnull(1);
 int		pthread_attr_getstack(const pthread_attr_t * __restrict, 
 			void ** __restrict, size_t * __restrict)
 			__nonnull_all;
 int		pthread_attr_getstacksize(const pthread_attr_t *, size_t *)
 			__nonnull_all;
 int		pthread_attr_getguardsize(const pthread_attr_t *, size_t *);
 int		pthread_attr_getstackaddr(const pthread_attr_t *, void **);
 int		pthread_attr_getdetachstate(const pthread_attr_t *, int *)
 			__nonnull_all;
 int		pthread_attr_init(pthread_attr_t *) __nonnull(1);
 int		pthread_attr_setstacksize(pthread_attr_t *, size_t)
 			__nonnull(1);
 int		pthread_attr_setguardsize(pthread_attr_t *, size_t)
 			__nonnull(1);
 int		pthread_attr_setstack(pthread_attr_t *, void *, size_t)
 			__nonnull(1);
 int		pthread_attr_setstackaddr(pthread_attr_t *, void *);
 int		pthread_attr_setdetachstate(pthread_attr_t *, int) __nonnull(1);
 int		pthread_barrier_destroy(pthread_barrier_t *);
 int		pthread_barrier_init(pthread_barrier_t *,
 			const pthread_barrierattr_t *, unsigned);
 int		pthread_barrier_wait(pthread_barrier_t *);
 int		pthread_barrierattr_destroy(pthread_barrierattr_t *);
 int		pthread_barrierattr_getpshared(const pthread_barrierattr_t *,
 			int *);
 int		pthread_barrierattr_init(pthread_barrierattr_t *) __nonnull(1);
 int		pthread_barrierattr_setpshared(pthread_barrierattr_t *, int);
 
 #define		pthread_cleanup_push(cleanup_routine, cleanup_arg)		\
 		{								\
 			struct _pthread_cleanup_info __cleanup_info__;		\
 			__pthread_cleanup_push_imp(cleanup_routine, cleanup_arg,\
 				&__cleanup_info__);				\
 			{
 
 #define		pthread_cleanup_pop(execute)					\
 				(void)0;					\
 			}							\
 			__pthread_cleanup_pop_imp(execute);			\
 		}
 
 int		pthread_condattr_destroy(pthread_condattr_t *) __nonnull(1);
 int		pthread_condattr_getclock(const pthread_condattr_t *,
 			clockid_t *) __nonnull_all;
 int		pthread_condattr_getpshared(const pthread_condattr_t *, int *)
 			__nonnull_all;
 int		pthread_condattr_init(pthread_condattr_t *) __nonnull(1);
 int		pthread_condattr_setclock(pthread_condattr_t *, clockid_t)
 			__nonnull(1);
 int		pthread_condattr_setpshared(pthread_condattr_t *, int)
 			__nonnull(1);
 int		pthread_cond_broadcast(pthread_cond_t *)
 			__nonnull(1);
 int		pthread_cond_destroy(pthread_cond_t *)
 			__nonnull(1);
 int		pthread_cond_init(pthread_cond_t *,
 			const pthread_condattr_t *) __nonnull(1);
 int		pthread_cond_signal(pthread_cond_t *) __nonnull(1);
 int		pthread_cond_timedwait(pthread_cond_t *,
 			pthread_mutex_t *__mutex, const struct timespec *)
 			__nonnull_all __requires_exclusive(*__mutex);
 int		pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *__mutex)
 			__nonnull_all __requires_exclusive(*__mutex);
 int		pthread_create(pthread_t *, const pthread_attr_t *,
 			void *(*) (void *), void *) __nonnull(1) __nonnull(3);
 int		pthread_detach(pthread_t);
 int		pthread_equal(pthread_t, pthread_t);
 void		pthread_exit(void *) __dead2;
 void		*pthread_getspecific(pthread_key_t);
 int		pthread_getcpuclockid(pthread_t, clockid_t *) __nonnull(2);
 int		pthread_join(pthread_t, void **);
 int		pthread_key_create(pthread_key_t *,
 			void (*) (void *)) __nonnull(1);
 int		pthread_key_delete(pthread_key_t);
 int		pthread_mutexattr_init(pthread_mutexattr_t *) __nonnull(1);
 int		pthread_mutexattr_destroy(pthread_mutexattr_t *) __nonnull(1);
 int		pthread_mutexattr_getpshared(const pthread_mutexattr_t *,
 			int *) __nonnull_all;
 int		pthread_mutexattr_gettype(pthread_mutexattr_t *, int *)
 			__nonnull_all;
 int		pthread_mutexattr_settype(pthread_mutexattr_t *, int)
 			__nonnull(1);
 int		pthread_mutexattr_setpshared(pthread_mutexattr_t *, int)
 			__nonnull(1);
+int		pthread_mutex_consistent(pthread_mutex_t *__mutex)
+			__nonnull(1) __requires_exclusive(*__mutex);
 int		pthread_mutex_destroy(pthread_mutex_t *__mutex)
 			__nonnull(1) __requires_unlocked(*__mutex);
 int		pthread_mutex_init(pthread_mutex_t *__mutex,
 			const pthread_mutexattr_t *)
 			__nonnull(1) __requires_unlocked(*__mutex);
 int		pthread_mutex_lock(pthread_mutex_t *__mutex)
 			__nonnull(1) __locks_exclusive(*__mutex);
 int		pthread_mutex_trylock(pthread_mutex_t *__mutex)
 			__nonnull(1) __trylocks_exclusive(0, *__mutex);
 int		pthread_mutex_timedlock(pthread_mutex_t *__mutex,
 			const struct timespec *)
 			__nonnull_all __trylocks_exclusive(0, *__mutex);
 int		pthread_mutex_unlock(pthread_mutex_t *__mutex)
 			__nonnull(1) __unlocks(*__mutex);
 int		pthread_once(pthread_once_t *, void (*) (void)) __nonnull_all;
 int		pthread_rwlock_destroy(pthread_rwlock_t *__rwlock)
 			__nonnull(1) __requires_unlocked(*__rwlock);
 int		pthread_rwlock_init(pthread_rwlock_t *__rwlock,
 			const pthread_rwlockattr_t *)
 			__nonnull(1) __requires_unlocked(*__rwlock);
 int		pthread_rwlock_rdlock(pthread_rwlock_t *__rwlock)
 			__nonnull(1) __locks_shared(*__rwlock);
 int		pthread_rwlock_timedrdlock(pthread_rwlock_t *__rwlock,
 			const struct timespec *)
 			__nonnull_all __trylocks_shared(0, *__rwlock);
 int		pthread_rwlock_timedwrlock(pthread_rwlock_t *__rwlock,
 			const struct timespec *)
 			__nonnull_all __trylocks_exclusive(0, *__rwlock);
 int		pthread_rwlock_tryrdlock(pthread_rwlock_t *__rwlock)
 			__nonnull(1) __trylocks_shared(0, *__rwlock);
 int		pthread_rwlock_trywrlock(pthread_rwlock_t *__rwlock)
 			__nonnull(1) __trylocks_exclusive(0, *__rwlock);
 int		pthread_rwlock_unlock(pthread_rwlock_t *__rwlock)
 			__nonnull(1) __unlocks(*__rwlock);
 int		pthread_rwlock_wrlock(pthread_rwlock_t *__rwlock)
 			__nonnull(1) __locks_exclusive(*__rwlock);
 int		pthread_rwlockattr_destroy(pthread_rwlockattr_t *)
 			__nonnull(1);
 int		pthread_rwlockattr_getkind_np(const pthread_rwlockattr_t *,
 			int *);
 int		pthread_rwlockattr_getpshared(const pthread_rwlockattr_t *,
 			int *) __nonnull_all;
 int		pthread_rwlockattr_init(pthread_rwlockattr_t *)
 			__nonnull(1);
 int		pthread_rwlockattr_setkind_np(pthread_rwlockattr_t *, int);
 int		pthread_rwlockattr_setpshared(pthread_rwlockattr_t *, int)
 			__nonnull(1);
 pthread_t	pthread_self(void);
 int		pthread_setspecific(pthread_key_t, const void *);
 
 int		pthread_spin_init(pthread_spinlock_t *__spin, int)
 			__requires_unlocked(*__spin);
 int		pthread_spin_destroy(pthread_spinlock_t *__spin)
 			__requires_unlocked(*__spin);
 int		pthread_spin_lock(pthread_spinlock_t *__spin)
 			__locks_exclusive(*__spin);
 int		pthread_spin_trylock(pthread_spinlock_t *__spin)
 			__trylocks_exclusive(0, *__spin);
 int		pthread_spin_unlock(pthread_spinlock_t *__spin)
 			__unlocks(*__spin);
 int		pthread_cancel(pthread_t);
 int		pthread_setcancelstate(int, int *);
 int		pthread_setcanceltype(int, int *);
 void		pthread_testcancel(void);
 
 #if __BSD_VISIBLE
 int		pthread_getprio(pthread_t);
 int		pthread_setprio(pthread_t, int);
 void		pthread_yield(void);
 #endif
 
 int		pthread_mutexattr_getprioceiling(pthread_mutexattr_t *,
 			int *);
 int		pthread_mutexattr_setprioceiling(pthread_mutexattr_t *,
 			int);
 int		pthread_mutex_getprioceiling(pthread_mutex_t *, int *);
 int		pthread_mutex_setprioceiling(pthread_mutex_t *, int, int *);
 
 int		pthread_mutexattr_getprotocol(pthread_mutexattr_t *, int *);
 int		pthread_mutexattr_setprotocol(pthread_mutexattr_t *, int);
+
+int		pthread_mutexattr_getrobust(pthread_mutexattr_t *__restrict,
+			int *__restrict) __nonnull_all;
+int		pthread_mutexattr_setrobust(pthread_mutexattr_t *, int)
+			__nonnull(1);
 
 int		pthread_attr_getinheritsched(const pthread_attr_t *, int *);
 int		pthread_attr_getschedparam(const pthread_attr_t *,
 			struct sched_param *) __nonnull_all;
 int		pthread_attr_getschedpolicy(const pthread_attr_t *, int *)
 			__nonnull_all;
 int		pthread_attr_getscope(const pthread_attr_t *, int *)
 			__nonnull_all;
 int		pthread_attr_setinheritsched(pthread_attr_t *, int);
 int		pthread_attr_setschedparam(pthread_attr_t *,
 			const struct sched_param *) __nonnull(1) __nonnull(2);
 int		pthread_attr_setschedpolicy(pthread_attr_t *, int) __nonnull(1);
 int		pthread_attr_setscope(pthread_attr_t *, int) __nonnull(1);
 int		pthread_getschedparam(pthread_t pthread, int *,
 			struct sched_param *) __nonnull(2) __nonnull(3);
 int		pthread_setschedparam(pthread_t, int,
 			const struct sched_param *) __nonnull(3);
 #if __XSI_VISIBLE
 int		pthread_getconcurrency(void);
 int		pthread_setconcurrency(int);
 #endif
 
 void		__pthread_cleanup_push_imp(void (*)(void *), void *,
 			struct _pthread_cleanup_info *);
 void		__pthread_cleanup_pop_imp(int);
 __END_DECLS
 
 #endif
Index: head/lib/libc/gen/Symbol.map
===================================================================
--- head/lib/libc/gen/Symbol.map	(revision 300042)
+++ head/lib/libc/gen/Symbol.map	(revision 300043)
@@ -1,543 +1,546 @@
 /*
  * $FreeBSD$
  */
 
 FBSD_1.0 {
 	__xuname;
 	pthread_atfork;
 	pthread_attr_destroy;
 	pthread_attr_getdetachstate;
 	pthread_attr_getguardsize;
 	pthread_attr_getinheritsched;
 	pthread_attr_getschedparam;
 	pthread_attr_getschedpolicy;
 	pthread_attr_getscope;
 	pthread_attr_getstackaddr;
 	pthread_attr_getstacksize;
 	pthread_attr_init;
 	pthread_attr_setdetachstate;
 	pthread_attr_setguardsize;
 	pthread_attr_setinheritsched;
 	pthread_attr_setschedparam;
 	pthread_attr_setschedpolicy;
 	pthread_attr_setscope;
 	pthread_attr_setstackaddr;
 	pthread_attr_setstacksize;
 	pthread_cancel;
 	pthread_cleanup_pop;
 	pthread_cleanup_push;
 	pthread_cond_broadcast;
 	pthread_cond_destroy;
 	pthread_cond_init;
 	pthread_cond_signal;
 	pthread_cond_timedwait;
 	pthread_cond_wait;
 	pthread_detach;
 	pthread_equal;
 	pthread_exit;
 	pthread_getspecific;
 	pthread_join;
 	pthread_key_create;
 	pthread_key_delete;
 	pthread_kill;
 	pthread_main_np;
 	pthread_mutex_destroy;
 	pthread_mutex_init;
 	pthread_mutex_lock;
 	pthread_mutex_trylock;
 	pthread_mutex_unlock;
 	pthread_mutexattr_destroy;
 	pthread_mutexattr_init;
 	pthread_mutexattr_settype;
 	pthread_once;
 	pthread_rwlock_destroy;
 	pthread_rwlock_init;
 	pthread_rwlock_rdlock;
 	pthread_rwlock_tryrdlock;
 	pthread_rwlock_trywrlock;
 	pthread_rwlock_unlock;
 	pthread_rwlock_wrlock;
 	pthread_self;
 	pthread_setcancelstate;
 	pthread_setcanceltype;
 	pthread_setspecific;
 	pthread_sigmask;
 	pthread_testcancel;
 	alarm;
 	arc4random;
 	arc4random_addrandom;
 	arc4random_stir;
 	__assert;
 	basename;
 	check_utility_compat;
 	clock;
 	closedir;
 	confstr;
 	encrypt;
 	des_setkey;
 	des_cipher;
 	setkey;
 	ctermid;
 	ctermid_r;
 	daemon;
 	devname;
 	devname_r;
 	dirname;
 	getdiskbyname;
 	dladdr;
 	dlclose;
 	dlerror;
 	dlfunc;
 	dllockinit;
 	dlopen;
 	dlsym;
 	dlvsym;
 	dlinfo;
 	dl_iterate_phdr;
 	drand48;
 	erand48;
 	err_set_file;
 	err_set_exit;
 	err;
 	verr;
 	errc;
 	verrc;
 	errx;
 	verrx;
 	warn;
 	vwarn;
 	warnc;
 	vwarnc;
 	warnx;
 	vwarnx;
 	sys_errlist;
 	sys_nerr;
 	errno;
 	execl;
 	execle;
 	execlp;
 	execv;
 	execvp;
 	execvP;
 	fmtcheck;
 	fmtmsg;
 	fnmatch;
 	__fpclassifyf;
 	__fpclassifyd;
 	__fpclassifyl;
 	frexp;
 	setfstab;
 	getfstab;
 	getfsent;
 	getfsspec;
 	getfsfile;
 	setfsent;
 	endfsent;
 	ftok;
 	ftw;
 	glob;
 	globfree;
 	getbootfile;
 	getbsize;
 	cgetset;
 	cgetcap;
 	cgetent;
 	cgetmatch;
 	cgetfirst;
 	cgetclose;
 	cgetnext;
 	cgetstr;
 	cgetustr;
 	cgetnum;
 	getcwd;
 	getdomainname;
 	setgrent;
 	setgroupent;
 	endgrent;
 	getgrent_r;
 	getgrnam_r;
 	getgrgid_r;
 	getgrnam;
 	getgrgid;
 	getgrent;
 	/*
 	 * Why are __gr_parse_entry() and __gr_match_entry() not static in
 	 * gen/getgrent.c?
 	 */
 	getgrouplist;
 	gethostname;
 	getloadavg;
 	getlogin;
 	getlogin_r;
 	getmntinfo;
 	setnetgrent;
 	getnetgrent;
 	endnetgrent;
 	innetgr;
 	getosreldate;
 	getpagesize;
 	getpeereid;
 	_getprogname;
 	getprogname;
 	setpwent;
 	setpassent;
 	endpwent;
 	getpwent_r;
 	getpwnam_r;
 	getpwuid_r;
 	getpwnam;
 	getpwuid;
 	getpwent;
 	getttynam;
 	getttyent;
 	setttyent;
 	endttyent;
 	isdialuptty;
 	isnettty;
 	getusershell;
 	endusershell;
 	setusershell;
 	getvfsbyname;
 	__isnan;
 	isnan;
 	__isnanf;
 	isnanf;
 	__isinf;
 	isinf;
 	__isinff;
 	__isinfl;
 	isatty;
 	initgroups;
 	jrand48;
 	lcong48;
 	ldexp;
 	lockf;
 	lrand48;
 	modf;
 	mrand48;
 	nftw;
 	nice;
 	nlist;
 	nrand48;
 	opendir;
 	pause;
 	posix_madvise;
 	popen;
 	pclose;
 	psignal;
 	raise;
 	readdir;
 	readdir_r;
 	readpassphrase;
 	getpass;
 	rewinddir;
 	scandir;
 	alphasort;
 	seed48;
 	seekdir;
 	user_from_uid;
 	group_from_gid;
 	setdomainname;
 	sethostname;
 	longjmperror;
 	getmode;
 	setmode;
 	setproctitle;
 	setprogname;
 	siginterrupt;
 	sys_signame;
 	sys_siglist;
 	sys_nsig;
 	signal;
 	sigaddset;
 	sigdelset;
 	sigemptyset;
 	sigfillset;
 	sigismember;
 	sleep;
 	srand48;
 	fstatvfs;
 	statvfs;
 	sl_init;
 	sl_add;
 	sl_free;
 	sl_find;
 	fflagstostr;
 	strtofflags;
 	sysconf;
 	sysctl;
 	sysctlbyname;
 	sysctlnametomib;
 	syslog;
 	vsyslog;
 	openlog;
 	closelog;
 	setlogmask;
 	ttyname_r;
 	ttyname;
 	timezone;
 	times;
 	time;
 	telldir;
 	tcgetattr;
 	tcsetattr;
 	tcsetpgrp;
 	tcgetpgrp;
 	cfgetospeed;
 	cfgetispeed;
 	cfsetospeed;
 	cfsetispeed;
 	cfsetspeed;
 	cfmakeraw;
 	tcsendbreak;
 	_init_tls;
 	__tls_get_addr;
 	tcdrain;
 	tcflush;
 	tcflow;
 	ualarm;
 	ulimit;
 	uname;
 	strunvis;
 	strunvisx;
 	usleep;
 	utime;
 	valloc;
 	vis;
 	strvis;
 	strvisx;
 	wait;
 	wait3;
 	waitpid;
 	wordexp;
 	wordfree;
 };
 
 FBSD_1.1 {
 	arc4random_buf;
 	arc4random_uniform;
 	fdevname;
 	fdevname_r;
 	fdopendir;
 	feature_present;
 	fts_children;
 	fts_close;
 	fts_get_clientptr;
 	fts_get_stream;
 	fts_open;
 	fts_read;
 	fts_set;
 	fts_set_clientptr;
 	posix_spawn;
 	posix_spawn_file_actions_addclose;
 	posix_spawn_file_actions_adddup2;
 	posix_spawn_file_actions_addopen;
 	posix_spawn_file_actions_destroy;
 	posix_spawn_file_actions_init;
 	posix_spawnattr_destroy;
 	posix_spawnattr_getflags;
 	posix_spawnattr_getpgroup;
 	posix_spawnattr_getschedparam;
 	posix_spawnattr_getschedpolicy;
 	posix_spawnattr_getsigdefault;
 	posix_spawnattr_getsigmask;
 	posix_spawnattr_init;
 	posix_spawnattr_setflags;
 	posix_spawnattr_setpgroup;
 	posix_spawnattr_setschedparam;
 	posix_spawnattr_setschedpolicy;
 	posix_spawnattr_setsigdefault;
 	posix_spawnattr_setsigmask;
 	posix_spawnp;
 	semctl;
 	tcgetsid;
 	tcsetsid;
 	__pthread_cleanup_pop_imp;
 	__pthread_cleanup_push_imp;
 };
 
 FBSD_1.2 {
 	basename_r;
 	cfmakesane;
 	endutxent;
 	getpagesizes;
 	getutxent;
 	getutxid;
 	getutxline;
 	getutxuser;
 	pututxline;
 	sem_close;
 	sem_destroy;
 	sem_getvalue;
 	sem_init;
 	sem_open;
 	sem_post;
 	sem_timedwait;
 	sem_trywait;
 	sem_unlink;
 	sem_wait;
 	setutxdb;
 	setutxent;
 };
 
 FBSD_1.3 {
 	clock_getcpuclockid;
 	dirfd;
 	dup3;
 	fdclosedir;
 	fdlopen;
 	__FreeBSD_libc_enter_restricted_mode;
 	getcontextx;
 	gid_from_group;
 	nvis;
 	pwcache_userdb;
 	pwcache_groupdb;
 	snvis;
 	strenvisx;
 	strnunvis;
 	strnunvisx;
 	strnvis;
 	strnvisx;
 	strsenvisx;
 	strsnvis;
 	strsnvisx;
 	strsvis;
 	strsvisx;
 	svis;
 	uid_from_user;
 	unvis;
 	waitid;
 };
 
 FBSD_1.4 {
+	pthread_mutex_consistent;
+	pthread_mutexattr_getrobust;
+	pthread_mutexattr_setrobust;
 	scandir_b;
 };
 
 FBSDprivate_1.0 {
 	/* needed by thread libraries */
 	__thr_jtable;
 
 	_pthread_atfork;
 	_pthread_attr_destroy;
 	_pthread_attr_getdetachstate;
 	_pthread_attr_getguardsize;
 	_pthread_attr_getinheritsched;
 	_pthread_attr_getschedparam;
 	_pthread_attr_getschedpolicy;
 	_pthread_attr_getscope;
 	_pthread_attr_getstackaddr;
 	_pthread_attr_getstacksize;
 	_pthread_attr_init;
 	_pthread_attr_setdetachstate;
 	_pthread_attr_setguardsize;
 	_pthread_attr_setinheritsched;
 	_pthread_attr_setschedparam;
 	_pthread_attr_setschedpolicy;
 	_pthread_attr_setscope;
 	_pthread_attr_setstackaddr;
 	_pthread_attr_setstacksize;
 	_pthread_cancel;
 	_pthread_cancel_enter;
 	_pthread_cancel_leave;
 	_pthread_cleanup_pop;
 	_pthread_cleanup_push;
 	_pthread_cond_broadcast;
 	_pthread_cond_destroy;
 	_pthread_cond_init;
 	_pthread_cond_signal;
 	_pthread_cond_timedwait;
 	_pthread_cond_wait;
 	_pthread_detach;
 	_pthread_equal;
 	_pthread_exit;
 	_pthread_getspecific;
 	_pthread_join;
 	_pthread_key_create;
 	_pthread_key_delete;
 	_pthread_kill;
 	_pthread_main_np;
 	_pthread_mutex_destroy;
 	_pthread_mutex_init_calloc_cb;
 	_pthread_mutex_init;
 	_pthread_mutex_lock;
 	_pthread_mutex_trylock;
 	_pthread_mutex_unlock;
 	_pthread_mutexattr_destroy;
 	_pthread_mutexattr_init;
 	_pthread_mutexattr_settype;
 	_pthread_once;
 	_pthread_rwlock_destroy;
 	_pthread_rwlock_init;
 	_pthread_rwlock_rdlock;
 	_pthread_rwlock_tryrdlock;
 	_pthread_rwlock_trywrlock;
 	_pthread_rwlock_unlock;
 	_pthread_rwlock_wrlock;
 	_pthread_self;
 	_pthread_setcancelstate;
 	_pthread_setcanceltype;
 	_pthread_setspecific;
 	_pthread_sigmask;
 	_pthread_testcancel;
 	_spinlock;
 	_spinlock_debug;
 	_spinunlock;
 	_rtld_addr_phdr;
 	_rtld_atfork_pre;
 	_rtld_atfork_post;
 	_rtld_error;		/* for private use */
 	_rtld_get_stack_prot;
 	_rtld_is_dlopened;
 	_rtld_thread_init;	/* for private use */
 	__elf_phdr_match_addr;
 	_err;
 	_warn;
 	__fmtcheck;
 	/* __pw_match_entry; */
 	/* __pw_parse_entry; */
 	__fdnlist;	/* used by libkvm */
 	/* __aout_fdnlist; */
 	/* __elf_is_okay__; */
 	/* __elf_fdnlist; */
 	__opendir2;
 	__pause;
 	_pause;
 	__pw_scan;	/* Used by (at least) libutil */
 	__raise;
 	_raise;
 	__sleep;
 	_sleep;
 	_rtld_allocate_tls;
 	_rtld_free_tls;
 #if defined(i386)
 	___libc_tls_get_addr;	/* x86 only */
 #endif
 	__libc_tls_get_addr;
 	__tcdrain;
 	_tcdrain;
 	__usleep;
 	_usleep;
 	__wait;
 	_wait;
 	__waitpid;
 	_waitpid;
 
 	_libc_sem_init_compat;
 	_libc_sem_destroy_compat;
 	_libc_sem_open_compat;
 	_libc_sem_close_compat;
 	_libc_sem_unlink_compat;
 	_libc_sem_wait_compat;
 	_libc_sem_trywait_compat;
 	_libc_sem_timedwait_compat;
 	_libc_sem_post_compat;
 	_libc_sem_getvalue_compat;
 
 	__libc_tcdrain;
 
 	__elf_aux_vector;
 	__pthread_map_stacks_exec;
 	__fillcontextx;
 	__fillcontextx2;
 	__getcontextx_size;
 };
Index: head/lib/libc/gen/_pthread_stubs.c
===================================================================
--- head/lib/libc/gen/_pthread_stubs.c	(revision 300042)
+++ head/lib/libc/gen/_pthread_stubs.c	(revision 300043)
@@ -1,319 +1,327 @@
 /*
  * Copyright (c) 2001 Daniel Eischen <deischen@FreeBSD.org>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY DANIEL EISCHEN AND CONTRIBUTORS ``AS IS''
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <signal.h>
 #include <pthread.h>
 #include <stdlib.h>
 #include <errno.h>
 
 #include "libc_private.h"
 
 /*
  * Weak symbols: All libc internal usage of these functions should
  * use the weak symbol versions (_pthread_XXX).  If libpthread is
  * linked, it will override these functions with (non-weak) routines.
  * The _pthread_XXX functions are provided solely for internal libc
  * usage to avoid unwanted cancellation points and to differentiate
  * between application locks and libc locks (threads holding the
  * latter can't be allowed to exit/terminate).
  */
 
 /* Define a null pthread structure just to satisfy _pthread_self. */
 struct pthread {
 };
 
 static struct pthread	main_thread;
 
 static int		stub_main(void);
 static void 		*stub_null(void);
 static struct pthread	*stub_self(void);
 static int		stub_zero(void);
 static int		stub_fail(void);
 static int		stub_true(void);
 static void		stub_exit(void);
 
 #define	PJT_DUAL_ENTRY(entry)	\
 	(pthread_func_t)entry, (pthread_func_t)entry
 
 pthread_func_entry_t __thr_jtable[PJT_MAX] = {
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATFORK */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_DESTROY */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_GETDETACHSTATE */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_GETGUARDSIZE */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_GETINHERITSCHED */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_GETSCHEDPARAM */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_GETSCHEDPOLICY */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_GETSCOPE */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_GETSTACKADDR */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_GETSTACKSIZE */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_INIT */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_SETDETACHSTATE */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_SETGUARDSIZE */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_SETINHERITSCHED */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_SETSCHEDPARAM */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_SETSCHEDPOLICY */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_SETSCOPE */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_SETSTACKADDR */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_ATTR_SETSTACKSIZE */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_CANCEL */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_CLEANUP_POP */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_CLEANUP_PUSH */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_COND_BROADCAST */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_COND_DESTROY */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_COND_INIT */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_COND_SIGNAL */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_COND_TIMEDWAIT */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_COND_WAIT */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_DETACH */
 	{PJT_DUAL_ENTRY(stub_true)},    /* PJT_EQUAL */
 	{PJT_DUAL_ENTRY(stub_exit)},    /* PJT_EXIT */
 	{PJT_DUAL_ENTRY(stub_null)},    /* PJT_GETSPECIFIC */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_JOIN */
 	{PJT_DUAL_ENTRY(stub_fail)},    /* PJT_KEY_CREATE */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_KEY_DELETE */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_KILL */
 	{PJT_DUAL_ENTRY(stub_main)},    /* PJT_MAIN_NP */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_MUTEXATTR_DESTROY */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_MUTEXATTR_INIT */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_MUTEXATTR_SETTYPE */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_MUTEX_DESTROY */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_MUTEX_INIT */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_MUTEX_LOCK */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_MUTEX_TRYLOCK */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_MUTEX_UNLOCK */
 	{PJT_DUAL_ENTRY(stub_fail)},    /* PJT_ONCE */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_RWLOCK_DESTROY */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_RWLOCK_INIT */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_RWLOCK_RDLOCK */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_RWLOCK_TRYRDLOCK */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_RWLOCK_TRYWRLOCK */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_RWLOCK_UNLOCK */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_RWLOCK_WRLOCK */
 	{PJT_DUAL_ENTRY(stub_self)},    /* PJT_SELF */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_SETCANCELSTATE */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_SETCANCELTYPE */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_SETSPECIFIC */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_SIGMASK */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_TESTCANCEL */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_CLEANUP_POP_IMP */
 	{PJT_DUAL_ENTRY(stub_zero)},    /* PJT_CLEANUP_PUSH_IMP */
 	{PJT_DUAL_ENTRY(stub_zero)},	/* PJT_CANCEL_ENTER */
 	{PJT_DUAL_ENTRY(stub_zero)},	/* PJT_CANCEL_LEAVE */
+	{PJT_DUAL_ENTRY(stub_zero)},	/* PJT_MUTEX_CONSISTENT */
+	{PJT_DUAL_ENTRY(stub_zero)},	/* PJT_MUTEXATTR_GETROBUST */
+	{PJT_DUAL_ENTRY(stub_zero)},	/* PJT_MUTEXATTR_SETROBUST */
 };
 
 /*
  * Weak aliases for exported (pthread_*) and internal (_pthread_*) routines.
  */
 #define	WEAK_REF(sym, alias)	__weak_reference(sym, alias)
 
 #define	FUNC_TYPE(name)		__CONCAT(name, _func_t)
 #define	FUNC_INT(name)		__CONCAT(name, _int)
 #define	FUNC_EXP(name)		__CONCAT(name, _exp)
 
 #define	STUB_FUNC(name, idx, ret)				\
 	static ret FUNC_EXP(name)(void) __used;			\
 	static ret FUNC_INT(name)(void) __used;			\
 	WEAK_REF(FUNC_EXP(name), name);				\
 	WEAK_REF(FUNC_INT(name), __CONCAT(_, name));		\
 	typedef ret (*FUNC_TYPE(name))(void);			\
 	static ret FUNC_EXP(name)(void)				\
 	{							\
 		FUNC_TYPE(name) func;				\
 		func = (FUNC_TYPE(name))__thr_jtable[idx][0];	\
 		return (func());				\
 	}							\
 	static ret FUNC_INT(name)(void)				\
 	{							\
 		FUNC_TYPE(name) func;				\
 		func = (FUNC_TYPE(name))__thr_jtable[idx][1];	\
 		return (func());				\
 	}
 
 #define	STUB_FUNC1(name, idx, ret, p0_type)			\
 	static ret FUNC_EXP(name)(p0_type) __used;		\
 	static ret FUNC_INT(name)(p0_type) __used;		\
 	WEAK_REF(FUNC_EXP(name), name);				\
 	WEAK_REF(FUNC_INT(name), __CONCAT(_, name));		\
 	typedef ret (*FUNC_TYPE(name))(p0_type);		\
 	static ret FUNC_EXP(name)(p0_type p0)			\
 	{							\
 		FUNC_TYPE(name) func;				\
 		func = (FUNC_TYPE(name))__thr_jtable[idx][0];	\
 		return (func(p0));				\
 	}							\
 	static ret FUNC_INT(name)(p0_type p0)			\
 	{							\
 		FUNC_TYPE(name) func;				\
 		func = (FUNC_TYPE(name))__thr_jtable[idx][1];	\
 		return (func(p0));				\
 	}
 
 #define	STUB_FUNC2(name, idx, ret, p0_type, p1_type)		\
 	static ret FUNC_EXP(name)(p0_type, p1_type) __used;	\
 	static ret FUNC_INT(name)(p0_type, p1_type) __used;	\
 	WEAK_REF(FUNC_EXP(name), name);				\
 	WEAK_REF(FUNC_INT(name), __CONCAT(_, name));		\
 	typedef ret (*FUNC_TYPE(name))(p0_type, p1_type);	\
 	static ret FUNC_EXP(name)(p0_type p0, p1_type p1)	\
 	{							\
 		FUNC_TYPE(name) func;				\
 		func = (FUNC_TYPE(name))__thr_jtable[idx][0];	\
 		return (func(p0, p1));				\
 	}							\
 	static ret FUNC_INT(name)(p0_type p0, p1_type p1)	\
 	{							\
 		FUNC_TYPE(name) func;				\
 		func = (FUNC_TYPE(name))__thr_jtable[idx][1];	\
 		return (func(p0, p1));				\
 	}
 
 #define	STUB_FUNC3(name, idx, ret, p0_type, p1_type, p2_type)	\
 	static ret FUNC_EXP(name)(p0_type, p1_type, p2_type) __used; \
 	static ret FUNC_INT(name)(p0_type, p1_type, p2_type) __used; \
 	WEAK_REF(FUNC_EXP(name), name);				\
 	WEAK_REF(FUNC_INT(name), __CONCAT(_, name));		\
 	typedef ret (*FUNC_TYPE(name))(p0_type, p1_type, p2_type); \
 	static ret FUNC_EXP(name)(p0_type p0, p1_type p1, p2_type p2) \
 	{							\
 		FUNC_TYPE(name) func;				\
 		func = (FUNC_TYPE(name))__thr_jtable[idx][0];	\
 		return (func(p0, p1, p2));			\
 	}							\
 	static ret FUNC_INT(name)(p0_type p0, p1_type p1, p2_type p2) \
 	{							\
 		FUNC_TYPE(name) func;				\
 		func = (FUNC_TYPE(name))__thr_jtable[idx][1];	\
 		return (func(p0, p1, p2));			\
 	}
 
 STUB_FUNC1(pthread_cond_broadcast, PJT_COND_BROADCAST, int, void *)
 STUB_FUNC1(pthread_cond_destroy, PJT_COND_DESTROY, int, void *)
 STUB_FUNC2(pthread_cond_init,	PJT_COND_INIT, int, void *, void *)
 STUB_FUNC1(pthread_cond_signal,	PJT_COND_SIGNAL, int, void *)
 STUB_FUNC2(pthread_cond_wait,	PJT_COND_WAIT, int, void *, void *)
 STUB_FUNC1(pthread_getspecific,	PJT_GETSPECIFIC, void *, pthread_key_t)
 STUB_FUNC2(pthread_key_create,	PJT_KEY_CREATE, int, void *, void *)
 STUB_FUNC1(pthread_key_delete,	PJT_KEY_DELETE, int, pthread_key_t)
 STUB_FUNC(pthread_main_np,	PJT_MAIN_NP, int)
 STUB_FUNC1(pthread_mutex_destroy, PJT_MUTEX_DESTROY, int, void *)
 STUB_FUNC2(pthread_mutex_init,	PJT_MUTEX_INIT, int, void *, void *)
 STUB_FUNC1(pthread_mutex_lock,	PJT_MUTEX_LOCK, int, void *)
 STUB_FUNC1(pthread_mutex_trylock, PJT_MUTEX_TRYLOCK, int, void *)
 STUB_FUNC1(pthread_mutex_unlock, PJT_MUTEX_UNLOCK, int, void *)
+STUB_FUNC1(pthread_mutex_consistent, PJT_MUTEX_CONSISTENT, int, void *)
 STUB_FUNC1(pthread_mutexattr_destroy, PJT_MUTEXATTR_DESTROY, int, void *)
 STUB_FUNC1(pthread_mutexattr_init, PJT_MUTEXATTR_INIT, int, void *)
 STUB_FUNC2(pthread_mutexattr_settype, PJT_MUTEXATTR_SETTYPE, int, void *, int)
+STUB_FUNC2(pthread_mutexattr_getrobust, PJT_MUTEXATTR_GETROBUST, int, void *,
+    int *)
+STUB_FUNC2(pthread_mutexattr_setrobust, PJT_MUTEXATTR_SETROBUST, int, void *,
+    int)
 STUB_FUNC2(pthread_once, 	PJT_ONCE, int, void *, void *)
 STUB_FUNC1(pthread_rwlock_destroy, PJT_RWLOCK_DESTROY, int, void *)
 STUB_FUNC2(pthread_rwlock_init,	PJT_RWLOCK_INIT, int, void *, void *)
 STUB_FUNC1(pthread_rwlock_rdlock, PJT_RWLOCK_RDLOCK, int, void *)
 STUB_FUNC1(pthread_rwlock_tryrdlock, PJT_RWLOCK_TRYRDLOCK, int, void *)
 STUB_FUNC1(pthread_rwlock_trywrlock, PJT_RWLOCK_TRYWRLOCK, int, void *)
 STUB_FUNC1(pthread_rwlock_unlock, PJT_RWLOCK_UNLOCK, int, void *)
 STUB_FUNC1(pthread_rwlock_wrlock, PJT_RWLOCK_WRLOCK, int, void *)
 STUB_FUNC(pthread_self,		PJT_SELF, pthread_t)
 STUB_FUNC2(pthread_setspecific, PJT_SETSPECIFIC, int, pthread_key_t, void *)
 STUB_FUNC3(pthread_sigmask, PJT_SIGMASK, int, int, void *, void *)
 STUB_FUNC3(pthread_atfork, PJT_ATFORK, int, void *, void *, void*)
 STUB_FUNC1(pthread_attr_destroy, PJT_ATTR_DESTROY, int, void *);
 STUB_FUNC2(pthread_attr_getdetachstate, PJT_ATTR_GETDETACHSTATE, int, void *, void *)
 STUB_FUNC2(pthread_attr_getguardsize, PJT_ATTR_GETGUARDSIZE, int, void *, void *)
 STUB_FUNC2(pthread_attr_getstackaddr, PJT_ATTR_GETSTACKADDR, int, void *, void *)
 STUB_FUNC2(pthread_attr_getstacksize, PJT_ATTR_GETSTACKSIZE, int, void *, void *)
 STUB_FUNC2(pthread_attr_getinheritsched, PJT_ATTR_GETINHERITSCHED, int, void *, void *)
 STUB_FUNC2(pthread_attr_getschedparam, PJT_ATTR_GETSCHEDPARAM, int, void *, void *)
 STUB_FUNC2(pthread_attr_getschedpolicy, PJT_ATTR_GETSCHEDPOLICY, int, void *, void *)
 STUB_FUNC2(pthread_attr_getscope, PJT_ATTR_GETSCOPE, int, void *, void *)
 STUB_FUNC1(pthread_attr_init, PJT_ATTR_INIT, int, void *)
 STUB_FUNC2(pthread_attr_setdetachstate, PJT_ATTR_SETDETACHSTATE, int, void *, int)
 STUB_FUNC2(pthread_attr_setguardsize, PJT_ATTR_SETGUARDSIZE, int, void *, size_t)
 STUB_FUNC2(pthread_attr_setstackaddr, PJT_ATTR_SETSTACKADDR, int, void *, void *)
 STUB_FUNC2(pthread_attr_setstacksize, PJT_ATTR_SETSTACKSIZE, int, void *, size_t)
 STUB_FUNC2(pthread_attr_setinheritsched, PJT_ATTR_SETINHERITSCHED, int, void *, int)
 STUB_FUNC2(pthread_attr_setschedparam, PJT_ATTR_SETSCHEDPARAM, int, void *, void *)
 STUB_FUNC2(pthread_attr_setschedpolicy, PJT_ATTR_SETSCHEDPOLICY, int, void *, int)
 STUB_FUNC2(pthread_attr_setscope, PJT_ATTR_SETSCOPE, int, void *, int)
 STUB_FUNC1(pthread_cancel, PJT_CANCEL, int, void *)
 STUB_FUNC1(pthread_cleanup_pop, PJT_CLEANUP_POP, int, int)
 STUB_FUNC2(pthread_cleanup_push, PJT_CLEANUP_PUSH, void, void *, void *)
 STUB_FUNC3(pthread_cond_timedwait, PJT_COND_TIMEDWAIT, int, void *, void *, void *)
 STUB_FUNC1(pthread_detach, PJT_DETACH, int, void *)
 STUB_FUNC2(pthread_equal, PJT_EQUAL, int, void *, void *)
 STUB_FUNC1(pthread_exit, PJT_EXIT, void, void *)
 STUB_FUNC2(pthread_join, PJT_JOIN, int, void *, void *)
 STUB_FUNC2(pthread_kill, PJT_KILL, int, void *, int)
 STUB_FUNC2(pthread_setcancelstate, PJT_SETCANCELSTATE, int, int, void *)
 STUB_FUNC2(pthread_setcanceltype, PJT_SETCANCELTYPE, int, int, void *)
 STUB_FUNC(pthread_testcancel, PJT_TESTCANCEL, void)
 STUB_FUNC1(__pthread_cleanup_pop_imp, PJT_CLEANUP_POP_IMP, int, int)
 STUB_FUNC2(__pthread_cleanup_push_imp, PJT_CLEANUP_PUSH_IMP, void, void*, void *);
 STUB_FUNC1(_pthread_cancel_enter, PJT_CANCEL_ENTER, int, int)
 STUB_FUNC1(_pthread_cancel_leave, PJT_CANCEL_LEAVE, int, int)
 
 static int
 stub_zero(void)
 {
 	return (0);
 }
 
 static void *
 stub_null(void)
 {
 	return (NULL);
 }
 
 static struct pthread *
 stub_self(void)
 {
 	return (&main_thread);
 }
 
 static int
 stub_fail(void)
 {
 	return (ENOSYS);
 }
 
 static int
 stub_main(void)
 {
 	return (-1);
 }
 
 static int
 stub_true(void)
 {
 	return (1);
 }
 
 static void
 stub_exit(void)
 {
 	exit(0);
 }
Index: head/lib/libc/include/libc_private.h
===================================================================
--- head/lib/libc/include/libc_private.h	(revision 300042)
+++ head/lib/libc/include/libc_private.h	(revision 300043)
@@ -1,390 +1,393 @@
 /*
  * Copyright (c) 1998 John Birrell <jb@cimlogic.com.au>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  * Private definitions for libc, libc_r and libpthread.
  *
  */
 
 #ifndef _LIBC_PRIVATE_H_
 #define _LIBC_PRIVATE_H_
 #include <sys/_types.h>
 #include <sys/_pthreadtypes.h>
 
 /*
  * This global flag is non-zero when a process has created one
  * or more threads. It is used to avoid calling locking functions
  * when they are not required.
  */
 extern int	__isthreaded;
 
 /*
  * Elf_Auxinfo *__elf_aux_vector, the pointer to the ELF aux vector
  * provided by kernel. Either set for us by rtld, or found at runtime
  * on stack for static binaries.
  *
  * Type is void to avoid polluting whole libc with ELF types.
  */
 extern void	*__elf_aux_vector;
 
 /*
  * libc should use libc_dlopen internally, which respects a global
  * flag where loading of new shared objects can be restricted.
  */
 void *libc_dlopen(const char *, int);
 
 /*
  * For dynamic linker.
  */
 void _rtld_error(const char *fmt, ...);
 
 /*
  * File lock contention is difficult to diagnose without knowing
  * where locks were set. Allow a debug library to be built which
  * records the source file and line number of each lock call.
  */
 #ifdef	_FLOCK_DEBUG
 #define _FLOCKFILE(x)	_flockfile_debug(x, __FILE__, __LINE__)
 #else
 #define _FLOCKFILE(x)	_flockfile(x)
 #endif
 
 /*
  * Macros for locking and unlocking FILEs. These test if the
  * process is threaded to avoid locking when not required.
  */
 #define	FLOCKFILE(fp)		if (__isthreaded) _FLOCKFILE(fp)
 #define	FUNLOCKFILE(fp)		if (__isthreaded) _funlockfile(fp)
 
 struct _spinlock;
 extern struct _spinlock __stdio_thread_lock __hidden;
 #define STDIO_THREAD_LOCK()				\
 do {							\
 	if (__isthreaded)				\
 		_SPINLOCK(&__stdio_thread_lock);	\
 } while (0)
 #define STDIO_THREAD_UNLOCK()				\
 do {							\
 	if (__isthreaded)				\
 		_SPINUNLOCK(&__stdio_thread_lock);	\
 } while (0)
 
 void		__libc_spinlock_stub(struct _spinlock *);
 void		__libc_spinunlock_stub(struct _spinlock *);
 
 /*
  * Indexes into the pthread jump table.
  *
  * Warning! If you change this type, you must also change the threads
  * libraries that reference it (libc_r, libpthread).
  */
 typedef enum {
 	PJT_ATFORK,
 	PJT_ATTR_DESTROY,
 	PJT_ATTR_GETDETACHSTATE,
 	PJT_ATTR_GETGUARDSIZE,
 	PJT_ATTR_GETINHERITSCHED,
 	PJT_ATTR_GETSCHEDPARAM,
 	PJT_ATTR_GETSCHEDPOLICY,
 	PJT_ATTR_GETSCOPE,
 	PJT_ATTR_GETSTACKADDR,
 	PJT_ATTR_GETSTACKSIZE,
 	PJT_ATTR_INIT,
 	PJT_ATTR_SETDETACHSTATE,
 	PJT_ATTR_SETGUARDSIZE,
 	PJT_ATTR_SETINHERITSCHED,
 	PJT_ATTR_SETSCHEDPARAM,
 	PJT_ATTR_SETSCHEDPOLICY,
 	PJT_ATTR_SETSCOPE,
 	PJT_ATTR_SETSTACKADDR,
 	PJT_ATTR_SETSTACKSIZE,
 	PJT_CANCEL,
 	PJT_CLEANUP_POP,
 	PJT_CLEANUP_PUSH,
 	PJT_COND_BROADCAST,
 	PJT_COND_DESTROY,
 	PJT_COND_INIT,
 	PJT_COND_SIGNAL,
 	PJT_COND_TIMEDWAIT,
 	PJT_COND_WAIT,
 	PJT_DETACH,
 	PJT_EQUAL,
 	PJT_EXIT,
 	PJT_GETSPECIFIC,
 	PJT_JOIN,
 	PJT_KEY_CREATE,
 	PJT_KEY_DELETE,
 	PJT_KILL,
 	PJT_MAIN_NP,
 	PJT_MUTEXATTR_DESTROY,
 	PJT_MUTEXATTR_INIT,
 	PJT_MUTEXATTR_SETTYPE,
 	PJT_MUTEX_DESTROY,
 	PJT_MUTEX_INIT,
 	PJT_MUTEX_LOCK,
 	PJT_MUTEX_TRYLOCK,
 	PJT_MUTEX_UNLOCK,
 	PJT_ONCE,
 	PJT_RWLOCK_DESTROY,
 	PJT_RWLOCK_INIT,
 	PJT_RWLOCK_RDLOCK,
 	PJT_RWLOCK_TRYRDLOCK,
 	PJT_RWLOCK_TRYWRLOCK,
 	PJT_RWLOCK_UNLOCK,
 	PJT_RWLOCK_WRLOCK,
 	PJT_SELF,
 	PJT_SETCANCELSTATE,
 	PJT_SETCANCELTYPE,
 	PJT_SETSPECIFIC,
 	PJT_SIGMASK,
 	PJT_TESTCANCEL,
 	PJT_CLEANUP_POP_IMP,
 	PJT_CLEANUP_PUSH_IMP,
 	PJT_CANCEL_ENTER,
 	PJT_CANCEL_LEAVE,
+	PJT_MUTEX_CONSISTENT,
+	PJT_MUTEXATTR_GETROBUST,
+	PJT_MUTEXATTR_SETROBUST,
 	PJT_MAX
 } pjt_index_t;
 
 typedef int (*pthread_func_t)(void);
 typedef pthread_func_t pthread_func_entry_t[2];
 
 extern pthread_func_entry_t __thr_jtable[];
 
 void	__set_error_selector(int *(*arg)(void));
 int	_pthread_mutex_init_calloc_cb_stub(pthread_mutex_t *mutex,
 	    void *(calloc_cb)(__size_t, __size_t));
 
 typedef int (*interpos_func_t)(void);
 interpos_func_t *__libc_interposing_slot(int interposno);
 extern interpos_func_t __libc_interposing[] __hidden;
 
 enum {
 	INTERPOS_accept,
 	INTERPOS_accept4,
 	INTERPOS_aio_suspend,
 	INTERPOS_close,
 	INTERPOS_connect,
 	INTERPOS_fcntl,
 	INTERPOS_fsync,
 	INTERPOS_fork,
 	INTERPOS_msync,
 	INTERPOS_nanosleep,
 	INTERPOS_openat,
 	INTERPOS_poll,
 	INTERPOS_pselect,
 	INTERPOS_recvfrom,
 	INTERPOS_recvmsg,
 	INTERPOS_select,
 	INTERPOS_sendmsg,
 	INTERPOS_sendto,
 	INTERPOS_setcontext,
 	INTERPOS_sigaction,
 	INTERPOS_sigprocmask,
 	INTERPOS_sigsuspend,
 	INTERPOS_sigwait,
 	INTERPOS_sigtimedwait,
 	INTERPOS_sigwaitinfo,
 	INTERPOS_swapcontext,
 	INTERPOS_system,
 	INTERPOS_tcdrain,
 	INTERPOS_read,
 	INTERPOS_readv,
 	INTERPOS_wait4,
 	INTERPOS_write,
 	INTERPOS_writev,
 	INTERPOS__pthread_mutex_init_calloc_cb,
 	INTERPOS_spinlock,
 	INTERPOS_spinunlock,
 	INTERPOS_kevent,
 	INTERPOS_wait6,
 	INTERPOS_ppoll,
 	INTERPOS_map_stacks_exec,
 	INTERPOS_MAX
 };
 
 /*
  * yplib internal interfaces
  */
 #ifdef YP
 int _yp_check(char **);
 #endif
 
 /*
  * Initialise TLS for static programs
  */
 void _init_tls(void);
 
 /*
  * Provides pthread_once()-like functionality for both single-threaded
  * and multi-threaded applications.
  */
 int _once(pthread_once_t *, void (*)(void));
 
 /*
  * Set the TLS thread pointer
  */
 void _set_tp(void *tp);
 
 /*
  * This is a pointer in the C run-time startup code. It is used
  * by getprogname() and setprogname().
  */
 extern const char *__progname;
 
 /*
  * This function is used by the threading libraries to notify malloc that a
  * thread is exiting.
  */
 void _malloc_thread_cleanup(void);
 
 /*
  * These functions are used by the threading libraries in order to protect
  * malloc across fork().
  */
 void _malloc_prefork(void);
 void _malloc_postfork(void);
 
 void _malloc_first_thread(void);
 
 /*
  * Function to clean up streams, called from abort() and exit().
  */
 extern void (*__cleanup)(void) __hidden;
 
 /*
  * Get kern.osreldate to detect ABI revisions.  Explicitly
  * ignores value of $OSVERSION and caches result.
  */
 int __getosreldate(void);
 #include <sys/_types.h>
 #include <sys/_sigset.h>
 
 struct aiocb;
 struct fd_set;
 struct iovec;
 struct kevent;
 struct msghdr;
 struct pollfd;
 struct rusage;
 struct sigaction;
 struct sockaddr;
 struct timespec;
 struct timeval;
 struct timezone;
 struct __siginfo;
 struct __ucontext;
 struct __wrusage;
 enum idtype;
 int		__sys_aio_suspend(const struct aiocb * const[], int,
 		    const struct timespec *);
 int		__sys_accept(int, struct sockaddr *, __socklen_t *);
 int		__sys_accept4(int, struct sockaddr *, __socklen_t *, int);
 int		__sys_clock_gettime(__clockid_t, struct timespec *ts);
 int		__sys_close(int);
 int		__sys_connect(int, const struct sockaddr *, __socklen_t);
 int		__sys_fcntl(int, int, ...);
 int		__sys_fsync(int);
 __pid_t		__sys_fork(void);
 int		__sys_ftruncate(int, __off_t);
 int		__sys_gettimeofday(struct timeval *, struct timezone *);
 int		__sys_kevent(int, const struct kevent *, int, struct kevent *,
 		    int, const struct timespec *);
 __off_t		__sys_lseek(int, __off_t, int);
 void	       *__sys_mmap(void *, __size_t, int, int, int, __off_t);
 int		__sys_msync(void *, __size_t, int);
 int		__sys_nanosleep(const struct timespec *, struct timespec *);
 int		__sys_open(const char *, int, ...);
 int		__sys_openat(int, const char *, int, ...);
 int		__sys_pselect(int, struct fd_set *, struct fd_set *,
 		    struct fd_set *, const struct timespec *,
 		    const __sigset_t *);
 int		__sys_poll(struct pollfd *, unsigned, int);
 int		__sys_ppoll(struct pollfd *, unsigned, const struct timespec *,
 		    const __sigset_t *);
 __ssize_t	__sys_pread(int, void *, __size_t, __off_t);
 __ssize_t	__sys_pwrite(int, const void *, __size_t, __off_t);
 __ssize_t	__sys_read(int, void *, __size_t);
 __ssize_t	__sys_readv(int, const struct iovec *, int);
 __ssize_t	__sys_recv(int, void *, __size_t, int);
 __ssize_t	__sys_recvfrom(int, void *, __size_t, int, struct sockaddr *,
 		    __socklen_t *);
 __ssize_t	__sys_recvmsg(int, struct msghdr *, int);
 int		__sys_select(int, struct fd_set *, struct fd_set *,
 		    struct fd_set *, struct timeval *);
 __ssize_t	__sys_sendmsg(int, const struct msghdr *, int);
 __ssize_t	__sys_sendto(int, const void *, __size_t, int,
 		    const struct sockaddr *, __socklen_t);
 int		__sys_setcontext(const struct __ucontext *);
 int		__sys_sigaction(int, const struct sigaction *,
 		    struct sigaction *);
 int		__sys_sigprocmask(int, const __sigset_t *, __sigset_t *);
 int		__sys_sigsuspend(const __sigset_t *);
 int		__sys_sigtimedwait(const __sigset_t *, struct __siginfo *,
 		    const struct timespec *);
 int		__sys_sigwait(const __sigset_t *, int *);
 int		__sys_sigwaitinfo(const __sigset_t *, struct __siginfo *);
 int		__sys_swapcontext(struct __ucontext *,
 		    const struct __ucontext *);
 int		__sys_thr_kill(long, int);
 int		__sys_thr_self(long *);
 int		__sys_truncate(const char *, __off_t);
 __pid_t		__sys_wait4(__pid_t, int *, int, struct rusage *);
 __pid_t		__sys_wait6(enum idtype, __id_t, int *, int,
 		    struct __wrusage *, struct __siginfo *);
 __ssize_t	__sys_write(int, const void *, __size_t);
 __ssize_t	__sys_writev(int, const struct iovec *, int);
 
 int		__libc_sigaction(int, const struct sigaction *,
 		    struct sigaction *) __hidden;
 int		__libc_sigprocmask(int, const __sigset_t *, __sigset_t *)
 		    __hidden;
 int		__libc_sigsuspend(const __sigset_t *) __hidden;
 int		__libc_sigwait(const __sigset_t * __restrict,
 		    int * restrict sig);
 int		__libc_system(const char *);
 int		__libc_tcdrain(int);
 int		__fcntl_compat(int fd, int cmd, ...);
 
 int		__sys_futimens(int fd, const struct timespec *times) __hidden;
 int		__sys_utimensat(int fd, const char *path,
 		    const struct timespec *times, int flag) __hidden;
 
 /* execve() with PATH processing to implement posix_spawnp() */
 int _execvpe(const char *, char * const *, char * const *);
 
 int _elf_aux_info(int aux, void *buf, int buflen);
 struct dl_phdr_info;
 int __elf_phdr_match_addr(struct dl_phdr_info *, void *);
 void __init_elf_aux_vector(void);
 void __libc_map_stacks_exec(void);
 
 void	_pthread_cancel_enter(int);
 void	_pthread_cancel_leave(int);
 
 #endif /* _LIBC_PRIVATE_H_ */
Index: head/lib/libthr/pthread.map
===================================================================
--- head/lib/libthr/pthread.map	(revision 300042)
+++ head/lib/libthr/pthread.map	(revision 300043)
@@ -1,317 +1,323 @@
 /*
  * $FreeBSD$
  */
 
 /*
  * Use the same naming scheme as libc.
  */
 FBSD_1.0 {
 	pthread_atfork;
 	pthread_barrier_destroy;
 	pthread_barrier_init;
 	pthread_barrier_wait;
 	pthread_barrierattr_destroy;
 	pthread_barrierattr_getpshared;
 	pthread_barrierattr_init;
 	pthread_barrierattr_setpshared;
 	pthread_attr_destroy;
 	pthread_attr_get_np;
 	pthread_attr_getdetachstate;
 	pthread_attr_getguardsize;
 	pthread_attr_getinheritsched;
 	pthread_attr_getschedparam;
 	pthread_attr_getschedpolicy;
 	pthread_attr_getscope;
 	pthread_attr_getstack;
 	pthread_attr_getstackaddr;
 	pthread_attr_getstacksize;
 	pthread_attr_init;
 	pthread_attr_setcreatesuspend_np;
 	pthread_attr_setdetachstate;
 	pthread_attr_setguardsize;
 	pthread_attr_setinheritsched;
 	pthread_attr_setschedparam;
 	pthread_attr_setschedpolicy;
 	pthread_attr_setscope;
 	pthread_attr_setstack;
 	pthread_attr_setstackaddr;
 	pthread_attr_setstacksize;
 	pthread_cancel;
 	pthread_cleanup_pop;
 	pthread_cleanup_push;
 	pthread_cond_broadcast;
 	pthread_cond_destroy;
 	pthread_cond_init;
 	pthread_cond_signal;
 	pthread_cond_timedwait;
 	pthread_cond_wait;
 	pthread_condattr_destroy;
 	pthread_condattr_getclock;
 	pthread_condattr_getpshared;
 	pthread_condattr_init;
 	pthread_condattr_setclock;
 	pthread_condattr_setpshared;
 	pthread_create;
 	pthread_detach;
 	pthread_equal;
 	pthread_exit;
 	pthread_getconcurrency;
 	pthread_getprio;
 	pthread_getschedparam;
 	pthread_getspecific;
 	pthread_join;
 	pthread_key_create;
 	pthread_key_delete;
 	pthread_kill;
 	pthread_main_np;
 	pthread_multi_np;
 	pthread_mutex_destroy;
 	pthread_mutex_getprioceiling;
 	pthread_mutex_init;
 	pthread_mutex_lock;
 	pthread_mutex_setprioceiling;
 	pthread_mutex_timedlock;
 	pthread_mutex_trylock;
 	pthread_mutex_unlock;
 	pthread_mutexattr_destroy;
 	pthread_mutexattr_getkind_np;
 	pthread_mutexattr_getprioceiling;
 	pthread_mutexattr_getpshared;
 	pthread_mutexattr_getprotocol;
 	pthread_mutexattr_gettype;
 	pthread_mutexattr_init;
 	pthread_mutexattr_setkind_np;
 	pthread_mutexattr_setprioceiling;
 	pthread_mutexattr_setprotocol;
 	pthread_mutexattr_setpshared;
 	pthread_mutexattr_settype;
 	pthread_once;
 	pthread_resume_all_np;
 	pthread_resume_np;
 	pthread_rwlock_destroy;
 	pthread_rwlock_init;
 	pthread_rwlock_rdlock;
 	pthread_rwlock_timedrdlock;
 	pthread_rwlock_timedwrlock;
 	pthread_rwlock_tryrdlock;
 	pthread_rwlock_trywrlock;
 	pthread_rwlock_unlock;
 	pthread_rwlock_wrlock;
 	pthread_rwlockattr_destroy;
 	pthread_rwlockattr_getpshared;
 	pthread_rwlockattr_init;
 	pthread_rwlockattr_setpshared;
 	pthread_set_name_np;
 	pthread_self;
 	pthread_setcancelstate;
 	pthread_setcanceltype;
 	pthread_setconcurrency;
 	pthread_setprio;
 	pthread_setschedparam;
 	pthread_setspecific;
 	pthread_sigmask;
 	pthread_single_np;
 	pthread_spin_destroy;
 	pthread_spin_init;
 	pthread_spin_lock;
 	pthread_spin_trylock;
 	pthread_spin_unlock;
 	pthread_suspend_all_np;
 	pthread_suspend_np;
 	pthread_switch_add_np;
 	pthread_switch_delete_np;
 	pthread_testcancel;
 	pthread_timedjoin_np;
 	pthread_yield;
 };
 
 /*
  * List the private interfaces reserved for use in FreeBSD libraries.
  * These are not part of our application ABI.
  */
 FBSDprivate_1.0 {
 	__pthread_cond_timedwait;
 	__pthread_cond_wait;
 	__pthread_cxa_finalize;
 	__pthread_mutex_init;
 	__pthread_mutex_lock;
 	__pthread_mutex_timedlock;
 	__pthread_mutex_trylock;
 	_pthread_atfork;
 	_pthread_barrier_destroy;
 	_pthread_barrier_init;
 	_pthread_barrier_wait;
 	_pthread_barrierattr_destroy;
 	_pthread_barrierattr_getpshared;
 	_pthread_barrierattr_init;
 	_pthread_barrierattr_setpshared;
 	_pthread_attr_destroy;
 	_pthread_attr_get_np;
 	_pthread_attr_getaffinity_np;
 	_pthread_attr_getdetachstate;
 	_pthread_attr_getguardsize;
 	_pthread_attr_getinheritsched;
 	_pthread_attr_getschedparam;
 	_pthread_attr_getschedpolicy;
 	_pthread_attr_getscope;
 	_pthread_attr_getstack;
 	_pthread_attr_getstackaddr;
 	_pthread_attr_getstacksize;
 	_pthread_attr_init;
 	_pthread_attr_setaffinity_np;
 	_pthread_attr_setcreatesuspend_np;
 	_pthread_attr_setdetachstate;
 	_pthread_attr_setguardsize;
 	_pthread_attr_setinheritsched;
 	_pthread_attr_setschedparam;
 	_pthread_attr_setschedpolicy;
 	_pthread_attr_setscope;
 	_pthread_attr_setstack;
 	_pthread_attr_setstackaddr;
 	_pthread_attr_setstacksize;
 	_pthread_cancel;
 	_pthread_cancel_enter;
 	_pthread_cancel_leave;
 	_pthread_cleanup_pop;
 	_pthread_cleanup_push;
 	_pthread_cond_broadcast;
 	_pthread_cond_destroy;
 	_pthread_cond_init;
 	_pthread_cond_signal;
 	_pthread_cond_timedwait;
 	_pthread_cond_wait;
 	_pthread_condattr_destroy;
 	_pthread_condattr_getclock;
 	_pthread_condattr_getpshared;
 	_pthread_condattr_init;
 	_pthread_condattr_setclock;
 	_pthread_condattr_setpshared;
 	_pthread_create;
 	_pthread_detach;
 	_pthread_equal;
 	_pthread_exit;
 	_pthread_getaffinity_np;
 	_pthread_getconcurrency;
 	_pthread_getcpuclockid;
 	_pthread_getprio;
 	_pthread_getschedparam;
 	_pthread_getspecific;
 	_pthread_getthreadid_np;
 	_pthread_join;
 	_pthread_key_create;
 	_pthread_key_delete;
 	_pthread_kill;
 	_pthread_main_np;
 	_pthread_multi_np;
 	_pthread_mutex_destroy;
 	_pthread_mutex_getprioceiling;
 	_pthread_mutex_getspinloops_np;
 	_pthread_mutex_getyieldloops_np;
 	_pthread_mutex_init;
 	_pthread_mutex_init_calloc_cb;
 	_pthread_mutex_isowned_np;
 	_pthread_mutex_lock;
 	_pthread_mutex_setprioceiling;
 	_pthread_mutex_setspinloops_np;
 	_pthread_mutex_setyieldloops_np;
 	_pthread_mutex_timedlock;
 	_pthread_mutex_trylock;
 	_pthread_mutex_unlock;
 	_pthread_mutexattr_destroy;
 	_pthread_mutexattr_getkind_np;
 	_pthread_mutexattr_getprioceiling;
 	_pthread_mutexattr_getprotocol;
 	_pthread_mutexattr_getpshared;
 	_pthread_mutexattr_gettype;
 	_pthread_mutexattr_init;
 	_pthread_mutexattr_setkind_np;
 	_pthread_mutexattr_setprioceiling;
 	_pthread_mutexattr_setprotocol;
 	_pthread_mutexattr_setpshared;
 	_pthread_mutexattr_settype;
 	_pthread_once;
 	_pthread_resume_all_np;
 	_pthread_resume_np;
 	_pthread_rwlock_destroy;
 	_pthread_rwlock_init;
 	_pthread_rwlock_rdlock;
 	_pthread_rwlock_timedrdlock;
 	_pthread_rwlock_timedwrlock;
 	_pthread_rwlock_tryrdlock;
 	_pthread_rwlock_trywrlock;
 	_pthread_rwlock_unlock;
 	_pthread_rwlock_wrlock;
 	_pthread_rwlockattr_destroy;
 	_pthread_rwlockattr_getpshared;
 	_pthread_rwlockattr_init;
 	_pthread_rwlockattr_setpshared;
 	_pthread_self;
 	_pthread_set_name_np;
 	_pthread_setaffinity_np;
 	_pthread_setcancelstate;
 	_pthread_setcanceltype;
 	_pthread_setconcurrency;
 	_pthread_setprio;
 	_pthread_setschedparam;
 	_pthread_setspecific;
 	_pthread_sigmask;
 	_pthread_single_np;
 	_pthread_spin_destroy;
 	_pthread_spin_init;
 	_pthread_spin_lock;
 	_pthread_spin_trylock;
 	_pthread_spin_unlock;
 	_pthread_suspend_all_np;
 	_pthread_suspend_np;
 	_pthread_switch_add_np;
 	_pthread_switch_delete_np;
 	_pthread_testcancel;
 	_pthread_timedjoin_np;
 	_pthread_yield;
 
 	/* Debugger needs these. */
 	_libthr_debug;
 	_thread_active_threads;
 	_thread_bp_create;
 	_thread_bp_death;
 	_thread_event_mask;
 	_thread_keytable;
 	_thread_last_event;
 	_thread_list;
 	_thread_max_keys;
 	_thread_off_attr_flags;
 	_thread_off_dtv;
 	_thread_off_event_buf;
 	_thread_off_event_mask;
 	_thread_off_key_allocated;
 	_thread_off_key_destructor;
 	_thread_off_linkmap;
 	_thread_off_next;
 	_thread_off_report_events;
 	_thread_off_state;
 	_thread_off_tcb;
 	_thread_off_tid;
 	_thread_off_tlsindex;
 	_thread_size_key;
 	_thread_state_running;
 	_thread_state_zoombie;
 };
 
 FBSD_1.1 {
 	__pthread_cleanup_pop_imp;
 	__pthread_cleanup_push_imp;
 	pthread_attr_getaffinity_np;
 	pthread_attr_setaffinity_np;
 	pthread_getaffinity_np;
 	pthread_getcpuclockid;
 	pthread_setaffinity_np;
 	pthread_mutex_getspinloops_np;
 	pthread_mutex_getyieldloops_np;
 	pthread_mutex_isowned_np;
 	pthread_mutex_setspinloops_np;
 	pthread_mutex_setyieldloops_np;
 };
 
 FBSD_1.2 {
 	pthread_getthreadid_np;
 };
+
+FBSD_1.4 {
+	 pthread_mutex_consistent;
+	 pthread_mutexattr_getrobust;
+	 pthread_mutexattr_setrobust;
+};
Index: head/lib/libthr/thread/thr_cond.c
===================================================================
--- head/lib/libthr/thread/thr_cond.c	(revision 300042)
+++ head/lib/libthr/thread/thr_cond.c	(revision 300043)
@@ -1,523 +1,533 @@
 /*
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * Copyright (c) 2015 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "namespace.h"
 #include <stdlib.h>
 #include <errno.h>
 #include <string.h>
 #include <pthread.h>
 #include <limits.h>
 #include "un-namespace.h"
 
 #include "thr_private.h"
 
 _Static_assert(sizeof(struct pthread_cond) <= PAGE_SIZE,
     "pthread_cond too large");
 
 /*
  * Prototypes
  */
 int	__pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex);
 int	__pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
 		       const struct timespec * abstime);
 static int cond_init(pthread_cond_t *cond, const pthread_condattr_t *attr);
 static int cond_wait_common(pthread_cond_t *cond, pthread_mutex_t *mutex,
 		    const struct timespec *abstime, int cancel);
 static int cond_signal_common(pthread_cond_t *cond);
 static int cond_broadcast_common(pthread_cond_t *cond);
 
 /*
  * Double underscore versions are cancellation points.  Single underscore
  * versions are not and are provided for libc internal usage (which
  * shouldn't introduce cancellation points).
  */
 __weak_reference(__pthread_cond_wait, pthread_cond_wait);
 __weak_reference(__pthread_cond_timedwait, pthread_cond_timedwait);
 
 __weak_reference(_pthread_cond_init, pthread_cond_init);
 __weak_reference(_pthread_cond_destroy, pthread_cond_destroy);
 __weak_reference(_pthread_cond_signal, pthread_cond_signal);
 __weak_reference(_pthread_cond_broadcast, pthread_cond_broadcast);
 
 #define CV_PSHARED(cvp)	(((cvp)->__flags & USYNC_PROCESS_SHARED) != 0)
 
 static void
 cond_init_body(struct pthread_cond *cvp, const struct pthread_cond_attr *cattr)
 {
 
 	if (cattr == NULL) {
 		cvp->__clock_id = CLOCK_REALTIME;
 	} else {
 		if (cattr->c_pshared)
 			cvp->__flags |= USYNC_PROCESS_SHARED;
 		cvp->__clock_id = cattr->c_clockid;
 	}
 }
 
 static int
 cond_init(pthread_cond_t *cond, const pthread_condattr_t *cond_attr)
 {
 	struct pthread_cond *cvp;
 	const struct pthread_cond_attr *cattr;
 	int pshared;
 
 	cattr = cond_attr != NULL ? *cond_attr : NULL;
 	if (cattr == NULL || cattr->c_pshared == PTHREAD_PROCESS_PRIVATE) {
 		pshared = 0;
 		cvp = calloc(1, sizeof(struct pthread_cond));
 		if (cvp == NULL)
 			return (ENOMEM);
 	} else {
 		pshared = 1;
 		cvp = __thr_pshared_offpage(cond, 1);
 		if (cvp == NULL)
 			return (EFAULT);
 	}
 
 	/*
 	 * Initialise the condition variable structure:
 	 */
 	cond_init_body(cvp, cattr);
 	*cond = pshared ? THR_PSHARED_PTR : cvp;
 	return (0);
 }
 
 static int
 init_static(struct pthread *thread, pthread_cond_t *cond)
 {
 	int ret;
 
 	THR_LOCK_ACQUIRE(thread, &_cond_static_lock);
 
 	if (*cond == NULL)
 		ret = cond_init(cond, NULL);
 	else
 		ret = 0;
 
 	THR_LOCK_RELEASE(thread, &_cond_static_lock);
 
 	return (ret);
 }
 
 #define CHECK_AND_INIT_COND							\
 	if (*cond == THR_PSHARED_PTR) {						\
 		cvp = __thr_pshared_offpage(cond, 0);				\
 		if (cvp == NULL)						\
 			return (EINVAL);					\
 	} else if (__predict_false((cvp = (*cond)) <= THR_COND_DESTROYED)) {	\
 		if (cvp == THR_COND_INITIALIZER) {				\
 			int ret;						\
 			ret = init_static(_get_curthread(), cond);		\
 			if (ret)						\
 				return (ret);					\
 		} else if (cvp == THR_COND_DESTROYED) {				\
 			return (EINVAL);					\
 		}								\
 		cvp = *cond;							\
 	}
 
 int
 _pthread_cond_init(pthread_cond_t *cond, const pthread_condattr_t *cond_attr)
 {
 
 	*cond = NULL;
 	return (cond_init(cond, cond_attr));
 }
 
 int
 _pthread_cond_destroy(pthread_cond_t *cond)
 {
 	struct pthread_cond *cvp;
 	int error;
 
 	error = 0;
 	if (*cond == THR_PSHARED_PTR) {
 		cvp = __thr_pshared_offpage(cond, 0);
 		if (cvp != NULL)
 			__thr_pshared_destroy(cond);
 		*cond = THR_COND_DESTROYED;
 	} else if ((cvp = *cond) == THR_COND_INITIALIZER) {
 		/* nothing */
 	} else if (cvp == THR_COND_DESTROYED) {
 		error = EINVAL;
 	} else {
 		cvp = *cond;
 		*cond = THR_COND_DESTROYED;
 		free(cvp);
 	}
 	return (error);
 }
 
 /*
  * Cancellation behavior:
  *   Thread may be canceled at start, if thread is canceled, it means it
  *   did not get a wakeup from pthread_cond_signal(), otherwise, it is
  *   not canceled.
  *   Thread cancellation never cause wakeup from pthread_cond_signal()
  *   to be lost.
  */
 static int
 cond_wait_kernel(struct pthread_cond *cvp, struct pthread_mutex *mp,
-	const struct timespec *abstime, int cancel)
+    const struct timespec *abstime, int cancel)
 {
-	struct pthread	*curthread = _get_curthread();
-	int		recurse;
-	int		error, error2 = 0;
+	struct pthread *curthread;
+	int error, error2, recurse, robust;
 
+	curthread = _get_curthread();
+	robust = _mutex_enter_robust(curthread, mp);
+
 	error = _mutex_cv_detach(mp, &recurse);
-	if (error != 0)
+	if (error != 0) {
+		if (robust)
+			_mutex_leave_robust(curthread, mp);
 		return (error);
+	}
 
-	if (cancel) {
+	if (cancel)
 		_thr_cancel_enter2(curthread, 0);
-		error = _thr_ucond_wait((struct ucond *)&cvp->__has_kern_waiters,
-			(struct umutex *)&mp->m_lock, abstime,
-			CVWAIT_ABSTIME|CVWAIT_CLOCKID);
+	error = _thr_ucond_wait((struct ucond *)&cvp->__has_kern_waiters,
+	    (struct umutex *)&mp->m_lock, abstime, CVWAIT_ABSTIME |
+	    CVWAIT_CLOCKID);
+	if (cancel)
 		_thr_cancel_leave(curthread, 0);
-	} else {
-		error = _thr_ucond_wait((struct ucond *)&cvp->__has_kern_waiters,
-			(struct umutex *)&mp->m_lock, abstime,
-			CVWAIT_ABSTIME|CVWAIT_CLOCKID);
-	}
 
 	/*
 	 * Note that PP mutex and ROBUST mutex may return
 	 * interesting error codes.
 	 */
 	if (error == 0) {
-		error2 = _mutex_cv_lock(mp, recurse);
+		error2 = _mutex_cv_lock(mp, recurse, true);
 	} else if (error == EINTR || error == ETIMEDOUT) {
-		error2 = _mutex_cv_lock(mp, recurse);
+		error2 = _mutex_cv_lock(mp, recurse, true);
+		/*
+		 * Do not do cancellation on EOWNERDEAD there.  The
+		 * cancellation cleanup handler will use the protected
+		 * state and unlock the mutex without making the state
+		 * consistent and the state will be unrecoverable.
+		 */
 		if (error2 == 0 && cancel)
 			_thr_testcancel(curthread);
+
 		if (error == EINTR)
 			error = 0;
 	} else {
 		/* We know that it didn't unlock the mutex. */
-		error2 = _mutex_cv_attach(mp, recurse);
-		if (error2 == 0 && cancel)
+		_mutex_cv_attach(mp, recurse);
+		if (cancel)
 			_thr_testcancel(curthread);
+		error2 = 0;
 	}
+	if (robust)
+		_mutex_leave_robust(curthread, mp);
 	return (error2 != 0 ? error2 : error);
 }
 
 /*
  * Thread waits in userland queue whenever possible, when thread
  * is signaled or broadcasted, it is removed from the queue, and
  * is saved in curthread's defer_waiters[] buffer, but won't be
  * woken up until mutex is unlocked.
  */
 
 static int
 cond_wait_user(struct pthread_cond *cvp, struct pthread_mutex *mp,
-	const struct timespec *abstime, int cancel)
+    const struct timespec *abstime, int cancel)
 {
-	struct pthread	*curthread = _get_curthread();
+	struct pthread *curthread;
 	struct sleepqueue *sq;
-	int	recurse;
-	int	error;
-	int	defered;
+	int deferred, error, error2, recurse;
 
+	curthread = _get_curthread();
 	if (curthread->wchan != NULL)
 		PANIC("thread was already on queue.");
 
 	if (cancel)
 		_thr_testcancel(curthread);
 
 	_sleepq_lock(cvp);
 	/*
 	 * set __has_user_waiters before unlocking mutex, this allows
 	 * us to check it without locking in pthread_cond_signal().
 	 */
 	cvp->__has_user_waiters = 1; 
-	defered = 0;
-	(void)_mutex_cv_unlock(mp, &recurse, &defered);
+	deferred = 0;
+	(void)_mutex_cv_unlock(mp, &recurse, &deferred);
 	curthread->mutex_obj = mp;
 	_sleepq_add(cvp, curthread);
 	for(;;) {
 		_thr_clear_wake(curthread);
 		_sleepq_unlock(cvp);
-		if (defered) {
-			defered = 0;
+		if (deferred) {
+			deferred = 0;
 			if ((mp->m_lock.m_owner & UMUTEX_CONTESTED) == 0)
-				(void)_umtx_op_err(&mp->m_lock, UMTX_OP_MUTEX_WAKE2,
-					 mp->m_lock.m_flags, 0, 0);
+				(void)_umtx_op_err(&mp->m_lock,
+				    UMTX_OP_MUTEX_WAKE2, mp->m_lock.m_flags,
+				    0, 0);
 		}
 		if (curthread->nwaiter_defer > 0) {
 			_thr_wake_all(curthread->defer_waiters,
-				curthread->nwaiter_defer);
+			    curthread->nwaiter_defer);
 			curthread->nwaiter_defer = 0;
 		}
 
-		if (cancel) {
+		if (cancel)
 			_thr_cancel_enter2(curthread, 0);
-			error = _thr_sleep(curthread, cvp->__clock_id, abstime);
+		error = _thr_sleep(curthread, cvp->__clock_id, abstime);
+		if (cancel)
 			_thr_cancel_leave(curthread, 0);
-		} else {
-			error = _thr_sleep(curthread, cvp->__clock_id, abstime);
-		}
 
 		_sleepq_lock(cvp);
 		if (curthread->wchan == NULL) {
 			error = 0;
 			break;
 		} else if (cancel && SHOULD_CANCEL(curthread)) {
 			sq = _sleepq_lookup(cvp);
-			cvp->__has_user_waiters = 
-				_sleepq_remove(sq, curthread);
+			cvp->__has_user_waiters = _sleepq_remove(sq, curthread);
 			_sleepq_unlock(cvp);
 			curthread->mutex_obj = NULL;
-			_mutex_cv_lock(mp, recurse);
+			error2 = _mutex_cv_lock(mp, recurse, false);
 			if (!THR_IN_CRITICAL(curthread))
 				_pthread_exit(PTHREAD_CANCELED);
 			else /* this should not happen */
-				return (0);
+				return (error2);
 		} else if (error == ETIMEDOUT) {
 			sq = _sleepq_lookup(cvp);
 			cvp->__has_user_waiters =
-				_sleepq_remove(sq, curthread);
+			    _sleepq_remove(sq, curthread);
 			break;
 		}
 	}
 	_sleepq_unlock(cvp);
 	curthread->mutex_obj = NULL;
-	_mutex_cv_lock(mp, recurse);
+	error2 = _mutex_cv_lock(mp, recurse, false);
+	if (error == 0)
+		error = error2;
 	return (error);
 }
 
 static int
 cond_wait_common(pthread_cond_t *cond, pthread_mutex_t *mutex,
 	const struct timespec *abstime, int cancel)
 {
 	struct pthread	*curthread = _get_curthread();
 	struct pthread_cond *cvp;
 	struct pthread_mutex *mp;
 	int	error;
 
 	CHECK_AND_INIT_COND
 
 	if (*mutex == THR_PSHARED_PTR) {
 		mp = __thr_pshared_offpage(mutex, 0);
 		if (mp == NULL)
 			return (EINVAL);
 	} else {
 		mp = *mutex;
 	}
 
 	if ((error = _mutex_owned(curthread, mp)) != 0)
 		return (error);
 
 	if (curthread->attr.sched_policy != SCHED_OTHER ||
-	    (mp->m_lock.m_flags & (UMUTEX_PRIO_PROTECT|UMUTEX_PRIO_INHERIT|
-		USYNC_PROCESS_SHARED)) != 0 ||
+	    (mp->m_lock.m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT |
+	    USYNC_PROCESS_SHARED)) != 0 ||
 	    (cvp->__flags & USYNC_PROCESS_SHARED) != 0)
-		return cond_wait_kernel(cvp, mp, abstime, cancel);
+		return (cond_wait_kernel(cvp, mp, abstime, cancel));
 	else
-		return cond_wait_user(cvp, mp, abstime, cancel);
+		return (cond_wait_user(cvp, mp, abstime, cancel));
 }
 
 int
 _pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
 {
 
 	return (cond_wait_common(cond, mutex, NULL, 0));
 }
 
 int
 __pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
 {
 
 	return (cond_wait_common(cond, mutex, NULL, 1));
 }
 
 int
 _pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
 		       const struct timespec * abstime)
 {
 
 	if (abstime == NULL || abstime->tv_sec < 0 || abstime->tv_nsec < 0 ||
 	    abstime->tv_nsec >= 1000000000)
 		return (EINVAL);
 
 	return (cond_wait_common(cond, mutex, abstime, 0));
 }
 
 int
 __pthread_cond_timedwait(pthread_cond_t *cond, pthread_mutex_t *mutex,
 		       const struct timespec *abstime)
 {
 
 	if (abstime == NULL || abstime->tv_sec < 0 || abstime->tv_nsec < 0 ||
 	    abstime->tv_nsec >= 1000000000)
 		return (EINVAL);
 
 	return (cond_wait_common(cond, mutex, abstime, 1));
 }
 
 static int
 cond_signal_common(pthread_cond_t *cond)
 {
 	struct pthread	*curthread = _get_curthread();
 	struct pthread *td;
 	struct pthread_cond *cvp;
 	struct pthread_mutex *mp;
 	struct sleepqueue *sq;
 	int	*waddr;
 	int	pshared;
 
 	/*
 	 * If the condition variable is statically initialized, perform dynamic
 	 * initialization.
 	 */
 	CHECK_AND_INIT_COND
 
 	pshared = CV_PSHARED(cvp);
 
 	_thr_ucond_signal((struct ucond *)&cvp->__has_kern_waiters);
 
 	if (pshared || cvp->__has_user_waiters == 0)
 		return (0);
 
 	curthread = _get_curthread();
 	waddr = NULL;
 	_sleepq_lock(cvp);
 	sq = _sleepq_lookup(cvp);
 	if (sq == NULL) {
 		_sleepq_unlock(cvp);
 		return (0);
 	}
 
 	td = _sleepq_first(sq);
 	mp = td->mutex_obj;
 	cvp->__has_user_waiters = _sleepq_remove(sq, td);
-	if (mp->m_owner == TID(curthread)) {
+	if (PMUTEX_OWNER_ID(mp) == TID(curthread)) {
 		if (curthread->nwaiter_defer >= MAX_DEFER_WAITERS) {
 			_thr_wake_all(curthread->defer_waiters,
-					curthread->nwaiter_defer);
+			    curthread->nwaiter_defer);
 			curthread->nwaiter_defer = 0;
 		}
 		curthread->defer_waiters[curthread->nwaiter_defer++] =
-			&td->wake_addr->value;
-		mp->m_flags |= PMUTEX_FLAG_DEFERED;
+		    &td->wake_addr->value;
+		mp->m_flags |= PMUTEX_FLAG_DEFERRED;
 	} else {
 		waddr = &td->wake_addr->value;
 	}
 	_sleepq_unlock(cvp);
 	if (waddr != NULL)
 		_thr_set_wake(waddr);
 	return (0);
 }
 
 struct broadcast_arg {
 	struct pthread *curthread;
 	unsigned int *waddrs[MAX_DEFER_WAITERS];
 	int count;
 };
 
 static void
 drop_cb(struct pthread *td, void *arg)
 {
 	struct broadcast_arg *ba = arg;
 	struct pthread_mutex *mp;
 	struct pthread *curthread = ba->curthread;
 
 	mp = td->mutex_obj;
-	if (mp->m_owner == TID(curthread)) {
+	if (PMUTEX_OWNER_ID(mp) == TID(curthread)) {
 		if (curthread->nwaiter_defer >= MAX_DEFER_WAITERS) {
 			_thr_wake_all(curthread->defer_waiters,
-				curthread->nwaiter_defer);
+			    curthread->nwaiter_defer);
 			curthread->nwaiter_defer = 0;
 		}
 		curthread->defer_waiters[curthread->nwaiter_defer++] =
-			&td->wake_addr->value;
-		mp->m_flags |= PMUTEX_FLAG_DEFERED;
+		    &td->wake_addr->value;
+		mp->m_flags |= PMUTEX_FLAG_DEFERRED;
 	} else {
 		if (ba->count >= MAX_DEFER_WAITERS) {
 			_thr_wake_all(ba->waddrs, ba->count);
 			ba->count = 0;
 		}
 		ba->waddrs[ba->count++] = &td->wake_addr->value;
 	}
 }
 
 static int
 cond_broadcast_common(pthread_cond_t *cond)
 {
 	int    pshared;
 	struct pthread_cond *cvp;
 	struct sleepqueue *sq;
 	struct broadcast_arg ba;
 
 	/*
 	 * If the condition variable is statically initialized, perform dynamic
 	 * initialization.
 	 */
 	CHECK_AND_INIT_COND
 
 	pshared = CV_PSHARED(cvp);
 
 	_thr_ucond_broadcast((struct ucond *)&cvp->__has_kern_waiters);
 
 	if (pshared || cvp->__has_user_waiters == 0)
 		return (0);
 
 	ba.curthread = _get_curthread();
 	ba.count = 0;
 	
 	_sleepq_lock(cvp);
 	sq = _sleepq_lookup(cvp);
 	if (sq == NULL) {
 		_sleepq_unlock(cvp);
 		return (0);
 	}
 	_sleepq_drop(sq, drop_cb, &ba);
 	cvp->__has_user_waiters = 0;
 	_sleepq_unlock(cvp);
 	if (ba.count > 0)
 		_thr_wake_all(ba.waddrs, ba.count);
 	return (0);
 }
 
 int
 _pthread_cond_signal(pthread_cond_t * cond)
 {
 
 	return (cond_signal_common(cond));
 }
 
 int
 _pthread_cond_broadcast(pthread_cond_t * cond)
 {
 
 	return (cond_broadcast_common(cond));
 }
Index: head/lib/libthr/thread/thr_init.c
===================================================================
--- head/lib/libthr/thread/thr_init.c	(revision 300042)
+++ head/lib/libthr/thread/thr_init.c	(revision 300043)
@@ -1,491 +1,496 @@
 /*
  * Copyright (c) 2003 Daniel M. Eischen <deischen@freebsd.org>
  * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by John Birrell.
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "namespace.h"
 #include <sys/types.h>
 #include <sys/signalvar.h>
 #include <sys/ioctl.h>
 #include <sys/link_elf.h>
 #include <sys/resource.h>
 #include <sys/sysctl.h>
 #include <sys/ttycom.h>
 #include <sys/mman.h>
 #include <sys/rtprio.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <paths.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include <signal.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include <unistd.h>
 #include "un-namespace.h"
 
 #include "libc_private.h"
 #include "thr_private.h"
 
 char		*_usrstack;
 struct pthread	*_thr_initial;
 int		_libthr_debug;
 int		_thread_event_mask;
 struct pthread	*_thread_last_event;
 pthreadlist	_thread_list = TAILQ_HEAD_INITIALIZER(_thread_list);
 pthreadlist 	_thread_gc_list = TAILQ_HEAD_INITIALIZER(_thread_gc_list);
 int		_thread_active_threads = 1;
 atfork_head	_thr_atfork_list = TAILQ_HEAD_INITIALIZER(_thr_atfork_list);
 struct urwlock	_thr_atfork_lock = DEFAULT_URWLOCK;
 
 struct pthread_prio	_thr_priorities[3] = {
 	{RTP_PRIO_MIN,  RTP_PRIO_MAX, 0}, /* FIFO */
 	{0, 0, 63}, /* OTHER */
 	{RTP_PRIO_MIN, RTP_PRIO_MAX, 0}  /* RR */
 };
 
 struct pthread_attr _pthread_attr_default = {
 	.sched_policy = SCHED_OTHER,
 	.sched_inherit = PTHREAD_INHERIT_SCHED,
 	.prio = 0,
 	.suspend = THR_CREATE_RUNNING,
 	.flags = PTHREAD_SCOPE_SYSTEM,
 	.stackaddr_attr = NULL,
 	.stacksize_attr = THR_STACK_DEFAULT,
 	.guardsize_attr = 0,
 	.cpusetsize = 0,
 	.cpuset = NULL
 };
 
 struct pthread_mutex_attr _pthread_mutexattr_default = {
 	.m_type = PTHREAD_MUTEX_DEFAULT,
 	.m_protocol = PTHREAD_PRIO_NONE,
 	.m_ceiling = 0,
 	.m_pshared = PTHREAD_PROCESS_PRIVATE,
+	.m_robust = PTHREAD_MUTEX_STALLED,
 };
 
 struct pthread_mutex_attr _pthread_mutexattr_adaptive_default = {
 	.m_type = PTHREAD_MUTEX_ADAPTIVE_NP,
 	.m_protocol = PTHREAD_PRIO_NONE,
 	.m_ceiling = 0,
 	.m_pshared = PTHREAD_PROCESS_PRIVATE,
+	.m_robust = PTHREAD_MUTEX_STALLED,
 };
 
 /* Default condition variable attributes: */
 struct pthread_cond_attr _pthread_condattr_default = {
 	.c_pshared = PTHREAD_PROCESS_PRIVATE,
 	.c_clockid = CLOCK_REALTIME
 };
 
 int		_thr_is_smp = 0;
 size_t		_thr_guard_default;
 size_t		_thr_stack_default = THR_STACK_DEFAULT;
 size_t		_thr_stack_initial = THR_STACK_INITIAL;
 int		_thr_page_size;
 int		_thr_spinloops;
 int		_thr_yieldloops;
 int		_thr_queuefifo = 4;
 int		_gc_count;
 struct umutex	_mutex_static_lock = DEFAULT_UMUTEX;
 struct umutex	_cond_static_lock = DEFAULT_UMUTEX;
 struct umutex	_rwlock_static_lock = DEFAULT_UMUTEX;
 struct umutex	_keytable_lock = DEFAULT_UMUTEX;
 struct urwlock	_thr_list_lock = DEFAULT_URWLOCK;
 struct umutex	_thr_event_lock = DEFAULT_UMUTEX;
 struct umutex	_suspend_all_lock = DEFAULT_UMUTEX;
 struct pthread	*_single_thread;
 int		_suspend_all_cycle;
 int		_suspend_all_waiters;
 
 int	__pthread_cond_wait(pthread_cond_t *, pthread_mutex_t *);
 int	__pthread_mutex_lock(pthread_mutex_t *);
 int	__pthread_mutex_trylock(pthread_mutex_t *);
 void	_thread_init_hack(void) __attribute__ ((constructor));
 
 static void init_private(void);
 static void init_main_thread(struct pthread *thread);
 
 /*
  * All weak references used within libc should be in this table.
  * This is so that static libraries will work.
  */
 
 STATIC_LIB_REQUIRE(_fork);
 STATIC_LIB_REQUIRE(_pthread_getspecific);
 STATIC_LIB_REQUIRE(_pthread_key_create);
 STATIC_LIB_REQUIRE(_pthread_key_delete);
 STATIC_LIB_REQUIRE(_pthread_mutex_destroy);
 STATIC_LIB_REQUIRE(_pthread_mutex_init);
 STATIC_LIB_REQUIRE(_pthread_mutex_lock);
 STATIC_LIB_REQUIRE(_pthread_mutex_trylock);
 STATIC_LIB_REQUIRE(_pthread_mutex_unlock);
 STATIC_LIB_REQUIRE(_pthread_mutexattr_init);
 STATIC_LIB_REQUIRE(_pthread_mutexattr_destroy);
 STATIC_LIB_REQUIRE(_pthread_mutexattr_settype);
 STATIC_LIB_REQUIRE(_pthread_once);
 STATIC_LIB_REQUIRE(_pthread_setspecific);
 STATIC_LIB_REQUIRE(_raise);
 STATIC_LIB_REQUIRE(_sem_destroy);
 STATIC_LIB_REQUIRE(_sem_getvalue);
 STATIC_LIB_REQUIRE(_sem_init);
 STATIC_LIB_REQUIRE(_sem_post);
 STATIC_LIB_REQUIRE(_sem_timedwait);
 STATIC_LIB_REQUIRE(_sem_trywait);
 STATIC_LIB_REQUIRE(_sem_wait);
 STATIC_LIB_REQUIRE(_sigaction);
 STATIC_LIB_REQUIRE(_sigprocmask);
 STATIC_LIB_REQUIRE(_sigsuspend);
 STATIC_LIB_REQUIRE(_sigtimedwait);
 STATIC_LIB_REQUIRE(_sigwait);
 STATIC_LIB_REQUIRE(_sigwaitinfo);
 STATIC_LIB_REQUIRE(_spinlock);
 STATIC_LIB_REQUIRE(_spinlock_debug);
 STATIC_LIB_REQUIRE(_spinunlock);
 STATIC_LIB_REQUIRE(_thread_init_hack);
 
 /*
  * These are needed when linking statically.  All references within
  * libgcc (and in the future libc) to these routines are weak, but
  * if they are not (strongly) referenced by the application or other
  * libraries, then the actual functions will not be loaded.
  */
 STATIC_LIB_REQUIRE(_pthread_once);
 STATIC_LIB_REQUIRE(_pthread_key_create);
 STATIC_LIB_REQUIRE(_pthread_key_delete);
 STATIC_LIB_REQUIRE(_pthread_getspecific);
 STATIC_LIB_REQUIRE(_pthread_setspecific);
 STATIC_LIB_REQUIRE(_pthread_mutex_init);
 STATIC_LIB_REQUIRE(_pthread_mutex_destroy);
 STATIC_LIB_REQUIRE(_pthread_mutex_lock);
 STATIC_LIB_REQUIRE(_pthread_mutex_trylock);
 STATIC_LIB_REQUIRE(_pthread_mutex_unlock);
 STATIC_LIB_REQUIRE(_pthread_create);
 
 /* Pull in all symbols required by libthread_db */
 STATIC_LIB_REQUIRE(_thread_state_running);
 
 #define	DUAL_ENTRY(entry)	\
 	(pthread_func_t)entry, (pthread_func_t)entry
 
 static pthread_func_t jmp_table[][2] = {
 	{DUAL_ENTRY(_pthread_atfork)},	/* PJT_ATFORK */
 	{DUAL_ENTRY(_pthread_attr_destroy)},	/* PJT_ATTR_DESTROY */
 	{DUAL_ENTRY(_pthread_attr_getdetachstate)},	/* PJT_ATTR_GETDETACHSTATE */
 	{DUAL_ENTRY(_pthread_attr_getguardsize)},	/* PJT_ATTR_GETGUARDSIZE */
 	{DUAL_ENTRY(_pthread_attr_getinheritsched)},	/* PJT_ATTR_GETINHERITSCHED */
 	{DUAL_ENTRY(_pthread_attr_getschedparam)},	/* PJT_ATTR_GETSCHEDPARAM */
 	{DUAL_ENTRY(_pthread_attr_getschedpolicy)},	/* PJT_ATTR_GETSCHEDPOLICY */
 	{DUAL_ENTRY(_pthread_attr_getscope)},	/* PJT_ATTR_GETSCOPE */
 	{DUAL_ENTRY(_pthread_attr_getstackaddr)},	/* PJT_ATTR_GETSTACKADDR */
 	{DUAL_ENTRY(_pthread_attr_getstacksize)},	/* PJT_ATTR_GETSTACKSIZE */
 	{DUAL_ENTRY(_pthread_attr_init)},	/* PJT_ATTR_INIT */
 	{DUAL_ENTRY(_pthread_attr_setdetachstate)},	/* PJT_ATTR_SETDETACHSTATE */
 	{DUAL_ENTRY(_pthread_attr_setguardsize)},	/* PJT_ATTR_SETGUARDSIZE */
 	{DUAL_ENTRY(_pthread_attr_setinheritsched)},	/* PJT_ATTR_SETINHERITSCHED */
 	{DUAL_ENTRY(_pthread_attr_setschedparam)},	/* PJT_ATTR_SETSCHEDPARAM */
 	{DUAL_ENTRY(_pthread_attr_setschedpolicy)},	/* PJT_ATTR_SETSCHEDPOLICY */
 	{DUAL_ENTRY(_pthread_attr_setscope)},	/* PJT_ATTR_SETSCOPE */
 	{DUAL_ENTRY(_pthread_attr_setstackaddr)},	/* PJT_ATTR_SETSTACKADDR */
 	{DUAL_ENTRY(_pthread_attr_setstacksize)},	/* PJT_ATTR_SETSTACKSIZE */
 	{DUAL_ENTRY(_pthread_cancel)},	/* PJT_CANCEL */
 	{DUAL_ENTRY(_pthread_cleanup_pop)},	/* PJT_CLEANUP_POP */
 	{DUAL_ENTRY(_pthread_cleanup_push)},	/* PJT_CLEANUP_PUSH */
 	{DUAL_ENTRY(_pthread_cond_broadcast)},	/* PJT_COND_BROADCAST */
 	{DUAL_ENTRY(_pthread_cond_destroy)},	/* PJT_COND_DESTROY */
 	{DUAL_ENTRY(_pthread_cond_init)},	/* PJT_COND_INIT */
 	{DUAL_ENTRY(_pthread_cond_signal)},	/* PJT_COND_SIGNAL */
 	{DUAL_ENTRY(_pthread_cond_timedwait)},	/* PJT_COND_TIMEDWAIT */
 	{(pthread_func_t)__pthread_cond_wait,
 	 (pthread_func_t)_pthread_cond_wait},	/* PJT_COND_WAIT */
 	{DUAL_ENTRY(_pthread_detach)},	/* PJT_DETACH */
 	{DUAL_ENTRY(_pthread_equal)},	/* PJT_EQUAL */
 	{DUAL_ENTRY(_pthread_exit)},	/* PJT_EXIT */
 	{DUAL_ENTRY(_pthread_getspecific)},	/* PJT_GETSPECIFIC */
 	{DUAL_ENTRY(_pthread_join)},	/* PJT_JOIN */
 	{DUAL_ENTRY(_pthread_key_create)},	/* PJT_KEY_CREATE */
 	{DUAL_ENTRY(_pthread_key_delete)},	/* PJT_KEY_DELETE*/
 	{DUAL_ENTRY(_pthread_kill)},	/* PJT_KILL */
 	{DUAL_ENTRY(_pthread_main_np)},		/* PJT_MAIN_NP */
 	{DUAL_ENTRY(_pthread_mutexattr_destroy)}, /* PJT_MUTEXATTR_DESTROY */
 	{DUAL_ENTRY(_pthread_mutexattr_init)},	/* PJT_MUTEXATTR_INIT */
 	{DUAL_ENTRY(_pthread_mutexattr_settype)}, /* PJT_MUTEXATTR_SETTYPE */
 	{DUAL_ENTRY(_pthread_mutex_destroy)},	/* PJT_MUTEX_DESTROY */
 	{DUAL_ENTRY(_pthread_mutex_init)},	/* PJT_MUTEX_INIT */
 	{(pthread_func_t)__pthread_mutex_lock,
 	 (pthread_func_t)_pthread_mutex_lock},	/* PJT_MUTEX_LOCK */
 	{(pthread_func_t)__pthread_mutex_trylock,
 	 (pthread_func_t)_pthread_mutex_trylock},/* PJT_MUTEX_TRYLOCK */
 	{DUAL_ENTRY(_pthread_mutex_unlock)},	/* PJT_MUTEX_UNLOCK */
 	{DUAL_ENTRY(_pthread_once)},		/* PJT_ONCE */
 	{DUAL_ENTRY(_pthread_rwlock_destroy)},	/* PJT_RWLOCK_DESTROY */
 	{DUAL_ENTRY(_pthread_rwlock_init)},	/* PJT_RWLOCK_INIT */
 	{DUAL_ENTRY(_pthread_rwlock_rdlock)},	/* PJT_RWLOCK_RDLOCK */
 	{DUAL_ENTRY(_pthread_rwlock_tryrdlock)},/* PJT_RWLOCK_TRYRDLOCK */
 	{DUAL_ENTRY(_pthread_rwlock_trywrlock)},/* PJT_RWLOCK_TRYWRLOCK */
 	{DUAL_ENTRY(_pthread_rwlock_unlock)},	/* PJT_RWLOCK_UNLOCK */
 	{DUAL_ENTRY(_pthread_rwlock_wrlock)},	/* PJT_RWLOCK_WRLOCK */
 	{DUAL_ENTRY(_pthread_self)},		/* PJT_SELF */
 	{DUAL_ENTRY(_pthread_setcancelstate)},	/* PJT_SETCANCELSTATE */
 	{DUAL_ENTRY(_pthread_setcanceltype)},	/* PJT_SETCANCELTYPE */
 	{DUAL_ENTRY(_pthread_setspecific)},	/* PJT_SETSPECIFIC */
 	{DUAL_ENTRY(_pthread_sigmask)},		/* PJT_SIGMASK */
 	{DUAL_ENTRY(_pthread_testcancel)},	/* PJT_TESTCANCEL */
 	{DUAL_ENTRY(__pthread_cleanup_pop_imp)},/* PJT_CLEANUP_POP_IMP */
 	{DUAL_ENTRY(__pthread_cleanup_push_imp)},/* PJT_CLEANUP_PUSH_IMP */
 	{DUAL_ENTRY(_pthread_cancel_enter)},	/* PJT_CANCEL_ENTER */
-	{DUAL_ENTRY(_pthread_cancel_leave)}		/* PJT_CANCEL_LEAVE */
+	{DUAL_ENTRY(_pthread_cancel_leave)},	/* PJT_CANCEL_LEAVE */
+	{DUAL_ENTRY(_pthread_mutex_consistent)},/* PJT_MUTEX_CONSISTENT */
+	{DUAL_ENTRY(_pthread_mutexattr_getrobust)},/* PJT_MUTEXATTR_GETROBUST */
+	{DUAL_ENTRY(_pthread_mutexattr_setrobust)},/* PJT_MUTEXATTR_SETROBUST */
 };
 
 static int init_once = 0;
 
 /*
  * For the shared version of the threads library, the above is sufficient.
  * But for the archive version of the library, we need a little bit more.
  * Namely, we must arrange for this particular module to be pulled in from
  * the archive library at link time.  To accomplish that, we define and
  * initialize a variable, "_thread_autoinit_dummy_decl".  This variable is
  * referenced (as an extern) from libc/stdlib/exit.c. This will always
  * create a need for this module, ensuring that it is present in the
  * executable.
  */
 extern int _thread_autoinit_dummy_decl;
 int _thread_autoinit_dummy_decl = 0;
 
 void
 _thread_init_hack(void)
 {
 
 	_libpthread_init(NULL);
 }
 
 
 /*
  * Threaded process initialization.
  *
  * This is only called under two conditions:
  *
  *   1) Some thread routines have detected that the library hasn't yet
  *      been initialized (_thr_initial == NULL && curthread == NULL), or
  *
  *   2) An explicit call to reinitialize after a fork (indicated
  *      by curthread != NULL)
  */
 void
 _libpthread_init(struct pthread *curthread)
 {
 	int first, dlopened;
 
 	/* Check if this function has already been called: */
-	if ((_thr_initial != NULL) && (curthread == NULL))
+	if (_thr_initial != NULL && curthread == NULL)
 		/* Only initialize the threaded application once. */
 		return;
 
 	/*
 	 * Check the size of the jump table to make sure it is preset
 	 * with the correct number of entries.
 	 */
-	if (sizeof(jmp_table) != (sizeof(pthread_func_t) * PJT_MAX * 2))
+	if (sizeof(jmp_table) != sizeof(pthread_func_t) * PJT_MAX * 2)
 		PANIC("Thread jump table not properly initialized");
 	memcpy(__thr_jtable, jmp_table, sizeof(jmp_table));
 	__thr_interpose_libc();
 
 	/* Initialize pthread private data. */
 	init_private();
 
 	/* Set the initial thread. */
 	if (curthread == NULL) {
 		first = 1;
 		/* Create and initialize the initial thread. */
 		curthread = _thr_alloc(NULL);
 		if (curthread == NULL)
 			PANIC("Can't allocate initial thread");
 		init_main_thread(curthread);
 	} else {
 		first = 0;
 	}
 		
 	/*
 	 * Add the thread to the thread list queue.
 	 */
 	THR_LIST_ADD(curthread);
 	_thread_active_threads = 1;
 
 	/* Setup the thread specific data */
 	_tcb_set(curthread->tcb);
 
 	if (first) {
 		_thr_initial = curthread;
 		dlopened = _rtld_is_dlopened(&_thread_autoinit_dummy_decl) != 0;
 		_thr_signal_init(dlopened);
 		if (_thread_event_mask & TD_CREATE)
 			_thr_report_creation(curthread, curthread);
 		/*
 		 * Always use our rtld lock implementation.
 		 * It is faster because it postpones signal handlers
 		 * instead of calling sigprocmask(2).
 		 */
 		_thr_rtld_init();
 	}
 }
 
 /*
  * This function and pthread_create() do a lot of the same things.
  * It'd be nice to consolidate the common stuff in one place.
  */
 static void
 init_main_thread(struct pthread *thread)
 {
 	struct sched_param sched_param;
 	int i;
 
 	/* Setup the thread attributes. */
 	thr_self(&thread->tid);
 	thread->attr = _pthread_attr_default;
 	/*
 	 * Set up the thread stack.
 	 *
 	 * Create a red zone below the main stack.  All other stacks
 	 * are constrained to a maximum size by the parameters
 	 * passed to mmap(), but this stack is only limited by
 	 * resource limits, so this stack needs an explicitly mapped
 	 * red zone to protect the thread stack that is just beyond.
 	 */
 	if (mmap(_usrstack - _thr_stack_initial -
 	    _thr_guard_default, _thr_guard_default, 0, MAP_ANON,
 	    -1, 0) == MAP_FAILED)
 		PANIC("Cannot allocate red zone for initial thread");
 
 	/*
 	 * Mark the stack as an application supplied stack so that it
 	 * isn't deallocated.
 	 *
 	 * XXX - I'm not sure it would hurt anything to deallocate
 	 *       the main thread stack because deallocation doesn't
 	 *       actually free() it; it just puts it in the free
 	 *       stack queue for later reuse.
 	 */
 	thread->attr.stackaddr_attr = _usrstack - _thr_stack_initial;
 	thread->attr.stacksize_attr = _thr_stack_initial;
 	thread->attr.guardsize_attr = _thr_guard_default;
 	thread->attr.flags |= THR_STACK_USER;
 
 	/*
 	 * Write a magic value to the thread structure
 	 * to help identify valid ones:
 	 */
 	thread->magic = THR_MAGIC;
 
 	thread->cancel_enable = 1;
 	thread->cancel_async = 0;
 
 	/* Initialize the mutex queues */
 	for (i = 0; i < TMQ_NITEMS; i++)
 		TAILQ_INIT(&thread->mq[i]);
 
 	thread->state = PS_RUNNING;
 
 	_thr_getscheduler(thread->tid, &thread->attr.sched_policy,
 		 &sched_param);
 	thread->attr.prio = sched_param.sched_priority;
 
 #ifdef _PTHREAD_FORCED_UNWIND
 	thread->unwind_stackend = _usrstack;
 #endif
 
 	/* Others cleared to zero by thr_alloc() */
 }
 
 static void
 init_private(void)
 {
 	struct rlimit rlim;
 	size_t len;
 	int mib[2];
 	char *env, *env_bigstack, *env_splitstack;
 
 	_thr_umutex_init(&_mutex_static_lock);
 	_thr_umutex_init(&_cond_static_lock);
 	_thr_umutex_init(&_rwlock_static_lock);
 	_thr_umutex_init(&_keytable_lock);
 	_thr_urwlock_init(&_thr_atfork_lock);
 	_thr_umutex_init(&_thr_event_lock);
 	_thr_umutex_init(&_suspend_all_lock);
 	_thr_once_init();
 	_thr_spinlock_init();
 	_thr_list_init();
 	_thr_wake_addr_init();
 	_sleepq_init();
 	_single_thread = NULL;
 	_suspend_all_waiters = 0;
 
 	/*
 	 * Avoid reinitializing some things if they don't need to be,
 	 * e.g. after a fork().
 	 */
 	if (init_once == 0) {
 		__thr_pshared_init();
 		/* Find the stack top */
 		mib[0] = CTL_KERN;
 		mib[1] = KERN_USRSTACK;
 		len = sizeof (_usrstack);
 		if (sysctl(mib, 2, &_usrstack, &len, NULL, 0) == -1)
 			PANIC("Cannot get kern.usrstack from sysctl");
 		env_bigstack = getenv("LIBPTHREAD_BIGSTACK_MAIN");
 		env_splitstack = getenv("LIBPTHREAD_SPLITSTACK_MAIN");
 		if (env_bigstack != NULL || env_splitstack == NULL) {
 			if (getrlimit(RLIMIT_STACK, &rlim) == -1)
 				PANIC("Cannot get stack rlimit");
 			_thr_stack_initial = rlim.rlim_cur;
 		}
 		len = sizeof(_thr_is_smp);
 		sysctlbyname("kern.smp.cpus", &_thr_is_smp, &len, NULL, 0);
 		_thr_is_smp = (_thr_is_smp > 1);
 		_thr_page_size = getpagesize();
 		_thr_guard_default = _thr_page_size;
 		_pthread_attr_default.guardsize_attr = _thr_guard_default;
 		_pthread_attr_default.stacksize_attr = _thr_stack_default;
 		env = getenv("LIBPTHREAD_SPINLOOPS");
 		if (env)
 			_thr_spinloops = atoi(env);
 		env = getenv("LIBPTHREAD_YIELDLOOPS");
 		if (env)
 			_thr_yieldloops = atoi(env);
 		env = getenv("LIBPTHREAD_QUEUE_FIFO");
 		if (env)
 			_thr_queuefifo = atoi(env);
 		TAILQ_INIT(&_thr_atfork_list);
 	}
 	init_once = 1;
 }
Index: head/lib/libthr/thread/thr_mutex.c
===================================================================
--- head/lib/libthr/thread/thr_mutex.c	(revision 300042)
+++ head/lib/libthr/thread/thr_mutex.c	(revision 300043)
@@ -1,1026 +1,1200 @@
 /*
  * Copyright (c) 1995 John Birrell <jb@cimlogic.com.au>.
  * Copyright (c) 2006 David Xu <davidxu@freebsd.org>.
- * Copyright (c) 2015 The FreeBSD Foundation
+ * Copyright (c) 2015, 2016 The FreeBSD Foundation
  *
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by John Birrell.
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include <stdbool.h>
 #include "namespace.h"
 #include <stdlib.h>
 #include <errno.h>
 #include <string.h>
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include "un-namespace.h"
 
 #include "thr_private.h"
 
 _Static_assert(sizeof(struct pthread_mutex) <= PAGE_SIZE,
     "pthread_mutex is too large for off-page");
 
 /*
  * For adaptive mutexes, how many times to spin doing trylock2
  * before entering the kernel to block
  */
 #define MUTEX_ADAPTIVE_SPINS	2000
 
 /*
  * Prototypes
  */
+int	__pthread_mutex_consistent(pthread_mutex_t *mutex);
 int	__pthread_mutex_init(pthread_mutex_t *mutex,
 		const pthread_mutexattr_t *mutex_attr);
 int	__pthread_mutex_trylock(pthread_mutex_t *mutex);
 int	__pthread_mutex_lock(pthread_mutex_t *mutex);
 int	__pthread_mutex_timedlock(pthread_mutex_t *mutex,
 		const struct timespec *abstime);
 int	_pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
     		void *(calloc_cb)(size_t, size_t));
 int	_pthread_mutex_getspinloops_np(pthread_mutex_t *mutex, int *count);
 int	_pthread_mutex_setspinloops_np(pthread_mutex_t *mutex, int count);
 int	__pthread_mutex_setspinloops_np(pthread_mutex_t *mutex, int count);
 int	_pthread_mutex_setyieldloops_np(pthread_mutex_t *mutex, int count);
 int	_pthread_mutex_getyieldloops_np(pthread_mutex_t *mutex, int *count);
 int	__pthread_mutex_setyieldloops_np(pthread_mutex_t *mutex, int count);
 
 static int	mutex_self_trylock(pthread_mutex_t);
 static int	mutex_self_lock(pthread_mutex_t,
 				const struct timespec *abstime);
-static int	mutex_unlock_common(struct pthread_mutex *, int, int *);
+static int	mutex_unlock_common(struct pthread_mutex *, bool, int *);
 static int	mutex_lock_sleep(struct pthread *, pthread_mutex_t,
 				const struct timespec *);
+static void	mutex_init_robust(struct pthread *curthread);
+static int	mutex_qidx(struct pthread_mutex *m);
+static bool	is_robust_mutex(struct pthread_mutex *m);
+static bool	is_pshared_mutex(struct pthread_mutex *m);
 
 __weak_reference(__pthread_mutex_init, pthread_mutex_init);
 __strong_reference(__pthread_mutex_init, _pthread_mutex_init);
 __weak_reference(__pthread_mutex_lock, pthread_mutex_lock);
 __strong_reference(__pthread_mutex_lock, _pthread_mutex_lock);
 __weak_reference(__pthread_mutex_timedlock, pthread_mutex_timedlock);
 __strong_reference(__pthread_mutex_timedlock, _pthread_mutex_timedlock);
 __weak_reference(__pthread_mutex_trylock, pthread_mutex_trylock);
 __strong_reference(__pthread_mutex_trylock, _pthread_mutex_trylock);
+__weak_reference(_pthread_mutex_consistent, pthread_mutex_consistent);
+__strong_reference(_pthread_mutex_consistent, __pthread_mutex_consistent);
 
 /* Single underscore versions provided for libc internal usage: */
 /* No difference between libc and application usage of these: */
 __weak_reference(_pthread_mutex_destroy, pthread_mutex_destroy);
 __weak_reference(_pthread_mutex_unlock, pthread_mutex_unlock);
 
 __weak_reference(_pthread_mutex_getprioceiling, pthread_mutex_getprioceiling);
 __weak_reference(_pthread_mutex_setprioceiling, pthread_mutex_setprioceiling);
 
 __weak_reference(__pthread_mutex_setspinloops_np, pthread_mutex_setspinloops_np);
 __strong_reference(__pthread_mutex_setspinloops_np, _pthread_mutex_setspinloops_np);
 __weak_reference(_pthread_mutex_getspinloops_np, pthread_mutex_getspinloops_np);
 
 __weak_reference(__pthread_mutex_setyieldloops_np, pthread_mutex_setyieldloops_np);
 __strong_reference(__pthread_mutex_setyieldloops_np, _pthread_mutex_setyieldloops_np);
 __weak_reference(_pthread_mutex_getyieldloops_np, pthread_mutex_getyieldloops_np);
 __weak_reference(_pthread_mutex_isowned_np, pthread_mutex_isowned_np);
 
 static void
 mutex_init_link(struct pthread_mutex *m)
 {
 
 #if defined(_PTHREADS_INVARIANTS)
 	m->m_qe.tqe_prev = NULL;
 	m->m_qe.tqe_next = NULL;
 	m->m_pqe.tqe_prev = NULL;
 	m->m_pqe.tqe_next = NULL;
 #endif
 }
 
 static void
-mutex_assert_is_owned(struct pthread_mutex *m)
+mutex_assert_is_owned(struct pthread_mutex *m __unused)
 {
 
 #if defined(_PTHREADS_INVARIANTS)
 	if (__predict_false(m->m_qe.tqe_prev == NULL)) {
 		char msg[128];
 		snprintf(msg, sizeof(msg),
-		    "mutex %p own %#x %#x is not on list %p %p",
-		    m, m->m_lock.m_owner, m->m_owner, m->m_qe.tqe_prev,
-		    m->m_qe.tqe_next);
+		    "mutex %p own %#x is not on list %p %p",
+		    m, m->m_lock.m_owner, m->m_qe.tqe_prev, m->m_qe.tqe_next);
 		PANIC(msg);
 	}
 #endif
 }
 
 static void
-mutex_assert_not_owned(struct pthread_mutex *m)
+mutex_assert_not_owned(struct pthread *curthread __unused,
+    struct pthread_mutex *m __unused)
 {
 
 #if defined(_PTHREADS_INVARIANTS)
 	if (__predict_false(m->m_qe.tqe_prev != NULL ||
 	    m->m_qe.tqe_next != NULL)) {
 		char msg[128];
 		snprintf(msg, sizeof(msg),
-		    "mutex %p own %#x %#x is on list %p %p",
-		    m, m->m_lock.m_owner, m->m_owner, m->m_qe.tqe_prev,
-		    m->m_qe.tqe_next);
+		    "mutex %p own %#x is on list %p %p",
+		    m, m->m_lock.m_owner, m->m_qe.tqe_prev, m->m_qe.tqe_next);
 		PANIC(msg);
 	}
+	if (__predict_false(is_robust_mutex(m) &&
+	    (m->m_lock.m_rb_lnk != 0 || m->m_rb_prev != NULL ||
+	    (is_pshared_mutex(m) && curthread->robust_list ==
+	    (uintptr_t)&m->m_lock) ||
+	    (!is_pshared_mutex(m) && curthread->priv_robust_list ==
+	    (uintptr_t)&m->m_lock)))) {
+		char msg[128];
+		snprintf(msg, sizeof(msg),
+    "mutex %p own %#x is on robust linkage %p %p head %p phead %p",
+		    m, m->m_lock.m_owner, (void *)m->m_lock.m_rb_lnk,
+		    m->m_rb_prev, (void *)curthread->robust_list,
+		    (void *)curthread->priv_robust_list);
+		PANIC(msg);
+	}
 #endif
 }
 
-static int
+static bool
 is_pshared_mutex(struct pthread_mutex *m)
 {
 
 	return ((m->m_lock.m_flags & USYNC_PROCESS_SHARED) != 0);
 }
 
+static bool
+is_robust_mutex(struct pthread_mutex *m)
+{
+
+	return ((m->m_lock.m_flags & UMUTEX_ROBUST) != 0);
+}
+
+int
+_mutex_enter_robust(struct pthread *curthread, struct pthread_mutex *m)
+{
+
+#if defined(_PTHREADS_INVARIANTS)
+	if (__predict_false(curthread->inact_mtx != 0))
+		PANIC("inact_mtx enter");
+#endif
+	if (!is_robust_mutex(m))
+		return (0);
+
+	mutex_init_robust(curthread);
+	curthread->inact_mtx = (uintptr_t)&m->m_lock;
+	return (1);
+}
+
+void
+_mutex_leave_robust(struct pthread *curthread, struct pthread_mutex *m __unused)
+{
+
+#if defined(_PTHREADS_INVARIANTS)
+	if (__predict_false(curthread->inact_mtx != (uintptr_t)&m->m_lock))
+		PANIC("inact_mtx leave");
+#endif
+	curthread->inact_mtx = 0;
+}
+
 static int
 mutex_check_attr(const struct pthread_mutex_attr *attr)
 {
 
 	if (attr->m_type < PTHREAD_MUTEX_ERRORCHECK ||
 	    attr->m_type >= PTHREAD_MUTEX_TYPE_MAX)
 		return (EINVAL);
 	if (attr->m_protocol < PTHREAD_PRIO_NONE ||
 	    attr->m_protocol > PTHREAD_PRIO_PROTECT)
 		return (EINVAL);
 	return (0);
 }
 
 static void
+mutex_init_robust(struct pthread *curthread)
+{
+	struct umtx_robust_lists_params rb;
+
+	if (curthread == NULL)
+		curthread = _get_curthread();
+	if (curthread->robust_inited)
+		return;
+	rb.robust_list_offset = (uintptr_t)&curthread->robust_list;
+	rb.robust_priv_list_offset = (uintptr_t)&curthread->priv_robust_list;
+	rb.robust_inact_offset = (uintptr_t)&curthread->inact_mtx;
+	_umtx_op(NULL, UMTX_OP_ROBUST_LISTS, sizeof(rb), &rb, NULL);
+	curthread->robust_inited = 1;
+}
+
+static void
 mutex_init_body(struct pthread_mutex *pmutex,
     const struct pthread_mutex_attr *attr)
 {
 
 	pmutex->m_flags = attr->m_type;
-	pmutex->m_owner = 0;
 	pmutex->m_count = 0;
 	pmutex->m_spinloops = 0;
 	pmutex->m_yieldloops = 0;
 	mutex_init_link(pmutex);
 	switch (attr->m_protocol) {
 	case PTHREAD_PRIO_NONE:
 		pmutex->m_lock.m_owner = UMUTEX_UNOWNED;
 		pmutex->m_lock.m_flags = 0;
 		break;
 	case PTHREAD_PRIO_INHERIT:
 		pmutex->m_lock.m_owner = UMUTEX_UNOWNED;
 		pmutex->m_lock.m_flags = UMUTEX_PRIO_INHERIT;
 		break;
 	case PTHREAD_PRIO_PROTECT:
 		pmutex->m_lock.m_owner = UMUTEX_CONTESTED;
 		pmutex->m_lock.m_flags = UMUTEX_PRIO_PROTECT;
 		pmutex->m_lock.m_ceilings[0] = attr->m_ceiling;
 		break;
 	}
 	if (attr->m_pshared == PTHREAD_PROCESS_SHARED)
 		pmutex->m_lock.m_flags |= USYNC_PROCESS_SHARED;
-
+	if (attr->m_robust == PTHREAD_MUTEX_ROBUST) {
+		mutex_init_robust(NULL);
+		pmutex->m_lock.m_flags |= UMUTEX_ROBUST;
+	}
 	if (PMUTEX_TYPE(pmutex->m_flags) == PTHREAD_MUTEX_ADAPTIVE_NP) {
 		pmutex->m_spinloops =
 		    _thr_spinloops ? _thr_spinloops: MUTEX_ADAPTIVE_SPINS;
 		pmutex->m_yieldloops = _thr_yieldloops;
 	}
 }
 
 static int
 mutex_init(pthread_mutex_t *mutex,
     const struct pthread_mutex_attr *mutex_attr,
     void *(calloc_cb)(size_t, size_t))
 {
 	const struct pthread_mutex_attr *attr;
 	struct pthread_mutex *pmutex;
 	int error;
 
 	if (mutex_attr == NULL) {
 		attr = &_pthread_mutexattr_default;
 	} else {
 		attr = mutex_attr;
 		error = mutex_check_attr(attr);
 		if (error != 0)
 			return (error);
 	}
 	if ((pmutex = (pthread_mutex_t)
 		calloc_cb(1, sizeof(struct pthread_mutex))) == NULL)
 		return (ENOMEM);
 	mutex_init_body(pmutex, attr);
 	*mutex = pmutex;
 	return (0);
 }
 
 static int
 init_static(struct pthread *thread, pthread_mutex_t *mutex)
 {
 	int ret;
 
 	THR_LOCK_ACQUIRE(thread, &_mutex_static_lock);
 
 	if (*mutex == THR_MUTEX_INITIALIZER)
 		ret = mutex_init(mutex, &_pthread_mutexattr_default, calloc);
 	else if (*mutex == THR_ADAPTIVE_MUTEX_INITIALIZER)
 		ret = mutex_init(mutex, &_pthread_mutexattr_adaptive_default,
 		    calloc);
 	else
 		ret = 0;
 	THR_LOCK_RELEASE(thread, &_mutex_static_lock);
 
 	return (ret);
 }
 
 static void
 set_inherited_priority(struct pthread *curthread, struct pthread_mutex *m)
 {
 	struct pthread_mutex *m2;
 
-	m2 = TAILQ_LAST(&curthread->mq[TMQ_NORM_PP], mutex_queue);
+	m2 = TAILQ_LAST(&curthread->mq[mutex_qidx(m)], mutex_queue);
 	if (m2 != NULL)
 		m->m_lock.m_ceilings[1] = m2->m_lock.m_ceilings[0];
 	else
 		m->m_lock.m_ceilings[1] = -1;
 }
 
 static void
 shared_mutex_init(struct pthread_mutex *pmtx, const struct
     pthread_mutex_attr *mutex_attr)
 {
 	static const struct pthread_mutex_attr foobar_mutex_attr = {
 		.m_type = PTHREAD_MUTEX_DEFAULT,
 		.m_protocol = PTHREAD_PRIO_NONE,
 		.m_ceiling = 0,
-		.m_pshared = PTHREAD_PROCESS_SHARED
+		.m_pshared = PTHREAD_PROCESS_SHARED,
+		.m_robust = PTHREAD_MUTEX_STALLED,
 	};
 	bool done;
 
 	/*
 	 * Hack to allow multiple pthread_mutex_init() calls on the
 	 * same process-shared mutex.  We rely on kernel allocating
 	 * zeroed offpage for the mutex, i.e. the
 	 * PMUTEX_INITSTAGE_ALLOC value must be zero.
 	 */
 	for (done = false; !done;) {
 		switch (pmtx->m_ps) {
 		case PMUTEX_INITSTAGE_DONE:
 			atomic_thread_fence_acq();
 			done = true;
 			break;
 		case PMUTEX_INITSTAGE_ALLOC:
 			if (atomic_cmpset_int(&pmtx->m_ps,
 			    PMUTEX_INITSTAGE_ALLOC, PMUTEX_INITSTAGE_BUSY)) {
 				if (mutex_attr == NULL)
 					mutex_attr = &foobar_mutex_attr;
 				mutex_init_body(pmtx, mutex_attr);
 				atomic_store_rel_int(&pmtx->m_ps,
 				    PMUTEX_INITSTAGE_DONE);
 				done = true;
 			}
 			break;
 		case PMUTEX_INITSTAGE_BUSY:
 			_pthread_yield();
 			break;
 		default:
 			PANIC("corrupted offpage");
 			break;
 		}
 	}
 }
 
 int
 __pthread_mutex_init(pthread_mutex_t *mutex,
     const pthread_mutexattr_t *mutex_attr)
 {
 	struct pthread_mutex *pmtx;
 	int ret;
 
 	if (mutex_attr != NULL) {
 		ret = mutex_check_attr(*mutex_attr);
 		if (ret != 0)
 			return (ret);
 	}
 	if (mutex_attr == NULL ||
 	    (*mutex_attr)->m_pshared == PTHREAD_PROCESS_PRIVATE) {
 		return (mutex_init(mutex, mutex_attr ? *mutex_attr : NULL,
-		   calloc));
+		    calloc));
 	}
 	pmtx = __thr_pshared_offpage(mutex, 1);
 	if (pmtx == NULL)
 		return (EFAULT);
 	*mutex = THR_PSHARED_PTR;
 	shared_mutex_init(pmtx, *mutex_attr);
 	return (0);
 }
 
 /* This function is used internally by malloc. */
 int
 _pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
     void *(calloc_cb)(size_t, size_t))
 {
 	static const struct pthread_mutex_attr attr = {
 		.m_type = PTHREAD_MUTEX_NORMAL,
 		.m_protocol = PTHREAD_PRIO_NONE,
 		.m_ceiling = 0,
 		.m_pshared = PTHREAD_PROCESS_PRIVATE,
+		.m_robust = PTHREAD_MUTEX_STALLED,
 	};
 	int ret;
 
 	ret = mutex_init(mutex, &attr, calloc_cb);
 	if (ret == 0)
 		(*mutex)->m_flags |= PMUTEX_FLAG_PRIVATE;
 	return (ret);
 }
 
 /*
  * Fix mutex ownership for child process.
  *
  * Process private mutex ownership is transmitted from the forking
  * thread to the child process.
  *
  * Process shared mutex should not be inherited because owner is
  * forking thread which is in parent process, they are removed from
  * the owned mutex list.
  */
 static void
 queue_fork(struct pthread *curthread, struct mutex_queue *q,
     struct mutex_queue *qp, uint bit)
 {
 	struct pthread_mutex *m;
 
 	TAILQ_INIT(q);
 	TAILQ_FOREACH(m, qp, m_pqe) {
 		TAILQ_INSERT_TAIL(q, m, m_qe);
 		m->m_lock.m_owner = TID(curthread) | bit;
-		m->m_owner = TID(curthread);
 	}
 }
 
 void
 _mutex_fork(struct pthread *curthread)
 {
 
 	queue_fork(curthread, &curthread->mq[TMQ_NORM],
 	    &curthread->mq[TMQ_NORM_PRIV], 0);
 	queue_fork(curthread, &curthread->mq[TMQ_NORM_PP],
 	    &curthread->mq[TMQ_NORM_PP_PRIV], UMUTEX_CONTESTED);
+	queue_fork(curthread, &curthread->mq[TMQ_ROBUST_PP],
+	    &curthread->mq[TMQ_ROBUST_PP_PRIV], UMUTEX_CONTESTED);
+	curthread->robust_list = 0;
 }
 
 int
 _pthread_mutex_destroy(pthread_mutex_t *mutex)
 {
 	pthread_mutex_t m, m1;
 	int ret;
 
 	m = *mutex;
 	if (m < THR_MUTEX_DESTROYED) {
 		ret = 0;
 	} else if (m == THR_MUTEX_DESTROYED) {
 		ret = EINVAL;
 	} else {
 		if (m == THR_PSHARED_PTR) {
 			m1 = __thr_pshared_offpage(mutex, 0);
 			if (m1 != NULL) {
-				mutex_assert_not_owned(m1);
+				mutex_assert_not_owned(_get_curthread(), m1);
 				__thr_pshared_destroy(mutex);
 			}
 			*mutex = THR_MUTEX_DESTROYED;
 			return (0);
 		}
-		if (m->m_owner != 0) {
+		if (PMUTEX_OWNER_ID(m) != 0 &&
+		    (uint32_t)m->m_lock.m_owner != UMUTEX_RB_NOTRECOV) {
 			ret = EBUSY;
 		} else {
 			*mutex = THR_MUTEX_DESTROYED;
-			mutex_assert_not_owned(m);
+			mutex_assert_not_owned(_get_curthread(), m);
 			free(m);
 			ret = 0;
 		}
 	}
 
 	return (ret);
 }
 
 static int
 mutex_qidx(struct pthread_mutex *m)
 {
 
 	if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0)
 		return (TMQ_NORM);
-	return (TMQ_NORM_PP);
+	return (is_robust_mutex(m) ? TMQ_ROBUST_PP : TMQ_NORM_PP);
 }
 
+/*
+ * Both enqueue_mutex() and dequeue_mutex() operate on the
+ * thread-private linkage of the locked mutexes and on the robust
+ * linkage.
+ *
+ * Robust list, as seen by kernel, must be consistent even in the case
+ * of thread termination at arbitrary moment.  Since either enqueue or
+ * dequeue for list walked by kernel consists of rewriting a single
+ * forward pointer, it is safe.  On the other hand, rewrite of the
+ * back pointer is not atomic WRT the forward one, but kernel does not
+ * care.
+ */
 static void
-enqueue_mutex(struct pthread *curthread, struct pthread_mutex *m)
+enqueue_mutex(struct pthread *curthread, struct pthread_mutex *m,
+    int error)
 {
+	struct pthread_mutex *m1;
+	uintptr_t *rl;
 	int qidx;
 
-	m->m_owner = TID(curthread);
 	/* Add to the list of owned mutexes: */
-	mutex_assert_not_owned(m);
+	if (error != EOWNERDEAD)
+		mutex_assert_not_owned(curthread, m);
 	qidx = mutex_qidx(m);
 	TAILQ_INSERT_TAIL(&curthread->mq[qidx], m, m_qe);
 	if (!is_pshared_mutex(m))
 		TAILQ_INSERT_TAIL(&curthread->mq[qidx + 1], m, m_pqe);
+	if (is_robust_mutex(m)) {
+		rl = is_pshared_mutex(m) ? &curthread->robust_list :
+		    &curthread->priv_robust_list;
+		m->m_rb_prev = NULL;
+		if (*rl != 0) {
+			m1 = __containerof((void *)*rl,
+			    struct pthread_mutex, m_lock);
+			m->m_lock.m_rb_lnk = (uintptr_t)&m1->m_lock;
+			m1->m_rb_prev = m;
+		} else {
+			m1 = NULL;
+			m->m_lock.m_rb_lnk = 0;
+		}
+		*rl = (uintptr_t)&m->m_lock;
+	}
 }
 
 static void
 dequeue_mutex(struct pthread *curthread, struct pthread_mutex *m)
 {
+	struct pthread_mutex *mp, *mn;
 	int qidx;
 
-	m->m_owner = 0;
 	mutex_assert_is_owned(m);
 	qidx = mutex_qidx(m);
+	if (is_robust_mutex(m)) {
+		mp = m->m_rb_prev;
+		if (mp == NULL) {
+			if (is_pshared_mutex(m)) {
+				curthread->robust_list = m->m_lock.m_rb_lnk;
+			} else {
+				curthread->priv_robust_list =
+				    m->m_lock.m_rb_lnk;
+			}
+		} else {
+			mp->m_lock.m_rb_lnk = m->m_lock.m_rb_lnk;
+		}
+		if (m->m_lock.m_rb_lnk != 0) {
+			mn = __containerof((void *)m->m_lock.m_rb_lnk,
+			    struct pthread_mutex, m_lock);
+			mn->m_rb_prev = m->m_rb_prev;
+		}
+		m->m_lock.m_rb_lnk = 0;
+		m->m_rb_prev = NULL;
+	}
 	TAILQ_REMOVE(&curthread->mq[qidx], m, m_qe);
 	if (!is_pshared_mutex(m))
 		TAILQ_REMOVE(&curthread->mq[qidx + 1], m, m_pqe);
 	if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) != 0)
 		set_inherited_priority(curthread, m);
 	mutex_init_link(m);
 }
 
 static int
 check_and_init_mutex(pthread_mutex_t *mutex, struct pthread_mutex **m)
 {
 	int ret;
 
 	*m = *mutex;
 	ret = 0;
 	if (*m == THR_PSHARED_PTR) {
 		*m = __thr_pshared_offpage(mutex, 0);
 		if (*m == NULL)
 			ret = EINVAL;
 		else
 			shared_mutex_init(*m, NULL);
 	} else if (__predict_false(*m <= THR_MUTEX_DESTROYED)) {
 		if (*m == THR_MUTEX_DESTROYED) {
 			ret = EINVAL;
 		} else {
 			ret = init_static(_get_curthread(), mutex);
 			if (ret == 0)
 				*m = *mutex;
 		}
 	}
 	return (ret);
 }
 
 int
 __pthread_mutex_trylock(pthread_mutex_t *mutex)
 {
 	struct pthread *curthread;
 	struct pthread_mutex *m;
 	uint32_t id;
-	int ret;
+	int ret, robust;
 
 	ret = check_and_init_mutex(mutex, &m);
 	if (ret != 0)
 		return (ret);
 	curthread = _get_curthread();
 	id = TID(curthread);
 	if (m->m_flags & PMUTEX_FLAG_PRIVATE)
 		THR_CRITICAL_ENTER(curthread);
+	robust = _mutex_enter_robust(curthread, m);
 	ret = _thr_umutex_trylock(&m->m_lock, id);
-	if (__predict_true(ret == 0)) {
-		enqueue_mutex(curthread, m);
-	} else if (m->m_owner == id) {
+	if (__predict_true(ret == 0) || ret == EOWNERDEAD) {
+		enqueue_mutex(curthread, m, ret);
+		if (ret == EOWNERDEAD)
+			m->m_lock.m_flags |= UMUTEX_NONCONSISTENT;
+	} else if (PMUTEX_OWNER_ID(m) == id) {
 		ret = mutex_self_trylock(m);
 	} /* else {} */
-	if (ret && (m->m_flags & PMUTEX_FLAG_PRIVATE))
+	if (robust)
+		_mutex_leave_robust(curthread, m);
+	if ((ret == 0 || ret == EOWNERDEAD) &&
+	    (m->m_flags & PMUTEX_FLAG_PRIVATE) != 0)
 		THR_CRITICAL_LEAVE(curthread);
 	return (ret);
 }
 
 static int
 mutex_lock_sleep(struct pthread *curthread, struct pthread_mutex *m,
-	const struct timespec *abstime)
+    const struct timespec *abstime)
 {
-	uint32_t	id, owner;
-	int	count;
-	int	ret;
+	uint32_t id, owner;
+	int count, ret;
 
 	id = TID(curthread);
-	if (m->m_owner == id)
+	if (PMUTEX_OWNER_ID(m) == id)
 		return (mutex_self_lock(m, abstime));
 
 	/*
 	 * For adaptive mutexes, spin for a bit in the expectation
 	 * that if the application requests this mutex type then
 	 * the lock is likely to be released quickly and it is
 	 * faster than entering the kernel
 	 */
-	if (__predict_false(
-		(m->m_lock.m_flags & 
-		 (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) != 0))
-			goto sleep_in_kernel;
+	if (__predict_false((m->m_lock.m_flags & (UMUTEX_PRIO_PROTECT |
+	    UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST | UMUTEX_NONCONSISTENT)) != 0))
+		goto sleep_in_kernel;
 
 	if (!_thr_is_smp)
 		goto yield_loop;
 
 	count = m->m_spinloops;
 	while (count--) {
 		owner = m->m_lock.m_owner;
 		if ((owner & ~UMUTEX_CONTESTED) == 0) {
-			if (atomic_cmpset_acq_32(&m->m_lock.m_owner, owner, id|owner)) {
+			if (atomic_cmpset_acq_32(&m->m_lock.m_owner, owner,
+			    id | owner)) {
 				ret = 0;
 				goto done;
 			}
 		}
 		CPU_SPINWAIT;
 	}
 
 yield_loop:
 	count = m->m_yieldloops;
 	while (count--) {
 		_sched_yield();
 		owner = m->m_lock.m_owner;
 		if ((owner & ~UMUTEX_CONTESTED) == 0) {
-			if (atomic_cmpset_acq_32(&m->m_lock.m_owner, owner, id|owner)) {
+			if (atomic_cmpset_acq_32(&m->m_lock.m_owner, owner,
+			    id | owner)) {
 				ret = 0;
 				goto done;
 			}
 		}
 	}
 
 sleep_in_kernel:
-	if (abstime == NULL) {
+	if (abstime == NULL)
 		ret = __thr_umutex_lock(&m->m_lock, id);
-	} else if (__predict_false(
-		   abstime->tv_nsec < 0 ||
-		   abstime->tv_nsec >= 1000000000)) {
+	else if (__predict_false(abstime->tv_nsec < 0 ||
+	    abstime->tv_nsec >= 1000000000))
 		ret = EINVAL;
-	} else {
+	else
 		ret = __thr_umutex_timedlock(&m->m_lock, id, abstime);
-	}
 done:
-	if (ret == 0)
-		enqueue_mutex(curthread, m);
-
+	if (ret == 0 || ret == EOWNERDEAD) {
+		enqueue_mutex(curthread, m, ret);
+		if (ret == EOWNERDEAD)
+			m->m_lock.m_flags |= UMUTEX_NONCONSISTENT;
+	}
 	return (ret);
 }
 
 static inline int
-mutex_lock_common(struct pthread_mutex *m,
-	const struct timespec *abstime, int cvattach)
+mutex_lock_common(struct pthread_mutex *m, const struct timespec *abstime,
+    bool cvattach, bool rb_onlist)
 {
-	struct pthread *curthread  = _get_curthread();
-	int ret;
+	struct pthread *curthread;
+	int ret, robust;
 
+	curthread  = _get_curthread();
 	if (!cvattach && m->m_flags & PMUTEX_FLAG_PRIVATE)
 		THR_CRITICAL_ENTER(curthread);
-	if (_thr_umutex_trylock2(&m->m_lock, TID(curthread)) == 0) {
-		enqueue_mutex(curthread, m);
-		ret = 0;
+	if (!rb_onlist)
+		robust = _mutex_enter_robust(curthread, m);
+	ret = _thr_umutex_trylock2(&m->m_lock, TID(curthread));
+	if (ret == 0 || ret == EOWNERDEAD) {
+		enqueue_mutex(curthread, m, ret);
+		if (ret == EOWNERDEAD)
+			m->m_lock.m_flags |= UMUTEX_NONCONSISTENT;
 	} else {
 		ret = mutex_lock_sleep(curthread, m, abstime);
 	}
-	if (ret && (m->m_flags & PMUTEX_FLAG_PRIVATE) && !cvattach)
+	if (!rb_onlist && robust)
+		_mutex_leave_robust(curthread, m);
+	if (ret != 0 && ret != EOWNERDEAD &&
+	    (m->m_flags & PMUTEX_FLAG_PRIVATE) != 0 && !cvattach)
 		THR_CRITICAL_LEAVE(curthread);
 	return (ret);
 }
 
 int
 __pthread_mutex_lock(pthread_mutex_t *mutex)
 {
 	struct pthread_mutex *m;
 	int ret;
 
 	_thr_check_init();
 	ret = check_and_init_mutex(mutex, &m);
 	if (ret == 0)
-		ret = mutex_lock_common(m, NULL, 0);
+		ret = mutex_lock_common(m, NULL, false, false);
 	return (ret);
 }
 
 int
 __pthread_mutex_timedlock(pthread_mutex_t *mutex,
     const struct timespec *abstime)
 {
 	struct pthread_mutex *m;
 	int ret;
 
 	_thr_check_init();
 	ret = check_and_init_mutex(mutex, &m);
 	if (ret == 0)
-		ret = mutex_lock_common(m, abstime, 0);
+		ret = mutex_lock_common(m, abstime, false, false);
 	return (ret);
 }
 
 int
 _pthread_mutex_unlock(pthread_mutex_t *mutex)
 {
 	struct pthread_mutex *mp;
 
 	if (*mutex == THR_PSHARED_PTR) {
 		mp = __thr_pshared_offpage(mutex, 0);
 		if (mp == NULL)
 			return (EINVAL);
 		shared_mutex_init(mp, NULL);
 	} else {
 		mp = *mutex;
 	}
-	return (mutex_unlock_common(mp, 0, NULL));
+	return (mutex_unlock_common(mp, false, NULL));
 }
 
 int
-_mutex_cv_lock(struct pthread_mutex *m, int count)
+_mutex_cv_lock(struct pthread_mutex *m, int count, bool rb_onlist)
 {
-	int	error;
+	int error;
 
-	error = mutex_lock_common(m, NULL, 1);
-	if (error == 0)
+	error = mutex_lock_common(m, NULL, true, rb_onlist);
+	if (error == 0 || error == EOWNERDEAD)
 		m->m_count = count;
 	return (error);
 }
 
 int
 _mutex_cv_unlock(struct pthread_mutex *m, int *count, int *defer)
 {
 
 	/*
 	 * Clear the count in case this is a recursive mutex.
 	 */
 	*count = m->m_count;
 	m->m_count = 0;
-	(void)mutex_unlock_common(m, 1, defer);
+	(void)mutex_unlock_common(m, true, defer);
         return (0);
 }
 
 int
 _mutex_cv_attach(struct pthread_mutex *m, int count)
 {
-	struct pthread *curthread = _get_curthread();
+	struct pthread *curthread;
 
-	enqueue_mutex(curthread, m);
+	curthread = _get_curthread();
+	enqueue_mutex(curthread, m, 0);
 	m->m_count = count;
 	return (0);
 }
 
 int
 _mutex_cv_detach(struct pthread_mutex *mp, int *recurse)
 {
-	struct pthread *curthread = _get_curthread();
-	int     defered;
-	int     error;
+	struct pthread *curthread;
+	int deferred, error;
 
+	curthread = _get_curthread();
 	if ((error = _mutex_owned(curthread, mp)) != 0)
-                return (error);
+		return (error);
 
 	/*
 	 * Clear the count in case this is a recursive mutex.
 	 */
 	*recurse = mp->m_count;
 	mp->m_count = 0;
 	dequeue_mutex(curthread, mp);
 
 	/* Will this happen in real-world ? */
-        if ((mp->m_flags & PMUTEX_FLAG_DEFERED) != 0) {
-		defered = 1;
-		mp->m_flags &= ~PMUTEX_FLAG_DEFERED;
+        if ((mp->m_flags & PMUTEX_FLAG_DEFERRED) != 0) {
+		deferred = 1;
+		mp->m_flags &= ~PMUTEX_FLAG_DEFERRED;
 	} else
-		defered = 0;
+		deferred = 0;
 
-	if (defered)  {
+	if (deferred)  {
 		_thr_wake_all(curthread->defer_waiters,
-				curthread->nwaiter_defer);
+		    curthread->nwaiter_defer);
 		curthread->nwaiter_defer = 0;
 	}
 	return (0);
 }
 
 static int
 mutex_self_trylock(struct pthread_mutex *m)
 {
-	int	ret;
+	int ret;
 
 	switch (PMUTEX_TYPE(m->m_flags)) {
 	case PTHREAD_MUTEX_ERRORCHECK:
 	case PTHREAD_MUTEX_NORMAL:
 	case PTHREAD_MUTEX_ADAPTIVE_NP:
 		ret = EBUSY; 
 		break;
 
 	case PTHREAD_MUTEX_RECURSIVE:
 		/* Increment the lock count: */
 		if (m->m_count + 1 > 0) {
 			m->m_count++;
 			ret = 0;
 		} else
 			ret = EAGAIN;
 		break;
 
 	default:
 		/* Trap invalid mutex types; */
 		ret = EINVAL;
 	}
 
 	return (ret);
 }
 
 static int
 mutex_self_lock(struct pthread_mutex *m, const struct timespec *abstime)
 {
 	struct timespec	ts1, ts2;
-	int	ret;
+	int ret;
 
 	switch (PMUTEX_TYPE(m->m_flags)) {
 	case PTHREAD_MUTEX_ERRORCHECK:
 	case PTHREAD_MUTEX_ADAPTIVE_NP:
 		if (abstime) {
 			if (abstime->tv_sec < 0 || abstime->tv_nsec < 0 ||
 			    abstime->tv_nsec >= 1000000000) {
 				ret = EINVAL;
 			} else {
 				clock_gettime(CLOCK_REALTIME, &ts1);
 				TIMESPEC_SUB(&ts2, abstime, &ts1);
 				__sys_nanosleep(&ts2, NULL);
 				ret = ETIMEDOUT;
 			}
 		} else {
 			/*
 			 * POSIX specifies that mutexes should return
 			 * EDEADLK if a recursive lock is detected.
 			 */
 			ret = EDEADLK; 
 		}
 		break;
 
 	case PTHREAD_MUTEX_NORMAL:
 		/*
 		 * What SS2 define as a 'normal' mutex.  Intentionally
 		 * deadlock on attempts to get a lock you already own.
 		 */
 		ret = 0;
 		if (abstime) {
 			if (abstime->tv_sec < 0 || abstime->tv_nsec < 0 ||
 			    abstime->tv_nsec >= 1000000000) {
 				ret = EINVAL;
 			} else {
 				clock_gettime(CLOCK_REALTIME, &ts1);
 				TIMESPEC_SUB(&ts2, abstime, &ts1);
 				__sys_nanosleep(&ts2, NULL);
 				ret = ETIMEDOUT;
 			}
 		} else {
 			ts1.tv_sec = 30;
 			ts1.tv_nsec = 0;
 			for (;;)
 				__sys_nanosleep(&ts1, NULL);
 		}
 		break;
 
 	case PTHREAD_MUTEX_RECURSIVE:
 		/* Increment the lock count: */
 		if (m->m_count + 1 > 0) {
 			m->m_count++;
 			ret = 0;
 		} else
 			ret = EAGAIN;
 		break;
 
 	default:
 		/* Trap invalid mutex types; */
 		ret = EINVAL;
 	}
 
 	return (ret);
 }
 
 static int
-mutex_unlock_common(struct pthread_mutex *m, int cv, int *mtx_defer)
+mutex_unlock_common(struct pthread_mutex *m, bool cv, int *mtx_defer)
 {
-	struct pthread *curthread = _get_curthread();
+	struct pthread *curthread;
 	uint32_t id;
-	int defered, error;
+	int deferred, error, robust;
 
 	if (__predict_false(m <= THR_MUTEX_DESTROYED)) {
 		if (m == THR_MUTEX_DESTROYED)
 			return (EINVAL);
 		return (EPERM);
 	}
 
+	curthread = _get_curthread();
 	id = TID(curthread);
 
 	/*
 	 * Check if the running thread is not the owner of the mutex.
 	 */
-	if (__predict_false(m->m_owner != id))
+	if (__predict_false(PMUTEX_OWNER_ID(m) != id))
 		return (EPERM);
 
 	error = 0;
-	if (__predict_false(
-		PMUTEX_TYPE(m->m_flags) == PTHREAD_MUTEX_RECURSIVE &&
-		m->m_count > 0)) {
+	if (__predict_false(PMUTEX_TYPE(m->m_flags) ==
+	    PTHREAD_MUTEX_RECURSIVE && m->m_count > 0)) {
 		m->m_count--;
 	} else {
-		if ((m->m_flags & PMUTEX_FLAG_DEFERED) != 0) {
-			defered = 1;
-			m->m_flags &= ~PMUTEX_FLAG_DEFERED;
+		if ((m->m_flags & PMUTEX_FLAG_DEFERRED) != 0) {
+			deferred = 1;
+			m->m_flags &= ~PMUTEX_FLAG_DEFERRED;
         	} else
-			defered = 0;
+			deferred = 0;
 
+		robust = _mutex_enter_robust(curthread, m);
 		dequeue_mutex(curthread, m);
 		error = _thr_umutex_unlock2(&m->m_lock, id, mtx_defer);
-
-		if (mtx_defer == NULL && defered)  {
-			_thr_wake_all(curthread->defer_waiters,
-				curthread->nwaiter_defer);
-			curthread->nwaiter_defer = 0;
+		if (deferred)  {
+			if (mtx_defer == NULL) {
+				_thr_wake_all(curthread->defer_waiters,
+				    curthread->nwaiter_defer);
+				curthread->nwaiter_defer = 0;
+			} else
+				*mtx_defer = 1;
 		}
+		if (robust)
+			_mutex_leave_robust(curthread, m);
 	}
 	if (!cv && m->m_flags & PMUTEX_FLAG_PRIVATE)
 		THR_CRITICAL_LEAVE(curthread);
 	return (error);
 }
 
 int
 _pthread_mutex_getprioceiling(pthread_mutex_t *mutex,
     int *prioceiling)
 {
 	struct pthread_mutex *m;
 
 	if (*mutex == THR_PSHARED_PTR) {
 		m = __thr_pshared_offpage(mutex, 0);
 		if (m == NULL)
 			return (EINVAL);
 		shared_mutex_init(m, NULL);
 	} else {
 		m = *mutex;
 		if (m <= THR_MUTEX_DESTROYED)
 			return (EINVAL);
 	}
 	if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0)
 		return (EINVAL);
 	*prioceiling = m->m_lock.m_ceilings[0];
 	return (0);
 }
 
 int
 _pthread_mutex_setprioceiling(pthread_mutex_t *mutex,
     int ceiling, int *old_ceiling)
 {
 	struct pthread *curthread;
 	struct pthread_mutex *m, *m1, *m2;
 	struct mutex_queue *q, *qp;
-	int ret;
+	int qidx, ret;
 
 	if (*mutex == THR_PSHARED_PTR) {
 		m = __thr_pshared_offpage(mutex, 0);
 		if (m == NULL)
 			return (EINVAL);
 		shared_mutex_init(m, NULL);
 	} else {
 		m = *mutex;
 		if (m <= THR_MUTEX_DESTROYED)
 			return (EINVAL);
 	}
 	if ((m->m_lock.m_flags & UMUTEX_PRIO_PROTECT) == 0)
 		return (EINVAL);
 
 	ret = __thr_umutex_set_ceiling(&m->m_lock, ceiling, old_ceiling);
 	if (ret != 0)
 		return (ret);
 
 	curthread = _get_curthread();
-	if (m->m_owner == TID(curthread)) {
+	if (PMUTEX_OWNER_ID(m) == TID(curthread)) {
 		mutex_assert_is_owned(m);
 		m1 = TAILQ_PREV(m, mutex_queue, m_qe);
 		m2 = TAILQ_NEXT(m, m_qe);
 		if ((m1 != NULL && m1->m_lock.m_ceilings[0] > (u_int)ceiling) ||
 		    (m2 != NULL && m2->m_lock.m_ceilings[0] < (u_int)ceiling)) {
-			q = &curthread->mq[TMQ_NORM_PP];
-			qp = &curthread->mq[TMQ_NORM_PP_PRIV];
+			qidx = mutex_qidx(m);
+			q = &curthread->mq[qidx];
+			qp = &curthread->mq[qidx + 1];
 			TAILQ_REMOVE(q, m, m_qe);
 			if (!is_pshared_mutex(m))
 				TAILQ_REMOVE(qp, m, m_pqe);
 			TAILQ_FOREACH(m2, q, m_qe) {
 				if (m2->m_lock.m_ceilings[0] > (u_int)ceiling) {
 					TAILQ_INSERT_BEFORE(m2, m, m_qe);
 					if (!is_pshared_mutex(m)) {
 						while (m2 != NULL &&
 						    is_pshared_mutex(m2)) {
 							m2 = TAILQ_PREV(m2,
 							    mutex_queue, m_qe);
 						}
 						if (m2 == NULL) {
 							TAILQ_INSERT_HEAD(qp,
 							    m, m_pqe);
 						} else {
 							TAILQ_INSERT_BEFORE(m2,
 							    m, m_pqe);
 						}
 					}
 					return (0);
 				}
 			}
 			TAILQ_INSERT_TAIL(q, m, m_qe);
 			if (!is_pshared_mutex(m))
 				TAILQ_INSERT_TAIL(qp, m, m_pqe);
 		}
 	}
 	return (0);
 }
 
 int
 _pthread_mutex_getspinloops_np(pthread_mutex_t *mutex, int *count)
 {
 	struct pthread_mutex *m;
 	int ret;
 
 	ret = check_and_init_mutex(mutex, &m);
 	if (ret == 0)
 		*count = m->m_spinloops;
 	return (ret);
 }
 
 int
 __pthread_mutex_setspinloops_np(pthread_mutex_t *mutex, int count)
 {
 	struct pthread_mutex *m;
 	int ret;
 
 	ret = check_and_init_mutex(mutex, &m);
 	if (ret == 0)
 		m->m_spinloops = count;
 	return (ret);
 }
 
 int
 _pthread_mutex_getyieldloops_np(pthread_mutex_t *mutex, int *count)
 {
 	struct pthread_mutex *m;
 	int ret;
 
 	ret = check_and_init_mutex(mutex, &m);
 	if (ret == 0)
 		*count = m->m_yieldloops;
 	return (ret);
 }
 
 int
 __pthread_mutex_setyieldloops_np(pthread_mutex_t *mutex, int count)
 {
 	struct pthread_mutex *m;
 	int ret;
 
 	ret = check_and_init_mutex(mutex, &m);
 	if (ret == 0)
 		m->m_yieldloops = count;
 	return (0);
 }
 
 int
 _pthread_mutex_isowned_np(pthread_mutex_t *mutex)
 {
 	struct pthread_mutex *m;
 
 	if (*mutex == THR_PSHARED_PTR) {
 		m = __thr_pshared_offpage(mutex, 0);
 		if (m == NULL)
 			return (0);
 		shared_mutex_init(m, NULL);
 	} else {
 		m = *mutex;
 		if (m <= THR_MUTEX_DESTROYED)
 			return (0);
 	}
-	return (m->m_owner == TID(_get_curthread()));
+	return (PMUTEX_OWNER_ID(m) == TID(_get_curthread()));
 }
 
 int
 _mutex_owned(struct pthread *curthread, const struct pthread_mutex *mp)
 {
+
 	if (__predict_false(mp <= THR_MUTEX_DESTROYED)) {
 		if (mp == THR_MUTEX_DESTROYED)
 			return (EINVAL);
 		return (EPERM);
 	}
-	if (mp->m_owner != TID(curthread))
+	if (PMUTEX_OWNER_ID(mp) != TID(curthread))
 		return (EPERM);
 	return (0);                  
+}
+
+int
+_pthread_mutex_consistent(pthread_mutex_t *mutex)
+{
+	struct pthread_mutex *m;
+	struct pthread *curthread;
+
+	if (*mutex == THR_PSHARED_PTR) {
+		m = __thr_pshared_offpage(mutex, 0);
+		if (m == NULL)
+			return (EINVAL);
+		shared_mutex_init(m, NULL);
+	} else {
+		m = *mutex;
+		if (m <= THR_MUTEX_DESTROYED)
+			return (EINVAL);
+	}
+	curthread = _get_curthread();
+	if ((m->m_lock.m_flags & (UMUTEX_ROBUST | UMUTEX_NONCONSISTENT)) !=
+	    (UMUTEX_ROBUST | UMUTEX_NONCONSISTENT))
+		return (EINVAL);
+	if (PMUTEX_OWNER_ID(m) != TID(curthread))
+		return (EPERM);
+	m->m_lock.m_flags &= ~UMUTEX_NONCONSISTENT;
+	return (0);
 }
Index: head/lib/libthr/thread/thr_mutexattr.c
===================================================================
--- head/lib/libthr/thread/thr_mutexattr.c	(revision 300042)
+++ head/lib/libthr/thread/thr_mutexattr.c	(revision 300043)
@@ -1,253 +1,290 @@
 /*
  * Copyright (c) 1996 Jeffrey Hsu <hsu@freebsd.org>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Copyright (c) 1997 John Birrell <jb@cimlogic.com.au>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "namespace.h"
 #include <string.h>
 #include <stdlib.h>
 #include <errno.h>
 #include <pthread.h>
 #include <pthread_np.h>
 #include "un-namespace.h"
 
 #include "thr_private.h"
 
 __weak_reference(_pthread_mutexattr_init, pthread_mutexattr_init);
 __weak_reference(_pthread_mutexattr_setkind_np, pthread_mutexattr_setkind_np);
 __weak_reference(_pthread_mutexattr_getkind_np, pthread_mutexattr_getkind_np);
 __weak_reference(_pthread_mutexattr_gettype, pthread_mutexattr_gettype);
 __weak_reference(_pthread_mutexattr_settype, pthread_mutexattr_settype);
 __weak_reference(_pthread_mutexattr_destroy, pthread_mutexattr_destroy);
 __weak_reference(_pthread_mutexattr_getpshared, pthread_mutexattr_getpshared);
 __weak_reference(_pthread_mutexattr_setpshared, pthread_mutexattr_setpshared);
 __weak_reference(_pthread_mutexattr_getprotocol, pthread_mutexattr_getprotocol);
 __weak_reference(_pthread_mutexattr_setprotocol, pthread_mutexattr_setprotocol);
-__weak_reference(_pthread_mutexattr_getprioceiling, pthread_mutexattr_getprioceiling);
-__weak_reference(_pthread_mutexattr_setprioceiling, pthread_mutexattr_setprioceiling);
+__weak_reference(_pthread_mutexattr_getprioceiling,
+    pthread_mutexattr_getprioceiling);
+__weak_reference(_pthread_mutexattr_setprioceiling,
+    pthread_mutexattr_setprioceiling);
+__weak_reference(_pthread_mutexattr_getrobust, pthread_mutexattr_getrobust);
+__weak_reference(_pthread_mutexattr_setrobust, pthread_mutexattr_setrobust);
 
 int
 _pthread_mutexattr_init(pthread_mutexattr_t *attr)
 {
 	int ret;
 	pthread_mutexattr_t pattr;
 
 	if ((pattr = (pthread_mutexattr_t)
 	    malloc(sizeof(struct pthread_mutex_attr))) == NULL) {
 		ret = ENOMEM;
 	} else {
 		memcpy(pattr, &_pthread_mutexattr_default,
 		    sizeof(struct pthread_mutex_attr));
 		*attr = pattr;
 		ret = 0;
 	}
 	return (ret);
 }
 
 int
 _pthread_mutexattr_setkind_np(pthread_mutexattr_t *attr, int kind)
 {
 	int	ret;
 	if (attr == NULL || *attr == NULL) {
 		errno = EINVAL;
 		ret = -1;
 	} else {
 		(*attr)->m_type = kind;
 		ret = 0;
 	}
 	return(ret);
 }
 
 int
 _pthread_mutexattr_getkind_np(pthread_mutexattr_t attr)
 {
 	int	ret;
+
 	if (attr == NULL) {
 		errno = EINVAL;
 		ret = -1;
 	} else {
 		ret = attr->m_type;
 	}
-	return(ret);
+	return (ret);
 }
 
 int
 _pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type)
 {
 	int	ret;
+
 	if (attr == NULL || *attr == NULL || type >= PTHREAD_MUTEX_TYPE_MAX) {
 		ret = EINVAL;
 	} else {
 		(*attr)->m_type = type;
 		ret = 0;
 	}
-	return(ret);
+	return (ret);
 }
 
 int
 _pthread_mutexattr_gettype(pthread_mutexattr_t *attr, int *type)
 {
 	int	ret;
 
 	if (attr == NULL || *attr == NULL || (*attr)->m_type >=
 	    PTHREAD_MUTEX_TYPE_MAX) {
 		ret = EINVAL;
 	} else {
 		*type = (*attr)->m_type;
 		ret = 0;
 	}
-	return ret;
+	return (ret);
 }
 
 int
 _pthread_mutexattr_destroy(pthread_mutexattr_t *attr)
 {
 	int	ret;
 	if (attr == NULL || *attr == NULL) {
 		ret = EINVAL;
 	} else {
 		free(*attr);
 		*attr = NULL;
 		ret = 0;
 	}
-	return(ret);
+	return (ret);
 }
 
 int
 _pthread_mutexattr_getpshared(const pthread_mutexattr_t *attr,
 	int *pshared)
 {
 
 	if (attr == NULL || *attr == NULL)
 		return (EINVAL);
 	*pshared = (*attr)->m_pshared;
 	return (0);
 }
 
 int
 _pthread_mutexattr_setpshared(pthread_mutexattr_t *attr, int pshared)
 {
 
 	if (attr == NULL || *attr == NULL ||
 	    (pshared != PTHREAD_PROCESS_PRIVATE &&
 	    pshared != PTHREAD_PROCESS_SHARED))
 		return (EINVAL);
 	(*attr)->m_pshared = pshared;
 	return (0);
 }
 
 int
 _pthread_mutexattr_getprotocol(pthread_mutexattr_t *mattr, int *protocol)
 {
 	int ret = 0;
 
-	if ((mattr == NULL) || (*mattr == NULL))
+	if (mattr == NULL || *mattr == NULL)
 		ret = EINVAL;
 	else
 		*protocol = (*mattr)->m_protocol;
 
-	return(ret);
+	return (ret);
 }
 
 int
 _pthread_mutexattr_setprotocol(pthread_mutexattr_t *mattr, int protocol)
 {
 	int ret = 0;
 
-	if ((mattr == NULL) || (*mattr == NULL) ||
-	    (protocol < PTHREAD_PRIO_NONE) || (protocol > PTHREAD_PRIO_PROTECT))
+	if (mattr == NULL || *mattr == NULL ||
+	    protocol < PTHREAD_PRIO_NONE || protocol > PTHREAD_PRIO_PROTECT)
 		ret = EINVAL;
 	else {
 		(*mattr)->m_protocol = protocol;
 		(*mattr)->m_ceiling = THR_MAX_RR_PRIORITY;
 	}
-	return(ret);
+	return (ret);
 }
 
 int
 _pthread_mutexattr_getprioceiling(pthread_mutexattr_t *mattr, int *prioceiling)
 {
 	int ret = 0;
 
-	if ((mattr == NULL) || (*mattr == NULL))
+	if (mattr == NULL || *mattr == NULL)
 		ret = EINVAL;
 	else if ((*mattr)->m_protocol != PTHREAD_PRIO_PROTECT)
 		ret = EINVAL;
 	else
 		*prioceiling = (*mattr)->m_ceiling;
 
-	return(ret);
+	return (ret);
 }
 
 int
 _pthread_mutexattr_setprioceiling(pthread_mutexattr_t *mattr, int prioceiling)
 {
 	int ret = 0;
 
-	if ((mattr == NULL) || (*mattr == NULL))
+	if (mattr == NULL || *mattr == NULL)
 		ret = EINVAL;
 	else if ((*mattr)->m_protocol != PTHREAD_PRIO_PROTECT)
 		ret = EINVAL;
 	else
 		(*mattr)->m_ceiling = prioceiling;
 
-	return(ret);
+	return (ret);
+}
+
+int
+_pthread_mutexattr_getrobust(pthread_mutexattr_t *mattr, int *robust)
+{
+	int ret;
+
+	if (mattr == NULL || *mattr == NULL) {
+		ret = EINVAL;
+	} else {
+		ret = 0;
+		*robust = (*mattr)->m_robust;
+	}
+	return (ret);
+}
+
+int
+_pthread_mutexattr_setrobust(pthread_mutexattr_t *mattr, int robust)
+{
+	int ret;
+
+	if (mattr == NULL || *mattr == NULL) {
+		ret = EINVAL;
+	} else if (robust != PTHREAD_MUTEX_STALLED &&
+	    robust != PTHREAD_MUTEX_ROBUST) {
+		ret = EINVAL;
+	} else {
+		ret = 0;
+		(*mattr)->m_robust = robust;
+	}
+	return (ret);
 }
 
Index: head/lib/libthr/thread/thr_private.h
===================================================================
--- head/lib/libthr/thread/thr_private.h	(revision 300042)
+++ head/lib/libthr/thread/thr_private.h	(revision 300043)
@@ -1,967 +1,988 @@
 /*
  * Copyright (C) 2005 Daniel M. Eischen <deischen@freebsd.org>
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * Copyright (c) 1995-1998 John Birrell <jb@cimlogic.com.au>.
  *
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _THR_PRIVATE_H
 #define _THR_PRIVATE_H
 
 /*
  * Include files.
  */
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/cdefs.h>
 #include <sys/queue.h>
 #include <sys/param.h>
 #include <sys/cpuset.h>
 #include <machine/atomic.h>
 #include <errno.h>
 #include <limits.h>
 #include <signal.h>
+#include <stdbool.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <ucontext.h>
 #include <sys/thr.h>
 #include <pthread.h>
 
 #define	SYM_FB10(sym)			__CONCAT(sym, _fb10)
 #define	SYM_FBP10(sym)			__CONCAT(sym, _fbp10)
 #define	WEAK_REF(sym, alias)		__weak_reference(sym, alias)
 #define	SYM_COMPAT(sym, impl, ver)	__sym_compat(sym, impl, ver)
 #define	SYM_DEFAULT(sym, impl, ver)	__sym_default(sym, impl, ver)
 
 #define	FB10_COMPAT(func, sym)				\
 	WEAK_REF(func, SYM_FB10(sym));			\
 	SYM_COMPAT(sym, SYM_FB10(sym), FBSD_1.0)
 
 #define	FB10_COMPAT_PRIVATE(func, sym)			\
 	WEAK_REF(func, SYM_FBP10(sym));			\
 	SYM_DEFAULT(sym, SYM_FBP10(sym), FBSDprivate_1.0)
 
 #include "pthread_md.h"
 #include "thr_umtx.h"
 #include "thread_db.h"
 
 #ifdef _PTHREAD_FORCED_UNWIND
 #define _BSD_SOURCE
 #include <unwind.h>
 #endif
 
 typedef TAILQ_HEAD(pthreadlist, pthread) pthreadlist;
 typedef TAILQ_HEAD(atfork_head, pthread_atfork) atfork_head;
 TAILQ_HEAD(mutex_queue, pthread_mutex);
 
 /* Signal to do cancellation */
 #define	SIGCANCEL		SIGTHR
 
 /*
  * Kernel fatal error handler macro.
  */
 #define PANIC(string)		_thread_exit(__FILE__,__LINE__,string)
 
 /* Output debug messages like this: */
 #define stdout_debug(args...)	_thread_printf(STDOUT_FILENO, ##args)
 #define stderr_debug(args...)	_thread_printf(STDERR_FILENO, ##args)
 
 #ifdef _PTHREADS_INVARIANTS
 #define THR_ASSERT(cond, msg) do {	\
 	if (__predict_false(!(cond)))	\
 		PANIC(msg);		\
 } while (0)
 #else
 #define THR_ASSERT(cond, msg)
 #endif
 
 #ifdef PIC
 # define STATIC_LIB_REQUIRE(name)
 #else
 # define STATIC_LIB_REQUIRE(name) __asm (".globl " #name)
 #endif
 
 #define	TIMESPEC_ADD(dst, src, val)				\
 	do { 							\
 		(dst)->tv_sec = (src)->tv_sec + (val)->tv_sec;	\
 		(dst)->tv_nsec = (src)->tv_nsec + (val)->tv_nsec; \
 		if ((dst)->tv_nsec >= 1000000000) {		\
 			(dst)->tv_sec++;			\
 			(dst)->tv_nsec -= 1000000000;		\
 		}						\
 	} while (0)
 
 #define	TIMESPEC_SUB(dst, src, val)				\
 	do { 							\
 		(dst)->tv_sec = (src)->tv_sec - (val)->tv_sec;	\
 		(dst)->tv_nsec = (src)->tv_nsec - (val)->tv_nsec; \
 		if ((dst)->tv_nsec < 0) {			\
 			(dst)->tv_sec--;			\
 			(dst)->tv_nsec += 1000000000;		\
 		}						\
 	} while (0)
 
 /* Magic cookie set for shared pthread locks and cv's pointers */
 #define	THR_PSHARED_PTR						\
     ((void *)(uintptr_t)((1ULL << (NBBY * sizeof(long) - 1)) | 1))
 
 /* XXX These values should be same as those defined in pthread.h */
 #define	THR_MUTEX_INITIALIZER		((struct pthread_mutex *)NULL)
 #define	THR_ADAPTIVE_MUTEX_INITIALIZER	((struct pthread_mutex *)1)
 #define	THR_MUTEX_DESTROYED		((struct pthread_mutex *)2)
 #define	THR_COND_INITIALIZER		((struct pthread_cond *)NULL)
 #define	THR_COND_DESTROYED		((struct pthread_cond *)1)
 #define	THR_RWLOCK_INITIALIZER		((struct pthread_rwlock *)NULL)
 #define	THR_RWLOCK_DESTROYED		((struct pthread_rwlock *)1)
 
 #define PMUTEX_FLAG_TYPE_MASK	0x0ff
 #define PMUTEX_FLAG_PRIVATE	0x100
-#define PMUTEX_FLAG_DEFERED	0x200
+#define PMUTEX_FLAG_DEFERRED	0x200
 #define PMUTEX_TYPE(mtxflags)	((mtxflags) & PMUTEX_FLAG_TYPE_MASK)
 
+#define	PMUTEX_OWNER_ID(m)	((m)->m_lock.m_owner & ~UMUTEX_CONTESTED)
+
 #define MAX_DEFER_WAITERS       50
 
 /*
  * Values for pthread_mutex m_ps indicator.
  */
 #define	PMUTEX_INITSTAGE_ALLOC	0
 #define	PMUTEX_INITSTAGE_BUSY	1
 #define	PMUTEX_INITSTAGE_DONE	2
 
 struct pthread_mutex {
 	/*
 	 * Lock for accesses to this structure.
 	 */
 	struct umutex			m_lock;
 	int				m_flags;
-	uint32_t			m_owner;
 	int				m_count;
 	int				m_spinloops;
 	int				m_yieldloops;
 	int				m_ps;	/* pshared init stage */
 	/*
 	 * Link for all mutexes a thread currently owns, of the same
 	 * prio type.
 	 */
 	TAILQ_ENTRY(pthread_mutex)	m_qe;
 	/* Link for all private mutexes a thread currently owns. */
 	TAILQ_ENTRY(pthread_mutex)	m_pqe;
+	struct pthread_mutex		*m_rb_prev;
 };
 
 struct pthread_mutex_attr {
 	enum pthread_mutextype	m_type;
 	int			m_protocol;
 	int			m_ceiling;
 	int			m_pshared;
+	int			m_robust;
 };
 
 #define PTHREAD_MUTEXATTR_STATIC_INITIALIZER \
-	{ PTHREAD_MUTEX_DEFAULT, PTHREAD_PRIO_NONE, 0, MUTEX_FLAGS_PRIVATE }
+	{ PTHREAD_MUTEX_DEFAULT, PTHREAD_PRIO_NONE, 0, MUTEX_FLAGS_PRIVATE, \
+	    PTHREAD_MUTEX_STALLED }
 
 struct pthread_cond {
 	__uint32_t	__has_user_waiters;
 	__uint32_t	__has_kern_waiters;
 	__uint32_t	__flags;
 	__uint32_t	__clock_id;
 };
 
 struct pthread_cond_attr {
 	int		c_pshared;
 	int		c_clockid;
 };
 
 struct pthread_barrier {
 	struct umutex		b_lock;
 	struct ucond		b_cv;
 	int64_t			b_cycle;
 	int			b_count;
 	int			b_waiters;
 	int			b_refcount;
 	int			b_destroying;
 };
 
 struct pthread_barrierattr {
 	int		pshared;
 };
 
 struct pthread_spinlock {
 	struct umutex	s_lock;
 };
 
 /*
  * Flags for condition variables.
  */
 #define COND_FLAGS_PRIVATE	0x01
 #define COND_FLAGS_INITED	0x02
 #define COND_FLAGS_BUSY		0x04
 
 /*
  * Cleanup definitions.
  */
 struct pthread_cleanup {
 	struct pthread_cleanup	*prev;
 	void			(*routine)(void *);
 	void			*routine_arg;
 	int			onheap;
 };
 
 #define	THR_CLEANUP_PUSH(td, func, arg) {		\
 	struct pthread_cleanup __cup;			\
 							\
 	__cup.routine = func;				\
 	__cup.routine_arg = arg;			\
 	__cup.onheap = 0;				\
 	__cup.prev = (td)->cleanup;			\
 	(td)->cleanup = &__cup;
 
 #define	THR_CLEANUP_POP(td, exec)			\
 	(td)->cleanup = __cup.prev;			\
 	if ((exec) != 0)				\
 		__cup.routine(__cup.routine_arg);	\
 }
 
 struct pthread_atfork {
 	TAILQ_ENTRY(pthread_atfork) qe;
 	void (*prepare)(void);
 	void (*parent)(void);
 	void (*child)(void);
 };
 
 struct pthread_attr {
 #define pthread_attr_start_copy	sched_policy
 	int	sched_policy;
 	int	sched_inherit;
 	int	prio;
 	int	suspend;
 #define	THR_STACK_USER		0x100	/* 0xFF reserved for <pthread.h> */
 	int	flags;
 	void	*stackaddr_attr;
 	size_t	stacksize_attr;
 	size_t	guardsize_attr;
 #define pthread_attr_end_copy	cpuset
 	cpuset_t	*cpuset;
 	size_t	cpusetsize;
 };
 
 struct wake_addr {
 	struct wake_addr *link;
 	unsigned int	value;
 	char		pad[12];
 };
 
 struct sleepqueue {
 	TAILQ_HEAD(, pthread)    sq_blocked;
 	SLIST_HEAD(, sleepqueue) sq_freeq;
 	LIST_ENTRY(sleepqueue)   sq_hash;
 	SLIST_ENTRY(sleepqueue)  sq_flink;
 	void			 *sq_wchan;
 	int			 sq_type;
 };
 
 /*
  * Thread creation state attributes.
  */
 #define THR_CREATE_RUNNING		0
 #define THR_CREATE_SUSPENDED		1
 
 /*
  * Miscellaneous definitions.
  */
 #define THR_STACK_DEFAULT		(sizeof(void *) / 4 * 1024 * 1024)
 
 /*
  * Maximum size of initial thread's stack.  This perhaps deserves to be larger
  * than the stacks of other threads, since many applications are likely to run
  * almost entirely on this stack.
  */
 #define THR_STACK_INITIAL		(THR_STACK_DEFAULT * 2)
 
 /*
  * Define priorities returned by kernel.
  */
 #define THR_MIN_PRIORITY		(_thr_priorities[SCHED_OTHER-1].pri_min)
 #define THR_MAX_PRIORITY		(_thr_priorities[SCHED_OTHER-1].pri_max)
 #define THR_DEF_PRIORITY		(_thr_priorities[SCHED_OTHER-1].pri_default)
 
 #define THR_MIN_RR_PRIORITY		(_thr_priorities[SCHED_RR-1].pri_min)
 #define THR_MAX_RR_PRIORITY		(_thr_priorities[SCHED_RR-1].pri_max)
 #define THR_DEF_RR_PRIORITY		(_thr_priorities[SCHED_RR-1].pri_default)
 
 /* XXX The SCHED_FIFO should have same priority range as SCHED_RR */
 #define THR_MIN_FIFO_PRIORITY		(_thr_priorities[SCHED_FIFO_1].pri_min)
 #define THR_MAX_FIFO_PRIORITY		(_thr_priorities[SCHED_FIFO-1].pri_max)
 #define THR_DEF_FIFO_PRIORITY		(_thr_priorities[SCHED_FIFO-1].pri_default)
 
 struct pthread_prio {
 	int	pri_min;
 	int	pri_max;
 	int	pri_default;
 };
 
 struct pthread_rwlockattr {
 	int		pshared;
 };
 
 struct pthread_rwlock {
 	struct urwlock 	lock;
 	uint32_t	owner;
 };
 
 /*
  * Thread states.
  */
 enum pthread_state {
 	PS_RUNNING,
 	PS_DEAD
 };
 
 struct pthread_specific_elem {
 	const void	*data;
 	int		seqno;
 };
 
 struct pthread_key {
 	volatile int	allocated;
 	int		seqno;
 	void            (*destructor)(void *);
 };
 
 /*
  * lwpid_t is 32bit but kernel thr API exports tid as long type
  * to preserve the ABI for M:N model in very early date (r131431).
  */
 #define TID(thread)	((uint32_t) ((thread)->tid))
 
 /*
  * Thread structure.
  */
 struct pthread {
 #define _pthread_startzero	tid
 	/* Kernel thread id. */
 	long			tid;
 #define	TID_TERMINATED		1
 
 	/*
 	 * Lock for accesses to this thread structure.
 	 */
 	struct umutex		lock;
 
 	/* Internal condition variable cycle number. */
 	uint32_t		cycle;
 
 	/* How many low level locks the thread held. */
 	int			locklevel;
 
 	/*
 	 * Set to non-zero when this thread has entered a critical
 	 * region.  We allow for recursive entries into critical regions.
 	 */
 	int			critical_count;
 
 	/* Signal blocked counter. */
 	int			sigblock;
 
 	/* Queue entry for list of all threads. */
 	TAILQ_ENTRY(pthread)	tle;	/* link for all threads in process */
 
 	/* Queue entry for GC lists. */
 	TAILQ_ENTRY(pthread)	gcle;
 
 	/* Hash queue entry. */
 	LIST_ENTRY(pthread)	hle;
 
 	/* Sleep queue entry */
 	TAILQ_ENTRY(pthread)    wle;
 
 	/* Threads reference count. */
 	int			refcount;
 
 	/*
 	 * Thread start routine, argument, stack pointer and thread
 	 * attributes.
 	 */
 	void			*(*start_routine)(void *);
 	void			*arg;
 	struct pthread_attr	attr;
 
 #define	SHOULD_CANCEL(thr)					\
 	((thr)->cancel_pending && (thr)->cancel_enable &&	\
 	 (thr)->no_cancel == 0)
 
 	/* Cancellation is enabled */
 	int			cancel_enable;
 
 	/* Cancellation request is pending */
 	int			cancel_pending;
 
 	/* Thread is at cancellation point */
 	int			cancel_point;
 
 	/* Cancellation is temporarily disabled */
 	int			no_cancel;
 
 	/* Asynchronouse cancellation is enabled */
 	int			cancel_async;
 
 	/* Cancellation is in progress */
 	int			cancelling;
 
 	/* Thread temporary signal mask. */
 	sigset_t		sigmask;
 
 	/* Thread should unblock SIGCANCEL. */
 	int			unblock_sigcancel;
 
 	/* In sigsuspend state */
 	int			in_sigsuspend;
 
 	/* deferred signal info	*/
 	siginfo_t		deferred_siginfo;
 
 	/* signal mask to restore. */
 	sigset_t		deferred_sigmask;
 
 	/* the sigaction should be used for deferred signal. */
 	struct sigaction	deferred_sigact;
 
 	/* deferred signal delivery is performed, do not reenter. */
 	int			deferred_run;
 
 	/* Force new thread to exit. */
 	int			force_exit;
 
 	/* Thread state: */
 	enum pthread_state 	state;
 
 	/*
 	 * Error variable used instead of errno. The function __error()
 	 * returns a pointer to this. 
 	 */
 	int			error;
 
 	/*
 	 * The joiner is the thread that is joining to this thread.  The
 	 * join status keeps track of a join operation to another thread.
 	 */
 	struct pthread		*joiner;
 
 	/* Miscellaneous flags; only set with scheduling lock held. */
 	int			flags;
 #define THR_FLAGS_PRIVATE	0x0001
 #define	THR_FLAGS_NEED_SUSPEND	0x0002	/* thread should be suspended */
 #define	THR_FLAGS_SUSPENDED	0x0004	/* thread is suspended */
 #define	THR_FLAGS_DETACHED	0x0008	/* thread is detached */
 
 	/* Thread list flags; only set with thread list lock held. */
 	int			tlflags;
 #define	TLFLAGS_GC_SAFE		0x0001	/* thread safe for cleaning */
 #define	TLFLAGS_IN_TDLIST	0x0002	/* thread in all thread list */
 #define	TLFLAGS_IN_GCLIST	0x0004	/* thread in gc list */
 
 	/*
 	 * Queues of the owned mutexes.  Private queue must have index
 	 * + 1 of the corresponding full queue.
 	 */
 #define	TMQ_NORM		0	/* NORMAL or PRIO_INHERIT normal */
 #define	TMQ_NORM_PRIV		1	/* NORMAL or PRIO_INHERIT normal priv */
 #define	TMQ_NORM_PP		2	/* PRIO_PROTECT normal mutexes */
 #define	TMQ_NORM_PP_PRIV	3	/* PRIO_PROTECT normal priv */
-#define	TMQ_NITEMS		4
+#define	TMQ_ROBUST_PP		4	/* PRIO_PROTECT robust mutexes */
+#define	TMQ_ROBUST_PP_PRIV	5	/* PRIO_PROTECT robust priv */	
+#define	TMQ_NITEMS		6
 	struct mutex_queue	mq[TMQ_NITEMS];
 
 	void				*ret;
 	struct pthread_specific_elem	*specific;
 	int				specific_data_count;
 
 	/* Number rwlocks rdlocks held. */
 	int			rdlock_count;
 
 	/*
 	 * Current locks bitmap for rtld. */
 	int			rtld_bits;
 
 	/* Thread control block */
 	struct tcb		*tcb;
 
 	/* Cleanup handlers Link List */
 	struct pthread_cleanup	*cleanup;
 
 #ifdef _PTHREAD_FORCED_UNWIND
 	struct _Unwind_Exception	ex;
 	void			*unwind_stackend;
 	int			unwind_disabled;
 #endif
 
 	/*
 	 * Magic value to help recognize a valid thread structure
 	 * from an invalid one:
 	 */
 #define	THR_MAGIC		((u_int32_t) 0xd09ba115)
 	u_int32_t		magic;
 
 	/* Enable event reporting */
 	int			report_events;
 
 	/* Event mask */
 	int			event_mask;
 
 	/* Event */
 	td_event_msg_t		event_buf;
 
 	/* Wait channel */
 	void			*wchan;
 
 	/* Referenced mutex. */
 	struct pthread_mutex	*mutex_obj;
 
 	/* Thread will sleep. */
 	int			will_sleep;
 
 	/* Number of threads deferred. */
 	int			nwaiter_defer;
 
+	int			robust_inited;
+	uintptr_t		robust_list;
+	uintptr_t		priv_robust_list;
+	uintptr_t		inact_mtx;
+
 	/* Deferred threads from pthread_cond_signal. */
 	unsigned int 		*defer_waiters[MAX_DEFER_WAITERS];
 #define _pthread_endzero	wake_addr
 
 	struct wake_addr	*wake_addr;
 #define WAKE_ADDR(td)           ((td)->wake_addr)
 
 	/* Sleep queue */
 	struct	sleepqueue	*sleepqueue;
 
 };
 
 #define THR_SHOULD_GC(thrd) 						\
 	((thrd)->refcount == 0 && (thrd)->state == PS_DEAD &&		\
 	 ((thrd)->flags & THR_FLAGS_DETACHED) != 0)
 
 #define	THR_IN_CRITICAL(thrd)				\
 	(((thrd)->locklevel > 0) ||			\
 	((thrd)->critical_count > 0))
 
 #define	THR_CRITICAL_ENTER(thrd)			\
 	(thrd)->critical_count++
 
 #define	THR_CRITICAL_LEAVE(thrd)			\
 	do {						\
 		(thrd)->critical_count--;		\
 		_thr_ast(thrd);				\
 	} while (0)
 
 #define THR_UMUTEX_TRYLOCK(thrd, lck)			\
 	_thr_umutex_trylock((lck), TID(thrd))
 
 #define	THR_UMUTEX_LOCK(thrd, lck)			\
 	_thr_umutex_lock((lck), TID(thrd))
 
 #define	THR_UMUTEX_TIMEDLOCK(thrd, lck, timo)		\
 	_thr_umutex_timedlock((lck), TID(thrd), (timo))
 
 #define	THR_UMUTEX_UNLOCK(thrd, lck)			\
 	_thr_umutex_unlock((lck), TID(thrd))
 
 #define	THR_LOCK_ACQUIRE(thrd, lck)			\
 do {							\
 	(thrd)->locklevel++;				\
 	_thr_umutex_lock(lck, TID(thrd));		\
 } while (0)
 
 #define	THR_LOCK_ACQUIRE_SPIN(thrd, lck)		\
 do {							\
 	(thrd)->locklevel++;				\
 	_thr_umutex_lock_spin(lck, TID(thrd));		\
 } while (0)
 
 #ifdef	_PTHREADS_INVARIANTS
 #define	THR_ASSERT_LOCKLEVEL(thrd)			\
 do {							\
 	if (__predict_false((thrd)->locklevel <= 0))	\
 		_thr_assert_lock_level();		\
 } while (0)
 #else
 #define THR_ASSERT_LOCKLEVEL(thrd)
 #endif
 
 #define	THR_LOCK_RELEASE(thrd, lck)			\
 do {							\
 	THR_ASSERT_LOCKLEVEL(thrd);			\
 	_thr_umutex_unlock((lck), TID(thrd));		\
 	(thrd)->locklevel--;				\
 	_thr_ast(thrd);					\
 } while (0)
 
 #define	THR_LOCK(curthrd)		THR_LOCK_ACQUIRE(curthrd, &(curthrd)->lock)
 #define	THR_UNLOCK(curthrd)		THR_LOCK_RELEASE(curthrd, &(curthrd)->lock)
 #define	THR_THREAD_LOCK(curthrd, thr)	THR_LOCK_ACQUIRE(curthrd, &(thr)->lock)
 #define	THR_THREAD_UNLOCK(curthrd, thr)	THR_LOCK_RELEASE(curthrd, &(thr)->lock)
 
 #define	THREAD_LIST_RDLOCK(curthrd)				\
 do {								\
 	(curthrd)->locklevel++;					\
 	_thr_rwl_rdlock(&_thr_list_lock);			\
 } while (0)
 
 #define	THREAD_LIST_WRLOCK(curthrd)				\
 do {								\
 	(curthrd)->locklevel++;					\
 	_thr_rwl_wrlock(&_thr_list_lock);			\
 } while (0)
 
 #define	THREAD_LIST_UNLOCK(curthrd)				\
 do {								\
 	_thr_rwl_unlock(&_thr_list_lock);			\
 	(curthrd)->locklevel--;					\
 	_thr_ast(curthrd);					\
 } while (0)
 
 /*
  * Macros to insert/remove threads to the all thread list and
  * the gc list.
  */
 #define	THR_LIST_ADD(thrd) do {					\
 	if (((thrd)->tlflags & TLFLAGS_IN_TDLIST) == 0) {	\
 		TAILQ_INSERT_HEAD(&_thread_list, thrd, tle);	\
 		_thr_hash_add(thrd);				\
 		(thrd)->tlflags |= TLFLAGS_IN_TDLIST;		\
 	}							\
 } while (0)
 #define	THR_LIST_REMOVE(thrd) do {				\
 	if (((thrd)->tlflags & TLFLAGS_IN_TDLIST) != 0) {	\
 		TAILQ_REMOVE(&_thread_list, thrd, tle);		\
 		_thr_hash_remove(thrd);				\
 		(thrd)->tlflags &= ~TLFLAGS_IN_TDLIST;		\
 	}							\
 } while (0)
 #define	THR_GCLIST_ADD(thrd) do {				\
 	if (((thrd)->tlflags & TLFLAGS_IN_GCLIST) == 0) {	\
 		TAILQ_INSERT_HEAD(&_thread_gc_list, thrd, gcle);\
 		(thrd)->tlflags |= TLFLAGS_IN_GCLIST;		\
 		_gc_count++;					\
 	}							\
 } while (0)
 #define	THR_GCLIST_REMOVE(thrd) do {				\
 	if (((thrd)->tlflags & TLFLAGS_IN_GCLIST) != 0) {	\
 		TAILQ_REMOVE(&_thread_gc_list, thrd, gcle);	\
 		(thrd)->tlflags &= ~TLFLAGS_IN_GCLIST;		\
 		_gc_count--;					\
 	}							\
 } while (0)
 
 #define THR_REF_ADD(curthread, pthread) {			\
 	THR_CRITICAL_ENTER(curthread);				\
 	pthread->refcount++;					\
 } while (0)
 
 #define THR_REF_DEL(curthread, pthread) {			\
 	pthread->refcount--;					\
 	THR_CRITICAL_LEAVE(curthread);				\
 } while (0)
 
 #define GC_NEEDED()	(_gc_count >= 5)
 
 #define SHOULD_REPORT_EVENT(curthr, e)			\
 	(curthr->report_events && 			\
 	 (((curthr)->event_mask | _thread_event_mask ) & e) != 0)
 
 extern int __isthreaded;
 
 /*
  * Global variables for the pthread kernel.
  */
 
 extern char		*_usrstack __hidden;
 extern struct pthread	*_thr_initial __hidden;
 
 /* For debugger */
 extern int		_libthr_debug;
 extern int		_thread_event_mask;
 extern struct pthread	*_thread_last_event;
 
 /* List of all threads: */
 extern pthreadlist	_thread_list;
 
 /* List of threads needing GC: */
 extern pthreadlist	_thread_gc_list __hidden;
 
 extern int		_thread_active_threads;
 extern atfork_head	_thr_atfork_list __hidden;
 extern struct urwlock	_thr_atfork_lock __hidden;
 
 /* Default thread attributes: */
 extern struct pthread_attr _pthread_attr_default __hidden;
 
 /* Default mutex attributes: */
 extern struct pthread_mutex_attr _pthread_mutexattr_default __hidden;
 extern struct pthread_mutex_attr _pthread_mutexattr_adaptive_default __hidden;
 
 /* Default condition variable attributes: */
 extern struct pthread_cond_attr _pthread_condattr_default __hidden;
 
 extern struct pthread_prio _thr_priorities[] __hidden;
 
 extern int	_thr_is_smp __hidden;
 
 extern size_t	_thr_guard_default __hidden;
 extern size_t	_thr_stack_default __hidden;
 extern size_t	_thr_stack_initial __hidden;
 extern int	_thr_page_size __hidden;
 extern int	_thr_spinloops __hidden;
 extern int	_thr_yieldloops __hidden;
 extern int	_thr_queuefifo __hidden;
 
 /* Garbage thread count. */
 extern int	_gc_count __hidden;
 
 extern struct umutex	_mutex_static_lock __hidden;
 extern struct umutex	_cond_static_lock __hidden;
 extern struct umutex	_rwlock_static_lock __hidden;
 extern struct umutex	_keytable_lock __hidden;
 extern struct urwlock	_thr_list_lock __hidden;
 extern struct umutex	_thr_event_lock __hidden;
 extern struct umutex	_suspend_all_lock __hidden;
 extern int		_suspend_all_waiters __hidden;
 extern int		_suspend_all_cycle __hidden;
 extern struct pthread	*_single_thread __hidden;
 
 /*
  * Function prototype definitions.
  */
 __BEGIN_DECLS
 int	_thr_setthreaded(int) __hidden;
-int	_mutex_cv_lock(struct pthread_mutex *, int) __hidden;
+int	_mutex_cv_lock(struct pthread_mutex *, int, bool) __hidden;
 int	_mutex_cv_unlock(struct pthread_mutex *, int *, int *) __hidden;
 int     _mutex_cv_attach(struct pthread_mutex *, int) __hidden;
 int     _mutex_cv_detach(struct pthread_mutex *, int *) __hidden;
 int     _mutex_owned(struct pthread *, const struct pthread_mutex *) __hidden;
 int	_mutex_reinit(pthread_mutex_t *) __hidden;
 void	_mutex_fork(struct pthread *curthread) __hidden;
+int	_mutex_enter_robust(struct pthread *curthread, struct pthread_mutex *m)
+	    __hidden;
+void	_mutex_leave_robust(struct pthread *curthread, struct pthread_mutex *m)
+	    __hidden;
 void	_libpthread_init(struct pthread *) __hidden;
 struct pthread *_thr_alloc(struct pthread *) __hidden;
 void	_thread_exit(const char *, int, const char *) __hidden __dead2;
 int	_thr_ref_add(struct pthread *, struct pthread *, int) __hidden;
 void	_thr_ref_delete(struct pthread *, struct pthread *) __hidden;
 void	_thr_ref_delete_unlocked(struct pthread *, struct pthread *) __hidden;
 int	_thr_find_thread(struct pthread *, struct pthread *, int) __hidden;
 void	_thr_rtld_init(void) __hidden;
 void	_thr_rtld_postfork_child(void) __hidden;
 int	_thr_stack_alloc(struct pthread_attr *) __hidden;
 void	_thr_stack_free(struct pthread_attr *) __hidden;
 void	_thr_free(struct pthread *, struct pthread *) __hidden;
 void	_thr_gc(struct pthread *) __hidden;
 void    _thread_cleanupspecific(void) __hidden;
 void	_thread_printf(int, const char *, ...) __hidden;
 void	_thr_spinlock_init(void) __hidden;
 void	_thr_cancel_enter(struct pthread *) __hidden;
 void	_thr_cancel_enter2(struct pthread *, int) __hidden;
 void	_thr_cancel_leave(struct pthread *, int) __hidden;
 void	_thr_testcancel(struct pthread *) __hidden;
 void	_thr_signal_block(struct pthread *) __hidden;
 void	_thr_signal_unblock(struct pthread *) __hidden;
 void	_thr_signal_init(int) __hidden;
 void	_thr_signal_deinit(void) __hidden;
 int	_thr_send_sig(struct pthread *, int sig) __hidden;
 void	_thr_list_init(void) __hidden;
 void	_thr_hash_add(struct pthread *) __hidden;
 void	_thr_hash_remove(struct pthread *) __hidden;
 struct pthread *_thr_hash_find(struct pthread *) __hidden;
 void	_thr_link(struct pthread *, struct pthread *) __hidden;
 void	_thr_unlink(struct pthread *, struct pthread *) __hidden;
 void	_thr_assert_lock_level(void) __hidden __dead2;
 void	_thr_ast(struct pthread *) __hidden;
 void	_thr_once_init(void) __hidden;
 void	_thr_report_creation(struct pthread *curthread,
 	    struct pthread *newthread) __hidden;
 void	_thr_report_death(struct pthread *curthread) __hidden;
 int	_thr_getscheduler(lwpid_t, int *, struct sched_param *) __hidden;
 int	_thr_setscheduler(lwpid_t, int, const struct sched_param *) __hidden;
 void	_thr_signal_prefork(void) __hidden;
 void	_thr_signal_postfork(void) __hidden;
 void	_thr_signal_postfork_child(void) __hidden;
 void	_thr_suspend_all_lock(struct pthread *) __hidden;
 void	_thr_suspend_all_unlock(struct pthread *) __hidden;
 void	_thr_try_gc(struct pthread *, struct pthread *) __hidden;
 int	_rtp_to_schedparam(const struct rtprio *rtp, int *policy,
 		struct sched_param *param) __hidden;
 int	_schedparam_to_rtp(int policy, const struct sched_param *param,
 		struct rtprio *rtp) __hidden;
 void	_thread_bp_create(void);
 void	_thread_bp_death(void);
 int	_sched_yield(void);
 
 void	_pthread_cleanup_push(void (*)(void *), void *);
 void	_pthread_cleanup_pop(int);
 void	_pthread_exit_mask(void *status, sigset_t *mask) __dead2 __hidden;
 void	_pthread_cancel_enter(int maycancel);
 void 	_pthread_cancel_leave(int maycancel);
+int	_pthread_mutex_consistent(pthread_mutex_t *) __nonnull(1);
+int	_pthread_mutexattr_getrobust(pthread_mutexattr_t *__restrict,
+	    int *__restrict) __nonnull_all;
+int	_pthread_mutexattr_setrobust(pthread_mutexattr_t *, int)
+	    __nonnull(1);
 
 /* #include <fcntl.h> */
 #ifdef  _SYS_FCNTL_H_
 int     __sys_fcntl(int, int, ...);
 int     __sys_openat(int, const char *, int, ...);
 #endif
 
 /* #include <signal.h> */
 #ifdef _SIGNAL_H_
 int	__sys_kill(pid_t, int);
 int     __sys_sigaction(int, const struct sigaction *, struct sigaction *);
 int     __sys_sigpending(sigset_t *);
 int     __sys_sigprocmask(int, const sigset_t *, sigset_t *);
 int     __sys_sigsuspend(const sigset_t *);
 int     __sys_sigreturn(const ucontext_t *);
 int     __sys_sigaltstack(const struct sigaltstack *, struct sigaltstack *);
 int	__sys_sigwait(const sigset_t *, int *);
 int	__sys_sigtimedwait(const sigset_t *, siginfo_t *,
 		const struct timespec *);
 int	__sys_sigwaitinfo(const sigset_t *set, siginfo_t *info);
 #endif
 
 /* #include <time.h> */
 #ifdef	_TIME_H_
 int	__sys_nanosleep(const struct timespec *, struct timespec *);
 #endif
 
 /* #include <sys/ucontext.h> */
 #ifdef _SYS_UCONTEXT_H_
 int	__sys_setcontext(const ucontext_t *ucp);
 int	__sys_swapcontext(ucontext_t *oucp, const ucontext_t *ucp);
 #endif
 
 /* #include <unistd.h> */
 #ifdef  _UNISTD_H_
 int     __sys_close(int);
 int	__sys_fork(void);
 pid_t	__sys_getpid(void);
 ssize_t __sys_read(int, void *, size_t);
 void	__sys_exit(int);
 #endif
 
 static inline int
 _thr_isthreaded(void)
 {
 	return (__isthreaded != 0);
 }
 
 static inline int
 _thr_is_inited(void)
 {
 	return (_thr_initial != NULL);
 }
 
 static inline void
 _thr_check_init(void)
 {
 	if (_thr_initial == NULL)
 		_libpthread_init(NULL);
 }
 
 struct wake_addr *_thr_alloc_wake_addr(void);
 void	_thr_release_wake_addr(struct wake_addr *);
 int	_thr_sleep(struct pthread *, int, const struct timespec *);
 
 void _thr_wake_addr_init(void) __hidden;
 
 static inline void
 _thr_clear_wake(struct pthread *td)
 {
 	td->wake_addr->value = 0;
 }
 
 static inline int
 _thr_is_woken(struct pthread *td)
 {
 	return td->wake_addr->value != 0;
 }
 
 static inline void
 _thr_set_wake(unsigned int *waddr)
 {
 	*waddr = 1;
 	_thr_umtx_wake(waddr, INT_MAX, 0);
 }
 
 void _thr_wake_all(unsigned int *waddrs[], int) __hidden;
 
 static inline struct pthread *
 _sleepq_first(struct sleepqueue *sq)
 {
 	return TAILQ_FIRST(&sq->sq_blocked);
 }
 
 void	_sleepq_init(void) __hidden;
 struct sleepqueue *_sleepq_alloc(void) __hidden;
 void	_sleepq_free(struct sleepqueue *) __hidden;
 void	_sleepq_lock(void *) __hidden;
 void	_sleepq_unlock(void *) __hidden;
 struct sleepqueue *_sleepq_lookup(void *) __hidden;
 void	_sleepq_add(void *, struct pthread *) __hidden;
 int	_sleepq_remove(struct sleepqueue *, struct pthread *) __hidden;
 void	_sleepq_drop(struct sleepqueue *,
 		void (*cb)(struct pthread *, void *arg), void *) __hidden;
 
 int	_pthread_mutex_init_calloc_cb(pthread_mutex_t *mutex,
 	    void *(calloc_cb)(size_t, size_t));
 
 struct dl_phdr_info;
 void __pthread_cxa_finalize(struct dl_phdr_info *phdr_info);
 void _thr_tsd_unload(struct dl_phdr_info *phdr_info) __hidden;
 void _thr_sigact_unload(struct dl_phdr_info *phdr_info) __hidden;
 void _thr_stack_fix_protection(struct pthread *thrd);
 
 int *__error_threaded(void) __hidden;
 void __thr_interpose_libc(void) __hidden;
 pid_t __thr_fork(void);
 int __thr_setcontext(const ucontext_t *ucp);
 int __thr_sigaction(int sig, const struct sigaction *act,
     struct sigaction *oact) __hidden;
 int __thr_sigprocmask(int how, const sigset_t *set, sigset_t *oset);
 int __thr_sigsuspend(const sigset_t * set);
 int __thr_sigtimedwait(const sigset_t *set, siginfo_t *info,
     const struct timespec * timeout);
 int __thr_sigwait(const sigset_t *set, int *sig);
 int __thr_sigwaitinfo(const sigset_t *set, siginfo_t *info);
 int __thr_swapcontext(ucontext_t *oucp, const ucontext_t *ucp);
 
 void __thr_map_stacks_exec(void);
 
 struct _spinlock;
 void __thr_spinunlock(struct _spinlock *lck);
 void __thr_spinlock(struct _spinlock *lck);
 
 struct tcb *_tcb_ctor(struct pthread *, int);
 void	_tcb_dtor(struct tcb *);
 
 void __thr_pshared_init(void) __hidden;
 void *__thr_pshared_offpage(void *key, int doalloc) __hidden;
 void __thr_pshared_destroy(void *key) __hidden;
 void __thr_pshared_atfork_pre(void) __hidden;
 void __thr_pshared_atfork_post(void) __hidden;
 
 __END_DECLS
 
 #endif  /* !_THR_PRIVATE_H */
Index: head/lib/libthr/thread/thr_umtx.c
===================================================================
--- head/lib/libthr/thread/thr_umtx.c	(revision 300042)
+++ head/lib/libthr/thread/thr_umtx.c	(revision 300043)
@@ -1,343 +1,374 @@
 /*
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "thr_private.h"
 #include "thr_umtx.h"
 
 #ifndef HAS__UMTX_OP_ERR
 int _umtx_op_err(void *obj, int op, u_long val, void *uaddr, void *uaddr2)
 {
+
 	if (_umtx_op(obj, op, val, uaddr, uaddr2) == -1)
 		return (errno);
 	return (0);
 }
 #endif
 
 void
 _thr_umutex_init(struct umutex *mtx)
 {
 	static const struct umutex default_mtx = DEFAULT_UMUTEX;
 
 	*mtx = default_mtx;
 }
 
 void
 _thr_urwlock_init(struct urwlock *rwl)
 {
 	static const struct urwlock default_rwl = DEFAULT_URWLOCK;
 
 	*rwl = default_rwl;
 }
 
 int
 __thr_umutex_lock(struct umutex *mtx, uint32_t id)
 {
 	uint32_t owner;
 
-	if ((mtx->m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) == 0) {
-		for (;;) {
-			/* wait in kernel */
-			_umtx_op_err(mtx, UMTX_OP_MUTEX_WAIT, 0, 0, 0);
+	if ((mtx->m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) != 0)
+		return	(_umtx_op_err(mtx, UMTX_OP_MUTEX_LOCK, 0, 0, 0));
 
-			owner = mtx->m_owner;
-			if ((owner & ~UMUTEX_CONTESTED) == 0 &&
-			     atomic_cmpset_acq_32(&mtx->m_owner, owner, id|owner))
-				return (0);
-		}
-	}
+	for (;;) {
+		owner = mtx->m_owner;
+		if ((owner & ~UMUTEX_CONTESTED) == 0 &&
+		     atomic_cmpset_acq_32(&mtx->m_owner, owner, id | owner))
+			return (0);
+		if (owner == UMUTEX_RB_OWNERDEAD &&
+		     atomic_cmpset_acq_32(&mtx->m_owner, owner,
+		     id | UMUTEX_CONTESTED))
+			return (EOWNERDEAD);
+		if (owner == UMUTEX_RB_NOTRECOV)
+			return (ENOTRECOVERABLE);
 
-	return	_umtx_op_err(mtx, UMTX_OP_MUTEX_LOCK, 0, 0, 0);
+		/* wait in kernel */
+		_umtx_op_err(mtx, UMTX_OP_MUTEX_WAIT, 0, 0, 0);
+	}
 }
 
 #define SPINLOOPS 1000
 
 int
 __thr_umutex_lock_spin(struct umutex *mtx, uint32_t id)
 {
 	uint32_t owner;
+	int count;
 
 	if (!_thr_is_smp)
-		return __thr_umutex_lock(mtx, id);
+		return (__thr_umutex_lock(mtx, id));
+	if ((mtx->m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) != 0)
+		return	(_umtx_op_err(mtx, UMTX_OP_MUTEX_LOCK, 0, 0, 0));
 
-	if ((mtx->m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) == 0) {
-		for (;;) {
-			int count = SPINLOOPS;
-			while (count--) {
-				owner = mtx->m_owner;
-				if ((owner & ~UMUTEX_CONTESTED) == 0) {
-					if (atomic_cmpset_acq_32(
-					    &mtx->m_owner,
-					    owner, id|owner)) {
-						return (0);
-					}
-				}
-				CPU_SPINWAIT;
-			}
-
-			/* wait in kernel */
-			_umtx_op_err(mtx, UMTX_OP_MUTEX_WAIT, 0, 0, 0);
+	for (;;) {
+		count = SPINLOOPS;
+		while (count--) {
+			owner = mtx->m_owner;
+			if ((owner & ~UMUTEX_CONTESTED) == 0 &&
+			    atomic_cmpset_acq_32(&mtx->m_owner, owner,
+			    id | owner))
+				return (0);
+			if (__predict_false(owner == UMUTEX_RB_OWNERDEAD) &&
+			    atomic_cmpset_acq_32(&mtx->m_owner, owner,
+			    id | UMUTEX_CONTESTED))
+				return (EOWNERDEAD);
+			if (__predict_false(owner == UMUTEX_RB_NOTRECOV))
+				return (ENOTRECOVERABLE);
+			CPU_SPINWAIT;
 		}
-	}
 
-	return	_umtx_op_err(mtx, UMTX_OP_MUTEX_LOCK, 0, 0, 0);
+		/* wait in kernel */
+		_umtx_op_err(mtx, UMTX_OP_MUTEX_WAIT, 0, 0, 0);
+	}
 }
 
 int
 __thr_umutex_timedlock(struct umutex *mtx, uint32_t id,
 	const struct timespec *abstime)
 {
 	struct _umtx_time *tm_p, timeout;
 	size_t tm_size;
 	uint32_t owner;
 	int ret;
 
 	if (abstime == NULL) {
 		tm_p = NULL;
 		tm_size = 0;
 	} else {
 		timeout._clockid = CLOCK_REALTIME;
 		timeout._flags = UMTX_ABSTIME;
 		timeout._timeout = *abstime;
 		tm_p = &timeout;
 		tm_size = sizeof(timeout);
 	}
 
 	for (;;) {
-		if ((mtx->m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) == 0) {
-
-			/* wait in kernel */
-			ret = _umtx_op_err(mtx, UMTX_OP_MUTEX_WAIT, 0,
-				 (void *)tm_size, __DECONST(void *, tm_p));
-
-			/* now try to lock it */
+		if ((mtx->m_flags & (UMUTEX_PRIO_PROTECT |
+		    UMUTEX_PRIO_INHERIT)) == 0) {
+			/* try to lock it */
 			owner = mtx->m_owner;
 			if ((owner & ~UMUTEX_CONTESTED) == 0 &&
-			     atomic_cmpset_acq_32(&mtx->m_owner, owner, id|owner))
+			     atomic_cmpset_acq_32(&mtx->m_owner, owner,
+			     id | owner))
 				return (0);
+			if (__predict_false(owner == UMUTEX_RB_OWNERDEAD) &&
+			     atomic_cmpset_acq_32(&mtx->m_owner, owner,
+			     id | UMUTEX_CONTESTED))
+				return (EOWNERDEAD);
+			if (__predict_false(owner == UMUTEX_RB_NOTRECOV))
+				return (ENOTRECOVERABLE);
+			/* wait in kernel */
+			ret = _umtx_op_err(mtx, UMTX_OP_MUTEX_WAIT, 0,
+			    (void *)tm_size, __DECONST(void *, tm_p));
 		} else {
 			ret = _umtx_op_err(mtx, UMTX_OP_MUTEX_LOCK, 0, 
-				 (void *)tm_size, __DECONST(void *, tm_p));
-			if (ret == 0)
+			    (void *)tm_size, __DECONST(void *, tm_p));
+			if (ret == 0 || ret == EOWNERDEAD ||
+			    ret == ENOTRECOVERABLE)
 				break;
 		}
 		if (ret == ETIMEDOUT)
 			break;
 	}
 	return (ret);
 }
 
 int
 __thr_umutex_unlock(struct umutex *mtx, uint32_t id)
 {
-	return _umtx_op_err(mtx, UMTX_OP_MUTEX_UNLOCK, 0, 0, 0);
+
+	return (_umtx_op_err(mtx, UMTX_OP_MUTEX_UNLOCK, 0, 0, 0));
 }
 
 int
 __thr_umutex_trylock(struct umutex *mtx)
 {
-	return _umtx_op_err(mtx, UMTX_OP_MUTEX_TRYLOCK, 0, 0, 0);
+
+	return (_umtx_op_err(mtx, UMTX_OP_MUTEX_TRYLOCK, 0, 0, 0));
 }
 
 int
 __thr_umutex_set_ceiling(struct umutex *mtx, uint32_t ceiling,
-	uint32_t *oldceiling)
+    uint32_t *oldceiling)
 {
-	return _umtx_op_err(mtx, UMTX_OP_SET_CEILING, ceiling, oldceiling, 0);
+
+	return (_umtx_op_err(mtx, UMTX_OP_SET_CEILING, ceiling, oldceiling, 0));
 }
 
 int
 _thr_umtx_wait(volatile long *mtx, long id, const struct timespec *timeout)
 {
+
 	if (timeout && (timeout->tv_sec < 0 || (timeout->tv_sec == 0 &&
-		timeout->tv_nsec <= 0)))
+	    timeout->tv_nsec <= 0)))
 		return (ETIMEDOUT);
-	return _umtx_op_err(__DEVOLATILE(void *, mtx), UMTX_OP_WAIT, id, 0,
-		__DECONST(void*, timeout));
+	return (_umtx_op_err(__DEVOLATILE(void *, mtx), UMTX_OP_WAIT, id, 0,
+	    __DECONST(void*, timeout)));
 }
 
 int
-_thr_umtx_wait_uint(volatile u_int *mtx, u_int id, const struct timespec *timeout, int shared)
+_thr_umtx_wait_uint(volatile u_int *mtx, u_int id,
+    const struct timespec *timeout, int shared)
 {
+
 	if (timeout && (timeout->tv_sec < 0 || (timeout->tv_sec == 0 &&
-		timeout->tv_nsec <= 0)))
+	    timeout->tv_nsec <= 0)))
 		return (ETIMEDOUT);
-	return _umtx_op_err(__DEVOLATILE(void *, mtx), 
-			shared ? UMTX_OP_WAIT_UINT : UMTX_OP_WAIT_UINT_PRIVATE, id, 0,
-			__DECONST(void*, timeout));
+	return (_umtx_op_err(__DEVOLATILE(void *, mtx), shared ?
+	    UMTX_OP_WAIT_UINT : UMTX_OP_WAIT_UINT_PRIVATE, id, 0,
+	    __DECONST(void*, timeout)));
 }
 
 int
 _thr_umtx_timedwait_uint(volatile u_int *mtx, u_int id, int clockid,
-	const struct timespec *abstime, int shared)
+    const struct timespec *abstime, int shared)
 {
 	struct _umtx_time *tm_p, timeout;
 	size_t tm_size;
 
 	if (abstime == NULL) {
 		tm_p = NULL;
 		tm_size = 0;
 	} else {
 		timeout._clockid = clockid;
 		timeout._flags = UMTX_ABSTIME;
 		timeout._timeout = *abstime;
 		tm_p = &timeout;
 		tm_size = sizeof(timeout);
 	}
 
-	return _umtx_op_err(__DEVOLATILE(void *, mtx), 
-		shared ? UMTX_OP_WAIT_UINT : UMTX_OP_WAIT_UINT_PRIVATE, id, 
-		(void *)tm_size, __DECONST(void *, tm_p));
+	return (_umtx_op_err(__DEVOLATILE(void *, mtx), shared ?
+	    UMTX_OP_WAIT_UINT : UMTX_OP_WAIT_UINT_PRIVATE, id,
+	    (void *)tm_size, __DECONST(void *, tm_p)));
 }
 
 int
 _thr_umtx_wake(volatile void *mtx, int nr_wakeup, int shared)
 {
-	return _umtx_op_err(__DEVOLATILE(void *, mtx), shared ? UMTX_OP_WAKE : UMTX_OP_WAKE_PRIVATE,
-		nr_wakeup, 0, 0);
+
+	return (_umtx_op_err(__DEVOLATILE(void *, mtx), shared ?
+	    UMTX_OP_WAKE : UMTX_OP_WAKE_PRIVATE, nr_wakeup, 0, 0));
 }
 
 void
 _thr_ucond_init(struct ucond *cv)
 {
+
 	bzero(cv, sizeof(struct ucond));
 }
 
 int
 _thr_ucond_wait(struct ucond *cv, struct umutex *m,
 	const struct timespec *timeout, int flags)
 {
+	struct pthread *curthread;
+
 	if (timeout && (timeout->tv_sec < 0 || (timeout->tv_sec == 0 &&
 	    timeout->tv_nsec <= 0))) {
-		struct pthread *curthread = _get_curthread();
+		curthread = _get_curthread();
 		_thr_umutex_unlock(m, TID(curthread));
                 return (ETIMEDOUT);
 	}
-	return _umtx_op_err(cv, UMTX_OP_CV_WAIT, flags,
-		     m, __DECONST(void*, timeout));
+	return (_umtx_op_err(cv, UMTX_OP_CV_WAIT, flags, m,
+	    __DECONST(void*, timeout)));
 }
  
 int
 _thr_ucond_signal(struct ucond *cv)
 {
+
 	if (!cv->c_has_waiters)
 		return (0);
-	return _umtx_op_err(cv, UMTX_OP_CV_SIGNAL, 0, NULL, NULL);
+	return (_umtx_op_err(cv, UMTX_OP_CV_SIGNAL, 0, NULL, NULL));
 }
 
 int
 _thr_ucond_broadcast(struct ucond *cv)
 {
+
 	if (!cv->c_has_waiters)
 		return (0);
-	return _umtx_op_err(cv, UMTX_OP_CV_BROADCAST, 0, NULL, NULL);
+	return (_umtx_op_err(cv, UMTX_OP_CV_BROADCAST, 0, NULL, NULL));
 }
 
 int
 __thr_rwlock_rdlock(struct urwlock *rwlock, int flags,
 	const struct timespec *tsp)
 {
 	struct _umtx_time timeout, *tm_p;
 	size_t tm_size;
 
 	if (tsp == NULL) {
 		tm_p = NULL;
 		tm_size = 0;
 	} else {
 		timeout._timeout = *tsp;
 		timeout._flags = UMTX_ABSTIME;
 		timeout._clockid = CLOCK_REALTIME;
 		tm_p = &timeout;
 		tm_size = sizeof(timeout);
 	}
-	return _umtx_op_err(rwlock, UMTX_OP_RW_RDLOCK, flags, (void *)tm_size, tm_p);
+	return (_umtx_op_err(rwlock, UMTX_OP_RW_RDLOCK, flags,
+	    (void *)tm_size, tm_p));
 }
 
 int
 __thr_rwlock_wrlock(struct urwlock *rwlock, const struct timespec *tsp)
 {
 	struct _umtx_time timeout, *tm_p;
 	size_t tm_size;
 
 	if (tsp == NULL) {
 		tm_p = NULL;
 		tm_size = 0;
 	} else {
 		timeout._timeout = *tsp;
 		timeout._flags = UMTX_ABSTIME;
 		timeout._clockid = CLOCK_REALTIME;
 		tm_p = &timeout;
 		tm_size = sizeof(timeout);
 	}
-	return _umtx_op_err(rwlock, UMTX_OP_RW_WRLOCK, 0, (void *)tm_size, tm_p);
+	return (_umtx_op_err(rwlock, UMTX_OP_RW_WRLOCK, 0, (void *)tm_size,
+	    tm_p));
 }
 
 int
 __thr_rwlock_unlock(struct urwlock *rwlock)
 {
-	return _umtx_op_err(rwlock, UMTX_OP_RW_UNLOCK, 0, NULL, NULL);
+
+	return (_umtx_op_err(rwlock, UMTX_OP_RW_UNLOCK, 0, NULL, NULL));
 }
 
 void
 _thr_rwl_rdlock(struct urwlock *rwlock)
 {
 	int ret;
 
 	for (;;) {
 		if (_thr_rwlock_tryrdlock(rwlock, URWLOCK_PREFER_READER) == 0)
 			return;
 		ret = __thr_rwlock_rdlock(rwlock, URWLOCK_PREFER_READER, NULL);
 		if (ret == 0)
 			return;
 		if (ret != EINTR)
 			PANIC("rdlock error");
 	}
 }
 
 void
 _thr_rwl_wrlock(struct urwlock *rwlock)
 {
 	int ret;
 
 	for (;;) {
 		if (_thr_rwlock_trywrlock(rwlock) == 0)
 			return;
 		ret = __thr_rwlock_wrlock(rwlock, NULL);
 		if (ret == 0)
 			return;
 		if (ret != EINTR)
 			PANIC("wrlock error");
 	}
 }
 
 void
 _thr_rwl_unlock(struct urwlock *rwlock)
 {
+
 	if (_thr_rwlock_unlock(rwlock))
 		PANIC("unlock error");
 }
Index: head/lib/libthr/thread/thr_umtx.h
===================================================================
--- head/lib/libthr/thread/thr_umtx.h	(revision 300042)
+++ head/lib/libthr/thread/thr_umtx.h	(revision 300043)
@@ -1,234 +1,270 @@
 /*-
  * Copyright (c) 2005 David Xu <davidxu@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _THR_FBSD_UMTX_H_
 #define _THR_FBSD_UMTX_H_
 
 #include <strings.h>
 #include <sys/umtx.h>
 
-#define DEFAULT_UMUTEX	{0,0,{0,0},{0,0,0,0}}
+#ifdef __LP64__
+#define DEFAULT_UMUTEX	{0,0,{0,0},0,{0,0}}
+#else
+#define DEFAULT_UMUTEX	{0,0,{0,0},0,0,{0,0}}
+#endif
 #define DEFAULT_URWLOCK {0,0,0,0,{0,0,0,0}}
 
 int _umtx_op_err(void *, int op, u_long, void *, void *) __hidden;
 int __thr_umutex_lock(struct umutex *mtx, uint32_t id) __hidden;
 int __thr_umutex_lock_spin(struct umutex *mtx, uint32_t id) __hidden;
 int __thr_umutex_timedlock(struct umutex *mtx, uint32_t id,
 	const struct timespec *timeout) __hidden;
 int __thr_umutex_unlock(struct umutex *mtx, uint32_t id) __hidden;
 int __thr_umutex_trylock(struct umutex *mtx) __hidden;
 int __thr_umutex_set_ceiling(struct umutex *mtx, uint32_t ceiling,
 	uint32_t *oldceiling) __hidden;
 
 void _thr_umutex_init(struct umutex *mtx) __hidden;
 void _thr_urwlock_init(struct urwlock *rwl) __hidden;
 
 int _thr_umtx_wait(volatile long *mtx, long exp,
 	const struct timespec *timeout) __hidden;
 int _thr_umtx_wait_uint(volatile u_int *mtx, u_int exp,
 	const struct timespec *timeout, int shared) __hidden;
 int _thr_umtx_timedwait_uint(volatile u_int *mtx, u_int exp, int clockid,
 	const struct timespec *timeout, int shared) __hidden;
 int _thr_umtx_wake(volatile void *mtx, int count, int shared) __hidden;
 int _thr_ucond_wait(struct ucond *cv, struct umutex *m,
         const struct timespec *timeout, int flags) __hidden;
 void _thr_ucond_init(struct ucond *cv) __hidden;
 int _thr_ucond_signal(struct ucond *cv) __hidden;
 int _thr_ucond_broadcast(struct ucond *cv) __hidden;
 
 int __thr_rwlock_rdlock(struct urwlock *rwlock, int flags,
 	const struct timespec *tsp) __hidden;
 int __thr_rwlock_wrlock(struct urwlock *rwlock,
 	const struct timespec *tsp) __hidden;
 int __thr_rwlock_unlock(struct urwlock *rwlock) __hidden;
 
 /* Internal used only */
 void _thr_rwl_rdlock(struct urwlock *rwlock) __hidden;
 void _thr_rwl_wrlock(struct urwlock *rwlock) __hidden;
 void _thr_rwl_unlock(struct urwlock *rwlock) __hidden;
 
 static inline int
 _thr_umutex_trylock(struct umutex *mtx, uint32_t id)
 {
-    if (atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_UNOWNED, id))
-	return (0);
-    if ((mtx->m_flags & UMUTEX_PRIO_PROTECT) == 0)
-    	return (EBUSY);
-    return (__thr_umutex_trylock(mtx));
+
+	if (atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_UNOWNED, id))
+		return (0);
+	if (__predict_false((uint32_t)mtx->m_owner == UMUTEX_RB_OWNERDEAD) &&
+	    atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_RB_OWNERDEAD,
+	    id | UMUTEX_CONTESTED))
+		return (EOWNERDEAD);
+	if (__predict_false((uint32_t)mtx->m_owner == UMUTEX_RB_NOTRECOV))
+		return (ENOTRECOVERABLE);
+	if ((mtx->m_flags & UMUTEX_PRIO_PROTECT) == 0)
+		return (EBUSY);
+	return (__thr_umutex_trylock(mtx));
 }
 
 static inline int
 _thr_umutex_trylock2(struct umutex *mtx, uint32_t id)
 {
-    if (atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_UNOWNED, id) != 0)
-	return (0);
-    if ((uint32_t)mtx->m_owner == UMUTEX_CONTESTED &&
-        __predict_true((mtx->m_flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) == 0))
-    	if (atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_CONTESTED, id | UMUTEX_CONTESTED))
+
+	if (atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_UNOWNED, id) != 0)
 		return (0);
-    return (EBUSY);
+	if ((uint32_t)mtx->m_owner == UMUTEX_CONTESTED &&
+	    __predict_true((mtx->m_flags & (UMUTEX_PRIO_PROTECT |
+	   UMUTEX_PRIO_INHERIT)) == 0) &&
+	   atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_CONTESTED,
+	   id | UMUTEX_CONTESTED))
+		return (0);
+	if (__predict_false((uint32_t)mtx->m_owner == UMUTEX_RB_OWNERDEAD) &&
+	    atomic_cmpset_acq_32(&mtx->m_owner, UMUTEX_RB_OWNERDEAD,
+	    id | UMUTEX_CONTESTED))
+		return (EOWNERDEAD);
+	if (__predict_false((uint32_t)mtx->m_owner == UMUTEX_RB_NOTRECOV))
+		return (ENOTRECOVERABLE);
+	return (EBUSY);
 }
 
 static inline int
 _thr_umutex_lock(struct umutex *mtx, uint32_t id)
 {
-    if (_thr_umutex_trylock2(mtx, id) == 0)
-	return (0);
-    return (__thr_umutex_lock(mtx, id));
+
+	if (_thr_umutex_trylock2(mtx, id) == 0)
+		return (0);
+	return (__thr_umutex_lock(mtx, id));
 }
 
 static inline int
 _thr_umutex_lock_spin(struct umutex *mtx, uint32_t id)
 {
-    if (_thr_umutex_trylock2(mtx, id) == 0)
-	return (0);
-    return (__thr_umutex_lock_spin(mtx, id));
+
+	if (_thr_umutex_trylock2(mtx, id) == 0)
+		return (0);
+	return (__thr_umutex_lock_spin(mtx, id));
 }
 
 static inline int
 _thr_umutex_timedlock(struct umutex *mtx, uint32_t id,
-	const struct timespec *timeout)
+    const struct timespec *timeout)
 {
-    if (_thr_umutex_trylock2(mtx, id) == 0)
-	return (0);
-    return (__thr_umutex_timedlock(mtx, id, timeout));
+
+	if (_thr_umutex_trylock2(mtx, id) == 0)
+		return (0);
+	return (__thr_umutex_timedlock(mtx, id, timeout));
 }
 
 static inline int
 _thr_umutex_unlock2(struct umutex *mtx, uint32_t id, int *defer)
 {
-	uint32_t flags = mtx->m_flags;
+	uint32_t flags, owner;
+	bool noncst;
 
-	if ((flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) == 0) {
-		uint32_t owner;
-		do {
-			owner = mtx->m_owner;
-			if (__predict_false((owner & ~UMUTEX_CONTESTED) != id))
-				return (EPERM);
-		} while (__predict_false(!atomic_cmpset_rel_32(&mtx->m_owner,
-					 owner, UMUTEX_UNOWNED)));
-		if ((owner & UMUTEX_CONTESTED)) {
-			if (defer == NULL)
-				(void)_umtx_op_err(mtx, UMTX_OP_MUTEX_WAKE2, flags, 0, 0);
-			else
-				*defer = 1;
-		}
-		return (0);
+	flags = mtx->m_flags;
+	noncst = (flags & UMUTEX_NONCONSISTENT) != 0;
+
+	if ((flags & (UMUTEX_PRIO_PROTECT | UMUTEX_PRIO_INHERIT)) != 0) {
+		if (atomic_cmpset_rel_32(&mtx->m_owner, id, noncst ?
+		    UMUTEX_RB_NOTRECOV : UMUTEX_UNOWNED))
+			return (0);
+		return (__thr_umutex_unlock(mtx, id));
 	}
-    	if (atomic_cmpset_rel_32(&mtx->m_owner, id, UMUTEX_UNOWNED))
-		return (0);
-	return (__thr_umutex_unlock(mtx, id));
+
+	do {
+		owner = mtx->m_owner;
+		if (__predict_false((owner & ~UMUTEX_CONTESTED) != id))
+			return (EPERM);
+	} while (__predict_false(!atomic_cmpset_rel_32(&mtx->m_owner, owner,
+	    noncst ? UMUTEX_RB_NOTRECOV : UMUTEX_UNOWNED)));
+	if ((owner & UMUTEX_CONTESTED) != 0) {
+		if (defer == NULL || noncst)
+			(void)_umtx_op_err(mtx, UMTX_OP_MUTEX_WAKE2,
+			    flags, 0, 0);
+		else
+			*defer = 1;
+	}
+	return (0);
 }
 
 static inline int
 _thr_umutex_unlock(struct umutex *mtx, uint32_t id)
 {
-	return _thr_umutex_unlock2(mtx, id, NULL);
+
+	return (_thr_umutex_unlock2(mtx, id, NULL));
 }
 
 static inline int
 _thr_rwlock_tryrdlock(struct urwlock *rwlock, int flags)
 {
-	int32_t state;
-	int32_t wrflags;
+	int32_t state, wrflags;
 
-	if (flags & URWLOCK_PREFER_READER || rwlock->rw_flags & URWLOCK_PREFER_READER)
+	if ((flags & URWLOCK_PREFER_READER) != 0 ||
+	    (rwlock->rw_flags & URWLOCK_PREFER_READER) != 0)
 		wrflags = URWLOCK_WRITE_OWNER;
 	else
 		wrflags = URWLOCK_WRITE_OWNER | URWLOCK_WRITE_WAITERS;
 	state = rwlock->rw_state;
 	while (!(state & wrflags)) {
-		if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS))
+		if (__predict_false(URWLOCK_READER_COUNT(state) ==
+		    URWLOCK_MAX_READERS))
 			return (EAGAIN);
 		if (atomic_cmpset_acq_32(&rwlock->rw_state, state, state + 1))
 			return (0);
 		state = rwlock->rw_state;
 	}
 
 	return (EBUSY);
 }
 
 static inline int
 _thr_rwlock_trywrlock(struct urwlock *rwlock)
 {
 	int32_t state;
 
 	state = rwlock->rw_state;
-	while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
-		if (atomic_cmpset_acq_32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER))
+	while ((state & URWLOCK_WRITE_OWNER) == 0 &&
+	    URWLOCK_READER_COUNT(state) == 0) {
+		if (atomic_cmpset_acq_32(&rwlock->rw_state, state,
+		    state | URWLOCK_WRITE_OWNER))
 			return (0);
 		state = rwlock->rw_state;
 	}
 
 	return (EBUSY);
 }
 
 static inline int
 _thr_rwlock_rdlock(struct urwlock *rwlock, int flags, struct timespec *tsp)
 {
+
 	if (_thr_rwlock_tryrdlock(rwlock, flags) == 0)
 		return (0);
 	return (__thr_rwlock_rdlock(rwlock, flags, tsp));
 }
 
 static inline int
 _thr_rwlock_wrlock(struct urwlock *rwlock, struct timespec *tsp)
 {
+
 	if (_thr_rwlock_trywrlock(rwlock) == 0)
 		return (0);
 	return (__thr_rwlock_wrlock(rwlock, tsp));
 }
 
 static inline int
 _thr_rwlock_unlock(struct urwlock *rwlock)
 {
 	int32_t state;
 
 	state = rwlock->rw_state;
-	if (state & URWLOCK_WRITE_OWNER) {
-		if (atomic_cmpset_rel_32(&rwlock->rw_state, URWLOCK_WRITE_OWNER, 0))
+	if ((state & URWLOCK_WRITE_OWNER) != 0) {
+		if (atomic_cmpset_rel_32(&rwlock->rw_state,
+		    URWLOCK_WRITE_OWNER, 0))
 			return (0);
 	} else {
 		for (;;) {
 			if (__predict_false(URWLOCK_READER_COUNT(state) == 0))
 				return (EPERM);
 			if (!((state & (URWLOCK_WRITE_WAITERS |
-			    URWLOCK_READ_WAITERS)) &&
+			    URWLOCK_READ_WAITERS)) != 0 &&
 			    URWLOCK_READER_COUNT(state) == 1)) {
 				if (atomic_cmpset_rel_32(&rwlock->rw_state,
-				    state, state-1))
+				    state, state - 1))
 					return (0);
 				state = rwlock->rw_state;
 			} else {
 				break;
 			}
 		}
     	}
     	return (__thr_rwlock_unlock(rwlock));
 }
 #endif
Index: head/share/man/man3/Makefile
===================================================================
--- head/share/man/man3/Makefile	(revision 300042)
+++ head/share/man/man3/Makefile	(revision 300043)
@@ -1,334 +1,337 @@
 #	@(#)Makefile	8.2 (Berkeley) 12/13/93
 # $FreeBSD$
 
 .include <src.opts.mk>
 
 PACKAGE=runtime-manuals
 
 MAN=		assert.3 \
 		ATOMIC_VAR_INIT.3 \
 		bitstring.3 \
 		end.3 \
 		fpgetround.3 \
 		intro.3 \
 		makedev.3 \
 		offsetof.3 \
 		${PTHREAD_MAN} \
 		queue.3 \
 		siginfo.3 \
 		stdarg.3 \
 		sysexits.3 \
 		tgmath.3 \
 		timeradd.3 \
 		tree.3
 
 MLINKS=		ATOMIC_VAR_INIT.3 atomic_compare_exchange_strong.3 \
 		ATOMIC_VAR_INIT.3 atomic_compare_exchange_strong_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_compare_exchange_weak.3 \
 		ATOMIC_VAR_INIT.3 atomic_compare_exchange_weak_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_exchange.3 \
 		ATOMIC_VAR_INIT.3 atomic_exchange_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_add.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_add_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_and.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_and_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_or.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_or_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_sub.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_sub_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_xor.3 \
 		ATOMIC_VAR_INIT.3 atomic_fetch_xor_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_init.3 \
 		ATOMIC_VAR_INIT.3 atomic_is_lock_free.3 \
 		ATOMIC_VAR_INIT.3 atomic_load.3 \
 		ATOMIC_VAR_INIT.3 atomic_load_explicit.3 \
 		ATOMIC_VAR_INIT.3 atomic_store.3 \
 		ATOMIC_VAR_INIT.3 atomic_store_explicit.3
 MLINKS+=	bitstring.3 bit_alloc.3 \
 		bitstring.3 bit_clear.3 \
 		bitstring.3 bit_decl.3 \
 		bitstring.3 bit_ffc.3 \
 		bitstring.3 bit_ffs.3 \
 		bitstring.3 bit_nclear.3 \
 		bitstring.3 bit_nset.3 \
 		bitstring.3 bit_set.3 \
 		bitstring.3 bitstr_size.3 \
 		bitstring.3 bit_test.3
 MLINKS+=	end.3 edata.3 \
 		end.3 etext.3
 MLINKS+=	fpgetround.3 fpgetmask.3 \
 		fpgetround.3 fpgetprec.3 \
 		fpgetround.3 fpgetsticky.3 \
 		fpgetround.3 fpresetsticky.3 \
 		fpgetround.3 fpsetmask.3 \
 		fpgetround.3 fpsetprec.3 \
 		fpgetround.3 fpsetround.3
 MLINKS+=	makedev.3 major.3 \
 		makedev.3 minor.3
 MLINKS+=	${PTHREAD_MLINKS}
 MLINKS+=	queue.3 LIST_CLASS_ENTRY.3 \
 		queue.3 LIST_CLASS_HEAD.3 \
 		queue.3 LIST_EMPTY.3 \
 		queue.3 LIST_ENTRY.3 \
 		queue.3 LIST_FIRST.3 \
 		queue.3 LIST_FOREACH.3 \
 		queue.3 LIST_FOREACH_FROM.3 \
 		queue.3 LIST_FOREACH_FROM_SAFE.3 \
 		queue.3 LIST_FOREACH_SAFE.3 \
 		queue.3 LIST_HEAD.3 \
 		queue.3 LIST_HEAD_INITIALIZER.3 \
 		queue.3 LIST_INIT.3 \
 		queue.3 LIST_INSERT_AFTER.3 \
 		queue.3 LIST_INSERT_BEFORE.3 \
 		queue.3 LIST_INSERT_HEAD.3 \
 		queue.3 LIST_NEXT.3 \
 		queue.3 LIST_PREV.3 \
 		queue.3 LIST_REMOVE.3 \
 		queue.3 LIST_SWAP.3 \
 		queue.3 SLIST_CLASS_ENTRY.3 \
 		queue.3 SLIST_CLASS_HEAD.3 \
 		queue.3 SLIST_EMPTY.3 \
 		queue.3 SLIST_ENTRY.3 \
 		queue.3 SLIST_FIRST.3 \
 		queue.3 SLIST_FOREACH.3 \
 		queue.3 SLIST_FOREACH_FROM.3 \
 		queue.3 SLIST_FOREACH_FROM_SAFE.3 \
 		queue.3 SLIST_FOREACH_SAFE.3 \
 		queue.3 SLIST_HEAD.3 \
 		queue.3 SLIST_HEAD_INITIALIZER.3 \
 		queue.3 SLIST_INIT.3 \
 		queue.3 SLIST_INSERT_AFTER.3 \
 		queue.3 SLIST_INSERT_HEAD.3 \
 		queue.3 SLIST_NEXT.3 \
 		queue.3 SLIST_REMOVE.3 \
 		queue.3 SLIST_REMOVE_AFTER.3 \
 		queue.3 SLIST_REMOVE_HEAD.3 \
 		queue.3 SLIST_SWAP.3 \
 		queue.3 STAILQ_CLASS_ENTRY.3 \
 		queue.3 STAILQ_CLASS_HEAD.3 \
 		queue.3 STAILQ_CONCAT.3 \
 		queue.3 STAILQ_EMPTY.3 \
 		queue.3 STAILQ_ENTRY.3 \
 		queue.3 STAILQ_FIRST.3 \
 		queue.3 STAILQ_FOREACH.3 \
 		queue.3 STAILQ_FOREACH_FROM.3 \
 		queue.3 STAILQ_FOREACH_FROM_SAFE.3 \
 		queue.3 STAILQ_FOREACH_SAFE.3 \
 		queue.3 STAILQ_HEAD.3 \
 		queue.3 STAILQ_HEAD_INITIALIZER.3 \
 		queue.3 STAILQ_INIT.3 \
 		queue.3 STAILQ_INSERT_AFTER.3 \
 		queue.3 STAILQ_INSERT_HEAD.3 \
 		queue.3 STAILQ_INSERT_TAIL.3 \
 		queue.3 STAILQ_LAST.3 \
 		queue.3 STAILQ_NEXT.3 \
 		queue.3 STAILQ_REMOVE.3 \
 		queue.3 STAILQ_REMOVE_AFTER.3 \
 		queue.3 STAILQ_REMOVE_HEAD.3 \
 		queue.3 STAILQ_SWAP.3 \
 		queue.3 TAILQ_CLASS_ENTRY.3 \
 		queue.3 TAILQ_CLASS_HEAD.3 \
 		queue.3 TAILQ_CONCAT.3 \
 		queue.3 TAILQ_EMPTY.3 \
 		queue.3 TAILQ_ENTRY.3 \
 		queue.3 TAILQ_FIRST.3 \
 		queue.3 TAILQ_FOREACH.3 \
 		queue.3 TAILQ_FOREACH_FROM.3 \
 		queue.3 TAILQ_FOREACH_FROM_SAFE.3 \
 		queue.3 TAILQ_FOREACH_REVERSE.3 \
 		queue.3 TAILQ_FOREACH_REVERSE_FROM.3 \
 		queue.3 TAILQ_FOREACH_REVERSE_FROM_SAFE.3 \
 		queue.3 TAILQ_FOREACH_REVERSE_SAFE.3 \
 		queue.3 TAILQ_FOREACH_SAFE.3 \
 		queue.3 TAILQ_HEAD.3 \
 		queue.3 TAILQ_HEAD_INITIALIZER.3 \
 		queue.3 TAILQ_INIT.3 \
 		queue.3 TAILQ_INSERT_AFTER.3 \
 		queue.3 TAILQ_INSERT_BEFORE.3 \
 		queue.3 TAILQ_INSERT_HEAD.3 \
 		queue.3 TAILQ_INSERT_TAIL.3 \
 		queue.3 TAILQ_LAST.3 \
 		queue.3 TAILQ_NEXT.3 \
 		queue.3 TAILQ_PREV.3 \
 		queue.3 TAILQ_REMOVE.3 \
 		queue.3 TAILQ_SWAP.3
 MLINKS+=	stdarg.3 va_arg.3 \
 		stdarg.3 va_copy.3 \
 		stdarg.3 va_end.3 \
 		stdarg.3 varargs.3 \
 		stdarg.3 va_start.3
 MLINKS+=	timeradd.3 timerclear.3 \
 		timeradd.3 timercmp.3 \
 		timeradd.3 timerisset.3 \
 		timeradd.3 timersub.3
 MLINKS+=	tree.3 RB_EMPTY.3 \
 		tree.3 RB_ENTRY.3 \
 		tree.3 RB_FIND.3 \
 		tree.3 RB_FOREACH.3 \
 		tree.3 RB_FOREACH_REVERSE.3 \
 		tree.3 RB_GENERATE.3 \
 		tree.3 RB_GENERATE_STATIC.3 \
 		tree.3 RB_HEAD.3 \
 		tree.3 RB_INIT.3 \
 		tree.3 RB_INITIALIZER.3 \
 		tree.3 RB_INSERT.3 \
 		tree.3 RB_LEFT.3 \
 		tree.3 RB_MAX.3 \
 		tree.3 RB_MIN.3 \
 		tree.3 RB_NEXT.3 \
 		tree.3 RB_NFIND.3 \
 		tree.3 RB_PARENT.3 \
 		tree.3 RB_PREV.3 \
 		tree.3 RB_PROTOTYPE.3 \
 		tree.3 RB_PROTOTYPE_STATIC.3 \
 		tree.3 RB_REMOVE.3 \
 		tree.3 RB_RIGHT.3 \
 		tree.3 RB_ROOT.3 \
 		tree.3 SPLAY_EMPTY.3 \
 		tree.3 SPLAY_ENTRY.3 \
 		tree.3 SPLAY_FIND.3 \
 		tree.3 SPLAY_FOREACH.3 \
 		tree.3 SPLAY_GENERATE.3 \
 		tree.3 SPLAY_HEAD.3 \
 		tree.3 SPLAY_INIT.3 \
 		tree.3 SPLAY_INITIALIZER.3 \
 		tree.3 SPLAY_INSERT.3 \
 		tree.3 SPLAY_LEFT.3 \
 		tree.3 SPLAY_MAX.3 \
 		tree.3 SPLAY_MIN.3 \
 		tree.3 SPLAY_NEXT.3 \
 		tree.3 SPLAY_PROTOTYPE.3 \
 		tree.3 SPLAY_REMOVE.3 \
 		tree.3 SPLAY_RIGHT.3 \
 		tree.3 SPLAY_ROOT.3
 
 .if ${MK_LIBTHR} != "no"
 PTHREAD_MAN=	pthread.3 \
 		pthread_affinity_np.3 \
 		pthread_atfork.3 \
 		pthread_attr.3 \
 		pthread_attr_affinity_np.3 \
 		pthread_attr_get_np.3 \
 		pthread_attr_setcreatesuspend_np.3 \
 		pthread_barrierattr.3 \
 		pthread_barrier_destroy.3 \
 		pthread_cancel.3 \
 		pthread_cleanup_pop.3 \
 		pthread_cleanup_push.3 \
 		pthread_condattr.3 \
 		pthread_cond_broadcast.3 \
 		pthread_cond_destroy.3 \
 		pthread_cond_init.3 \
 		pthread_cond_signal.3 \
 		pthread_cond_timedwait.3 \
 		pthread_cond_wait.3 \
 		pthread_create.3 \
 		pthread_detach.3 \
 		pthread_equal.3 \
 		pthread_exit.3 \
 		pthread_getconcurrency.3 \
 		pthread_getcpuclockid.3 \
 		pthread_getspecific.3 \
 		pthread_getthreadid_np.3 \
 		pthread_join.3 \
 		pthread_key_create.3 \
 		pthread_key_delete.3 \
 		pthread_kill.3 \
 		pthread_main_np.3 \
 		pthread_multi_np.3 \
 		pthread_mutexattr.3 \
 		pthread_mutexattr_getkind_np.3 \
+		pthread_mutex_consistent.3 \
 		pthread_mutex_destroy.3 \
 		pthread_mutex_init.3 \
 		pthread_mutex_lock.3 \
 		pthread_mutex_timedlock.3 \
 		pthread_mutex_trylock.3 \
 		pthread_mutex_unlock.3 \
 		pthread_once.3 \
 		pthread_resume_all_np.3 \
 		pthread_resume_np.3 \
 		pthread_rwlockattr_destroy.3 \
 		pthread_rwlockattr_getpshared.3 \
 		pthread_rwlockattr_init.3 \
 		pthread_rwlockattr_setpshared.3 \
 		pthread_rwlock_destroy.3 \
 		pthread_rwlock_init.3 \
 		pthread_rwlock_rdlock.3 \
 		pthread_rwlock_timedrdlock.3 \
 		pthread_rwlock_timedwrlock.3 \
 		pthread_rwlock_unlock.3 \
 		pthread_rwlock_wrlock.3 \
 		pthread_schedparam.3 \
 		pthread_self.3 \
 		pthread_set_name_np.3 \
 		pthread_setspecific.3 \
 		pthread_sigmask.3 \
 		pthread_spin_init.3 \
 		pthread_spin_lock.3 \
 		pthread_suspend_all_np.3 \
 		pthread_suspend_np.3 \
 		pthread_switch_add_np.3 \
 		pthread_testcancel.3 \
 		pthread_yield.3
 
 PTHREAD_MLINKS=	pthread_affinity_np.3 pthread_getaffinity_np.3 \
 		pthread_affinity_np.3 pthread_setaffinity_np.3
 PTHREAD_MLINKS+=pthread_attr.3 pthread_attr_destroy.3 \
 		pthread_attr.3 pthread_attr_getdetachstate.3 \
 		pthread_attr.3 pthread_attr_getguardsize.3 \
 		pthread_attr.3 pthread_attr_getinheritsched.3 \
 		pthread_attr.3 pthread_attr_getschedparam.3 \
 		pthread_attr.3 pthread_attr_getschedpolicy.3 \
 		pthread_attr.3 pthread_attr_getscope.3 \
 		pthread_attr.3 pthread_attr_getstack.3 \
 		pthread_attr.3 pthread_attr_getstackaddr.3 \
 		pthread_attr.3 pthread_attr_getstacksize.3 \
 		pthread_attr.3 pthread_attr_init.3 \
 		pthread_attr.3 pthread_attr_setdetachstate.3 \
 		pthread_attr.3 pthread_attr_setguardsize.3 \
 		pthread_attr.3 pthread_attr_setinheritsched.3 \
 		pthread_attr.3 pthread_attr_setschedparam.3 \
 		pthread_attr.3 pthread_attr_setschedpolicy.3 \
 		pthread_attr.3 pthread_attr_setscope.3 \
 		pthread_attr.3 pthread_attr_setstack.3 \
 		pthread_attr.3 pthread_attr_setstackaddr.3 \
 		pthread_attr.3 pthread_attr_setstacksize.3
 PTHREAD_MLINKS+=pthread_attr_affinity_np.3 pthread_attr_getaffinity_np.3 \
 		pthread_attr_affinity_np.3 pthread_attr_setaffinity_np.3
 PTHREAD_MLINKS+=pthread_barrierattr.3 pthread_barrierattr_destroy.3 \
 		pthread_barrierattr.3 pthread_barrierattr_getpshared.3 \
 		pthread_barrierattr.3 pthread_barrierattr_init.3 \
 		pthread_barrierattr.3 pthread_barrierattr_setpshared.3
 PTHREAD_MLINKS+=pthread_barrier_destroy.3 pthread_barrier_init.3 \
 		pthread_barrier_destroy.3 pthread_barrier_wait.3
 PTHREAD_MLINKS+=pthread_condattr.3 pthread_condattr_destroy.3 \
 		pthread_condattr.3 pthread_condattr_init.3 \
 		pthread_condattr.3 pthread_condattr_getclock.3 \
 		pthread_condattr.3 pthread_condattr_setclock.3 \
 		pthread_condattr.3 pthread_condattr_getpshared.3 \
 		pthread_condattr.3 pthread_condattr_setpshared.3
 PTHREAD_MLINKS+=pthread_getconcurrency.3 pthread_setconcurrency.3
 PTHREAD_MLINKS+=pthread_multi_np.3 pthread_single_np.3
 PTHREAD_MLINKS+=pthread_mutexattr.3 pthread_mutexattr_destroy.3 \
 		pthread_mutexattr.3 pthread_mutexattr_getprioceiling.3 \
 		pthread_mutexattr.3 pthread_mutexattr_getprotocol.3 \
+		pthread_mutexattr.3 pthread_mutexattr_getrobust.3 \
 		pthread_mutexattr.3 pthread_mutexattr_gettype.3 \
 		pthread_mutexattr.3 pthread_mutexattr_init.3 \
 		pthread_mutexattr.3 pthread_mutexattr_setprioceiling.3 \
 		pthread_mutexattr.3 pthread_mutexattr_setprotocol.3 \
+		pthread_mutexattr.3 pthread_mutexattr_setrobust.3 \
 		pthread_mutexattr.3 pthread_mutexattr_settype.3
 PTHREAD_MLINKS+=pthread_mutexattr_getkind_np.3 pthread_mutexattr_setkind_np.3
 PTHREAD_MLINKS+=pthread_rwlock_rdlock.3 pthread_rwlock_tryrdlock.3
 PTHREAD_MLINKS+=pthread_rwlock_wrlock.3 pthread_rwlock_trywrlock.3
 PTHREAD_MLINKS+=pthread_schedparam.3 pthread_getschedparam.3 \
 		pthread_schedparam.3 pthread_setschedparam.3
 PTHREAD_MLINKS+=pthread_spin_init.3 pthread_spin_destroy.3 \
 		pthread_spin_lock.3 pthread_spin_trylock.3 \
 		pthread_spin_lock.3 pthread_spin_unlock.3
 PTHREAD_MLINKS+=pthread_switch_add_np.3 pthread_switch_delete_np.3
 PTHREAD_MLINKS+=pthread_testcancel.3 pthread_setcancelstate.3 \
 		pthread_testcancel.3 pthread_setcanceltype.3
 PTHREAD_MLINKS+=pthread_join.3 pthread_timedjoin_np.3
 .endif
 
 .include <bsd.prog.mk>
Index: head/share/man/man3/pthread_cond_wait.3
===================================================================
--- head/share/man/man3/pthread_cond_wait.3	(revision 300042)
+++ head/share/man/man3/pthread_cond_wait.3	(revision 300043)
@@ -1,89 +1,101 @@
 .\" Copyright (c) 1997 Brian Cully <shmit@kublai.com>
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the author nor the names of any co-contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd February 16, 2006
+.Dd April 29, 2016
 .Dt PTHREAD_COND_WAIT 3
 .Os
 .Sh NAME
 .Nm pthread_cond_wait
 .Nd wait on a condition variable
 .Sh LIBRARY
 .Lb libpthread
 .Sh SYNOPSIS
 .In pthread.h
 .Ft int
 .Fn pthread_cond_wait "pthread_cond_t *cond" "pthread_mutex_t *mutex"
 .Sh DESCRIPTION
 The
 .Fn pthread_cond_wait
 function atomically blocks the current thread waiting on the condition
 variable specified by
 .Fa cond ,
 and releases the mutex specified by
 .Fa mutex .
 The waiting thread unblocks only after another thread calls
 .Xr pthread_cond_signal 3 ,
 or
 .Xr pthread_cond_broadcast 3
 with the same condition variable, and the current thread reacquires the lock
 on
 .Fa mutex .
 .Sh RETURN VALUES
 If successful, the
 .Fn pthread_cond_wait
 function will return zero.
 Otherwise an error number will be returned to
 indicate the error.
 .Sh ERRORS
 The
 .Fn pthread_cond_wait
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 The value specified by
 .Fa cond
 or the value specified by
 .Fa mutex
 is invalid.
 .It Bq Er EPERM
 The specified
 .Fa mutex
 was not locked by the calling thread.
+.It Bq Er EOWNERDEAD
+The argument
+.Fa mutex
+points to a robust mutex and the previous owning thread terminated
+while holding the mutex lock.
+The lock was granted to the caller and it is up to the new owner
+to make the state consistent.
+.It Bq Er ENOTRECOVERABLE
+The state protected by the
+.Fa mutex
+is not recoverable.
 .El
 .Sh SEE ALSO
 .Xr pthread_cond_broadcast 3 ,
 .Xr pthread_cond_destroy 3 ,
 .Xr pthread_cond_init 3 ,
 .Xr pthread_cond_signal 3 ,
-.Xr pthread_cond_timedwait 3
+.Xr pthread_cond_timedwait 3 ,
+.Xr pthread_mutex_consistent 3
 .Sh STANDARDS
 The
 .Fn pthread_cond_wait
 function conforms to
 .St -p1003.1-96 .
Index: head/share/man/man3/pthread_mutex_consistent.3
===================================================================
--- head/share/man/man3/pthread_mutex_consistent.3	(nonexistent)
+++ head/share/man/man3/pthread_mutex_consistent.3	(revision 300043)
@@ -0,0 +1,94 @@
+.\" Copyright (c) 2016 The FreeBSD Foundation, Inc.
+.\" All rights reserved.
+.\"
+.\" This documentation was written by
+.\" Konstantin Belousov <kib@FreeBSD.org> under sponsorship
+.\" from the FreeBSD Foundation.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd May 8, 2016
+.Dt PTHREAD_MUTEX_CONSISTENT 3
+.Os
+.Sh NAME
+.Nm pthread_mutex_consistent
+.Nd mark state protected by robust mutex as consistent
+.Sh LIBRARY
+.Lb libpthread
+.Sh SYNOPSIS
+.In pthread.h
+.Ft int
+.Fn pthread_mutex_consistent "pthread_mutex_t *mutex"
+.Sh DESCRIPTION
+If the thread owning a robust mutex terminates while holding the
+mutex, the mutex becomes inconsistent and the next thread that
+acquires the mutex lock is notified of the state by the return value
+.Er EOWNERDEAD .
+In this case, the mutex does not become normally usable again until
+the state is marked consistent.
+.Pp
+The
+.Fn pthread_mutex_consistent ,
+when called with the
+.Fa mutex
+argument, which points to the initialized robust mutex in an
+inconsistent state, marks the by mutex as consistent again.
+The consequent unlock of the mutex, by either
+.Fn pthread_mutex_unlock
+or other methods, allows other contenders to lock the mutex.
+.Pp
+If the mutex in the inconsistent state is not marked consistent
+by the call to
+.Fn pthread_mutex_consistent
+before unlock,
+further attempts to lock the
+.Fa mutex
+result in the
+.Er ENOTRECOVERABLE
+condition reported by the locking functions.
+.Sh RETURN VALUES
+If successful,
+.Fn pthread_mutex_consistent
+will return zero, otherwise an error number will be returned to
+indicate the error.
+.Sh ERRORS
+The
+.Fn pthread_mutex_lock
+function will fail if:
+.Bl -tag -width Er
+.It Bq Er EINVAL
+The mutex pointed to by the
+.Fa mutex
+argument is not robust, or is not in the inconsistent state.
+.El
+.Sh SEE ALSO
+.Xr pthread_mutexattr_setrobust 3 ,
+.Xr pthread_mutex_init 3 ,
+.Xr pthread_mutex_lock 3 ,
+.Xr pthread_mutex_unlock 3
+.Sh STANDARDS
+The
+.Fn pthread_mutex_lock
+function conforms to
+.St -susv4 .

Property changes on: head/share/man/man3/pthread_mutex_consistent.3
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: head/share/man/man3/pthread_mutex_lock.3
===================================================================
--- head/share/man/man3/pthread_mutex_lock.3	(revision 300042)
+++ head/share/man/man3/pthread_mutex_lock.3	(revision 300043)
@@ -1,76 +1,88 @@
 .\" Copyright (c) 1997 Brian Cully <shmit@kublai.com>
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the author nor the names of any co-contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd January 31, 2006
+.Dd April 29, 2016
 .Dt PTHREAD_MUTEX_LOCK 3
 .Os
 .Sh NAME
 .Nm pthread_mutex_lock
 .Nd lock a mutex
 .Sh LIBRARY
 .Lb libpthread
 .Sh SYNOPSIS
 .In pthread.h
 .Ft int
 .Fn pthread_mutex_lock "pthread_mutex_t *mutex"
 .Sh DESCRIPTION
 The
 .Fn pthread_mutex_lock
 function locks
 .Fa mutex .
 If the mutex is already locked, the calling thread will block until the
 mutex becomes available.
 .Sh RETURN VALUES
 If successful,
 .Fn pthread_mutex_lock
 will return zero, otherwise an error number will be returned to
 indicate the error.
 .Sh ERRORS
 The
 .Fn pthread_mutex_lock
 function will fail if:
-.Bl -tag -width Er
+.Bl -tag -width "Er ENOTRECOVERABLE"
 .It Bq Er EINVAL
 The value specified by
 .Fa mutex
 is invalid.
 .It Bq Er EDEADLK
 A deadlock would occur if the thread blocked waiting for
 .Fa mutex .
+.It Bq Er EOWNERDEAD
+The argument
+.Fa mutex
+points to a robust mutex and the previous owning thread terminated
+while holding the mutex lock.
+The lock was granted to the caller and it is up to the new owner
+to make the state consistent.
+.It Bq Er ENOTRECOVERABLE
+The state protected by the
+.Fa mutex
+is not recoverable.
 .El
 .Sh SEE ALSO
+.Xr pthread_mutex_consistent 3 ,
 .Xr pthread_mutex_destroy 3 ,
 .Xr pthread_mutex_init 3 ,
 .Xr pthread_mutex_trylock 3 ,
 .Xr pthread_mutex_unlock 3
 .Sh STANDARDS
 The
 .Fn pthread_mutex_lock
 function conforms to
 .St -p1003.1-96 .
Index: head/share/man/man3/pthread_mutex_timedlock.3
===================================================================
--- head/share/man/man3/pthread_mutex_timedlock.3	(revision 300042)
+++ head/share/man/man3/pthread_mutex_timedlock.3	(revision 300043)
@@ -1,103 +1,115 @@
 .\" Copyright (c) 2003 Michael Telahun Makonnen
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd December 30, 2003
+.Dd April 29, 2016
 .Dt PTHREAD_MUTEX_TIMEDLOCK 3
 .Os
 .Sh NAME
 .Nm pthread_mutex_timedlock
 .Nd lock a mutex without blocking indefinitely
 .Sh LIBRARY
 .Lb libpthread
 .Sh SYNOPSIS
 .In pthread.h
 .In time.h
 .Ft int
 .Fn pthread_mutex_timedlock "pthread_mutex_t *mutex" "const struct timespec *abs_timeout"
 .Sh DESCRIPTION
 The
 .Fn pthread_mutex_timedlock
 function will lock
 .Fa mutex .
 If it is already locked the calling thread will block until
 the mutex becomes available or
 the timeout,
 specified by abs_timeout,
 expires.
 The time of the timeout is an absolute time and
 is not relative to the current time.
 .Sh RETURN VALUES
 If successful,
 .Fn pthread_mutex_timedlock
 will return zero, otherwise an error number will be returned to
 indicate the error.
 .Sh ERRORS
 The
 .Fn pthread_mutex_timedlock
 function will fail if:
 .Bl -tag -width Er
-.It Bq Er EINVAL
+.It Bq "Er ENOTRECOVERABLE"
 The
 .Fa mutex
 was created with the protocol attribute having the
 value PTHREAD_PRIO_PROTECT and
 the calling thread's priority is higher than the
 mutex's current priority ceiling.
 .It Bq Er EINVAL
 The process or thread would have blocked, and
 .Fa abs_timeout
 specified a nanosecond value less than zero or
 greater than or equal to 1 billion.
 .It Bq Er EINVAL
 The
 .Fa mutex
 parameter is invalid.
 .It Bq Er ETIMEDOUT
 The
 .Fa mutex
 could not be locked before the timeout expired.
 .It Bq Er EAGAIN
 The
 .Fa mutex
 could not be acquired because the
 maximum number of recursive locks for the
 .Fa mutex
 has been exceeded.
 .It Bq Er EDEADLK
 The current thread already owns the
 .Fa mutex .
+.It Bq Er EOWNERDEAD
+The argument
+.Fa mutex
+points to a robust mutex and the previous owning thread terminated
+while holding the mutex lock.
+The lock was granted to the caller and it is up to the new owner
+to make the state consistent.
+.It Bq Er ENOTRECOVERABLE
+The state protected by the
+.Fa mutex
+is not recoverable.
 .El
 .Sh SEE ALSO
+.Xr pthread_mutex_consistent 3 ,
 .Xr pthread_mutex_destroy 3 ,
 .Xr pthread_mutex_init 3 ,
 .Xr pthread_mutex_lock 3 ,
 .Xr pthread_mutex_trylock 3 ,
 .Xr pthread_mutex_unlock 3
 .Sh STANDARDS
 The
 .Fn pthread_mutex_timedlock
 function is expected to conform to
 .St -p1003.1-96 .
Index: head/share/man/man3/pthread_mutex_trylock.3
===================================================================
--- head/share/man/man3/pthread_mutex_trylock.3	(revision 300042)
+++ head/share/man/man3/pthread_mutex_trylock.3	(revision 300043)
@@ -1,77 +1,89 @@
 .\" Copyright (c) 1997 Brian Cully <shmit@kublai.com>
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the author nor the names of any co-contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd July 30, 1998
+.Dd April 29, 2016
 .Dt PTHREAD_MUTEX_TRYLOCK 3
 .Os
 .Sh NAME
 .Nm pthread_mutex_trylock
 .Nd attempt to lock a mutex without blocking
 .Sh LIBRARY
 .Lb libpthread
 .Sh SYNOPSIS
 .In pthread.h
 .Ft int
 .Fn pthread_mutex_trylock "pthread_mutex_t *mutex"
 .Sh DESCRIPTION
 The
 .Fn pthread_mutex_trylock
 function locks
 .Fa mutex .
 If the mutex is already locked,
 .Fn pthread_mutex_trylock
 will not block waiting for the mutex, but will return an error condition.
 .Sh RETURN VALUES
 If successful,
 .Fn pthread_mutex_trylock
 will return zero, otherwise an error number will be returned to
 indicate the error.
 .Sh ERRORS
 The
 .Fn pthread_mutex_trylock
 function will fail if:
-.Bl -tag -width Er
+.Bl -tag -width "Er ENOTRECOVERABLE"
 .It Bq Er EINVAL
 The value specified by
 .Fa mutex
 is invalid.
 .It Bq Er EBUSY
 .Fa Mutex
 is already locked.
+.It Bq Er EOWNERDEAD
+The argument
+.Fa mutex
+points to a robust mutex and the previous owning thread terminated
+while holding the mutex lock.
+The lock was granted to the caller and it is up to the new owner
+to make the state consistent.
+.It Bq Er ENOTRECOVERABLE
+The state protected by the
+.Fa mutex
+is not recoverable.
 .El
 .Sh SEE ALSO
+.Xr pthread_mutex_consistent 3 ,
 .Xr pthread_mutex_destroy 3 ,
 .Xr pthread_mutex_init 3 ,
 .Xr pthread_mutex_lock 3 ,
 .Xr pthread_mutex_unlock 3
 .Sh STANDARDS
 The
 .Fn pthread_mutex_trylock
 function conforms to
 .St -p1003.1-96 .
Index: head/share/man/man3/pthread_mutex_unlock.3
===================================================================
--- head/share/man/man3/pthread_mutex_unlock.3	(revision 300042)
+++ head/share/man/man3/pthread_mutex_unlock.3	(revision 300043)
@@ -1,76 +1,87 @@
 .\" Copyright (c) 1997 Brian Cully <shmit@kublai.com>
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the author nor the names of any co-contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd July 30, 1998
+.Dd April 29, 2016
 .Dt PTHREAD_MUTEX_UNLOCK 3
 .Os
 .Sh NAME
 .Nm pthread_mutex_unlock
 .Nd unlock a mutex
 .Sh LIBRARY
 .Lb libpthread
 .Sh SYNOPSIS
 .In pthread.h
 .Ft int
 .Fn pthread_mutex_unlock "pthread_mutex_t *mutex"
 .Sh DESCRIPTION
 If the current thread holds the lock on
 .Fa mutex ,
 then the
 .Fn pthread_mutex_unlock
 function unlocks
 .Fa mutex .
+.Pp
+If the argument pointed by the
+.Fa mutex
+is a robust mutex in the inconsistent state, and the call to
+.Fn pthread_mutex_consistent
+function was not done prior to unlocking, further locking attempts on
+the mutex
+.Fa mutex
+are denied and locking functions return
+.Er ENOTRECOVERABLE
+error.
 .Sh RETURN VALUES
 If successful,
 .Fn pthread_mutex_unlock
 will return zero, otherwise an error number will be returned to
 indicate the error.
 .Sh ERRORS
 The
 .Fn pthread_mutex_unlock
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 The value specified by
 .Fa mutex
 is invalid.
 .It Bq Er EPERM
 The current thread does not hold a lock on
 .Fa mutex .
 .El
 .Sh SEE ALSO
 .Xr pthread_mutex_destroy 3 ,
 .Xr pthread_mutex_init 3 ,
 .Xr pthread_mutex_lock 3 ,
 .Xr pthread_mutex_trylock 3
 .Sh STANDARDS
 The
 .Fn pthread_mutex_unlock
 function conforms to
 .St -p1003.1-96 .
Index: head/share/man/man3/pthread_mutexattr.3
===================================================================
--- head/share/man/man3/pthread_mutexattr.3	(revision 300042)
+++ head/share/man/man3/pthread_mutexattr.3	(revision 300043)
@@ -1,187 +1,219 @@
 .\" Copyright (C) 2000 Jason Evans <jasone@FreeBSD.org>.
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice(s), this list of conditions and the following disclaimer as
 .\"    the first lines of this file unmodified other than the possible
 .\"    addition of one or more copyright notices.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice(s), this list of conditions and the following disclaimer in
 .\"    the documentation and/or other materials provided with the
 .\"    distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
 .\" EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 .\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
 .\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 .\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 .\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 .\" BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 .\" WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 .\" OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 .\" EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
-.Dd May 1, 2000
+.Dd April 29, 2016
 .Dt PTHREAD_MUTEXATTR 3
 .Os
 .Sh NAME
 .Nm pthread_mutexattr_init ,
 .Nm pthread_mutexattr_destroy ,
 .Nm pthread_mutexattr_setprioceiling ,
 .Nm pthread_mutexattr_getprioceiling ,
 .Nm pthread_mutexattr_setprotocol ,
 .Nm pthread_mutexattr_getprotocol ,
+.Nm pthread_mutexattr_setrobust ,
+.Nm pthread_mutexattr_getrobust ,
 .Nm pthread_mutexattr_settype ,
 .Nm pthread_mutexattr_gettype
 .Nd mutex attribute operations
 .Sh LIBRARY
 .Lb libpthread
 .Sh SYNOPSIS
 .In pthread.h
 .Ft int
 .Fn pthread_mutexattr_init "pthread_mutexattr_t *attr"
 .Ft int
 .Fn pthread_mutexattr_destroy "pthread_mutexattr_t *attr"
 .Ft int
 .Fn pthread_mutexattr_setprioceiling "pthread_mutexattr_t *attr" "int prioceiling"
 .Ft int
 .Fn pthread_mutexattr_getprioceiling "pthread_mutexattr_t *attr" "int *prioceiling"
 .Ft int
 .Fn pthread_mutexattr_setprotocol "pthread_mutexattr_t *attr" "int protocol"
 .Ft int
 .Fn pthread_mutexattr_getprotocol "pthread_mutexattr_t *attr" "int *protocol"
 .Ft int
+.Fn pthread_mutexattr_setrobust "pthread_mutexattr_t *attr" "int robust"
+.Ft int
+.Fn pthread_mutexattr_getrobust "pthread_mutexattr_t *attr" "int *robust"
+.Ft int
 .Fn pthread_mutexattr_settype "pthread_mutexattr_t *attr" "int type"
 .Ft int
 .Fn pthread_mutexattr_gettype "pthread_mutexattr_t *attr" "int *type"
 .Sh DESCRIPTION
 Mutex attributes are used to specify parameters to
 .Fn pthread_mutex_init .
 One attribute object can be used in multiple calls to
 .Fn pthread_mutex_init ,
 with or without modifications between calls.
 .Pp
 The
 .Fn pthread_mutexattr_init
 function initializes
 .Fa attr
 with all the default mutex attributes.
 .Pp
 The
 .Fn pthread_mutexattr_destroy
 function destroys
 .Fa attr .
 .Pp
 The
 .Fn pthread_mutexattr_set*
 functions set the attribute that corresponds to each function name.
 .Pp
 The
 .Fn pthread_mutexattr_get*
 functions copy the value of the attribute that corresponds to each function name
 to the location pointed to by the second function parameter.
 .Sh RETURN VALUES
 If successful, these functions return 0.
 Otherwise, an error number is returned to indicate the error.
 .Sh ERRORS
 The
 .Fn pthread_mutexattr_init
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er ENOMEM
 Out of memory.
 .El
 .Pp
 The
 .Fn pthread_mutexattr_destroy
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 Invalid value for
 .Fa attr .
 .El
 .Pp
 The
 .Fn pthread_mutexattr_setprioceiling
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 Invalid value for
 .Fa attr ,
 or invalid value for
 .Fa prioceiling .
 .El
 .Pp
 The
 .Fn pthread_mutexattr_getprioceiling
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 Invalid value for
 .Fa attr .
 .El
 .Pp
 The
 .Fn pthread_mutexattr_setprotocol
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 Invalid value for
 .Fa attr ,
 or invalid value for
 .Fa protocol .
 .El
 .Pp
 The
 .Fn pthread_mutexattr_getprotocol
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 Invalid value for
 .Fa attr .
 .El
 .Pp
 The
 .Fn pthread_mutexattr_settype
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 Invalid value for
 .Fa attr ,
 or invalid value for
 .Fa type .
 .El
 .Pp
 The
 .Fn pthread_mutexattr_gettype
 function will fail if:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 Invalid value for
 .Fa attr .
 .El
+.Pp
+The
+.Fn pthread_mutexattr_setrobust
+function will fail if:
+.Bl -tag -width Er
+.It Bq Er EINVAL
+Invalid value for
+.Fa attr ,
+or invalid value for
+.Fa robust .
+.El
+.Pp
+The
+.Fn pthread_mutexattr_getrobust
+function will fail if:
+.Bl -tag -width Er
+.It Bq Er EINVAL
+Invalid value for
+.Fa attr .
+.El
 .Sh SEE ALSO
 .Xr pthread_mutex_init 3
 .Sh STANDARDS
 The
 .Fn pthread_mutexattr_init
 and
 .Fn pthread_mutexattr_destroy
 functions conform to
 .St -p1003.1-96
 .Pp
 The
 .Fn pthread_mutexattr_setprioceiling ,
 .Fn pthread_mutexattr_getprioceiling ,
 .Fn pthread_mutexattr_setprotocol ,
 .Fn pthread_mutexattr_getprotocol ,
 .Fn pthread_mutexattr_settype ,
 and
 .Fn pthread_mutexattr_gettype
 functions conform to
-.St -susv2
+.St -susv2 .
+The
+.Fn pthread_mutexattr_setrobust
+and
+.Fn pthread_mutexattr_getrobust
+functions conform to
+.St -susv4 .
Index: head/sys/compat/cloudabi/cloudabi_thread.c
===================================================================
--- head/sys/compat/cloudabi/cloudabi_thread.c	(revision 300042)
+++ head/sys/compat/cloudabi/cloudabi_thread.c	(revision 300043)
@@ -1,74 +1,77 @@
 /*-
  * Copyright (c) 2015 Nuxi, https://nuxi.nl/
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/syscallsubr.h>
+#include <sys/umtx.h>
 
 #include <contrib/cloudabi/cloudabi_types_common.h>
 
 #include <compat/cloudabi/cloudabi_proto.h>
 
 int
 cloudabi_sys_thread_exit(struct thread *td,
     struct cloudabi_sys_thread_exit_args *uap)
 {
 	struct cloudabi_sys_lock_unlock_args cloudabi_sys_lock_unlock_args = {
 		.lock = uap->lock,
 		.scope = uap->scope,
 	};
+
+	umtx_thread_exit(td);
 
         /* Wake up joining thread. */
 	cloudabi_sys_lock_unlock(td, &cloudabi_sys_lock_unlock_args);
 
         /*
 	 * Attempt to terminate the thread. Terminate the process if
 	 * it's the last thread.
 	 */
 	kern_thr_exit(td);
 	exit1(td, 0, 0);
 	/* NOTREACHED */
 }
 
 int
 cloudabi_sys_thread_tcb_set(struct thread *td,
     struct cloudabi_sys_thread_tcb_set_args *uap)
 {
 
 	return (cpu_set_user_tls(td, uap->tcb));
 }
 
 int
 cloudabi_sys_thread_yield(struct thread *td,
     struct cloudabi_sys_thread_yield_args *uap)
 {
 
 	sched_relinquish(td);
 	return (0);
 }
Index: head/sys/compat/linux/linux_fork.c
===================================================================
--- head/sys/compat/linux/linux_fork.c	(revision 300042)
+++ head/sys/compat/linux/linux_fork.c	(revision 300043)
@@ -1,482 +1,485 @@
 /*-
  * Copyright (c) 2004 Tim J. Robbins
  * Copyright (c) 2002 Doug Rabson
  * Copyright (c) 2000 Marcel Moolenaar
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/imgact.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/sched.h>
 #include <sys/syscallsubr.h>
 #include <sys/sx.h>
+#include <sys/umtx.h>
 #include <sys/unistd.h>
 #include <sys/wait.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_futex.h>
 #include <compat/linux/linux_misc.h>
 #include <compat/linux/linux_util.h>
 
 int
 linux_fork(struct thread *td, struct linux_fork_args *args)
 {
 	struct fork_req fr;
 	int error;
 	struct proc *p2;
 	struct thread *td2;
 
 #ifdef DEBUG
 	if (ldebug(fork))
 		printf(ARGS(fork, ""));
 #endif
 
 	bzero(&fr, sizeof(fr));
 	fr.fr_flags = RFFDG | RFPROC | RFSTOPPED;
 	fr.fr_procp = &p2;
 	if ((error = fork1(td, &fr)) != 0)
 		return (error);
 
 	td2 = FIRST_THREAD_IN_PROC(p2);
 
 	linux_proc_init(td, td2, 0);
 
 	td->td_retval[0] = p2->p_pid;
 
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
 	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
 	thread_unlock(td2);
 
 	return (0);
 }
 
 int
 linux_vfork(struct thread *td, struct linux_vfork_args *args)
 {
 	struct fork_req fr;
 	int error;
 	struct proc *p2;
 	struct thread *td2;
 
 #ifdef DEBUG
 	if (ldebug(vfork))
 		printf(ARGS(vfork, ""));
 #endif
 
 	bzero(&fr, sizeof(fr));
 	fr.fr_flags = RFFDG | RFPROC | RFMEM | RFPPWAIT | RFSTOPPED;
 	fr.fr_procp = &p2;
 	if ((error = fork1(td, &fr)) != 0)
 		return (error);
 
 	td2 = FIRST_THREAD_IN_PROC(p2);
 
 	linux_proc_init(td, td2, 0);
 
    	td->td_retval[0] = p2->p_pid;
 
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
 	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
 	thread_unlock(td2);
 
 	return (0);
 }
 
 static int
 linux_clone_proc(struct thread *td, struct linux_clone_args *args)
 {
 	struct fork_req fr;
 	int error, ff = RFPROC | RFSTOPPED;
 	struct proc *p2;
 	struct thread *td2;
 	int exit_signal;
 	struct linux_emuldata *em;
 
 #ifdef DEBUG
 	if (ldebug(clone)) {
 		printf(ARGS(clone, "flags %x, stack %p, parent tid: %p, "
 		    "child tid: %p"), (unsigned)args->flags,
 		    args->stack, args->parent_tidptr, args->child_tidptr);
 	}
 #endif
 
 	exit_signal = args->flags & 0x000000ff;
 	if (LINUX_SIG_VALID(exit_signal)) {
 		exit_signal = linux_to_bsd_signal(exit_signal);
 	} else if (exit_signal != 0)
 		return (EINVAL);
 
 	if (args->flags & LINUX_CLONE_VM)
 		ff |= RFMEM;
 	if (args->flags & LINUX_CLONE_SIGHAND)
 		ff |= RFSIGSHARE;
 	/*
 	 * XXX: In Linux, sharing of fs info (chroot/cwd/umask)
 	 * and open files is independent.  In FreeBSD, its in one
 	 * structure but in reality it does not cause any problems
 	 * because both of these flags are usually set together.
 	 */
 	if (!(args->flags & (LINUX_CLONE_FILES | LINUX_CLONE_FS)))
 		ff |= RFFDG;
 
 	if (args->flags & LINUX_CLONE_PARENT_SETTID)
 		if (args->parent_tidptr == NULL)
 			return (EINVAL);
 
 	if (args->flags & LINUX_CLONE_VFORK)
 		ff |= RFPPWAIT;
 
 	bzero(&fr, sizeof(fr));
 	fr.fr_flags = ff;
 	fr.fr_procp = &p2;
 	error = fork1(td, &fr);
 	if (error)
 		return (error);
 
 	td2 = FIRST_THREAD_IN_PROC(p2);
 
 	/* create the emuldata */
 	linux_proc_init(td, td2, args->flags);
 
 	em = em_find(td2);
 	KASSERT(em != NULL, ("clone_proc: emuldata not found.\n"));
 
 	if (args->flags & LINUX_CLONE_CHILD_SETTID)
 		em->child_set_tid = args->child_tidptr;
 	else
 	   	em->child_set_tid = NULL;
 
 	if (args->flags & LINUX_CLONE_CHILD_CLEARTID)
 		em->child_clear_tid = args->child_tidptr;
 	else
 	   	em->child_clear_tid = NULL;
 
 	if (args->flags & LINUX_CLONE_PARENT_SETTID) {
 		error = copyout(&p2->p_pid, args->parent_tidptr,
 		    sizeof(p2->p_pid));
 		if (error)
 			printf(LMSG("copyout failed!"));
 	}
 
 	PROC_LOCK(p2);
 	p2->p_sigparent = exit_signal;
 	PROC_UNLOCK(p2);
 	/*
 	 * In a case of stack = NULL, we are supposed to COW calling process
 	 * stack. This is what normal fork() does, so we just keep tf_rsp arg
 	 * intact.
 	 */
 	linux_set_upcall_kse(td2, PTROUT(args->stack));
 
 	if (args->flags & LINUX_CLONE_SETTLS)
 		linux_set_cloned_tls(td2, args->tls);
 
 	/*
 	 * If CLONE_PARENT is set, then the parent of the new process will be 
 	 * the same as that of the calling process.
 	 */
 	if (args->flags & LINUX_CLONE_PARENT) {
 		sx_xlock(&proctree_lock);
 		PROC_LOCK(p2);
 		proc_reparent(p2, td->td_proc->p_pptr);
 		PROC_UNLOCK(p2);
 		sx_xunlock(&proctree_lock);
 	}
 
 #ifdef DEBUG
 	if (ldebug(clone))
 		printf(LMSG("clone: successful rfork to %d, "
 		    "stack %p sig = %d"), (int)p2->p_pid, args->stack,
 		    exit_signal);
 #endif
 
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
 	thread_lock(td2);
 	TD_SET_CAN_RUN(td2);
 	sched_add(td2, SRQ_BORING);
 	thread_unlock(td2);
 
 	td->td_retval[0] = p2->p_pid;
 
 	return (0);
 }
 
 static int
 linux_clone_thread(struct thread *td, struct linux_clone_args *args)
 {
 	struct linux_emuldata *em;
 	struct thread *newtd;
 	struct proc *p;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(clone)) {
 		printf(ARGS(clone, "thread: flags %x, stack %p, parent tid: %p, "
 		    "child tid: %p"), (unsigned)args->flags,
 		    args->stack, args->parent_tidptr, args->child_tidptr);
 	}
 #endif
 
 	LINUX_CTR4(clone_thread, "thread(%d) flags %x ptid %p ctid %p",
 	    td->td_tid, (unsigned)args->flags,
 	    args->parent_tidptr, args->child_tidptr);
 
 	if (args->flags & LINUX_CLONE_PARENT_SETTID)
 		if (args->parent_tidptr == NULL)
 			return (EINVAL);
 
 	/* Threads should be created with own stack */
 	if (args->stack == NULL)
 		return (EINVAL);
 
 	p = td->td_proc;
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		error = racct_add(p, RACCT_NTHR, 1);
 		PROC_UNLOCK(p);
 		if (error != 0)
 			return (EPROCLIM);
 	}
 #endif
 
 	/* Initialize our td */
 	error = kern_thr_alloc(p, 0, &newtd);
 	if (error)
 		goto fail;
 														
 	cpu_set_upcall(newtd, td);
 
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	bcopy(&td->td_startcopy, &newtd->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 
 	newtd->td_proc = p;
 	thread_cow_get(newtd, td);
 
 	/* create the emuldata */
 	linux_proc_init(td, newtd, args->flags);
 
 	em = em_find(newtd);
 	KASSERT(em != NULL, ("clone_thread: emuldata not found.\n"));
 
 	if (args->flags & LINUX_CLONE_SETTLS)
 		linux_set_cloned_tls(newtd, args->tls);
 
 	if (args->flags & LINUX_CLONE_CHILD_SETTID)
 		em->child_set_tid = args->child_tidptr;
 	else
 	   	em->child_set_tid = NULL;
 
 	if (args->flags & LINUX_CLONE_CHILD_CLEARTID)
 		em->child_clear_tid = args->child_tidptr;
 	else
 	   	em->child_clear_tid = NULL;
 
 	cpu_thread_clean(newtd);
 	
 	linux_set_upcall_kse(newtd, PTROUT(args->stack));
 
 	PROC_LOCK(p);
 	p->p_flag |= P_HADTHREADS;
 	bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name));
 
 	if (args->flags & LINUX_CLONE_PARENT)
 		thread_link(newtd, p->p_pptr);
 	else
 		thread_link(newtd, p);
 
 	thread_lock(td);
 	/* let the scheduler know about these things. */
 	sched_fork_thread(td, newtd);
 	thread_unlock(td);
 	if (P_SHOULDSTOP(p))
 		newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
 	PROC_UNLOCK(p);
 
 	tidhash_add(newtd);
 
 #ifdef DEBUG
 	if (ldebug(clone))
 		printf(ARGS(clone, "successful clone to %d, stack %p"),
 		(int)newtd->td_tid, args->stack);
 #endif
 
 	LINUX_CTR2(clone_thread, "thread(%d) successful clone to %d",
 	    td->td_tid, newtd->td_tid);
 
 	if (args->flags & LINUX_CLONE_PARENT_SETTID) {
 		error = copyout(&newtd->td_tid, args->parent_tidptr,
 		    sizeof(newtd->td_tid));
 		if (error)
 			printf(LMSG("clone_thread: copyout failed!"));
 	}
 
 	/*
 	 * Make this runnable after we are finished with it.
 	 */
 	thread_lock(newtd);
 	TD_SET_CAN_RUN(newtd);
 	sched_add(newtd, SRQ_BORING);
 	thread_unlock(newtd);
 
 	td->td_retval[0] = newtd->td_tid;
 
 	return (0);
 
 fail:
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		racct_sub(p, RACCT_NTHR, 1);
 		PROC_UNLOCK(p);
 	}
 #endif
 	return (error);
 }
 
 int
 linux_clone(struct thread *td, struct linux_clone_args *args)
 {
 
 	if (args->flags & LINUX_CLONE_THREAD)
 		return (linux_clone_thread(td, args));
 	else
 		return (linux_clone_proc(td, args));
 }
 
 int
 linux_exit(struct thread *td, struct linux_exit_args *args)
 {
 	struct linux_emuldata *em;
 
 	em = em_find(td);
 	KASSERT(em != NULL, ("exit: emuldata not found.\n"));
 
 	LINUX_CTR2(exit, "thread(%d) (%d)", em->em_tid, args->rval);
+
+	umtx_thread_exit(td);
 
 	linux_thread_detach(td);
 
 	/*
 	 * XXX. When the last two threads of a process
 	 * exit via pthread_exit() try thr_exit() first.
 	 */
 	kern_thr_exit(td);
 	exit1(td, args->rval, 0);
 		/* NOTREACHED */
 }
 
 int
 linux_set_tid_address(struct thread *td, struct linux_set_tid_address_args *args)
 {
 	struct linux_emuldata *em;
 
 	em = em_find(td);
 	KASSERT(em != NULL, ("set_tid_address: emuldata not found.\n"));
 
 	em->child_clear_tid = args->tidptr;
 
 	td->td_retval[0] = em->em_tid;
 
 	LINUX_CTR3(set_tid_address, "tidptr(%d) %p, returns %d",
 	    em->em_tid, args->tidptr, td->td_retval[0]);
 
 	return (0);
 }
 
 void
 linux_thread_detach(struct thread *td)
 {
 	struct linux_sys_futex_args cup;
 	struct linux_emuldata *em;
 	int *child_clear_tid;
 	int error;
 
 	em = em_find(td);
 	KASSERT(em != NULL, ("thread_detach: emuldata not found.\n"));
 
 	LINUX_CTR1(thread_detach, "thread(%d)", em->em_tid);
 
 	release_futexes(td, em);
 
 	child_clear_tid = em->child_clear_tid;
 
 	if (child_clear_tid != NULL) {
 
 		LINUX_CTR2(thread_detach, "thread(%d) %p",
 		    em->em_tid, child_clear_tid);
 	
 		error = suword32(child_clear_tid, 0);
 		if (error != 0)
 			return;
 
 		cup.uaddr = child_clear_tid;
 		cup.op = LINUX_FUTEX_WAKE;
 		cup.val = 1;		/* wake one */
 		cup.timeout = NULL;
 		cup.uaddr2 = NULL;
 		cup.val3 = 0;
 		error = linux_sys_futex(td, &cup);
 		/*
 		 * this cannot happen at the moment and if this happens it
 		 * probably means there is a user space bug
 		 */
 		if (error != 0)
 			linux_msg(td, "futex stuff in thread_detach failed.");
 	}
 }
Index: head/sys/kern/kern_exit.c
===================================================================
--- head/sys/kern/kern_exit.c	(revision 300042)
+++ head/sys/kern/kern_exit.c	(revision 300043)
@@ -1,1327 +1,1329 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_exit.c	8.7 (Berkeley) 2/12/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/capsicum.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/pioctl.h>
 #include <sys/jail.h>
 #include <sys/tty.h>
 #include <sys/wait.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/syslog.h>
 #include <sys/ptrace.h>
 #include <sys/acct.h>		/* for acct_process() function prototype */
 #include <sys/filedesc.h>
 #include <sys/sdt.h>
 #include <sys/shm.h>
 #include <sys/sem.h>
 #include <sys/umtx.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 #include <vm/vm_domain.h>
 
 #ifdef KDTRACE_HOOKS
 #include <sys/dtrace_bsd.h>
 dtrace_execexit_func_t	dtrace_fasttrap_exit;
 #endif
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE1(proc, , , exit, "int");
 
 /* Hook for NFS teardown procedure. */
 void (*nlminfo_release_p)(struct proc *p);
 
 struct proc *
 proc_realparent(struct proc *child)
 {
 	struct proc *p, *parent;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	if ((child->p_treeflag & P_TREE_ORPHANED) == 0) {
 		if (child->p_oppid == 0 ||
 		    child->p_pptr->p_pid == child->p_oppid)
 			parent = child->p_pptr;
 		else
 			parent = initproc;
 		return (parent);
 	}
 	for (p = child; (p->p_treeflag & P_TREE_FIRST_ORPHAN) == 0;) {
 		/* Cannot use LIST_PREV(), since the list head is not known. */
 		p = __containerof(p->p_orphan.le_prev, struct proc,
 		    p_orphan.le_next);
 		KASSERT((p->p_treeflag & P_TREE_ORPHANED) != 0,
 		    ("missing P_ORPHAN %p", p));
 	}
 	parent = __containerof(p->p_orphan.le_prev, struct proc,
 	    p_orphans.lh_first);
 	return (parent);
 }
 
 void
 reaper_abandon_children(struct proc *p, bool exiting)
 {
 	struct proc *p1, *p2, *ptmp;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	KASSERT(p != initproc, ("reaper_abandon_children for initproc"));
 	if ((p->p_treeflag & P_TREE_REAPER) == 0)
 		return;
 	p1 = p->p_reaper;
 	LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) {
 		LIST_REMOVE(p2, p_reapsibling);
 		p2->p_reaper = p1;
 		p2->p_reapsubtree = p->p_reapsubtree;
 		LIST_INSERT_HEAD(&p1->p_reaplist, p2, p_reapsibling);
 		if (exiting && p2->p_pptr == p) {
 			PROC_LOCK(p2);
 			proc_reparent(p2, p1);
 			PROC_UNLOCK(p2);
 		}
 	}
 	KASSERT(LIST_EMPTY(&p->p_reaplist), ("p_reaplist not empty"));
 	p->p_treeflag &= ~P_TREE_REAPER;
 }
 
 static void
 clear_orphan(struct proc *p)
 {
 	struct proc *p1;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 	if ((p->p_treeflag & P_TREE_ORPHANED) == 0)
 		return;
 	if ((p->p_treeflag & P_TREE_FIRST_ORPHAN) != 0) {
 		p1 = LIST_NEXT(p, p_orphan);
 		if (p1 != NULL)
 			p1->p_treeflag |= P_TREE_FIRST_ORPHAN;
 		p->p_treeflag &= ~P_TREE_FIRST_ORPHAN;
 	}
 	LIST_REMOVE(p, p_orphan);
 	p->p_treeflag &= ~P_TREE_ORPHANED;
 }
 
 /*
  * exit -- death of process.
  */
 void
 sys_sys_exit(struct thread *td, struct sys_exit_args *uap)
 {
 
 	exit1(td, uap->rval, 0);
 	/* NOTREACHED */
 }
 
 /*
  * Exit: deallocate address space and other resources, change proc state to
  * zombie, and unlink proc from allproc and parent's lists.  Save exit status
  * and rusage for wait().  Check for child processes and orphan them.
  */
 void
 exit1(struct thread *td, int rval, int signo)
 {
 	struct proc *p, *nq, *q, *t;
 	struct thread *tdt;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	KASSERT(rval == 0 || signo == 0, ("exit1 rv %d sig %d", rval, signo));
 
 	p = td->td_proc;
 	/*
 	 * XXX in case we're rebooting we just let init die in order to
 	 * work around an unsolved stack overflow seen very late during
 	 * shutdown on sparc64 when the gmirror worker process exists.
 	 */
 	if (p == initproc && rebooting == 0) {
 		printf("init died (signal %d, exit %d)\n", signo, rval);
 		panic("Going nowhere without my init!");
 	}
 
 	/*
 	 * Deref SU mp, since the thread does not return to userspace.
 	 */
 	if (softdep_ast_cleanup != NULL)
 		softdep_ast_cleanup();
 
 	/*
 	 * MUST abort all other threads before proceeding past here.
 	 */
 	PROC_LOCK(p);
 	/*
 	 * First check if some other thread or external request got
 	 * here before us.  If so, act appropriately: exit or suspend.
 	 * We must ensure that stop requests are handled before we set
 	 * P_WEXIT.
 	 */
 	thread_suspend_check(0);
 	while (p->p_flag & P_HADTHREADS) {
 		/*
 		 * Kill off the other threads. This requires
 		 * some co-operation from other parts of the kernel
 		 * so it may not be instantaneous.  With this state set
 		 * any thread entering the kernel from userspace will
 		 * thread_exit() in trap().  Any thread attempting to
 		 * sleep will return immediately with EINTR or EWOULDBLOCK
 		 * which will hopefully force them to back out to userland
 		 * freeing resources as they go.  Any thread attempting
 		 * to return to userland will thread_exit() from userret().
 		 * thread_exit() will unsuspend us when the last of the
 		 * other threads exits.
 		 * If there is already a thread singler after resumption,
 		 * calling thread_single will fail; in that case, we just
 		 * re-check all suspension request, the thread should
 		 * either be suspended there or exit.
 		 */
 		if (!thread_single(p, SINGLE_EXIT))
 			/*
 			 * All other activity in this process is now
 			 * stopped.  Threading support has been turned
 			 * off.
 			 */
 			break;
 		/*
 		 * Recheck for new stop or suspend requests which
 		 * might appear while process lock was dropped in
 		 * thread_single().
 		 */
 		thread_suspend_check(0);
 	}
 	KASSERT(p->p_numthreads == 1,
 	    ("exit1: proc %p exiting with %d threads", p, p->p_numthreads));
 	racct_sub(p, RACCT_NTHR, 1);
 
 	/* Let event handler change exit status */
 	p->p_xexit = rval;
 	p->p_xsig = signo;
 
 	/*
 	 * Wakeup anyone in procfs' PIOCWAIT.  They should have a hold
 	 * on our vmspace, so we should block below until they have
 	 * released their reference to us.  Note that if they have
 	 * requested S_EXIT stops we will block here until they ack
 	 * via PIOCCONT.
 	 */
 	_STOPEVENT(p, S_EXIT, 0);
 
 	/*
 	 * Ignore any pending request to stop due to a stop signal.
 	 * Once P_WEXIT is set, future requests will be ignored as
 	 * well.
 	 */
 	p->p_flag &= ~P_STOPPED_SIG;
 	KASSERT(!P_SHOULDSTOP(p), ("exiting process is stopped"));
 
 	/*
 	 * Note that we are exiting and do another wakeup of anyone in
 	 * PIOCWAIT in case they aren't listening for S_EXIT stops or
 	 * decided to wait again after we told them we are exiting.
 	 */
 	p->p_flag |= P_WEXIT;
 	wakeup(&p->p_stype);
 
 	/*
 	 * Wait for any processes that have a hold on our vmspace to
 	 * release their reference.
 	 */
 	while (p->p_lock > 0)
 		msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0);
 
 	PROC_UNLOCK(p);
 	/* Drain the limit callout while we don't have the proc locked */
 	callout_drain(&p->p_limco);
 
 #ifdef AUDIT
 	/*
 	 * The Sun BSM exit token contains two components: an exit status as
 	 * passed to exit(), and a return value to indicate what sort of exit
 	 * it was.  The exit status is WEXITSTATUS(rv), but it's not clear
 	 * what the return value is.
 	 */
 	AUDIT_ARG_EXIT(rval, 0);
 	AUDIT_SYSCALL_EXIT(0, td);
 #endif
 
 	/* Are we a task leader with peers? */
 	if (p->p_peers != NULL && p == p->p_leader) {
 		mtx_lock(&ppeers_lock);
 		q = p->p_peers;
 		while (q != NULL) {
 			PROC_LOCK(q);
 			kern_psignal(q, SIGKILL);
 			PROC_UNLOCK(q);
 			q = q->p_peers;
 		}
 		while (p->p_peers != NULL)
 			msleep(p, &ppeers_lock, PWAIT, "exit1", 0);
 		mtx_unlock(&ppeers_lock);
 	}
 
 	/*
 	 * Check if any loadable modules need anything done at process exit.
 	 * E.g. SYSV IPC stuff.
 	 * Event handler could change exit status.
 	 * XXX what if one of these generates an error?
 	 */
 	EVENTHANDLER_INVOKE(process_exit, p);
 
 	/*
 	 * If parent is waiting for us to exit or exec,
 	 * P_PPWAIT is set; we will wakeup the parent below.
 	 */
 	PROC_LOCK(p);
 	stopprofclock(p);
 	p->p_flag &= ~(P_TRACED | P_PPWAIT | P_PPTRACE);
 
 	/*
 	 * Stop the real interval timer.  If the handler is currently
 	 * executing, prevent it from rearming itself and let it finish.
 	 */
 	if (timevalisset(&p->p_realtimer.it_value) &&
 	    callout_stop(&p->p_itcallout) == 0) {
 		timevalclear(&p->p_realtimer.it_interval);
 		msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0);
 		KASSERT(!timevalisset(&p->p_realtimer.it_value),
 		    ("realtime timer is still armed"));
 	}
+
 	PROC_UNLOCK(p);
 
+	umtx_thread_exit(td);
+
 	/*
 	 * Reset any sigio structures pointing to us as a result of
 	 * F_SETOWN with our pid.
 	 */
 	funsetownlst(&p->p_sigiolst);
 
 	/*
 	 * If this process has an nlminfo data area (for lockd), release it
 	 */
 	if (nlminfo_release_p != NULL && p->p_nlminfo != NULL)
 		(*nlminfo_release_p)(p);
 
 	/*
 	 * Close open files and release open-file table.
 	 * This may block!
 	 */
 	fdescfree(td);
 
 	/*
 	 * If this thread tickled GEOM, we need to wait for the giggling to
 	 * stop before we return to userland
 	 */
 	if (td->td_pflags & TDP_GEOM)
 		g_waitidle();
 
 	/*
 	 * Remove ourself from our leader's peer list and wake our leader.
 	 */
 	if (p->p_leader->p_peers != NULL) {
 		mtx_lock(&ppeers_lock);
 		if (p->p_leader->p_peers != NULL) {
 			q = p->p_leader;
 			while (q->p_peers != p)
 				q = q->p_peers;
 			q->p_peers = p->p_peers;
 			wakeup(p->p_leader);
 		}
 		mtx_unlock(&ppeers_lock);
 	}
 
 	vmspace_exit(td);
 	killjobc();
 	(void)acct_process(td);
 
 #ifdef KTRACE
 	ktrprocexit(td);
 #endif
 	/*
 	 * Release reference to text vnode
 	 */
 	if (p->p_textvp != NULL) {
 		vrele(p->p_textvp);
 		p->p_textvp = NULL;
 	}
 
 	/*
 	 * Release our limits structure.
 	 */
 	lim_free(p->p_limit);
 	p->p_limit = NULL;
 
 	tidhash_remove(td);
 
 	/*
 	 * Remove proc from allproc queue and pidhash chain.
 	 * Place onto zombproc.  Unlink from parent's child list.
 	 */
 	sx_xlock(&allproc_lock);
 	LIST_REMOVE(p, p_list);
 	LIST_INSERT_HEAD(&zombproc, p, p_list);
 	LIST_REMOVE(p, p_hash);
 	sx_xunlock(&allproc_lock);
 
 	/*
 	 * Call machine-dependent code to release any
 	 * machine-dependent resources other than the address space.
 	 * The address space is released by "vmspace_exitfree(p)" in
 	 * vm_waitproc().
 	 */
 	cpu_exit(td);
 
 	WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid);
 
 	/*
 	 * Reparent all children processes:
 	 * - traced ones to the original parent (or init if we are that parent)
 	 * - the rest to init
 	 */
 	sx_xlock(&proctree_lock);
 	q = LIST_FIRST(&p->p_children);
 	if (q != NULL)		/* only need this if any child is S_ZOMB */
 		wakeup(q->p_reaper);
 	for (; q != NULL; q = nq) {
 		nq = LIST_NEXT(q, p_sibling);
 		PROC_LOCK(q);
 		q->p_sigparent = SIGCHLD;
 
 		if (!(q->p_flag & P_TRACED)) {
 			proc_reparent(q, q->p_reaper);
 		} else {
 			/*
 			 * Traced processes are killed since their existence
 			 * means someone is screwing up.
 			 */
 			t = proc_realparent(q);
 			if (t == p) {
 				proc_reparent(q, q->p_reaper);
 			} else {
 				PROC_LOCK(t);
 				proc_reparent(q, t);
 				PROC_UNLOCK(t);
 			}
 			/*
 			 * Since q was found on our children list, the
 			 * proc_reparent() call moved q to the orphan
 			 * list due to present P_TRACED flag. Clear
 			 * orphan link for q now while q is locked.
 			 */
 			clear_orphan(q);
 			q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE);
 			FOREACH_THREAD_IN_PROC(q, tdt)
 				tdt->td_dbgflags &= ~TDB_SUSPEND;
 			kern_psignal(q, SIGKILL);
 		}
 		PROC_UNLOCK(q);
 	}
 
 	/*
 	 * Also get rid of our orphans.
 	 */
 	while ((q = LIST_FIRST(&p->p_orphans)) != NULL) {
 		PROC_LOCK(q);
 		CTR2(KTR_PTRACE, "exit: pid %d, clearing orphan %d", p->p_pid,
 		    q->p_pid);
 		clear_orphan(q);
 		PROC_UNLOCK(q);
 	}
 
 	/* Save exit status. */
 	PROC_LOCK(p);
 	p->p_xthread = td;
 
 	/* Tell the prison that we are gone. */
 	prison_proc_free(p->p_ucred->cr_prison);
 
 #ifdef KDTRACE_HOOKS
 	/*
 	 * Tell the DTrace fasttrap provider about the exit if it
 	 * has declared an interest.
 	 */
 	if (dtrace_fasttrap_exit)
 		dtrace_fasttrap_exit(p);
 #endif
 
 	/*
 	 * Notify interested parties of our demise.
 	 */
 	KNOTE_LOCKED(&p->p_klist, NOTE_EXIT);
 
 #ifdef KDTRACE_HOOKS
 	int reason = CLD_EXITED;
 	if (WCOREDUMP(signo))
 		reason = CLD_DUMPED;
 	else if (WIFSIGNALED(signo))
 		reason = CLD_KILLED;
 	SDT_PROBE1(proc, , , exit, reason);
 #endif
 
 	/*
 	 * Just delete all entries in the p_klist. At this point we won't
 	 * report any more events, and there are nasty race conditions that
 	 * can beat us if we don't.
 	 */
 	knlist_clear(&p->p_klist, 1);
 
 	/*
 	 * If this is a process with a descriptor, we may not need to deliver
 	 * a signal to the parent.  proctree_lock is held over
 	 * procdesc_exit() to serialize concurrent calls to close() and
 	 * exit().
 	 */
 	if (p->p_procdesc == NULL || procdesc_exit(p)) {
 		/*
 		 * Notify parent that we're gone.  If parent has the
 		 * PS_NOCLDWAIT flag set, or if the handler is set to SIG_IGN,
 		 * notify process 1 instead (and hope it will handle this
 		 * situation).
 		 */
 		PROC_LOCK(p->p_pptr);
 		mtx_lock(&p->p_pptr->p_sigacts->ps_mtx);
 		if (p->p_pptr->p_sigacts->ps_flag &
 		    (PS_NOCLDWAIT | PS_CLDSIGIGN)) {
 			struct proc *pp;
 
 			mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 			pp = p->p_pptr;
 			PROC_UNLOCK(pp);
 			proc_reparent(p, p->p_reaper);
 			p->p_sigparent = SIGCHLD;
 			PROC_LOCK(p->p_pptr);
 
 			/*
 			 * Notify parent, so in case he was wait(2)ing or
 			 * executing waitpid(2) with our pid, he will
 			 * continue.
 			 */
 			wakeup(pp);
 		} else
 			mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 
 		if (p->p_pptr == p->p_reaper || p->p_pptr == initproc)
 			childproc_exited(p);
 		else if (p->p_sigparent != 0) {
 			if (p->p_sigparent == SIGCHLD)
 				childproc_exited(p);
 			else	/* LINUX thread */
 				kern_psignal(p->p_pptr, p->p_sigparent);
 		}
 	} else
 		PROC_LOCK(p->p_pptr);
 	sx_xunlock(&proctree_lock);
 
 	/*
 	 * The state PRS_ZOMBIE prevents other proesses from sending
 	 * signal to the process, to avoid memory leak, we free memory
 	 * for signal queue at the time when the state is set.
 	 */
 	sigqueue_flush(&p->p_sigqueue);
 	sigqueue_flush(&td->td_sigqueue);
 
 	/*
 	 * We have to wait until after acquiring all locks before
 	 * changing p_state.  We need to avoid all possible context
 	 * switches (including ones from blocking on a mutex) while
 	 * marked as a zombie.  We also have to set the zombie state
 	 * before we release the parent process' proc lock to avoid
 	 * a lost wakeup.  So, we first call wakeup, then we grab the
 	 * sched lock, update the state, and release the parent process'
 	 * proc lock.
 	 */
 	wakeup(p->p_pptr);
 	cv_broadcast(&p->p_pwait);
 	sched_exit(p->p_pptr, td);
-	umtx_thread_exit(td);
 	PROC_SLOCK(p);
 	p->p_state = PRS_ZOMBIE;
 	PROC_UNLOCK(p->p_pptr);
 
 	/*
 	 * Hopefully no one will try to deliver a signal to the process this
 	 * late in the game.
 	 */
 	knlist_destroy(&p->p_klist);
 
 	/*
 	 * Save our children's rusage information in our exit rusage.
 	 */
 	PROC_STATLOCK(p);
 	ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
 	PROC_STATUNLOCK(p);
 
 	/*
 	 * Make sure the scheduler takes this thread out of its tables etc.
 	 * This will also release this thread's reference to the ucred.
 	 * Other thread parts to release include pcb bits and such.
 	 */
 	thread_exit();
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct abort2_args {
 	char *why;
 	int nargs;
 	void **args;
 };
 #endif
 
 int
 sys_abort2(struct thread *td, struct abort2_args *uap)
 {
 	struct proc *p = td->td_proc;
 	struct sbuf *sb;
 	void *uargs[16];
 	int error, i, sig;
 
 	/*
 	 * Do it right now so we can log either proper call of abort2(), or
 	 * note, that invalid argument was passed. 512 is big enough to
 	 * handle 16 arguments' descriptions with additional comments.
 	 */
 	sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN);
 	sbuf_clear(sb);
 	sbuf_printf(sb, "%s(pid %d uid %d) aborted: ",
 	    p->p_comm, p->p_pid, td->td_ucred->cr_uid);
 	/*
 	 * Since we can't return from abort2(), send SIGKILL in cases, where
 	 * abort2() was called improperly
 	 */
 	sig = SIGKILL;
 	/* Prevent from DoSes from user-space. */
 	if (uap->nargs < 0 || uap->nargs > 16)
 		goto out;
 	if (uap->nargs > 0) {
 		if (uap->args == NULL)
 			goto out;
 		error = copyin(uap->args, uargs, uap->nargs * sizeof(void *));
 		if (error != 0)
 			goto out;
 	}
 	/*
 	 * Limit size of 'reason' string to 128. Will fit even when
 	 * maximal number of arguments was chosen to be logged.
 	 */
 	if (uap->why != NULL) {
 		error = sbuf_copyin(sb, uap->why, 128);
 		if (error < 0)
 			goto out;
 	} else {
 		sbuf_printf(sb, "(null)");
 	}
 	if (uap->nargs > 0) {
 		sbuf_printf(sb, "(");
 		for (i = 0;i < uap->nargs; i++)
 			sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]);
 		sbuf_printf(sb, ")");
 	}
 	/*
 	 * Final stage: arguments were proper, string has been
 	 * successfully copied from userspace, and copying pointers
 	 * from user-space succeed.
 	 */
 	sig = SIGABRT;
 out:
 	if (sig == SIGKILL) {
 		sbuf_trim(sb);
 		sbuf_printf(sb, " (Reason text inaccessible)");
 	}
 	sbuf_cat(sb, "\n");
 	sbuf_finish(sb);
 	log(LOG_INFO, "%s", sbuf_data(sb));
 	sbuf_delete(sb);
 	exit1(td, 0, sig);
 	return (0);
 }
 
 
 #ifdef COMPAT_43
 /*
  * The dirty work is handled by kern_wait().
  */
 int
 owait(struct thread *td, struct owait_args *uap __unused)
 {
 	int error, status;
 
 	error = kern_wait(td, WAIT_ANY, &status, 0, NULL);
 	if (error == 0)
 		td->td_retval[1] = status;
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * The dirty work is handled by kern_wait().
  */
 int
 sys_wait4(struct thread *td, struct wait4_args *uap)
 {
 	struct rusage ru, *rup;
 	int error, status;
 
 	if (uap->rusage != NULL)
 		rup = &ru;
 	else
 		rup = NULL;
 	error = kern_wait(td, uap->pid, &status, uap->options, rup);
 	if (uap->status != NULL && error == 0)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->rusage != NULL && error == 0)
 		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
 	return (error);
 }
 
 int
 sys_wait6(struct thread *td, struct wait6_args *uap)
 {
 	struct __wrusage wru, *wrup;
 	siginfo_t si, *sip;
 	idtype_t idtype;
 	id_t id;
 	int error, status;
 
 	idtype = uap->idtype;
 	id = uap->id;
 
 	if (uap->wrusage != NULL)
 		wrup = &wru;
 	else
 		wrup = NULL;
 
 	if (uap->info != NULL) {
 		sip = &si;
 		bzero(sip, sizeof(*sip));
 	} else
 		sip = NULL;
 
 	/*
 	 *  We expect all callers of wait6() to know about WEXITED and
 	 *  WTRAPPED.
 	 */
 	error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip);
 
 	if (uap->status != NULL && error == 0)
 		error = copyout(&status, uap->status, sizeof(status));
 	if (uap->wrusage != NULL && error == 0)
 		error = copyout(&wru, uap->wrusage, sizeof(wru));
 	if (uap->info != NULL && error == 0)
 		error = copyout(&si, uap->info, sizeof(si));
 	return (error);
 }
 
 /*
  * Reap the remains of a zombie process and optionally return status and
  * rusage.  Asserts and will release both the proctree_lock and the process
  * lock as part of its work.
  */
 void
 proc_reap(struct thread *td, struct proc *p, int *status, int options)
 {
 	struct proc *q, *t;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE"));
 
 	q = td->td_proc;
 
 	PROC_SUNLOCK(p);
 	if (status)
 		*status = KW_EXITCODE(p->p_xexit, p->p_xsig);
 	if (options & WNOWAIT) {
 		/*
 		 *  Only poll, returning the status.  Caller does not wish to
 		 * release the proc struct just yet.
 		 */
 		PROC_UNLOCK(p);
 		sx_xunlock(&proctree_lock);
 		return;
 	}
 
 	PROC_LOCK(q);
 	sigqueue_take(p->p_ksi);
 	PROC_UNLOCK(q);
 
 	/*
 	 * If we got the child via a ptrace 'attach', we need to give it back
 	 * to the old parent.
 	 */
 	if (p->p_oppid != 0 && p->p_oppid != p->p_pptr->p_pid) {
 		PROC_UNLOCK(p);
 		t = proc_realparent(p);
 		PROC_LOCK(t);
 		PROC_LOCK(p);
 		CTR2(KTR_PTRACE,
 		    "wait: traced child %d moved back to parent %d", p->p_pid,
 		    t->p_pid);
 		proc_reparent(p, t);
 		p->p_oppid = 0;
 		PROC_UNLOCK(p);
 		pksignal(t, SIGCHLD, p->p_ksi);
 		wakeup(t);
 		cv_broadcast(&p->p_pwait);
 		PROC_UNLOCK(t);
 		sx_xunlock(&proctree_lock);
 		return;
 	}
 	p->p_oppid = 0;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Remove other references to this process to ensure we have an
 	 * exclusive reference.
 	 */
 	sx_xlock(&allproc_lock);
 	LIST_REMOVE(p, p_list);	/* off zombproc */
 	sx_xunlock(&allproc_lock);
 	LIST_REMOVE(p, p_sibling);
 	reaper_abandon_children(p, true);
 	LIST_REMOVE(p, p_reapsibling);
 	PROC_LOCK(p);
 	clear_orphan(p);
 	PROC_UNLOCK(p);
 	leavepgrp(p);
 	if (p->p_procdesc != NULL)
 		procdesc_reap(p);
 	sx_xunlock(&proctree_lock);
 
 	/*
 	 * Removal from allproc list and process group list paired with
 	 * PROC_LOCK which was executed during that time should guarantee
 	 * nothing can reach this process anymore. As such further locking
 	 * is unnecessary.
 	 */
 	p->p_xexit = p->p_xsig = 0;		/* XXX: why? */
 
 	PROC_LOCK(q);
 	ruadd(&q->p_stats->p_cru, &q->p_crux, &p->p_ru, &p->p_rux);
 	PROC_UNLOCK(q);
 
 	/*
 	 * Decrement the count of procs running with this uid.
 	 */
 	(void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);
 
 	/*
 	 * Destroy resource accounting information associated with the process.
 	 */
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		racct_sub(p, RACCT_NPROC, 1);
 		PROC_UNLOCK(p);
 	}
 #endif
 	racct_proc_exit(p);
 
 	/*
 	 * Free credentials, arguments, and sigacts.
 	 */
 	crfree(p->p_ucred);
 	proc_set_cred(p, NULL);
 	pargs_drop(p->p_args);
 	p->p_args = NULL;
 	sigacts_free(p->p_sigacts);
 	p->p_sigacts = NULL;
 
 	/*
 	 * Do any thread-system specific cleanups.
 	 */
 	thread_wait(p);
 
 	/*
 	 * Give vm and machine-dependent layer a chance to free anything that
 	 * cpu_exit couldn't release while still running in process context.
 	 */
 	vm_waitproc(p);
 #ifdef MAC
 	mac_proc_destroy(p);
 #endif
 	/*
 	 * Free any domain policy that's still hiding around.
 	 */
 	vm_domain_policy_cleanup(&p->p_vm_dom_policy);
 
 	KASSERT(FIRST_THREAD_IN_PROC(p),
 	    ("proc_reap: no residual thread!"));
 	uma_zfree(proc_zone, p);
 	atomic_add_int(&nprocs, -1);
 }
 
 static int
 proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id,
     int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo,
     int check_only)
 {
 	struct rusage *rup;
 
 	sx_assert(&proctree_lock, SA_XLOCKED);
 
 	PROC_LOCK(p);
 
 	switch (idtype) {
 	case P_ALL:
 		if (p->p_procdesc != NULL) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_PID:
 		if (p->p_pid != (pid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_PGID:
 		if (p->p_pgid != (pid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_SID:
 		if (p->p_session->s_sid != (pid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_UID:
 		if (p->p_ucred->cr_uid != (uid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_GID:
 		if (p->p_ucred->cr_gid != (gid_t)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	case P_JAILID:
 		if (p->p_ucred->cr_prison->pr_id != (int)id) {
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		break;
 	/*
 	 * It seems that the thread structures get zeroed out
 	 * at process exit.  This makes it impossible to
 	 * support P_SETID, P_CID or P_CPUID.
 	 */
 	default:
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	if (p_canwait(td, p)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	if (((options & WEXITED) == 0) && (p->p_state == PRS_ZOMBIE)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	/*
 	 * This special case handles a kthread spawned by linux_clone
 	 * (see linux_misc.c).  The linux_wait4 and linux_waitpid
 	 * functions need to be able to distinguish between waiting
 	 * on a process and waiting on a thread.  It is a thread if
 	 * p_sigparent is not SIGCHLD, and the WLINUXCLONE option
 	 * signifies we want to wait for threads and not processes.
 	 */
 	if ((p->p_sigparent != SIGCHLD) ^
 	    ((options & WLINUXCLONE) != 0)) {
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	if (siginfo != NULL) {
 		bzero(siginfo, sizeof(*siginfo));
 		siginfo->si_errno = 0;
 
 		/*
 		 * SUSv4 requires that the si_signo value is always
 		 * SIGCHLD. Obey it despite the rfork(2) interface
 		 * allows to request other signal for child exit
 		 * notification.
 		 */
 		siginfo->si_signo = SIGCHLD;
 
 		/*
 		 *  This is still a rough estimate.  We will fix the
 		 *  cases TRAPPED, STOPPED, and CONTINUED later.
 		 */
 		if (WCOREDUMP(p->p_xsig)) {
 			siginfo->si_code = CLD_DUMPED;
 			siginfo->si_status = WTERMSIG(p->p_xsig);
 		} else if (WIFSIGNALED(p->p_xsig)) {
 			siginfo->si_code = CLD_KILLED;
 			siginfo->si_status = WTERMSIG(p->p_xsig);
 		} else {
 			siginfo->si_code = CLD_EXITED;
 			siginfo->si_status = p->p_xexit;
 		}
 
 		siginfo->si_pid = p->p_pid;
 		siginfo->si_uid = p->p_ucred->cr_uid;
 
 		/*
 		 * The si_addr field would be useful additional
 		 * detail, but apparently the PC value may be lost
 		 * when we reach this point.  bzero() above sets
 		 * siginfo->si_addr to NULL.
 		 */
 	}
 
 	/*
 	 * There should be no reason to limit resources usage info to
 	 * exited processes only.  A snapshot about any resources used
 	 * by a stopped process may be exactly what is needed.
 	 */
 	if (wrusage != NULL) {
 		rup = &wrusage->wru_self;
 		*rup = p->p_ru;
 		PROC_STATLOCK(p);
 		calcru(p, &rup->ru_utime, &rup->ru_stime);
 		PROC_STATUNLOCK(p);
 
 		rup = &wrusage->wru_children;
 		*rup = p->p_stats->p_cru;
 		calccru(p, &rup->ru_utime, &rup->ru_stime);
 	}
 
 	if (p->p_state == PRS_ZOMBIE && !check_only) {
 		PROC_SLOCK(p);
 		proc_reap(td, p, status, options);
 		return (-1);
 	}
 	PROC_UNLOCK(p);
 	return (1);
 }
 
 int
 kern_wait(struct thread *td, pid_t pid, int *status, int options,
     struct rusage *rusage)
 {
 	struct __wrusage wru, *wrup;
 	idtype_t idtype;
 	id_t id;
 	int ret;
 
 	/*
 	 * Translate the special pid values into the (idtype, pid)
 	 * pair for kern_wait6.  The WAIT_MYPGRP case is handled by
 	 * kern_wait6() on its own.
 	 */
 	if (pid == WAIT_ANY) {
 		idtype = P_ALL;
 		id = 0;
 	} else if (pid < 0) {
 		idtype = P_PGID;
 		id = (id_t)-pid;
 	} else {
 		idtype = P_PID;
 		id = (id_t)pid;
 	}
 
 	if (rusage != NULL)
 		wrup = &wru;
 	else
 		wrup = NULL;
 
 	/*
 	 * For backward compatibility we implicitly add flags WEXITED
 	 * and WTRAPPED here.
 	 */
 	options |= WEXITED | WTRAPPED;
 	ret = kern_wait6(td, idtype, id, status, options, wrup, NULL);
 	if (rusage != NULL)
 		*rusage = wru.wru_self;
 	return (ret);
 }
 
 int
 kern_wait6(struct thread *td, idtype_t idtype, id_t id, int *status,
     int options, struct __wrusage *wrusage, siginfo_t *siginfo)
 {
 	struct proc *p, *q;
 	pid_t pid;
 	int error, nfound, ret;
 
 	AUDIT_ARG_VALUE((int)idtype);	/* XXX - This is likely wrong! */
 	AUDIT_ARG_PID((pid_t)id);	/* XXX - This may be wrong! */
 	AUDIT_ARG_VALUE(options);
 
 	q = td->td_proc;
 
 	if ((pid_t)id == WAIT_MYPGRP && (idtype == P_PID || idtype == P_PGID)) {
 		PROC_LOCK(q);
 		id = (id_t)q->p_pgid;
 		PROC_UNLOCK(q);
 		idtype = P_PGID;
 	}
 
 	/* If we don't know the option, just return. */
 	if ((options & ~(WUNTRACED | WNOHANG | WCONTINUED | WNOWAIT |
 	    WEXITED | WTRAPPED | WLINUXCLONE)) != 0)
 		return (EINVAL);
 	if ((options & (WEXITED | WUNTRACED | WCONTINUED | WTRAPPED)) == 0) {
 		/*
 		 * We will be unable to find any matching processes,
 		 * because there are no known events to look for.
 		 * Prefer to return error instead of blocking
 		 * indefinitely.
 		 */
 		return (EINVAL);
 	}
 
 loop:
 	if (q->p_flag & P_STATCHILD) {
 		PROC_LOCK(q);
 		q->p_flag &= ~P_STATCHILD;
 		PROC_UNLOCK(q);
 	}
 	nfound = 0;
 	sx_xlock(&proctree_lock);
 	LIST_FOREACH(p, &q->p_children, p_sibling) {
 		pid = p->p_pid;
 		ret = proc_to_reap(td, p, idtype, id, status, options,
 		    wrusage, siginfo, 0);
 		if (ret == 0)
 			continue;
 		else if (ret == 1)
 			nfound++;
 		else {
 			td->td_retval[0] = pid;
 			return (0);
 		}
 
 		PROC_LOCK(p);
 		PROC_SLOCK(p);
 
 		if ((options & WTRAPPED) != 0 &&
 		    (p->p_flag & P_TRACED) != 0 &&
 		    (p->p_flag & (P_STOPPED_TRACE | P_STOPPED_SIG)) != 0 &&
 		    (p->p_suspcount == p->p_numthreads) &&
 		    ((p->p_flag & P_WAITED) == 0)) {
 			PROC_SUNLOCK(p);
 			if ((options & WNOWAIT) == 0)
 				p->p_flag |= P_WAITED;
 			sx_xunlock(&proctree_lock);
 
 			if (status != NULL)
 				*status = W_STOPCODE(p->p_xsig);
 			if (siginfo != NULL) {
 				siginfo->si_status = p->p_xsig;
 				siginfo->si_code = CLD_TRAPPED;
 			}
 			if ((options & WNOWAIT) == 0) {
 				PROC_LOCK(q);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(q);
 			}
 
 			CTR4(KTR_PTRACE,
 	    "wait: returning trapped pid %d status %#x (xstat %d) xthread %d",
 			    p->p_pid, W_STOPCODE(p->p_xsig), p->p_xsig,
 			    p->p_xthread != NULL ? p->p_xthread->td_tid : -1);
 			PROC_UNLOCK(p);
 			td->td_retval[0] = pid;
 			return (0);
 		}
 		if ((options & WUNTRACED) != 0 &&
 		    (p->p_flag & P_STOPPED_SIG) != 0 &&
 		    (p->p_suspcount == p->p_numthreads) &&
 		    ((p->p_flag & P_WAITED) == 0)) {
 			PROC_SUNLOCK(p);
 			if ((options & WNOWAIT) == 0)
 				p->p_flag |= P_WAITED;
 			sx_xunlock(&proctree_lock);
 
 			if (status != NULL)
 				*status = W_STOPCODE(p->p_xsig);
 			if (siginfo != NULL) {
 				siginfo->si_status = p->p_xsig;
 				siginfo->si_code = CLD_STOPPED;
 			}
 			if ((options & WNOWAIT) == 0) {
 				PROC_LOCK(q);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(q);
 			}
 
 			PROC_UNLOCK(p);
 			td->td_retval[0] = pid;
 			return (0);
 		}
 		PROC_SUNLOCK(p);
 		if ((options & WCONTINUED) != 0 &&
 		    (p->p_flag & P_CONTINUED) != 0) {
 			sx_xunlock(&proctree_lock);
 			if ((options & WNOWAIT) == 0) {
 				p->p_flag &= ~P_CONTINUED;
 				PROC_LOCK(q);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(q);
 			}
 			PROC_UNLOCK(p);
 
 			if (status != NULL)
 				*status = SIGCONT;
 			if (siginfo != NULL) {
 				siginfo->si_status = SIGCONT;
 				siginfo->si_code = CLD_CONTINUED;
 			}
 			td->td_retval[0] = pid;
 			return (0);
 		}
 		PROC_UNLOCK(p);
 	}
 
 	/*
 	 * Look in the orphans list too, to allow the parent to
 	 * collect it's child exit status even if child is being
 	 * debugged.
 	 *
 	 * Debugger detaches from the parent upon successful
 	 * switch-over from parent to child.  At this point due to
 	 * re-parenting the parent loses the child to debugger and a
 	 * wait4(2) call would report that it has no children to wait
 	 * for.  By maintaining a list of orphans we allow the parent
 	 * to successfully wait until the child becomes a zombie.
 	 */
 	if (nfound == 0) {
 		LIST_FOREACH(p, &q->p_orphans, p_orphan) {
 			ret = proc_to_reap(td, p, idtype, id, NULL, options,
 			    NULL, NULL, 1);
 			if (ret != 0) {
 				KASSERT(ret != -1, ("reaped an orphan (pid %d)",
 				    (int)td->td_retval[0]));
 				nfound++;
 				break;
 			}
 		}
 	}
 	if (nfound == 0) {
 		sx_xunlock(&proctree_lock);
 		return (ECHILD);
 	}
 	if (options & WNOHANG) {
 		sx_xunlock(&proctree_lock);
 		td->td_retval[0] = 0;
 		return (0);
 	}
 	PROC_LOCK(q);
 	sx_xunlock(&proctree_lock);
 	if (q->p_flag & P_STATCHILD) {
 		q->p_flag &= ~P_STATCHILD;
 		error = 0;
 	} else
 		error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0);
 	PROC_UNLOCK(q);
 	if (error)
 		return (error);
 	goto loop;
 }
 
 /*
  * Make process 'parent' the new parent of process 'child'.
  * Must be called with an exclusive hold of proctree lock.
  */
 void
 proc_reparent(struct proc *child, struct proc *parent)
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(child, MA_OWNED);
 	if (child->p_pptr == parent)
 		return;
 
 	PROC_LOCK(child->p_pptr);
 	sigqueue_take(child->p_ksi);
 	PROC_UNLOCK(child->p_pptr);
 	LIST_REMOVE(child, p_sibling);
 	LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
 
 	clear_orphan(child);
 	if (child->p_flag & P_TRACED) {
 		if (LIST_EMPTY(&child->p_pptr->p_orphans)) {
 			child->p_treeflag |= P_TREE_FIRST_ORPHAN;
 			LIST_INSERT_HEAD(&child->p_pptr->p_orphans, child,
 			    p_orphan);
 		} else {
 			LIST_INSERT_AFTER(LIST_FIRST(&child->p_pptr->p_orphans),
 			    child, p_orphan);
 		}
 		child->p_treeflag |= P_TREE_ORPHANED;
 	}
 
 	child->p_pptr = parent;
 }
Index: head/sys/kern/kern_thr.c
===================================================================
--- head/sys/kern/kern_thr.c	(revision 300042)
+++ head/sys/kern/kern_thr.c	(revision 300043)
@@ -1,610 +1,611 @@
 /*-
  * Copyright (c) 2003, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_posix.h"
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/ucontext.h>
 #include <sys/thr.h>
 #include <sys/rtprio.h>
 #include <sys/umtx.h>
 #include <sys/limits.h>
 
 #include <vm/vm_domain.h>
 
 #include <machine/frame.h>
 
 #include <security/audit/audit.h>
 
 static SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0,
     "thread allocation");
 
 static int max_threads_per_proc = 1500;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
     &max_threads_per_proc, 0, "Limit on threads per proc");
 
 static int max_threads_hits;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
     &max_threads_hits, 0, "kern.threads.max_threads_per_proc hit count");
 
 #ifdef COMPAT_FREEBSD32
 
 static inline int
 suword_lwpid(void *addr, lwpid_t lwpid)
 {
 	int error;
 
 	if (SV_CURPROC_FLAG(SV_LP64))
 		error = suword(addr, lwpid);
 	else
 		error = suword32(addr, lwpid);
 	return (error);
 }
 
 #else
 #define suword_lwpid	suword
 #endif
 
 /*
  * System call interface.
  */
 
 struct thr_create_initthr_args {
 	ucontext_t ctx;
 	long *tid;
 };
 
 static int
 thr_create_initthr(struct thread *td, void *thunk)
 {
 	struct thr_create_initthr_args *args;
 
 	/* Copy out the child tid. */
 	args = thunk;
 	if (args->tid != NULL && suword_lwpid(args->tid, td->td_tid))
 		return (EFAULT);
 
 	return (set_mcontext(td, &args->ctx.uc_mcontext));
 }
 
 int
 sys_thr_create(struct thread *td, struct thr_create_args *uap)
     /* ucontext_t *ctx, long *id, int flags */
 {
 	struct thr_create_initthr_args args;
 	int error;
 
 	if ((error = copyin(uap->ctx, &args.ctx, sizeof(args.ctx))))
 		return (error);
 	args.tid = uap->id;
 	return (thread_create(td, NULL, thr_create_initthr, &args));
 }
 
 int
 sys_thr_new(struct thread *td, struct thr_new_args *uap)
     /* struct thr_param * */
 {
 	struct thr_param param;
 	int error;
 
 	if (uap->param_size < 0 || uap->param_size > sizeof(param))
 		return (EINVAL);
 	bzero(&param, sizeof(param));
 	if ((error = copyin(uap->param, &param, uap->param_size)))
 		return (error);
 	return (kern_thr_new(td, &param));
 }
 
 static int
 thr_new_initthr(struct thread *td, void *thunk)
 {
 	stack_t stack;
 	struct thr_param *param;
 
 	/*
 	 * Here we copy out tid to two places, one for child and one
 	 * for parent, because pthread can create a detached thread,
 	 * if parent wants to safely access child tid, it has to provide
 	 * its storage, because child thread may exit quickly and
 	 * memory is freed before parent thread can access it.
 	 */
 	param = thunk;
 	if ((param->child_tid != NULL &&
 	    suword_lwpid(param->child_tid, td->td_tid)) ||
 	    (param->parent_tid != NULL &&
 	    suword_lwpid(param->parent_tid, td->td_tid)))
 		return (EFAULT);
 
 	/* Set up our machine context. */
 	stack.ss_sp = param->stack_base;
 	stack.ss_size = param->stack_size;
 	/* Set upcall address to user thread entry function. */
 	cpu_set_upcall_kse(td, param->start_func, param->arg, &stack);
 	/* Setup user TLS address and TLS pointer register. */
 	return (cpu_set_user_tls(td, param->tls_base));
 }
 
 int
 kern_thr_new(struct thread *td, struct thr_param *param)
 {
 	struct rtprio rtp, *rtpp;
 	int error;
 
 	rtpp = NULL;
 	if (param->rtp != 0) {
 		error = copyin(param->rtp, &rtp, sizeof(struct rtprio));
 		if (error)
 			return (error);
 		rtpp = &rtp;
 	}
 	return (thread_create(td, rtpp, thr_new_initthr, param));
 }
 
 int
 thread_create(struct thread *td, struct rtprio *rtp,
     int (*initialize_thread)(struct thread *, void *), void *thunk)
 {
 	struct thread *newtd;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 
 	if (rtp != NULL) {
 		switch(rtp->type) {
 		case RTP_PRIO_REALTIME:
 		case RTP_PRIO_FIFO:
 			/* Only root can set scheduler policy */
 			if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0)
 				return (EPERM);
 			if (rtp->prio > RTP_PRIO_MAX)
 				return (EINVAL);
 			break;
 		case RTP_PRIO_NORMAL:
 			rtp->prio = 0;
 			break;
 		default:
 			return (EINVAL);
 		}
 	}
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		error = racct_add(p, RACCT_NTHR, 1);
 		PROC_UNLOCK(p);
 		if (error != 0)
 			return (EPROCLIM);
 	}
 #endif
 
 	/* Initialize our td */
 	error = kern_thr_alloc(p, 0, &newtd);
 	if (error)
 		goto fail;
 
 	cpu_set_upcall(newtd, td);
 
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	bcopy(&td->td_startcopy, &newtd->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 	newtd->td_proc = td->td_proc;
 	thread_cow_get(newtd, td);
 
 	error = initialize_thread(newtd, thunk);
 	if (error != 0) {
 		thread_cow_free(newtd);
 		thread_free(newtd);
 		goto fail;
 	}
 
 	PROC_LOCK(p);
 	p->p_flag |= P_HADTHREADS;
 	thread_link(newtd, p);
 	bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name));
 	thread_lock(td);
 	/* let the scheduler know about these things. */
 	sched_fork_thread(td, newtd);
 	thread_unlock(td);
 	if (P_SHOULDSTOP(p))
 		newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
 	if (p->p_flag2 & P2_LWP_EVENTS)
 		newtd->td_dbgflags |= TDB_BORN;
 
 	/*
 	 * Copy the existing thread VM policy into the new thread.
 	 */
 	vm_domain_policy_localcopy(&newtd->td_vm_dom_policy,
 	    &td->td_vm_dom_policy);
 
 	PROC_UNLOCK(p);
 
 	tidhash_add(newtd);
 
 	thread_lock(newtd);
 	if (rtp != NULL) {
 		if (!(td->td_pri_class == PRI_TIMESHARE &&
 		      rtp->type == RTP_PRIO_NORMAL)) {
 			rtp_to_pri(rtp, newtd);
 			sched_prio(newtd, newtd->td_user_pri);
 		} /* ignore timesharing class */
 	}
 	TD_SET_CAN_RUN(newtd);
 	sched_add(newtd, SRQ_BORING);
 	thread_unlock(newtd);
 
 	return (0);
 
 fail:
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		racct_sub(p, RACCT_NTHR, 1);
 		PROC_UNLOCK(p);
 	}
 #endif
 	return (error);
 }
 
 int
 sys_thr_self(struct thread *td, struct thr_self_args *uap)
     /* long *id */
 {
 	int error;
 
 	error = suword_lwpid(uap->id, (unsigned)td->td_tid);
 	if (error == -1)
 		return (EFAULT);
 	return (0);
 }
 
 int
 sys_thr_exit(struct thread *td, struct thr_exit_args *uap)
     /* long *state */
 {
 
+	umtx_thread_exit(td);
+
 	/* Signal userland that it can free the stack. */
 	if ((void *)uap->state != NULL) {
 		suword_lwpid(uap->state, 1);
 		kern_umtx_wake(td, uap->state, INT_MAX, 0);
 	}
 
 	return (kern_thr_exit(td));
 }
 
 int
 kern_thr_exit(struct thread *td)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 
 	/*
 	 * If all of the threads in a process call this routine to
 	 * exit (e.g. all threads call pthread_exit()), exactly one
 	 * thread should return to the caller to terminate the process
 	 * instead of the thread.
 	 *
 	 * Checking p_numthreads alone is not sufficient since threads
 	 * might be committed to terminating while the PROC_LOCK is
 	 * dropped in either ptracestop() or while removing this thread
 	 * from the tidhash.  Instead, the p_pendingexits field holds
 	 * the count of threads in either of those states and a thread
 	 * is considered the "last" thread if all of the other threads
 	 * in a process are already terminating.
 	 */
 	PROC_LOCK(p);
 	if (p->p_numthreads == p->p_pendingexits + 1) {
 		/*
 		 * Ignore attempts to shut down last thread in the
 		 * proc.  This will actually call _exit(2) in the
 		 * usermode trampoline when it returns.
 		 */
 		PROC_UNLOCK(p);
 		return (0);
 	}
 
 	p->p_pendingexits++;
 	td->td_dbgflags |= TDB_EXIT;
 	if (p->p_flag & P_TRACED && p->p_flag2 & P2_LWP_EVENTS)
 		ptracestop(td, SIGTRAP);
 	PROC_UNLOCK(p);
 	tidhash_remove(td);
 	PROC_LOCK(p);
 	p->p_pendingexits--;
 
 	/*
 	 * The check above should prevent all other threads from this
 	 * process from exiting while the PROC_LOCK is dropped, so
 	 * there must be at least one other thread other than the
 	 * current thread.
 	 */
 	KASSERT(p->p_numthreads > 1, ("too few threads"));
 	racct_sub(p, RACCT_NTHR, 1);
 	tdsigcleanup(td);
-	umtx_thread_exit(td);
 	PROC_SLOCK(p);
 	thread_stopped(p);
 	thread_exit();
 	/* NOTREACHED */
 }
 
 int
 sys_thr_kill(struct thread *td, struct thr_kill_args *uap)
     /* long id, int sig */
 {
 	ksiginfo_t ksi;
 	struct thread *ttd;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = uap->sig;
 	ksi.ksi_code = SI_LWP;
 	ksi.ksi_pid = p->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 	if (uap->id == -1) {
 		if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
 			error = EINVAL;
 		} else {
 			error = ESRCH;
 			PROC_LOCK(p);
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				if (ttd != td) {
 					error = 0;
 					if (uap->sig == 0)
 						break;
 					tdksignal(ttd, uap->sig, &ksi);
 				}
 			}
 			PROC_UNLOCK(p);
 		}
 	} else {
 		error = 0;
 		ttd = tdfind((lwpid_t)uap->id, p->p_pid);
 		if (ttd == NULL)
 			return (ESRCH);
 		if (uap->sig == 0)
 			;
 		else if (!_SIG_VALID(uap->sig))
 			error = EINVAL;
 		else 
 			tdksignal(ttd, uap->sig, &ksi);
 		PROC_UNLOCK(ttd->td_proc);
 	}
 	return (error);
 }
 
 int
 sys_thr_kill2(struct thread *td, struct thr_kill2_args *uap)
     /* pid_t pid, long id, int sig */
 {
 	ksiginfo_t ksi;
 	struct thread *ttd;
 	struct proc *p;
 	int error;
 
 	AUDIT_ARG_SIGNUM(uap->sig);
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = uap->sig;
 	ksi.ksi_code = SI_LWP;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 	if (uap->id == -1) {
 		if ((p = pfind(uap->pid)) == NULL)
 			return (ESRCH);
 		AUDIT_ARG_PROCESS(p);
 		error = p_cansignal(td, p, uap->sig);
 		if (error) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 		if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
 			error = EINVAL;
 		} else {
 			error = ESRCH;
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				if (ttd != td) {
 					error = 0;
 					if (uap->sig == 0)
 						break;
 					tdksignal(ttd, uap->sig, &ksi);
 				}
 			}
 		}
 		PROC_UNLOCK(p);
 	} else {
 		ttd = tdfind((lwpid_t)uap->id, uap->pid);
 		if (ttd == NULL)
 			return (ESRCH);
 		p = ttd->td_proc;
 		AUDIT_ARG_PROCESS(p);
 		error = p_cansignal(td, p, uap->sig);
 		if (uap->sig == 0)
 			;
 		else if (!_SIG_VALID(uap->sig))
 			error = EINVAL;
 		else
 			tdksignal(ttd, uap->sig, &ksi);
 		PROC_UNLOCK(p);
 	}
 	return (error);
 }
 
 int
 sys_thr_suspend(struct thread *td, struct thr_suspend_args *uap)
 	/* const struct timespec *timeout */
 {
 	struct timespec ts, *tsp;
 	int error;
 
 	tsp = NULL;
 	if (uap->timeout != NULL) {
 		error = umtx_copyin_timeout(uap->timeout, &ts);
 		if (error != 0)
 			return (error);
 		tsp = &ts;
 	}
 
 	return (kern_thr_suspend(td, tsp));
 }
 
 int
 kern_thr_suspend(struct thread *td, struct timespec *tsp)
 {
 	struct proc *p = td->td_proc;
 	struct timeval tv;
 	int error = 0;
 	int timo = 0;
 
 	if (td->td_pflags & TDP_WAKEUP) {
 		td->td_pflags &= ~TDP_WAKEUP;
 		return (0);
 	}
 
 	if (tsp != NULL) {
 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
 			error = EWOULDBLOCK;
 		else {
 			TIMESPEC_TO_TIMEVAL(&tv, tsp);
 			timo = tvtohz(&tv);
 		}
 	}
 
 	PROC_LOCK(p);
 	if (error == 0 && (td->td_flags & TDF_THRWAKEUP) == 0)
 		error = msleep((void *)td, &p->p_mtx,
 			 PCATCH, "lthr", timo);
 
 	if (td->td_flags & TDF_THRWAKEUP) {
 		thread_lock(td);
 		td->td_flags &= ~TDF_THRWAKEUP;
 		thread_unlock(td);
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	PROC_UNLOCK(p);
 	if (error == EWOULDBLOCK)
 		error = ETIMEDOUT;
 	else if (error == ERESTART) {
 		if (timo != 0)
 			error = EINTR;
 	}
 	return (error);
 }
 
 int
 sys_thr_wake(struct thread *td, struct thr_wake_args *uap)
 	/* long id */
 {
 	struct proc *p;
 	struct thread *ttd;
 
 	if (uap->id == td->td_tid) {
 		td->td_pflags |= TDP_WAKEUP;
 		return (0);
 	} 
 
 	p = td->td_proc;
 	ttd = tdfind((lwpid_t)uap->id, p->p_pid);
 	if (ttd == NULL)
 		return (ESRCH);
 	thread_lock(ttd);
 	ttd->td_flags |= TDF_THRWAKEUP;
 	thread_unlock(ttd);
 	wakeup((void *)ttd);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 int
 sys_thr_set_name(struct thread *td, struct thr_set_name_args *uap)
 {
 	struct proc *p;
 	char name[MAXCOMLEN + 1];
 	struct thread *ttd;
 	int error;
 
 	error = 0;
 	name[0] = '\0';
 	if (uap->name != NULL) {
 		error = copyinstr(uap->name, name, sizeof(name),
 			NULL);
 		if (error)
 			return (error);
 	}
 	p = td->td_proc;
 	ttd = tdfind((lwpid_t)uap->id, p->p_pid);
 	if (ttd == NULL)
 		return (ESRCH);
 	strcpy(ttd->td_name, name);
 #ifdef KTR
 	sched_clear_tdname(ttd);
 #endif
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 int
 kern_thr_alloc(struct proc *p, int pages, struct thread **ntd)
 {
 
 	/* Have race condition but it is cheap. */
 	if (p->p_numthreads >= max_threads_per_proc) {
 		++max_threads_hits;
 		return (EPROCLIM);
 	}
 
 	*ntd = thread_alloc(pages);
 	if (*ntd == NULL)
 		return (ENOMEM);
 
 	return (0);
 }
Index: head/sys/kern/kern_thread.c
===================================================================
--- head/sys/kern/kern_thread.c	(revision 300042)
+++ head/sys/kern/kern_thread.c	(revision 300043)
@@ -1,1205 +1,1206 @@
 /*-
  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
  *  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 #include "opt_witness.h"
 #include "opt_hwpmc_hooks.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rangelock.h>
 #include <sys/resourcevar.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/selinfo.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/turnstile.h>
 #include <sys/ktr.h>
 #include <sys/rwlock.h>
 #include <sys/umtx.h>
 #include <sys/cpuset.h>
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/vm_domain.h>
 #include <sys/eventhandler.h>
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE(proc, , , lwp__exit);
 
 /*
  * thread related storage.
  */
 static uma_zone_t thread_zone;
 
 TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
 static struct mtx zombie_lock;
 MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN);
 
 static void thread_zombie(struct thread *);
 static int thread_unsuspend_one(struct thread *td, struct proc *p,
     bool boundary);
 
 #define TID_BUFFER_SIZE	1024
 
 struct mtx tid_lock;
 static struct unrhdr *tid_unrhdr;
 static lwpid_t tid_buffer[TID_BUFFER_SIZE];
 static int tid_head, tid_tail;
 static MALLOC_DEFINE(M_TIDHASH, "tidhash", "thread hash");
 
 struct	tidhashhead *tidhashtbl;
 u_long	tidhash;
 struct	rwlock tidhash_lock;
 
 static lwpid_t
 tid_alloc(void)
 {
 	lwpid_t	tid;
 
 	tid = alloc_unr(tid_unrhdr);
 	if (tid != -1)
 		return (tid);
 	mtx_lock(&tid_lock);
 	if (tid_head == tid_tail) {
 		mtx_unlock(&tid_lock);
 		return (-1);
 	}
 	tid = tid_buffer[tid_head];
 	tid_head = (tid_head + 1) % TID_BUFFER_SIZE;
 	mtx_unlock(&tid_lock);
 	return (tid);
 }
 
 static void
 tid_free(lwpid_t tid)
 {
 	lwpid_t tmp_tid = -1;
 
 	mtx_lock(&tid_lock);
 	if ((tid_tail + 1) % TID_BUFFER_SIZE == tid_head) {
 		tmp_tid = tid_buffer[tid_head];
 		tid_head = (tid_head + 1) % TID_BUFFER_SIZE;
 	}
 	tid_buffer[tid_tail] = tid;
 	tid_tail = (tid_tail + 1) % TID_BUFFER_SIZE;
 	mtx_unlock(&tid_lock);
 	if (tmp_tid != -1)
 		free_unr(tid_unrhdr, tmp_tid);
 }
 
 /*
  * Prepare a thread for use.
  */
 static int
 thread_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 	td->td_state = TDS_INACTIVE;
 	td->td_oncpu = NOCPU;
 
 	td->td_tid = tid_alloc();
 
 	/*
 	 * Note that td_critnest begins life as 1 because the thread is not
 	 * running and is thereby implicitly waiting to be on the receiving
 	 * end of a context switch.
 	 */
 	td->td_critnest = 1;
 	td->td_lend_user_pri = PRI_MAX;
 	EVENTHANDLER_INVOKE(thread_ctor, td);
 #ifdef AUDIT
 	audit_thread_alloc(td);
 #endif
 	umtx_thread_alloc(td);
 	return (0);
 }
 
 /*
  * Reclaim a thread after use.
  */
 static void
 thread_dtor(void *mem, int size, void *arg)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 
 #ifdef INVARIANTS
 	/* Verify that this thread is in a safe state to free. */
 	switch (td->td_state) {
 	case TDS_INHIBITED:
 	case TDS_RUNNING:
 	case TDS_CAN_RUN:
 	case TDS_RUNQ:
 		/*
 		 * We must never unlink a thread that is in one of
 		 * these states, because it is currently active.
 		 */
 		panic("bad state for thread unlinking");
 		/* NOTREACHED */
 	case TDS_INACTIVE:
 		break;
 	default:
 		panic("bad thread state");
 		/* NOTREACHED */
 	}
 #endif
 #ifdef AUDIT
 	audit_thread_free(td);
 #endif
 	/* Free all OSD associated to this thread. */
 	osd_thread_exit(td);
 
 	EVENTHANDLER_INVOKE(thread_dtor, td);
 	tid_free(td->td_tid);
 }
 
 /*
  * Initialize type-stable parts of a thread (when newly created).
  */
 static int
 thread_init(void *mem, int size, int flags)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 
 	td->td_sleepqueue = sleepq_alloc();
 	td->td_turnstile = turnstile_alloc();
 	td->td_rlqe = NULL;
 	EVENTHANDLER_INVOKE(thread_init, td);
 	td->td_sched = (struct td_sched *)&td[1];
 	umtx_thread_init(td);
 	td->td_kstack = 0;
 	td->td_sel = NULL;
 	return (0);
 }
 
 /*
  * Tear down type-stable parts of a thread (just before being discarded).
  */
 static void
 thread_fini(void *mem, int size)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 	EVENTHANDLER_INVOKE(thread_fini, td);
 	rlqentry_free(td->td_rlqe);
 	turnstile_free(td->td_turnstile);
 	sleepq_free(td->td_sleepqueue);
 	umtx_thread_fini(td);
 	seltdfini(td);
 }
 
 /*
  * For a newly created process,
  * link up all the structures and its initial threads etc.
  * called from:
  * {arch}/{arch}/machdep.c   {arch}_init(), init386() etc.
  * proc_dtor() (should go away)
  * proc_init()
  */
 void
 proc_linkup0(struct proc *p, struct thread *td)
 {
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	proc_linkup(p, td);
 }
 
 void
 proc_linkup(struct proc *p, struct thread *td)
 {
 
 	sigqueue_init(&p->p_sigqueue, p);
 	p->p_ksi = ksiginfo_alloc(1);
 	if (p->p_ksi != NULL) {
 		/* XXX p_ksi may be null if ksiginfo zone is not ready */
 		p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
 	}
 	LIST_INIT(&p->p_mqnotifier);
 	p->p_numthreads = 0;
 	thread_link(td, p);
 }
 
 /*
  * Initialize global thread allocation resources.
  */
 void
 threadinit(void)
 {
 
 	mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF);
 
 	/*
 	 * pid_max cannot be greater than PID_MAX.
 	 * leave one number for thread0.
 	 */
 	tid_unrhdr = new_unrhdr(PID_MAX + 2, INT_MAX, &tid_lock);
 
 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
 	    thread_ctor, thread_dtor, thread_init, thread_fini,
 	    16 - 1, UMA_ZONE_NOFREE);
 	tidhashtbl = hashinit(maxproc / 2, M_TIDHASH, &tidhash);
 	rw_init(&tidhash_lock, "tidhash");
 }
 
 /*
  * Place an unused thread on the zombie list.
  * Use the slpq as that must be unused by now.
  */
 void
 thread_zombie(struct thread *td)
 {
 	mtx_lock_spin(&zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq);
 	mtx_unlock_spin(&zombie_lock);
 }
 
 /*
  * Release a thread that has exited after cpu_throw().
  */
 void
 thread_stash(struct thread *td)
 {
 	atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1);
 	thread_zombie(td);
 }
 
 /*
  * Reap zombie resources.
  */
 void
 thread_reap(void)
 {
 	struct thread *td_first, *td_next;
 
 	/*
 	 * Don't even bother to lock if none at this instant,
 	 * we really don't care about the next instant..
 	 */
 	if (!TAILQ_EMPTY(&zombie_threads)) {
 		mtx_lock_spin(&zombie_lock);
 		td_first = TAILQ_FIRST(&zombie_threads);
 		if (td_first)
 			TAILQ_INIT(&zombie_threads);
 		mtx_unlock_spin(&zombie_lock);
 		while (td_first) {
 			td_next = TAILQ_NEXT(td_first, td_slpq);
 			thread_cow_free(td_first);
 			thread_free(td_first);
 			td_first = td_next;
 		}
 	}
 }
 
 /*
  * Allocate a thread.
  */
 struct thread *
 thread_alloc(int pages)
 {
 	struct thread *td;
 
 	thread_reap(); /* check if any zombies to get */
 
 	td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK);
 	KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack"));
 	if (!vm_thread_new(td, pages)) {
 		uma_zfree(thread_zone, td);
 		return (NULL);
 	}
 	cpu_thread_alloc(td);
 	vm_domain_policy_init(&td->td_vm_dom_policy);
 	return (td);
 }
 
 int
 thread_alloc_stack(struct thread *td, int pages)
 {
 
 	KASSERT(td->td_kstack == 0,
 	    ("thread_alloc_stack called on a thread with kstack"));
 	if (!vm_thread_new(td, pages))
 		return (0);
 	cpu_thread_alloc(td);
 	return (1);
 }
 
 /*
  * Deallocate a thread.
  */
 void
 thread_free(struct thread *td)
 {
 
 	lock_profile_thread_exit(td);
 	if (td->td_cpuset)
 		cpuset_rel(td->td_cpuset);
 	td->td_cpuset = NULL;
 	cpu_thread_free(td);
 	if (td->td_kstack != 0)
 		vm_thread_dispose(td);
 	vm_domain_policy_cleanup(&td->td_vm_dom_policy);
 	uma_zfree(thread_zone, td);
 }
 
 void
 thread_cow_get_proc(struct thread *newtd, struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	newtd->td_ucred = crhold(p->p_ucred);
 	newtd->td_limit = lim_hold(p->p_limit);
 	newtd->td_cowgen = p->p_cowgen;
 }
 
 void
 thread_cow_get(struct thread *newtd, struct thread *td)
 {
 
 	newtd->td_ucred = crhold(td->td_ucred);
 	newtd->td_limit = lim_hold(td->td_limit);
 	newtd->td_cowgen = td->td_cowgen;
 }
 
 void
 thread_cow_free(struct thread *td)
 {
 
 	if (td->td_ucred != NULL)
 		crfree(td->td_ucred);
 	if (td->td_limit != NULL)
 		lim_free(td->td_limit);
 }
 
 void
 thread_cow_update(struct thread *td)
 {
 	struct proc *p;
 	struct ucred *oldcred;
 	struct plimit *oldlimit;
 
 	p = td->td_proc;
 	oldcred = NULL;
 	oldlimit = NULL;
 	PROC_LOCK(p);
 	if (td->td_ucred != p->p_ucred) {
 		oldcred = td->td_ucred;
 		td->td_ucred = crhold(p->p_ucred);
 	}
 	if (td->td_limit != p->p_limit) {
 		oldlimit = td->td_limit;
 		td->td_limit = lim_hold(p->p_limit);
 	}
 	td->td_cowgen = p->p_cowgen;
 	PROC_UNLOCK(p);
 	if (oldcred != NULL)
 		crfree(oldcred);
 	if (oldlimit != NULL)
 		lim_free(oldlimit);
 }
 
 /*
  * Discard the current thread and exit from its context.
  * Always called with scheduler locked.
  *
  * Because we can't free a thread while we're operating under its context,
  * push the current thread into our CPU's deadthread holder. This means
  * we needn't worry about someone else grabbing our context before we
  * do a cpu_throw().
  */
 void
 thread_exit(void)
 {
 	uint64_t runtime, new_switchtime;
 	struct thread *td;
 	struct thread *td2;
 	struct proc *p;
 	int wakeup_swapper;
 
 	td = curthread;
 	p = td->td_proc;
 
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p != NULL, ("thread exiting without a process"));
 	CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
 	    (long)p->p_pid, td->td_name);
 	KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
 
 #ifdef AUDIT
 	AUDIT_SYSCALL_EXIT(0, td);
 #endif
 	/*
 	 * drop FPU & debug register state storage, or any other
 	 * architecture specific resources that
 	 * would not be on a new untouched process.
 	 */
 	cpu_thread_exit(td);	/* XXXSMP */
 
 	/*
 	 * The last thread is left attached to the process
 	 * So that the whole bundle gets recycled. Skip
 	 * all this stuff if we never had threads.
 	 * EXIT clears all sign of other threads when
 	 * it goes to single threading, so the last thread always
 	 * takes the short path.
 	 */
 	if (p->p_flag & P_HADTHREADS) {
 		if (p->p_numthreads > 1) {
 			atomic_add_int(&td->td_proc->p_exitthreads, 1);
 			thread_unlink(td);
 			td2 = FIRST_THREAD_IN_PROC(p);
 			sched_exit_thread(td2, td);
 
 			/*
 			 * The test below is NOT true if we are the
 			 * sole exiting thread. P_STOPPED_SINGLE is unset
 			 * in exit1() after it is the only survivor.
 			 */
 			if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 				if (p->p_numthreads == p->p_suspcount) {
 					thread_lock(p->p_singlethread);
 					wakeup_swapper = thread_unsuspend_one(
 						p->p_singlethread, p, false);
 					thread_unlock(p->p_singlethread);
 					if (wakeup_swapper)
 						kick_proc0();
 				}
 			}
 
 			PCPU_SET(deadthread, td);
 		} else {
 			/*
 			 * The last thread is exiting.. but not through exit()
 			 */
 			panic ("thread_exit: Last thread exiting on its own");
 		}
 	} 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * If this thread is part of a process that is being tracked by hwpmc(4),
 	 * inform the module of the thread's impending exit.
 	 */
 	if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 		PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 	PROC_UNLOCK(p);
 	PROC_STATLOCK(p);
 	thread_lock(td);
 	PROC_SUNLOCK(p);
 
 	/* Do the same timestamp bookkeeping that mi_switch() would do. */
 	new_switchtime = cpu_ticks();
 	runtime = new_switchtime - PCPU_GET(switchtime);
 	td->td_runtime += runtime;
 	td->td_incruntime += runtime;
 	PCPU_SET(switchtime, new_switchtime);
 	PCPU_SET(switchticks, ticks);
 	PCPU_INC(cnt.v_swtch);
 
 	/* Save our resource usage in our process. */
 	td->td_ru.ru_nvcsw++;
 	ruxagg(p, td);
 	rucollect(&p->p_ru, &td->td_ru);
 	PROC_STATUNLOCK(p);
 
 	td->td_state = TDS_INACTIVE;
 #ifdef WITNESS
 	witness_thread_exit(td);
 #endif
 	CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
 	sched_throw(td);
 	panic("I'm a teapot!");
 	/* NOTREACHED */
 }
 
 /*
  * Do any thread specific cleanups that may be needed in wait()
  * called with Giant, proc and schedlock not held.
  */
 void
 thread_wait(struct proc *p)
 {
 	struct thread *td;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	KASSERT(p->p_numthreads == 1, ("multiple threads in thread_wait()"));
 	KASSERT(p->p_exitthreads == 0, ("p_exitthreads leaking"));
 	td = FIRST_THREAD_IN_PROC(p);
 	/* Lock the last thread so we spin until it exits cpu_throw(). */
 	thread_lock(td);
 	thread_unlock(td);
 	lock_profile_thread_exit(td);
 	cpuset_rel(td->td_cpuset);
 	td->td_cpuset = NULL;
 	cpu_thread_clean(td);
 	thread_cow_free(td);
 	thread_reap();	/* check for zombie threads etc. */
 }
 
 /*
  * Link a thread to a process.
  * set up anything that needs to be initialized for it to
  * be used by the process.
  */
 void
 thread_link(struct thread *td, struct proc *p)
 {
 
 	/*
 	 * XXX This can't be enabled because it's called for proc0 before
 	 * its lock has been created.
 	 * PROC_LOCK_ASSERT(p, MA_OWNED);
 	 */
 	td->td_state    = TDS_INACTIVE;
 	td->td_proc     = p;
 	td->td_flags    = TDF_INMEM;
 
 	LIST_INIT(&td->td_contested);
 	LIST_INIT(&td->td_lprof[0]);
 	LIST_INIT(&td->td_lprof[1]);
 	sigqueue_init(&td->td_sigqueue, p);
 	callout_init(&td->td_slpcallout, 1);
 	TAILQ_INSERT_TAIL(&p->p_threads, td, td_plist);
 	p->p_numthreads++;
 }
 
 /*
  * Called from:
  *  thread_exit()
  */
 void
 thread_unlink(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	TAILQ_REMOVE(&p->p_threads, td, td_plist);
 	p->p_numthreads--;
 	/* could clear a few other things here */
 	/* Must  NOT clear links to proc! */
 }
 
 static int
 calc_remaining(struct proc *p, int mode)
 {
 	int remaining;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	if (mode == SINGLE_EXIT)
 		remaining = p->p_numthreads;
 	else if (mode == SINGLE_BOUNDARY)
 		remaining = p->p_numthreads - p->p_boundary_count;
 	else if (mode == SINGLE_NO_EXIT || mode == SINGLE_ALLPROC)
 		remaining = p->p_numthreads - p->p_suspcount;
 	else
 		panic("calc_remaining: wrong mode %d", mode);
 	return (remaining);
 }
 
 static int
 remain_for_mode(int mode)
 {
 
 	return (mode == SINGLE_ALLPROC ? 0 : 1);
 }
 
 static int
 weed_inhib(int mode, struct thread *td2, struct proc *p)
 {
 	int wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td2, MA_OWNED);
 
 	wakeup_swapper = 0;
 	switch (mode) {
 	case SINGLE_EXIT:
 		if (TD_IS_SUSPENDED(td2))
 			wakeup_swapper |= thread_unsuspend_one(td2, p, true);
 		if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0)
 			wakeup_swapper |= sleepq_abort(td2, EINTR);
 		break;
 	case SINGLE_BOUNDARY:
 		if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0)
 			wakeup_swapper |= thread_unsuspend_one(td2, p, false);
 		if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0)
 			wakeup_swapper |= sleepq_abort(td2, ERESTART);
 		break;
 	case SINGLE_NO_EXIT:
 		if (TD_IS_SUSPENDED(td2) && (td2->td_flags & TDF_BOUNDARY) == 0)
 			wakeup_swapper |= thread_unsuspend_one(td2, p, false);
 		if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0)
 			wakeup_swapper |= sleepq_abort(td2, ERESTART);
 		break;
 	case SINGLE_ALLPROC:
 		/*
 		 * ALLPROC suspend tries to avoid spurious EINTR for
 		 * threads sleeping interruptable, by suspending the
 		 * thread directly, similarly to sig_suspend_threads().
 		 * Since such sleep is not performed at the user
 		 * boundary, TDF_BOUNDARY flag is not set, and TDF_ALLPROCSUSP
 		 * is used to avoid immediate un-suspend.
 		 */
 		if (TD_IS_SUSPENDED(td2) && (td2->td_flags & (TDF_BOUNDARY |
 		    TDF_ALLPROCSUSP)) == 0)
 			wakeup_swapper |= thread_unsuspend_one(td2, p, false);
 		if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR) != 0) {
 			if ((td2->td_flags & TDF_SBDRY) == 0) {
 				thread_suspend_one(td2);
 				td2->td_flags |= TDF_ALLPROCSUSP;
 			} else {
 				wakeup_swapper |= sleepq_abort(td2, ERESTART);
 			}
 		}
 		break;
 	}
 	return (wakeup_swapper);
 }
 
 /*
  * Enforce single-threading.
  *
  * Returns 1 if the caller must abort (another thread is waiting to
  * exit the process or similar). Process is locked!
  * Returns 0 when you are successfully the only thread running.
  * A process has successfully single threaded in the suspend mode when
  * There are no threads in user mode. Threads in the kernel must be
  * allowed to continue until they get to the user boundary. They may even
  * copy out their return values and data before suspending. They may however be
  * accelerated in reaching the user boundary as we will wake up
  * any sleeping threads that are interruptable. (PCATCH).
  */
 int
 thread_single(struct proc *p, int mode)
 {
 	struct thread *td;
 	struct thread *td2;
 	int remaining, wakeup_swapper;
 
 	td = curthread;
 	KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
 	    mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
 	    ("invalid mode %d", mode));
 	/*
 	 * If allowing non-ALLPROC singlethreading for non-curproc
 	 * callers, calc_remaining() and remain_for_mode() should be
 	 * adjusted to also account for td->td_proc != p.  For now
 	 * this is not implemented because it is not used.
 	 */
 	KASSERT((mode == SINGLE_ALLPROC && td->td_proc != p) ||
 	    (mode != SINGLE_ALLPROC && td->td_proc == p),
 	    ("mode %d proc %p curproc %p", mode, p, td->td_proc));
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if ((p->p_flag & P_HADTHREADS) == 0 && mode != SINGLE_ALLPROC)
 		return (0);
 
 	/* Is someone already single threading? */
 	if (p->p_singlethread != NULL && p->p_singlethread != td)
 		return (1);
 
 	if (mode == SINGLE_EXIT) {
 		p->p_flag |= P_SINGLE_EXIT;
 		p->p_flag &= ~P_SINGLE_BOUNDARY;
 	} else {
 		p->p_flag &= ~P_SINGLE_EXIT;
 		if (mode == SINGLE_BOUNDARY)
 			p->p_flag |= P_SINGLE_BOUNDARY;
 		else
 			p->p_flag &= ~P_SINGLE_BOUNDARY;
 	}
 	if (mode == SINGLE_ALLPROC)
 		p->p_flag |= P_TOTAL_STOP;
 	p->p_flag |= P_STOPPED_SINGLE;
 	PROC_SLOCK(p);
 	p->p_singlethread = td;
 	remaining = calc_remaining(p, mode);
 	while (remaining != remain_for_mode(mode)) {
 		if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE)
 			goto stopme;
 		wakeup_swapper = 0;
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			thread_lock(td2);
 			td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
 			if (TD_IS_INHIBITED(td2)) {
 				wakeup_swapper |= weed_inhib(mode, td2, p);
 #ifdef SMP
 			} else if (TD_IS_RUNNING(td2) && td != td2) {
 				forward_signal(td2);
 #endif
 			}
 			thread_unlock(td2);
 		}
 		if (wakeup_swapper)
 			kick_proc0();
 		remaining = calc_remaining(p, mode);
 
 		/*
 		 * Maybe we suspended some threads.. was it enough?
 		 */
 		if (remaining == remain_for_mode(mode))
 			break;
 
 stopme:
 		/*
 		 * Wake us up when everyone else has suspended.
 		 * In the mean time we suspend as well.
 		 */
 		thread_suspend_switch(td, p);
 		remaining = calc_remaining(p, mode);
 	}
 	if (mode == SINGLE_EXIT) {
 		/*
 		 * Convert the process to an unthreaded process.  The
 		 * SINGLE_EXIT is called by exit1() or execve(), in
 		 * both cases other threads must be retired.
 		 */
 		KASSERT(p->p_numthreads == 1, ("Unthreading with >1 threads"));
 		p->p_singlethread = NULL;
 		p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_HADTHREADS);
 
 		/*
 		 * Wait for any remaining threads to exit cpu_throw().
 		 */
 		while (p->p_exitthreads != 0) {
 			PROC_SUNLOCK(p);
 			PROC_UNLOCK(p);
 			sched_relinquish(td);
 			PROC_LOCK(p);
 			PROC_SLOCK(p);
 		}
 	} else if (mode == SINGLE_BOUNDARY) {
 		/*
 		 * Wait until all suspended threads are removed from
 		 * the processors.  The thread_suspend_check()
 		 * increments p_boundary_count while it is still
 		 * running, which makes it possible for the execve()
 		 * to destroy vmspace while our other threads are
 		 * still using the address space.
 		 *
 		 * We lock the thread, which is only allowed to
 		 * succeed after context switch code finished using
 		 * the address space.
 		 */
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			thread_lock(td2);
 			KASSERT((td2->td_flags & TDF_BOUNDARY) != 0,
 			    ("td %p not on boundary", td2));
 			KASSERT(TD_IS_SUSPENDED(td2),
 			    ("td %p is not suspended", td2));
 			thread_unlock(td2);
 		}
 	}
 	PROC_SUNLOCK(p);
 	return (0);
 }
 
 bool
 thread_suspend_check_needed(void)
 {
 	struct proc *p;
 	struct thread *td;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	return (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) != 0 &&
 	    (td->td_dbgflags & TDB_SUSPEND) != 0));
 }
 
 /*
  * Called in from locations that can safely check to see
  * whether we have to suspend or at least throttle for a
  * single-thread event (e.g. fork).
  *
  * Such locations include userret().
  * If the "return_instead" argument is non zero, the thread must be able to
  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
  *
  * The 'return_instead' argument tells the function if it may do a
  * thread_exit() or suspend, or whether the caller must abort and back
  * out instead.
  *
  * If the thread that set the single_threading request has set the
  * P_SINGLE_EXIT bit in the process flags then this call will never return
  * if 'return_instead' is false, but will exit.
  *
  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
  *---------------+--------------------+---------------------
  *       0       | returns 0          |   returns 0 or 1
  *               | when ST ends       |   immediately
  *---------------+--------------------+---------------------
  *       1       | thread exits       |   returns 1
  *               |                    |  immediately
  * 0 = thread_exit() or suspension ok,
  * other = return error instead of stopping the thread.
  *
  * While a full suspension is under effect, even a single threading
  * thread would be suspended if it made this call (but it shouldn't).
  * This call should only be made from places where
  * thread_exit() would be safe as that may be the outcome unless
  * return_instead is set.
  */
 int
 thread_suspend_check(int return_instead)
 {
 	struct thread *td;
 	struct proc *p;
 	int wakeup_swapper;
 
 	td = curthread;
 	p = td->td_proc;
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	while (thread_suspend_check_needed()) {
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			KASSERT(p->p_singlethread != NULL,
 			    ("singlethread not set"));
 			/*
 			 * The only suspension in action is a
 			 * single-threading. Single threader need not stop.
 			 * XXX Should be safe to access unlocked
 			 * as it can only be set to be true by us.
 			 */
 			if (p->p_singlethread == td)
 				return (0);	/* Exempt from stopping. */
 		}
 		if ((p->p_flag & P_SINGLE_EXIT) && return_instead)
 			return (EINTR);
 
 		/* Should we goto user boundary if we didn't come from there? */
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
 		    (p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
 			return (ERESTART);
 
 		/*
 		 * Ignore suspend requests if they are deferred.
 		 */
 		if ((td->td_flags & TDF_SBDRY) != 0) {
 			KASSERT(return_instead,
 			    ("TDF_SBDRY set for unsafe thread_suspend_check"));
 			return (0);
 		}
 
 		/*
 		 * If the process is waiting for us to exit,
 		 * this thread should just suicide.
 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
 		 */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
 			PROC_UNLOCK(p);
 
 			/*
 			 * Allow Linux emulation layer to do some work
 			 * before thread suicide.
 			 */
 			if (__predict_false(p->p_sysent->sv_thread_detach != NULL))
 				(p->p_sysent->sv_thread_detach)(td);
+			umtx_thread_exit(td);
 			kern_thr_exit(td);
 			panic("stopped thread did not exit");
 		}
 
 		PROC_SLOCK(p);
 		thread_stopped(p);
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount + 1) {
 				thread_lock(p->p_singlethread);
 				wakeup_swapper = thread_unsuspend_one(
 				    p->p_singlethread, p, false);
 				thread_unlock(p->p_singlethread);
 				if (wakeup_swapper)
 					kick_proc0();
 			}
 		}
 		PROC_UNLOCK(p);
 		thread_lock(td);
 		/*
 		 * When a thread suspends, it just
 		 * gets taken off all queues.
 		 */
 		thread_suspend_one(td);
 		if (return_instead == 0) {
 			p->p_boundary_count++;
 			td->td_flags |= TDF_BOUNDARY;
 		}
 		PROC_SUNLOCK(p);
 		mi_switch(SW_INVOL | SWT_SUSPEND, NULL);
 		thread_unlock(td);
 		PROC_LOCK(p);
 	}
 	return (0);
 }
 
 void
 thread_suspend_switch(struct thread *td, struct proc *p)
 {
 
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * We implement thread_suspend_one in stages here to avoid
 	 * dropping the proc lock while the thread lock is owned.
 	 */
 	if (p == td->td_proc) {
 		thread_stopped(p);
 		p->p_suspcount++;
 	}
 	PROC_UNLOCK(p);
 	thread_lock(td);
 	td->td_flags &= ~TDF_NEEDSUSPCHK;
 	TD_SET_SUSPENDED(td);
 	sched_sleep(td, 0);
 	PROC_SUNLOCK(p);
 	DROP_GIANT();
 	mi_switch(SW_VOL | SWT_SUSPEND, NULL);
 	thread_unlock(td);
 	PICKUP_GIANT();
 	PROC_LOCK(p);
 	PROC_SLOCK(p);
 }
 
 void
 thread_suspend_one(struct thread *td)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	p->p_suspcount++;
 	td->td_flags &= ~TDF_NEEDSUSPCHK;
 	TD_SET_SUSPENDED(td);
 	sched_sleep(td, 0);
 }
 
 static int
 thread_unsuspend_one(struct thread *td, struct proc *p, bool boundary)
 {
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
 	TD_CLR_SUSPENDED(td);
 	td->td_flags &= ~TDF_ALLPROCSUSP;
 	if (td->td_proc == p) {
 		PROC_SLOCK_ASSERT(p, MA_OWNED);
 		p->p_suspcount--;
 		if (boundary && (td->td_flags & TDF_BOUNDARY) != 0) {
 			td->td_flags &= ~TDF_BOUNDARY;
 			p->p_boundary_count--;
 		}
 	}
 	return (setrunnable(td));
 }
 
 /*
  * Allow all threads blocked by single threading to continue running.
  */
 void
 thread_unsuspend(struct proc *p)
 {
 	struct thread *td;
 	int wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	wakeup_swapper = 0;
 	if (!P_SHOULDSTOP(p)) {
                 FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (TD_IS_SUSPENDED(td)) {
 				wakeup_swapper |= thread_unsuspend_one(td, p,
 				    true);
 			}
 			thread_unlock(td);
 		}
 	} else if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
 	    p->p_numthreads == p->p_suspcount) {
 		/*
 		 * Stopping everything also did the job for the single
 		 * threading request. Now we've downgraded to single-threaded,
 		 * let it continue.
 		 */
 		if (p->p_singlethread->td_proc == p) {
 			thread_lock(p->p_singlethread);
 			wakeup_swapper = thread_unsuspend_one(
 			    p->p_singlethread, p, false);
 			thread_unlock(p->p_singlethread);
 		}
 	}
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * End the single threading mode..
  */
 void
 thread_single_end(struct proc *p, int mode)
 {
 	struct thread *td;
 	int wakeup_swapper;
 
 	KASSERT(mode == SINGLE_EXIT || mode == SINGLE_BOUNDARY ||
 	    mode == SINGLE_ALLPROC || mode == SINGLE_NO_EXIT,
 	    ("invalid mode %d", mode));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT((mode == SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) != 0) ||
 	    (mode != SINGLE_ALLPROC && (p->p_flag & P_TOTAL_STOP) == 0),
 	    ("mode %d does not match P_TOTAL_STOP", mode));
 	KASSERT(mode == SINGLE_ALLPROC || p->p_singlethread == curthread,
 	    ("thread_single_end from other thread %p %p",
 	    curthread, p->p_singlethread));
 	KASSERT(mode != SINGLE_BOUNDARY ||
 	    (p->p_flag & P_SINGLE_BOUNDARY) != 0,
 	    ("mis-matched SINGLE_BOUNDARY flags %x", p->p_flag));
 	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY |
 	    P_TOTAL_STOP);
 	PROC_SLOCK(p);
 	p->p_singlethread = NULL;
 	wakeup_swapper = 0;
 	/*
 	 * If there are other threads they may now run,
 	 * unless of course there is a blanket 'stop order'
 	 * on the process. The single threader must be allowed
 	 * to continue however as this is a bad place to stop.
 	 */
 	if (p->p_numthreads != remain_for_mode(mode) && !P_SHOULDSTOP(p)) {
                 FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (TD_IS_SUSPENDED(td)) {
 				wakeup_swapper |= thread_unsuspend_one(td, p,
 				    mode == SINGLE_BOUNDARY);
 			}
 			thread_unlock(td);
 		}
 	}
 	KASSERT(mode != SINGLE_BOUNDARY || p->p_boundary_count == 0,
 	    ("inconsistent boundary count %d", p->p_boundary_count));
 	PROC_SUNLOCK(p);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 struct thread *
 thread_find(struct proc *p, lwpid_t tid)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (td->td_tid == tid)
 			break;
 	}
 	return (td);
 }
 
 /* Locate a thread by number; return with proc lock held. */
 struct thread *
 tdfind(lwpid_t tid, pid_t pid)
 {
 #define RUN_THRESH	16
 	struct thread *td;
 	int run = 0;
 
 	rw_rlock(&tidhash_lock);
 	LIST_FOREACH(td, TIDHASH(tid), td_hash) {
 		if (td->td_tid == tid) {
 			if (pid != -1 && td->td_proc->p_pid != pid) {
 				td = NULL;
 				break;
 			}
 			PROC_LOCK(td->td_proc);
 			if (td->td_proc->p_state == PRS_NEW) {
 				PROC_UNLOCK(td->td_proc);
 				td = NULL;
 				break;
 			}
 			if (run > RUN_THRESH) {
 				if (rw_try_upgrade(&tidhash_lock)) {
 					LIST_REMOVE(td, td_hash);
 					LIST_INSERT_HEAD(TIDHASH(td->td_tid),
 						td, td_hash);
 					rw_wunlock(&tidhash_lock);
 					return (td);
 				}
 			}
 			break;
 		}
 		run++;
 	}
 	rw_runlock(&tidhash_lock);
 	return (td);
 }
 
 void
 tidhash_add(struct thread *td)
 {
 	rw_wlock(&tidhash_lock);
 	LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
 	rw_wunlock(&tidhash_lock);
 }
 
 void
 tidhash_remove(struct thread *td)
 {
 	rw_wlock(&tidhash_lock);
 	LIST_REMOVE(td, td_hash);
 	rw_wunlock(&tidhash_lock);
 }
Index: head/sys/kern/kern_umtx.c
===================================================================
--- head/sys/kern/kern_umtx.c	(revision 300042)
+++ head/sys/kern/kern_umtx.c	(revision 300043)
@@ -1,4178 +1,4494 @@
 /*-
- * Copyright (c) 2015 The FreeBSD Foundation
+ * Copyright (c) 2015, 2016 The FreeBSD Foundation
  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_umtx_profiling.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/syscallsubr.h>
 #include <sys/taskqueue.h>
 #include <sys/eventhandler.h>
 #include <sys/umtx.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
 #include <machine/cpu.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32_proto.h>
 #endif
 
 #define _UMUTEX_TRY		1
 #define _UMUTEX_WAIT		2
 
 #ifdef UMTX_PROFILING
 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
 #endif
 
 /* Priority inheritance mutex info. */
 struct umtx_pi {
 	/* Owner thread */
 	struct thread		*pi_owner;
 
 	/* Reference count */
 	int			pi_refcount;
 
  	/* List entry to link umtx holding by thread */
 	TAILQ_ENTRY(umtx_pi)	pi_link;
 
 	/* List entry in hash */
 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
 
 	/* List for waiters */
 	TAILQ_HEAD(,umtx_q)	pi_blocked;
 
 	/* Identify a userland lock object */
 	struct umtx_key		pi_key;
 };
 
 /* A userland synchronous object user. */
 struct umtx_q {
 	/* Linked list for the hash. */
 	TAILQ_ENTRY(umtx_q)	uq_link;
 
 	/* Umtx key. */
 	struct umtx_key		uq_key;
 
 	/* Umtx flags. */
 	int			uq_flags;
 #define UQF_UMTXQ	0x0001
 
 	/* The thread waits on. */
 	struct thread		*uq_thread;
 
 	/*
 	 * Blocked on PI mutex. read can use chain lock
 	 * or umtx_lock, write must have both chain lock and
 	 * umtx_lock being hold.
 	 */
 	struct umtx_pi		*uq_pi_blocked;
 
 	/* On blocked list */
 	TAILQ_ENTRY(umtx_q)	uq_lockq;
 
 	/* Thread contending with us */
 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
 
 	/* Inherited priority from PP mutex */
 	u_char			uq_inherited_pri;
 	
 	/* Spare queue ready to be reused */
 	struct umtxq_queue	*uq_spare_queue;
 
 	/* The queue we on */
 	struct umtxq_queue	*uq_cur_queue;
 };
 
 TAILQ_HEAD(umtxq_head, umtx_q);
 
 /* Per-key wait-queue */
 struct umtxq_queue {
 	struct umtxq_head	head;
 	struct umtx_key		key;
 	LIST_ENTRY(umtxq_queue)	link;
 	int			length;
 };
 
 LIST_HEAD(umtxq_list, umtxq_queue);
 
 /* Userland lock object's wait-queue chain */
 struct umtxq_chain {
 	/* Lock for this chain. */
 	struct mtx		uc_lock;
 
 	/* List of sleep queues. */
 	struct umtxq_list	uc_queue[2];
 #define UMTX_SHARED_QUEUE	0
 #define UMTX_EXCLUSIVE_QUEUE	1
 
 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
 
 	/* Busy flag */
 	char			uc_busy;
 
 	/* Chain lock waiters */
 	int			uc_waiters;
 
 	/* All PI in the list */
 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
 
 #ifdef UMTX_PROFILING
 	u_int 			length;
 	u_int			max_length;
 #endif
 };
 
 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
 
 /*
  * Don't propagate time-sharing priority, there is a security reason,
  * a user can simply introduce PI-mutex, let thread A lock the mutex,
  * and let another thread B block on the mutex, because B is
  * sleeping, its priority will be boosted, this causes A's priority to
  * be boosted via priority propagating too and will never be lowered even
  * if it is using 100%CPU, this is unfair to other processes.
  */
 
 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
 
 #define	GOLDEN_RATIO_PRIME	2654404609U
 #define	UMTX_CHAINS		512
 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
 
 #define	GET_SHARE(flags)	\
     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
 
 #define BUSY_SPINS		200
 
 struct abs_timeout {
 	int clockid;
 	struct timespec cur;
 	struct timespec end;
 };
 
+#ifdef COMPAT_FREEBSD32
+struct umutex32 {
+	volatile __lwpid_t	m_owner;	/* Owner of the mutex */
+	__uint32_t		m_flags;	/* Flags of the mutex */
+	__uint32_t		m_ceilings[2];	/* Priority protect ceiling */
+	__uint32_t		m_rb_lnk;	/* Robust linkage */
+	__uint32_t		m_pad;
+	__uint32_t		m_spare[2];
+};
+
+_Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32");
+_Static_assert(__offsetof(struct umutex, m_spare[0]) ==
+    __offsetof(struct umutex32, m_spare[0]), "m_spare32");
+#endif
+
+int umtx_shm_vnobj_persistent = 0;
+SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN,
+    &umtx_shm_vnobj_persistent, 0,
+    "False forces destruction of umtx attached to file, on last close");
+static int umtx_max_rb = 1000;
+SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN,
+    &umtx_max_rb, 0,
+    "");
+
 static uma_zone_t		umtx_pi_zone;
 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
 static int			umtx_pi_allocated;
 
 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
     &umtx_pi_allocated, 0, "Allocated umtx_pi");
+static int umtx_verbose_rb = 1;
+SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN,
+    &umtx_verbose_rb, 0,
+    "");
 
 #ifdef UMTX_PROFILING
 static long max_length;
 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
 #endif
 
 static void umtx_shm_init(void);
 static void umtxq_sysinit(void *);
 static void umtxq_hash(struct umtx_key *key);
 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
 static void umtxq_lock(struct umtx_key *key);
 static void umtxq_unlock(struct umtx_key *key);
 static void umtxq_busy(struct umtx_key *key);
 static void umtxq_unbusy(struct umtx_key *key);
 static void umtxq_insert_queue(struct umtx_q *uq, int q);
 static void umtxq_remove_queue(struct umtx_q *uq, int q);
 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
 static int umtxq_count(struct umtx_key *key);
 static struct umtx_pi *umtx_pi_alloc(int);
 static void umtx_pi_free(struct umtx_pi *pi);
-static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
+static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags,
+    bool rb);
 static void umtx_thread_cleanup(struct thread *td);
 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
-	struct image_params *imgp __unused);
+    struct image_params *imgp __unused);
 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
 
 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
 
 static struct mtx umtx_lock;
 
 #ifdef UMTX_PROFILING
 static void
 umtx_init_profiling(void) 
 {
 	struct sysctl_oid *chain_oid;
 	char chain_name[10];
 	int i;
 
 	for (i = 0; i < UMTX_CHAINS; ++i) {
 		snprintf(chain_name, sizeof(chain_name), "%d", i);
 		chain_oid = SYSCTL_ADD_NODE(NULL, 
 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO, 
 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
 	}
 }
 
 static int
 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
 {
 	char buf[512];
 	struct sbuf sb;
 	struct umtxq_chain *uc;
 	u_int fract, i, j, tot, whole;
 	u_int sf0, sf1, sf2, sf3, sf4;
 	u_int si0, si1, si2, si3, si4;
 	u_int sw0, sw1, sw2, sw3, sw4;
 
 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 	for (i = 0; i < 2; i++) {
 		tot = 0;
 		for (j = 0; j < UMTX_CHAINS; ++j) {
 			uc = &umtxq_chains[i][j];
 			mtx_lock(&uc->uc_lock);
 			tot += uc->max_length;
 			mtx_unlock(&uc->uc_lock);
 		}
 		if (tot == 0)
 			sbuf_printf(&sb, "%u) Empty ", i);
 		else {
 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
 			si0 = si1 = si2 = si3 = si4 = 0;
 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
 			for (j = 0; j < UMTX_CHAINS; j++) {
 				uc = &umtxq_chains[i][j];
 				mtx_lock(&uc->uc_lock);
 				whole = uc->max_length * 100;
 				mtx_unlock(&uc->uc_lock);
 				fract = (whole % tot) * 100;
 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
 					sf0 = fract;
 					si0 = j;
 					sw0 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
 				    sf1)) {
 					sf1 = fract;
 					si1 = j;
 					sw1 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
 				    sf2)) {
 					sf2 = fract;
 					si2 = j;
 					sw2 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
 				    sf3)) {
 					sf3 = fract;
 					si3 = j;
 					sw3 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
 				    sf4)) {
 					sf4 = fract;
 					si4 = j;
 					sw4 = whole;
 				}
 			}
 			sbuf_printf(&sb, "queue %u:\n", i);
 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
 			    sf0 / tot, si0);
 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
 			    sf1 / tot, si1);
 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
 			    sf2 / tot, si2);
 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
 			    sf3 / tot, si3);
 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
 			    sf4 / tot, si4);
 		}
 	}
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 	return (0);
 }
 
 static int
 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
 {
 	struct umtxq_chain *uc;
 	u_int i, j;
 	int clear, error;
 
 	clear = 0;
 	error = sysctl_handle_int(oidp, &clear, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (clear != 0) {
 		for (i = 0; i < 2; ++i) {
 			for (j = 0; j < UMTX_CHAINS; ++j) {
 				uc = &umtxq_chains[i][j];
 				mtx_lock(&uc->uc_lock);
 				uc->length = 0;
 				uc->max_length = 0;	
 				mtx_unlock(&uc->uc_lock);
 			}
 		}
 	}
 	return (0);
 }
 
 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
 #endif
 
 static void
 umtxq_sysinit(void *arg __unused)
 {
 	int i, j;
 
 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	for (i = 0; i < 2; ++i) {
 		for (j = 0; j < UMTX_CHAINS; ++j) {
 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
 				 MTX_DEF | MTX_DUPOK);
 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
 			umtxq_chains[i][j].uc_busy = 0;
 			umtxq_chains[i][j].uc_waiters = 0;
 #ifdef UMTX_PROFILING
 			umtxq_chains[i][j].length = 0;
 			umtxq_chains[i][j].max_length = 0;	
 #endif
 		}
 	}
 #ifdef UMTX_PROFILING
 	umtx_init_profiling();
 #endif
 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	umtx_shm_init();
 }
 
 struct umtx_q *
 umtxq_alloc(void)
 {
 	struct umtx_q *uq;
 
 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
-	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
+	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX,
+	    M_WAITOK | M_ZERO);
 	TAILQ_INIT(&uq->uq_spare_queue->head);
 	TAILQ_INIT(&uq->uq_pi_contested);
 	uq->uq_inherited_pri = PRI_MAX;
 	return (uq);
 }
 
 void
 umtxq_free(struct umtx_q *uq)
 {
+
 	MPASS(uq->uq_spare_queue != NULL);
 	free(uq->uq_spare_queue, M_UMTX);
 	free(uq, M_UMTX);
 }
 
 static inline void
 umtxq_hash(struct umtx_key *key)
 {
-	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
+	unsigned n;
+
+	n = (uintptr_t)key->info.both.a + key->info.both.b;
 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
 }
 
 static inline struct umtxq_chain *
 umtxq_getchain(struct umtx_key *key)
 {
+
 	if (key->type <= TYPE_SEM)
 		return (&umtxq_chains[1][key->hash]);
 	return (&umtxq_chains[0][key->hash]);
 }
 
 /*
  * Lock a chain.
  */
 static inline void
 umtxq_lock(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_lock(&uc->uc_lock);
 }
 
 /*
  * Unlock a chain.
  */
 static inline void
 umtxq_unlock(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_unlock(&uc->uc_lock);
 }
 
 /*
  * Set chain to busy state when following operation
  * may be blocked (kernel mutex can not be used).
  */
 static inline void
 umtxq_busy(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_assert(&uc->uc_lock, MA_OWNED);
 	if (uc->uc_busy) {
 #ifdef SMP
 		if (smp_cpus > 1) {
 			int count = BUSY_SPINS;
 			if (count > 0) {
 				umtxq_unlock(key);
 				while (uc->uc_busy && --count > 0)
 					cpu_spinwait();
 				umtxq_lock(key);
 			}
 		}
 #endif
 		while (uc->uc_busy) {
 			uc->uc_waiters++;
 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
 			uc->uc_waiters--;
 		}
 	}
 	uc->uc_busy = 1;
 }
 
 /*
  * Unbusy a chain.
  */
 static inline void
 umtxq_unbusy(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_assert(&uc->uc_lock, MA_OWNED);
 	KASSERT(uc->uc_busy != 0, ("not busy"));
 	uc->uc_busy = 0;
 	if (uc->uc_waiters)
 		wakeup_one(uc);
 }
 
 static inline void
 umtxq_unbusy_unlocked(struct umtx_key *key)
 {
 
 	umtxq_lock(key);
 	umtxq_unbusy(key);
 	umtxq_unlock(key);
 }
 
 static struct umtxq_queue *
 umtxq_queue_lookup(struct umtx_key *key, int q)
 {
 	struct umtxq_queue *uh;
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
 		if (umtx_key_match(&uh->key, key))
 			return (uh);
 	}
 
 	return (NULL);
 }
 
 static inline void
 umtxq_insert_queue(struct umtx_q *uq, int q)
 {
 	struct umtxq_queue *uh;
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
 	uh = umtxq_queue_lookup(&uq->uq_key, q);
 	if (uh != NULL) {
 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
 	} else {
 		uh = uq->uq_spare_queue;
 		uh->key = uq->uq_key;
 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
 #ifdef UMTX_PROFILING
 		uc->length++;
 		if (uc->length > uc->max_length) {
 			uc->max_length = uc->length;
 			if (uc->max_length > max_length)
 				max_length = uc->max_length;	
 		}
 #endif
 	}
 	uq->uq_spare_queue = NULL;
 
 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
 	uh->length++;
 	uq->uq_flags |= UQF_UMTXQ;
 	uq->uq_cur_queue = uh;
 	return;
 }
 
 static inline void
 umtxq_remove_queue(struct umtx_q *uq, int q)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	if (uq->uq_flags & UQF_UMTXQ) {
 		uh = uq->uq_cur_queue;
 		TAILQ_REMOVE(&uh->head, uq, uq_link);
 		uh->length--;
 		uq->uq_flags &= ~UQF_UMTXQ;
 		if (TAILQ_EMPTY(&uh->head)) {
 			KASSERT(uh->length == 0,
 			    ("inconsistent umtxq_queue length"));
 #ifdef UMTX_PROFILING
 			uc->length--;
 #endif
 			LIST_REMOVE(uh, link);
 		} else {
 			uh = LIST_FIRST(&uc->uc_spare_queue);
 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
 			LIST_REMOVE(uh, link);
 		}
 		uq->uq_spare_queue = uh;
 		uq->uq_cur_queue = NULL;
 	}
 }
 
 /*
  * Check if there are multiple waiters
  */
 static int
 umtxq_count(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
 	if (uh != NULL)
 		return (uh->length);
 	return (0);
 }
 
 /*
  * Check if there are multiple PI waiters and returns first
  * waiter.
  */
 static int
 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 
 	*first = NULL;
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
 	if (uh != NULL) {
 		*first = TAILQ_FIRST(&uh->head);
 		return (uh->length);
 	}
 	return (0);
 }
 
 static int
 umtxq_check_susp(struct thread *td)
 {
 	struct proc *p;
 	int error;
 
 	/*
 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
 	 * eventually break the lockstep loop.
 	 */
 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
 		return (0);
 	error = 0;
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (P_SHOULDSTOP(p) ||
 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
 		if (p->p_flag & P_SINGLE_EXIT)
 			error = EINTR;
 		else
 			error = ERESTART;
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Wake up threads waiting on an userland object.
  */
 
 static int
 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 	struct umtx_q *uq;
 	int ret;
 
 	ret = 0;
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	uh = umtxq_queue_lookup(key, q);
 	if (uh != NULL) {
 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
 			umtxq_remove_queue(uq, q);
 			wakeup(uq);
 			if (++ret >= n_wake)
 				return (ret);
 		}
 	}
 	return (ret);
 }
 
 
 /*
  * Wake up specified thread.
  */
 static inline void
 umtxq_signal_thread(struct umtx_q *uq)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	umtxq_remove(uq);
 	wakeup(uq);
 }
 
 static inline int 
 tstohz(const struct timespec *tsp)
 {
 	struct timeval tv;
 
 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
 	return tvtohz(&tv);
 }
 
 static void
 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
 	const struct timespec *timeout)
 {
 
 	timo->clockid = clockid;
 	if (!absolute) {
 		kern_clock_gettime(curthread, clockid, &timo->end);
 		timo->cur = timo->end;
 		timespecadd(&timo->end, timeout);
 	} else {
 		timo->end = *timeout;
 		kern_clock_gettime(curthread, clockid, &timo->cur);
 	}
 }
 
 static void
 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
 {
 
 	abs_timeout_init(timo, umtxtime->_clockid,
-		(umtxtime->_flags & UMTX_ABSTIME) != 0,
-		&umtxtime->_timeout);
+	    (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout);
 }
 
 static inline void
 abs_timeout_update(struct abs_timeout *timo)
 {
+
 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
 }
 
 static int
 abs_timeout_gethz(struct abs_timeout *timo)
 {
 	struct timespec tts;
 
 	if (timespeccmp(&timo->end, &timo->cur, <=))
 		return (-1); 
 	tts = timo->end;
 	timespecsub(&tts, &timo->cur);
 	return (tstohz(&tts));
 }
 
+static uint32_t
+umtx_unlock_val(uint32_t flags, bool rb)
+{
+
+	if (rb)
+		return (UMUTEX_RB_OWNERDEAD);
+	else if ((flags & UMUTEX_NONCONSISTENT) != 0)
+		return (UMUTEX_RB_NOTRECOV);
+	else
+		return (UMUTEX_UNOWNED);
+
+}
+
 /*
  * Put thread into sleep state, before sleeping, check if
  * thread was removed from umtx queue.
  */
 static inline int
 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
 {
 	struct umtxq_chain *uc;
 	int error, timo;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	for (;;) {
 		if (!(uq->uq_flags & UQF_UMTXQ))
 			return (0);
 		if (abstime != NULL) {
 			timo = abs_timeout_gethz(abstime);
 			if (timo < 0)
 				return (ETIMEDOUT);
 		} else
 			timo = 0;
 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
 		if (error != EWOULDBLOCK) {
 			umtxq_lock(&uq->uq_key);
 			break;
 		}
 		if (abstime != NULL)
 			abs_timeout_update(abstime);
 		umtxq_lock(&uq->uq_key);
 	}
 	return (error);
 }
 
 /*
  * Convert userspace address into unique logical address.
  */
 int
 umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
 {
 	struct thread *td = curthread;
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 
 	key->type = type;
 	if (share == THREAD_SHARE) {
 		key->shared = 0;
 		key->info.private.vs = td->td_proc->p_vmspace;
 		key->info.private.addr = (uintptr_t)addr;
 	} else {
 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
 		map = &td->td_proc->p_vmspace->vm_map;
 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
 		    &entry, &key->info.shared.object, &pindex, &prot,
 		    &wired) != KERN_SUCCESS) {
 			return (EFAULT);
 		}
 
 		if ((share == PROCESS_SHARE) ||
 		    (share == AUTO_SHARE &&
 		     VM_INHERIT_SHARE == entry->inheritance)) {
 			key->shared = 1;
 			key->info.shared.offset = (vm_offset_t)addr -
 			    entry->start + entry->offset;
 			vm_object_reference(key->info.shared.object);
 		} else {
 			key->shared = 0;
 			key->info.private.vs = td->td_proc->p_vmspace;
 			key->info.private.addr = (uintptr_t)addr;
 		}
 		vm_map_lookup_done(map, entry);
 	}
 
 	umtxq_hash(key);
 	return (0);
 }
 
 /*
  * Release key.
  */
 void
 umtx_key_release(struct umtx_key *key)
 {
 	if (key->shared)
 		vm_object_deallocate(key->info.shared.object);
 }
 
 /*
  * Fetch and compare value, sleep on the address if value is not changed.
  */
 static int
 do_wait(struct thread *td, void *addr, u_long id,
-	struct _umtx_time *timeout, int compat32, int is_private)
+    struct _umtx_time *timeout, int compat32, int is_private)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	u_long tmp;
 	uint32_t tmp32;
 	int error = 0;
 
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	if (compat32 == 0) {
 		error = fueword(addr, &tmp);
 		if (error != 0)
 			error = EFAULT;
 	} else {
 		error = fueword32(addr, &tmp32);
 		if (error == 0)
 			tmp = tmp32;
 		else
 			error = EFAULT;
 	}
 	umtxq_lock(&uq->uq_key);
 	if (error == 0) {
 		if (tmp == id)
 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
 			    NULL : &timo);
 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
 			error = 0;
 		else
 			umtxq_remove(uq);
 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
 		umtxq_remove(uq);
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 /*
  * Wake up threads sleeping on the specified address.
  */
 int
 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
 {
 	struct umtx_key key;
 	int ret;
 	
 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
-		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
+	    is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
 		return (ret);
 	umtxq_lock(&key);
 	umtxq_signal(&key, n_wake);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (0);
 }
 
 /*
  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
-	struct _umtx_time *timeout, int mode)
+    struct _umtx_time *timeout, int mode)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t owner, old, id;
 	int error, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	error = 0;
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure. It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		rv = fueword32(&m->m_owner, &owner);
 		if (rv == -1)
 			return (EFAULT);
 		if (mode == _UMUTEX_WAIT) {
-			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
+			if (owner == UMUTEX_UNOWNED ||
+			    owner == UMUTEX_CONTESTED ||
+			    owner == UMUTEX_RB_OWNERDEAD ||
+			    owner == UMUTEX_RB_NOTRECOV)
 				return (0);
 		} else {
 			/*
-			 * Try the uncontested case.  This should be done in userland.
+			 * Robust mutex terminated.  Kernel duty is to
+			 * return EOWNERDEAD to the userspace.  The
+			 * umutex.m_flags UMUTEX_NONCONSISTENT is set
+			 * by the common userspace code.
 			 */
+			if (owner == UMUTEX_RB_OWNERDEAD) {
+				rv = casueword32(&m->m_owner,
+				    UMUTEX_RB_OWNERDEAD, &owner,
+				    id | UMUTEX_CONTESTED);
+				if (rv == -1)
+					return (EFAULT);
+				if (owner == UMUTEX_RB_OWNERDEAD)
+					return (EOWNERDEAD); /* success */
+				rv = umtxq_check_susp(td);
+				if (rv != 0)
+					return (rv);
+				continue;
+			}
+			if (owner == UMUTEX_RB_NOTRECOV)
+				return (ENOTRECOVERABLE);
+
+
+			/*
+			 * Try the uncontested case.  This should be
+			 * done in userland.
+			 */
 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
 			    &owner, id);
 			/* The address was invalid. */
 			if (rv == -1)
 				return (EFAULT);
 
 			/* The acquire succeeded. */
 			if (owner == UMUTEX_UNOWNED)
 				return (0);
 
-			/* If no one owns it but it is contested try to acquire it. */
+			/*
+			 * If no one owns it but it is contested try
+			 * to acquire it.
+			 */
 			if (owner == UMUTEX_CONTESTED) {
 				rv = casueword32(&m->m_owner,
 				    UMUTEX_CONTESTED, &owner,
 				    id | UMUTEX_CONTESTED);
 				/* The address was invalid. */
 				if (rv == -1)
 					return (EFAULT);
 
 				if (owner == UMUTEX_CONTESTED)
 					return (0);
 
 				rv = umtxq_check_susp(td);
 				if (rv != 0)
 					return (rv);
 
-				/* If this failed the lock has changed, restart. */
+				/*
+				 * If this failed the lock has
+				 * changed, restart.
+				 */
 				continue;
 			}
 		}
 
 		if (mode == _UMUTEX_TRY)
 			return (EBUSY);
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			return (error);
 
 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
 		    GET_SHARE(flags), &uq->uq_key)) != 0)
 			return (error);
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		rv = casueword32(&m->m_owner, owner, &old,
 		    owner | UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (rv == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		if (old == owner)
 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
 			    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 
 		if (error == 0)
 			error = umtxq_check_susp(td);
 	}
 
 	return (0);
 }
 
 /*
  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
-do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
+do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
-	uint32_t owner, old, id;
-	int error;
-	int count;
+	uint32_t owner, old, id, newlock;
+	int error, count;
 
 	id = td->td_tid;
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
+	newlock = umtx_unlock_val(flags, rb);
 	if ((owner & UMUTEX_CONTESTED) == 0) {
-		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
+		error = casueword32(&m->m_owner, owner, &old, newlock);
 		if (error == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
-	error = casueword32(&m->m_owner, owner, &old,
-	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+	if (count > 1)
+		newlock |= UMUTEX_CONTESTED;
+	error = casueword32(&m->m_owner, owner, &old, newlock);
 	umtxq_lock(&key);
-	umtxq_signal(&key,1);
+	umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	if (error == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Check if the mutex is available and wake up a waiter,
  * only for simple mutex.
  */
 static int
 do_wake_umutex(struct thread *td, struct umutex *m)
 {
 	struct umtx_key key;
 	uint32_t owner;
 	uint32_t flags;
 	int error;
 	int count;
 
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
-	if ((owner & ~UMUTEX_CONTESTED) != 0)
+	if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD &&
+	    owner != UMUTEX_RB_NOTRECOV)
 		return (0);
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
-	if (count <= 1) {
+	if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD &&
+	    owner != UMUTEX_RB_NOTRECOV) {
 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
 		    UMUTEX_UNOWNED);
 		if (error == -1)
 			error = EFAULT;
 	}
 
 	umtxq_lock(&key);
-	if (error == 0 && count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
+	if (error == 0 && count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
+	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 /*
  * Check if the mutex has waiters and tries to fix contention bit.
  */
 static int
 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
 {
 	struct umtx_key key;
 	uint32_t owner, old;
 	int type;
 	int error;
 	int count;
 
-	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
+	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT |
+	    UMUTEX_ROBUST)) {
 	case 0:
+	case UMUTEX_ROBUST:
 		type = TYPE_NORMAL_UMUTEX;
 		break;
 	case UMUTEX_PRIO_INHERIT:
 		type = TYPE_PI_UMUTEX;
 		break;
+	case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST):
+		type = TYPE_PI_ROBUST_UMUTEX;
+		break;
 	case UMUTEX_PRIO_PROTECT:
 		type = TYPE_PP_UMUTEX;
 		break;
+	case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST):
+		type = TYPE_PP_ROBUST_UMUTEX;
+		break;
 	default:
 		return (EINVAL);
 	}
-	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
-	    &key)) != 0)
+	if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0)
 		return (error);
 
 	owner = 0;
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 	/*
 	 * Only repair contention bit if there is a waiter, this means the mutex
 	 * is still being referenced by userland code, otherwise don't update
 	 * any memory.
 	 */
 	if (count > 1) {
 		error = fueword32(&m->m_owner, &owner);
 		if (error == -1)
 			error = EFAULT;
 		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
 			error = casueword32(&m->m_owner, owner, &old,
 			    owner | UMUTEX_CONTESTED);
 			if (error == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (old == owner)
 				break;
 			owner = old;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 	} else if (count == 1) {
 		error = fueword32(&m->m_owner, &owner);
 		if (error == -1)
 			error = EFAULT;
 		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
-		       (owner & UMUTEX_CONTESTED) == 0) {
+		    (owner & UMUTEX_CONTESTED) == 0) {
 			error = casueword32(&m->m_owner, owner, &old,
 			    owner | UMUTEX_CONTESTED);
 			if (error == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (old == owner)
 				break;
 			owner = old;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 	}
 	umtxq_lock(&key);
 	if (error == EFAULT) {
 		umtxq_signal(&key, INT_MAX);
-	} else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
+	} else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
+	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 static inline struct umtx_pi *
 umtx_pi_alloc(int flags)
 {
 	struct umtx_pi *pi;
 
 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
 	TAILQ_INIT(&pi->pi_blocked);
 	atomic_add_int(&umtx_pi_allocated, 1);
 	return (pi);
 }
 
 static inline void
 umtx_pi_free(struct umtx_pi *pi)
 {
 	uma_zfree(umtx_pi_zone, pi);
 	atomic_add_int(&umtx_pi_allocated, -1);
 }
 
 /*
  * Adjust the thread's position on a pi_state after its priority has been
  * changed.
  */
 static int
 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
 {
 	struct umtx_q *uq, *uq1, *uq2;
 	struct thread *td1;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi == NULL)
 		return (0);
 
 	uq = td->td_umtxq;
 
 	/*
 	 * Check if the thread needs to be moved on the blocked chain.
 	 * It needs to be moved if either its priority is lower than
 	 * the previous thread or higher than the next thread.
 	 */
 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
 	uq2 = TAILQ_NEXT(uq, uq_lockq);
 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
 		/*
 		 * Remove thread from blocked chain and determine where
 		 * it should be moved to.
 		 */
 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
 			td1 = uq1->uq_thread;
 			MPASS(td1->td_proc->p_magic == P_MAGIC);
 			if (UPRI(td1) > UPRI(td))
 				break;
 		}
 
 		if (uq1 == NULL)
 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
 		else
 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
 	}
 	return (1);
 }
 
 static struct umtx_pi *
 umtx_pi_next(struct umtx_pi *pi)
 {
 	struct umtx_q *uq_owner;
 
 	if (pi->pi_owner == NULL)
 		return (NULL);
 	uq_owner = pi->pi_owner->td_umtxq;
 	if (uq_owner == NULL)
 		return (NULL);
 	return (uq_owner->uq_pi_blocked);
 }
 
 /*
  * Floyd's Cycle-Finding Algorithm.
  */
 static bool
 umtx_pi_check_loop(struct umtx_pi *pi)
 {
 	struct umtx_pi *pi1;	/* fast iterator */
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi == NULL)
 		return (false);
 	pi1 = pi;
 	for (;;) {
 		pi = umtx_pi_next(pi);
 		if (pi == NULL)
 			break;
 		pi1 = umtx_pi_next(pi1);
 		if (pi1 == NULL)
 			break;
 		pi1 = umtx_pi_next(pi1);
 		if (pi1 == NULL)
 			break;
 		if (pi == pi1)
 			return (true);
 	}
 	return (false);
 }
 
 /*
  * Propagate priority when a thread is blocked on POSIX
  * PI mutex.
  */ 
 static void
 umtx_propagate_priority(struct thread *td)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 	int pri;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	pri = UPRI(td);
 	uq = td->td_umtxq;
 	pi = uq->uq_pi_blocked;
 	if (pi == NULL)
 		return;
 	if (umtx_pi_check_loop(pi))
 		return;
 
 	for (;;) {
 		td = pi->pi_owner;
 		if (td == NULL || td == curthread)
 			return;
 
 		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
 
 		thread_lock(td);
 		if (td->td_lend_user_pri > pri)
 			sched_lend_user_prio(td, pri);
 		else {
 			thread_unlock(td);
 			break;
 		}
 		thread_unlock(td);
 
 		/*
 		 * Pick up the lock that td is blocked on.
 		 */
 		uq = td->td_umtxq;
 		pi = uq->uq_pi_blocked;
 		if (pi == NULL)
 			break;
 		/* Resort td on the list if needed. */
 		umtx_pi_adjust_thread(pi, td);
 	}
 }
 
 /*
  * Unpropagate priority for a PI mutex when a thread blocked on
  * it is interrupted by signal or resumed by others.
  */
 static void
 umtx_repropagate_priority(struct umtx_pi *pi)
 {
 	struct umtx_q *uq, *uq_owner;
 	struct umtx_pi *pi2;
 	int pri;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 
 	if (umtx_pi_check_loop(pi))
 		return;
 	while (pi != NULL && pi->pi_owner != NULL) {
 		pri = PRI_MAX;
 		uq_owner = pi->pi_owner->td_umtxq;
 
 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
 			uq = TAILQ_FIRST(&pi2->pi_blocked);
 			if (uq != NULL) {
 				if (pri > UPRI(uq->uq_thread))
 					pri = UPRI(uq->uq_thread);
 			}
 		}
 
 		if (pri > uq_owner->uq_inherited_pri)
 			pri = uq_owner->uq_inherited_pri;
 		thread_lock(pi->pi_owner);
 		sched_lend_user_prio(pi->pi_owner, pri);
 		thread_unlock(pi->pi_owner);
 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
 	}
 }
 
 /*
  * Insert a PI mutex into owned list.
  */
 static void
 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
 {
 	struct umtx_q *uq_owner;
 
 	uq_owner = owner->td_umtxq;
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi->pi_owner != NULL)
 		panic("pi_owner != NULL");
 	pi->pi_owner = owner;
 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
 }
 
 
 /*
  * Disown a PI mutex, and remove it from the owned list.
  */
 static void
 umtx_pi_disown(struct umtx_pi *pi)
 {
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
 	pi->pi_owner = NULL;
 }
 
 /*
  * Claim ownership of a PI mutex.
  */
 static int
 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
 {
 	struct umtx_q *uq;
+	int pri;
 
 	mtx_lock(&umtx_lock);
 	if (pi->pi_owner == owner) {
 		mtx_unlock(&umtx_lock);
 		return (0);
 	}
 
 	if (pi->pi_owner != NULL) {
 		/*
 		 * userland may have already messed the mutex, sigh.
 		 */
 		mtx_unlock(&umtx_lock);
 		return (EPERM);
 	}
 	umtx_pi_setowner(pi, owner);
 	uq = TAILQ_FIRST(&pi->pi_blocked);
 	if (uq != NULL) {
-		int pri;
-
 		pri = UPRI(uq->uq_thread);
 		thread_lock(owner);
 		if (pri < UPRI(owner))
 			sched_lend_user_prio(owner, pri);
 		thread_unlock(owner);
 	}
 	mtx_unlock(&umtx_lock);
 	return (0);
 }
 
 /*
  * Adjust a thread's order position in its blocked PI mutex,
  * this may result new priority propagating process.
  */
 void
 umtx_pi_adjust(struct thread *td, u_char oldpri)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 
 	uq = td->td_umtxq;
 	mtx_lock(&umtx_lock);
 	/*
 	 * Pick up the lock that td is blocked on.
 	 */
 	pi = uq->uq_pi_blocked;
 	if (pi != NULL) {
 		umtx_pi_adjust_thread(pi, td);
 		umtx_repropagate_priority(pi);
 	}
 	mtx_unlock(&umtx_lock);
 }
 
 /*
  * Sleep on a PI mutex.
  */
 static int
-umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
-	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
+umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner,
+    const char *wmesg, struct abs_timeout *timo, bool shared)
 {
 	struct umtxq_chain *uc;
 	struct thread *td, *td1;
 	struct umtx_q *uq1;
-	int pri;
-	int error = 0;
+	int error, pri;
 
+	error = 0;
 	td = uq->uq_thread;
 	KASSERT(td == curthread, ("inconsistent uq_thread"));
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
 	umtxq_insert(uq);
 	mtx_lock(&umtx_lock);
 	if (pi->pi_owner == NULL) {
 		mtx_unlock(&umtx_lock);
-		/* XXX Only look up thread in current process. */
-		td1 = tdfind(owner, curproc->p_pid);
+		td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid);
 		mtx_lock(&umtx_lock);
 		if (td1 != NULL) {
 			if (pi->pi_owner == NULL)
 				umtx_pi_setowner(pi, td1);
 			PROC_UNLOCK(td1->td_proc);
 		}
 	}
 
 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
 		pri = UPRI(uq1->uq_thread);
 		if (pri > UPRI(td))
 			break;
 	}
 
 	if (uq1 != NULL)
 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
 	else
 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
 
 	uq->uq_pi_blocked = pi;
 	thread_lock(td);
 	td->td_flags |= TDF_UPIBLOCKED;
 	thread_unlock(td);
 	umtx_propagate_priority(td);
 	mtx_unlock(&umtx_lock);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, wmesg, timo);
 	umtxq_remove(uq);
 
 	mtx_lock(&umtx_lock);
 	uq->uq_pi_blocked = NULL;
 	thread_lock(td);
 	td->td_flags &= ~TDF_UPIBLOCKED;
 	thread_unlock(td);
 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 	umtx_repropagate_priority(pi);
 	mtx_unlock(&umtx_lock);
 	umtxq_unlock(&uq->uq_key);
 
 	return (error);
 }
 
 /*
  * Add reference count for a PI mutex.
  */
 static void
 umtx_pi_ref(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	pi->pi_refcount++;
 }
 
 /*
  * Decrease reference count for a PI mutex, if the counter
  * is decreased to zero, its memory space is freed.
  */ 
 static void
 umtx_pi_unref(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
 	if (--pi->pi_refcount == 0) {
 		mtx_lock(&umtx_lock);
 		if (pi->pi_owner != NULL)
 			umtx_pi_disown(pi);
 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
 			("blocked queue not empty"));
 		mtx_unlock(&umtx_lock);
 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
 		umtx_pi_free(pi);
 	}
 }
 
 /*
  * Find a PI mutex in hash table.
  */
 static struct umtx_pi *
 umtx_pi_lookup(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 	struct umtx_pi *pi;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 
 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
 		if (umtx_key_match(&pi->pi_key, key)) {
 			return (pi);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Insert a PI mutex into hash table.
  */
 static inline void
 umtx_pi_insert(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
 }
 
 /*
  * Lock a PI mutex.
  */
 static int
 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int try)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	struct umtx_pi *pi, *new_pi;
-	uint32_t id, owner, old;
+	uint32_t id, old_owner, owner, old;
 	int error, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 
-	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
+	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
+	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	pi = umtx_pi_lookup(&uq->uq_key);
 	if (pi == NULL) {
 		new_pi = umtx_pi_alloc(M_NOWAIT);
 		if (new_pi == NULL) {
 			umtxq_unlock(&uq->uq_key);
 			new_pi = umtx_pi_alloc(M_WAITOK);
 			umtxq_lock(&uq->uq_key);
 			pi = umtx_pi_lookup(&uq->uq_key);
 			if (pi != NULL) {
 				umtx_pi_free(new_pi);
 				new_pi = NULL;
 			}
 		}
 		if (new_pi != NULL) {
 			new_pi->pi_key = uq->uq_key;
 			umtx_pi_insert(new_pi);
 			pi = new_pi;
 		}
 	}
 	umtx_pi_ref(pi);
 	umtxq_unlock(&uq->uq_key);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure.  It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		/*
 		 * Try the uncontested case.  This should be done in userland.
 		 */
 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
 		/* The address was invalid. */
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		/* The acquire succeeded. */
 		if (owner == UMUTEX_UNOWNED) {
 			error = 0;
 			break;
 		}
 
+		if (owner == UMUTEX_RB_NOTRECOV) {
+			error = ENOTRECOVERABLE;
+			break;
+		}
+
 		/* If no one owns it but it is contested try to acquire it. */
-		if (owner == UMUTEX_CONTESTED) {
-			rv = casueword32(&m->m_owner,
-			    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
+		if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) {
+			old_owner = owner;
+			rv = casueword32(&m->m_owner, owner, &owner,
+			    id | UMUTEX_CONTESTED);
 			/* The address was invalid. */
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 
-			if (owner == UMUTEX_CONTESTED) {
+			if (owner == old_owner) {
 				umtxq_lock(&uq->uq_key);
 				umtxq_busy(&uq->uq_key);
 				error = umtx_pi_claim(pi, td);
 				umtxq_unbusy(&uq->uq_key);
 				umtxq_unlock(&uq->uq_key);
 				if (error != 0) {
 					/*
 					 * Since we're going to return an
 					 * error, restore the m_owner to its
 					 * previous, unowned state to avoid
 					 * compounding the problem.
 					 */
 					(void)casuword32(&m->m_owner,
 					    id | UMUTEX_CONTESTED,
-					    UMUTEX_CONTESTED);
+					    old_owner);
 				}
+				if (error == 0 &&
+				    old_owner == UMUTEX_RB_OWNERDEAD)
+					error = EOWNERDEAD;
 				break;
 			}
 
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 
 			/* If this failed the lock has changed, restart. */
 			continue;
 		}
 
 		if ((owner & ~UMUTEX_CONTESTED) == id) {
 			error = EDEADLK;
 			break;
 		}
 
 		if (try != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 			
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
-		rv = casueword32(&m->m_owner, owner, &old,
-		    owner | UMUTEX_CONTESTED);
+		rv = casueword32(&m->m_owner, owner, &old, owner |
+		    UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 
 		umtxq_lock(&uq->uq_key);
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
-		 * unlocking the umtx.
+		 * unlocking the umtx.  Note that the UMUTEX_RB_OWNERDEAD
+		 * value for owner is impossible there.
 		 */
 		if (old == owner) {
-			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
-			    "umtxpi", timeout == NULL ? NULL : &timo);
+			error = umtxq_sleep_pi(uq, pi,
+			    owner & ~UMUTEX_CONTESTED,
+			    "umtxpi", timeout == NULL ? NULL : &timo,
+			    (flags & USYNC_PROCESS_SHARED) != 0);
 			if (error != 0)
 				continue;
 		} else {
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 		}
 
 		error = umtxq_check_susp(td);
 		if (error != 0)
 			break;
 	}
 
 	umtxq_lock(&uq->uq_key);
 	umtx_pi_unref(pi);
 	umtxq_unlock(&uq->uq_key);
 
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Unlock a PI mutex.
  */
 static int
-do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
+do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
 	struct umtx_q *uq_first, *uq_first2, *uq_me;
 	struct umtx_pi *pi, *pi2;
-	uint32_t owner, old, id;
-	int error;
-	int count;
-	int pri;
+	uint32_t id, new_owner, old, owner;
+	int count, error, pri;
 
 	id = td->td_tid;
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
+	new_owner = umtx_unlock_val(flags, rb);
+
 	/* This should be done in userland */
 	if ((owner & UMUTEX_CONTESTED) == 0) {
-		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
+		error = casueword32(&m->m_owner, owner, &old, new_owner);
 		if (error == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
-	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
+	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
+	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count_pi(&key, &uq_first);
 	if (uq_first != NULL) {
 		mtx_lock(&umtx_lock);
 		pi = uq_first->uq_pi_blocked;
 		KASSERT(pi != NULL, ("pi == NULL?"));
-		if (pi->pi_owner != td) {
+		if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) {
 			mtx_unlock(&umtx_lock);
 			umtxq_unbusy(&key);
 			umtxq_unlock(&key);
 			umtx_key_release(&key);
 			/* userland messed the mutex */
 			return (EPERM);
 		}
 		uq_me = td->td_umtxq;
-		umtx_pi_disown(pi);
+		if (pi->pi_owner == td)
+			umtx_pi_disown(pi);
 		/* get highest priority thread which is still sleeping. */
 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
 		while (uq_first != NULL && 
-		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
+		    (uq_first->uq_flags & UQF_UMTXQ) == 0) {
 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
 		}
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
 			if (uq_first2 != NULL) {
 				if (pri > UPRI(uq_first2->uq_thread))
 					pri = UPRI(uq_first2->uq_thread);
 			}
 		}
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 		if (uq_first)
 			umtxq_signal_thread(uq_first);
 	} else {
 		pi = umtx_pi_lookup(&key);
 		/*
 		 * A umtx_pi can exist if a signal or timeout removed the
 		 * last waiter from the umtxq, but there is still
 		 * a thread in do_lock_pi() holding the umtx_pi.
 		 */
 		if (pi != NULL) {
 			/*
 			 * The umtx_pi can be unowned, such as when a thread
 			 * has just entered do_lock_pi(), allocated the
 			 * umtx_pi, and unlocked the umtxq.
 			 * If the current thread owns it, it must disown it.
 			 */
 			mtx_lock(&umtx_lock);
 			if (pi->pi_owner == td)
 				umtx_pi_disown(pi);
 			mtx_unlock(&umtx_lock);
 		}
 	}
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
-	error = casueword32(&m->m_owner, owner, &old,
-	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
 
+	if (count > 1)
+		new_owner |= UMUTEX_CONTESTED;
+	error = casueword32(&m->m_owner, owner, &old, new_owner);
+
 	umtxq_unbusy_unlocked(&key);
 	umtx_key_release(&key);
 	if (error == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Lock a PP mutex.
  */
 static int
 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int try)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
 	uint32_t ceiling;
 	uint32_t owner, id;
 	int error, pri, old_inherited_pri, su, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
-	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
+	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
+	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 	for (;;) {
 		old_inherited_pri = uq->uq_inherited_pri;
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		rv = fueword32(&m->m_ceilings[0], &ceiling);
 		if (rv == -1) {
 			error = EFAULT;
 			goto out;
 		}
 		ceiling = RTP_PRIO_MAX - ceiling;
 		if (ceiling > RTP_PRIO_MAX) {
 			error = EINVAL;
 			goto out;
 		}
 
 		mtx_lock(&umtx_lock);
 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
 			mtx_unlock(&umtx_lock);
 			error = EINVAL;
 			goto out;
 		}
 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
 			thread_lock(td);
 			if (uq->uq_inherited_pri < UPRI(td))
 				sched_lend_user_prio(td, uq->uq_inherited_pri);
 			thread_unlock(td);
 		}
 		mtx_unlock(&umtx_lock);
 
-		rv = casueword32(&m->m_owner,
-		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
+		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
+		    id | UMUTEX_CONTESTED);
 		/* The address was invalid. */
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		if (owner == UMUTEX_CONTESTED) {
 			error = 0;
 			break;
+		} else if (owner == UMUTEX_RB_OWNERDEAD) {
+			rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD,
+			    &owner, id | UMUTEX_CONTESTED);
+			if (rv == -1) {
+				error = EFAULT;
+				break;
+			}
+			if (owner == UMUTEX_RB_OWNERDEAD) {
+				error = EOWNERDEAD; /* success */
+				break;
+			}
+			error = 0;
+		} else if (owner == UMUTEX_RB_NOTRECOV) {
+			error = ENOTRECOVERABLE;
+			break;
 		}
 
 		if (try != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
 		    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 
-	if (error != 0) {
+	if (error != 0 && error != EOWNERDEAD) {
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 
 out:
 	umtxq_unbusy_unlocked(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Unlock a PP mutex.
  */
 static int
-do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
+do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
-	uint32_t owner, id;
-	uint32_t rceiling;
+	uint32_t id, owner, rceiling;
 	int error, pri, new_inherited_pri, su;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
 	if (error != 0)
 		return (error);
 
 	if (rceiling == -1)
 		new_inherited_pri = PRI_MAX;
 	else {
 		rceiling = RTP_PRIO_MAX - rceiling;
 		if (rceiling > RTP_PRIO_MAX)
 			return (EINVAL);
 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
 	}
 
-	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
+	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
+	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	umtxq_unlock(&key);
 	/*
 	 * For priority protected mutex, always set unlocked state
 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
 	 * to lock the mutex, it is necessary because thread priority
 	 * has to be adjusted for such mutex.
 	 */
-	error = suword32(&m->m_owner, UMUTEX_CONTESTED);
+	error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) |
+	    UMUTEX_CONTESTED);
 
 	umtxq_lock(&key);
 	if (error == 0)
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 
 	if (error == -1)
 		error = EFAULT;
 	else {
 		mtx_lock(&umtx_lock);
 		if (su != 0)
 			uq->uq_inherited_pri = new_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
-	uint32_t *old_ceiling)
+    uint32_t *old_ceiling)
 {
 	struct umtx_q *uq;
-	uint32_t save_ceiling;
-	uint32_t owner, id;
-	uint32_t flags;
-	int error, rv;
+	uint32_t flags, id, owner, save_ceiling;
+	int error, rv, rv1;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
 		return (EINVAL);
 	if (ceiling > RTP_PRIO_MAX)
 		return (EINVAL);
 	id = td->td_tid;
 	uq = td->td_umtxq;
-	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
-	   &uq->uq_key)) != 0)
+	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
+	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
+	    &uq->uq_key)) != 0)
 		return (error);
 	for (;;) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
-		rv = casueword32(&m->m_owner,
-		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
+		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
+		    id | UMUTEX_CONTESTED);
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		if (owner == UMUTEX_CONTESTED) {
-			suword32(&m->m_ceilings[0], ceiling);
-			suword32(&m->m_owner, UMUTEX_CONTESTED);
-			error = 0;
+			rv = suword32(&m->m_ceilings[0], ceiling);
+			rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED);
+			error = (rv == 0 && rv1 == 0) ? 0: EFAULT;
 			break;
 		}
 
 		if ((owner & ~UMUTEX_CONTESTED) == id) {
-			suword32(&m->m_ceilings[0], ceiling);
-			error = 0;
+			rv = suword32(&m->m_ceilings[0], ceiling);
+			error = rv == 0 ? 0 : EFAULT;
 			break;
 		}
 
+		if (owner == UMUTEX_RB_OWNERDEAD) {
+			error = EOWNERDEAD;
+			break;
+		} else if (owner == UMUTEX_RB_NOTRECOV) {
+			error = ENOTRECOVERABLE;
+			break;
+		}
+
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = umtxq_sleep(uq, "umtxpp", NULL);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 	}
 	umtxq_lock(&uq->uq_key);
 	if (error == 0)
 		umtxq_signal(&uq->uq_key, INT_MAX);
 	umtxq_unbusy(&uq->uq_key);
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
-	if (error == 0 && old_ceiling != NULL)
-		suword32(old_ceiling, save_ceiling);
+	if (error == 0 && old_ceiling != NULL) {
+		rv = suword32(old_ceiling, save_ceiling);
+		error = rv == 0 ? 0 : EFAULT;
+	}
 	return (error);
 }
 
 /*
  * Lock a userland POSIX mutex.
  */
 static int
 do_lock_umutex(struct thread *td, struct umutex *m,
     struct _umtx_time *timeout, int mode)
 {
 	uint32_t flags;
 	int error;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
-	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
+	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
 		error = do_lock_normal(td, m, flags, timeout, mode);
 		break;
 	case UMUTEX_PRIO_INHERIT:
 		error = do_lock_pi(td, m, flags, timeout, mode);
 		break;
 	case UMUTEX_PRIO_PROTECT:
 		error = do_lock_pp(td, m, flags, timeout, mode);
 		break;
 	default:
 		return (EINVAL);
 	}
 	if (timeout == NULL) {
 		if (error == EINTR && mode != _UMUTEX_WAIT)
 			error = ERESTART;
 	} else {
 		/* Timed-locking is not restarted. */
 		if (error == ERESTART)
 			error = EINTR;
 	}
 	return (error);
 }
 
 /*
  * Unlock a userland POSIX mutex.
  */
 static int
-do_unlock_umutex(struct thread *td, struct umutex *m)
+do_unlock_umutex(struct thread *td, struct umutex *m, bool rb)
 {
 	uint32_t flags;
 	int error;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
-	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
+	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
-		return (do_unlock_normal(td, m, flags));
+		return (do_unlock_normal(td, m, flags, rb));
 	case UMUTEX_PRIO_INHERIT:
-		return (do_unlock_pi(td, m, flags));
+		return (do_unlock_pi(td, m, flags, rb));
 	case UMUTEX_PRIO_PROTECT:
-		return (do_unlock_pp(td, m, flags));
+		return (do_unlock_pp(td, m, flags, rb));
 	}
 
 	return (EINVAL);
 }
 
 static int
 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
-	struct timespec *timeout, u_long wflags)
+    struct timespec *timeout, u_long wflags)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, clockid, hasw;
 	int error;
 
 	uq = td->td_umtxq;
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if ((wflags & CVWAIT_CLOCKID) != 0) {
 		error = fueword32(&cv->c_clockid, &clockid);
 		if (error == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		if (clockid < CLOCK_REALTIME ||
 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
 			/* hmm, only HW clock id will work. */
 			umtx_key_release(&uq->uq_key);
 			return (EINVAL);
 		}
 	} else {
 		clockid = CLOCK_REALTIME;
 	}
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 
 	/*
 	 * Set c_has_waiters to 1 before releasing user mutex, also
 	 * don't modify cache line when unnecessary.
 	 */
 	error = fueword32(&cv->c_has_waiters, &hasw);
 	if (error == 0 && hasw == 0)
 		suword32(&cv->c_has_waiters, 1);
 
 	umtxq_unbusy_unlocked(&uq->uq_key);
 
-	error = do_unlock_umutex(td, m);
+	error = do_unlock_umutex(td, m, false);
 
 	if (timeout != NULL)
-		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
-			timeout);
+		abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0,
+		    timeout);
 	
 	umtxq_lock(&uq->uq_key);
 	if (error == 0) {
 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
 		    NULL : &timo);
 	}
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		/*
 		 * This must be timeout,interrupted by signal or
 		 * surprious wakeup, clear c_has_waiter flag when
 		 * necessary.
 		 */
 		umtxq_busy(&uq->uq_key);
 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
 			int oldlen = uq->uq_cur_queue->length;
 			umtxq_remove(uq);
 			if (oldlen == 1) {
 				umtxq_unlock(&uq->uq_key);
 				suword32(&cv->c_has_waiters, 0);
 				umtxq_lock(&uq->uq_key);
 			}
 		}
 		umtxq_unbusy(&uq->uq_key);
 		if (error == ERESTART)
 			error = EINTR;
 	}
 
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland condition variable.
  */
 static int
 do_cv_signal(struct thread *td, struct ucond *cv)
 {
 	struct umtx_key key;
 	int error, cnt, nwake;
 	uint32_t flags;
 
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	nwake = umtxq_signal(&key, 1);
 	if (cnt <= nwake) {
 		umtxq_unlock(&key);
 		error = suword32(&cv->c_has_waiters, 0);
 		if (error == -1)
 			error = EFAULT;
 		umtxq_lock(&key);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_cv_broadcast(struct thread *td, struct ucond *cv)
 {
 	struct umtx_key key;
 	int error;
 	uint32_t flags;
 
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	umtxq_signal(&key, INT_MAX);
 	umtxq_unlock(&key);
 
 	error = suword32(&cv->c_has_waiters, 0);
 	if (error == -1)
 		error = EFAULT;
 
 	umtxq_unbusy_unlocked(&key);
 
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, wrflags;
 	int32_t state, oldstate;
 	int32_t blocked_readers;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	wrflags = URWLOCK_WRITE_OWNER;
 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
 		wrflags |= URWLOCK_WRITE_WAITERS;
 
 	for (;;) {
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/* try to lock it */
 		while (!(state & wrflags)) {
 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
 				umtx_key_release(&uq->uq_key);
 				return (EAGAIN);
 			}
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state + 1);
 			if (rv == -1) {
 				umtx_key_release(&uq->uq_key);
 				return (EFAULT);
 			}
 			if (oldstate == state) {
 				umtx_key_release(&uq->uq_key);
 				return (0);
 			}
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			state = oldstate;
 		}
 
 		if (error)
 			break;
 
 		/* grab monitor lock */
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * re-read the state, in case it changed between the try-lock above
 		 * and the check below
 		 */
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1)
 			error = EFAULT;
 
 		/* set read contention bit */
 		while (error == 0 && (state & wrflags) &&
 		    !(state & URWLOCK_READ_WAITERS)) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_READ_WAITERS);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (oldstate == state)
 				goto sleep;
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 		if (error != 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			break;
 		}
 
 		/* state is changed while setting flags, restart */
 		if (!(state & wrflags)) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			continue;
 		}
 
 sleep:
 		/* contention bit is set, before sleeping, increase read waiter count */
 		rv = fueword32(&rwlock->rw_blocked_readers,
 		    &blocked_readers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
 
 		while (state & wrflags) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_insert(uq);
 			umtxq_unbusy(&uq->uq_key);
 
 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
 			    NULL : &timo);
 
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			if (error)
 				break;
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 		}
 
 		/* decrease read waiter count, and may clear read contention bit */
 		rv = fueword32(&rwlock->rw_blocked_readers,
 		    &blocked_readers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
 		if (blocked_readers == 1) {
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1)
 				error = EFAULT;
 			while (error == 0) {
 				rv = casueword32(&rwlock->rw_state, state,
 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
 				if (rv == -1) {
 					error = EFAULT;
 					break;
 				}
 				if (oldstate == state)
 					break;
 				state = oldstate;
 				error = umtxq_check_susp(td);
 			}
 		}
 
 		umtxq_unbusy_unlocked(&uq->uq_key);
 		if (error != 0)
 			break;
 	}
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 static int
 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags;
 	int32_t state, oldstate;
 	int32_t blocked_writers;
 	int32_t blocked_readers;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	blocked_readers = 0;
 	for (;;) {
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_WRITE_OWNER);
 			if (rv == -1) {
 				umtx_key_release(&uq->uq_key);
 				return (EFAULT);
 			}
 			if (oldstate == state) {
 				umtx_key_release(&uq->uq_key);
 				return (0);
 			}
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 
 		if (error) {
 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
 			    blocked_readers != 0) {
 				umtxq_lock(&uq->uq_key);
 				umtxq_busy(&uq->uq_key);
 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
 				umtxq_unbusy(&uq->uq_key);
 				umtxq_unlock(&uq->uq_key);
 			}
 
 			break;
 		}
 
 		/* grab monitor lock */
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * re-read the state, in case it changed between the try-lock above
 		 * and the check below
 		 */
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1)
 			error = EFAULT;
 
 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
 		    URWLOCK_READER_COUNT(state) != 0) &&
 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (oldstate == state)
 				goto sleep;
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 		if (error != 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			break;
 		}
 
 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			continue;
 		}
 sleep:
 		rv = fueword32(&rwlock->rw_blocked_writers,
 		    &blocked_writers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
 
 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
 			umtxq_unbusy(&uq->uq_key);
 
 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
 			    NULL : &timo);
 
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
 			umtxq_unlock(&uq->uq_key);
 			if (error)
 				break;
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 		}
 
 		rv = fueword32(&rwlock->rw_blocked_writers,
 		    &blocked_writers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
 		if (blocked_writers == 1) {
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 			for (;;) {
 				rv = casueword32(&rwlock->rw_state, state,
 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
 				if (rv == -1) {
 					error = EFAULT;
 					break;
 				}
 				if (oldstate == state)
 					break;
 				state = oldstate;
 				error = umtxq_check_susp(td);
 				/*
 				 * We are leaving the URWLOCK_WRITE_WAITERS
 				 * behind, but this should not harm the
 				 * correctness.
 				 */
 				if (error != 0)
 					break;
 			}
 			rv = fueword32(&rwlock->rw_blocked_readers,
 			    &blocked_readers);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 		} else
 			blocked_readers = 0;
 
 		umtxq_unbusy_unlocked(&uq->uq_key);
 	}
 
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 static int
 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
 {
 	struct umtx_q *uq;
 	uint32_t flags;
 	int32_t state, oldstate;
 	int error, rv, q, count;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	error = fueword32(&rwlock->rw_state, &state);
 	if (error == -1) {
 		error = EFAULT;
 		goto out;
 	}
 	if (state & URWLOCK_WRITE_OWNER) {
 		for (;;) {
 			rv = casueword32(&rwlock->rw_state, state, 
 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
 			if (rv == -1) {
 				error = EFAULT;
 				goto out;
 			}
 			if (oldstate != state) {
 				state = oldstate;
 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
 					error = EPERM;
 					goto out;
 				}
 				error = umtxq_check_susp(td);
 				if (error != 0)
 					goto out;
 			} else
 				break;
 		}
 	} else if (URWLOCK_READER_COUNT(state) != 0) {
 		for (;;) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state - 1);
 			if (rv == -1) {
 				error = EFAULT;
 				goto out;
 			}
 			if (oldstate != state) {
 				state = oldstate;
 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
 					error = EPERM;
 					goto out;
 				}
 				error = umtxq_check_susp(td);
 				if (error != 0)
 					goto out;
 			} else
 				break;
 		}
 	} else {
 		error = EPERM;
 		goto out;
 	}
 
 	count = 0;
 
 	if (!(flags & URWLOCK_PREFER_READER)) {
 		if (state & URWLOCK_WRITE_WAITERS) {
 			count = 1;
 			q = UMTX_EXCLUSIVE_QUEUE;
 		} else if (state & URWLOCK_READ_WAITERS) {
 			count = INT_MAX;
 			q = UMTX_SHARED_QUEUE;
 		}
 	} else {
 		if (state & URWLOCK_READ_WAITERS) {
 			count = INT_MAX;
 			q = UMTX_SHARED_QUEUE;
 		} else if (state & URWLOCK_WRITE_WAITERS) {
 			count = 1;
 			q = UMTX_EXCLUSIVE_QUEUE;
 		}
 	}
 
 	if (count) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_signal_queue(&uq->uq_key, count, q);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 	}
 out:
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, count, count1;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&sem->_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
 	if (rv == 0)
 		rv = fueword32(&sem->_count, &count);
 	if (rv == -1 || count != 0) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 		return (rv == -1 ? EFAULT : 0);
 	}
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		umtxq_remove(uq);
 		/* A relative timeout cannot be restarted. */
 		if (error == ERESTART && timeout != NULL &&
 		    (timeout->_flags & UMTX_ABSTIME) == 0)
 			error = EINTR;
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland semaphore.
  */
 static int
 do_sem_wake(struct thread *td, struct _usem *sem)
 {
 	struct umtx_key key;
 	int error, cnt;
 	uint32_t flags;
 
 	error = fueword32(&sem->_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	if (cnt > 0) {
 		umtxq_signal(&key, 1);
 		/*
 		 * Check if count is greater than 0, this means the memory is
 		 * still being referenced by user code, so we can safely
 		 * update _has_waiters flag.
 		 */
 		if (cnt == 1) {
 			umtxq_unlock(&key);
 			error = suword32(&sem->_has_waiters, 0);
 			umtxq_lock(&key);
 			if (error == -1)
 				error = EFAULT;
 		}
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 #endif
 
 static int
 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t count, flags;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	flags = fuword32(&sem->_flags);
 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	rv = fueword32(&sem->_count, &count);
 	if (rv == -1) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 		return (EFAULT);
 	}
 	for (;;) {
 		if (USEM_COUNT(count) != 0) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (0);
 		}
 		if (count == USEM_HAS_WAITERS)
 			break;
 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
 		if (rv == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		if (count == 0)
 			break;
 	}
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		umtxq_remove(uq);
 		/* A relative timeout cannot be restarted. */
 		if (error == ERESTART && timeout != NULL &&
 		    (timeout->_flags & UMTX_ABSTIME) == 0)
 			error = EINTR;
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland semaphore.
  */
 static int
 do_sem2_wake(struct thread *td, struct _usem2 *sem)
 {
 	struct umtx_key key;
 	int error, cnt, rv;
 	uint32_t count, flags;
 
 	rv = fueword32(&sem->_flags, &flags);
 	if (rv == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	if (cnt > 0) {
 		umtxq_signal(&key, 1);
 
 		/*
 		 * If this was the last sleeping thread, clear the waiters
 		 * flag in _count.
 		 */
 		if (cnt == 1) {
 			umtxq_unlock(&key);
 			rv = fueword32(&sem->_count, &count);
 			while (rv != -1 && count & USEM_HAS_WAITERS)
 				rv = casueword32(&sem->_count, count, &count,
 				    count & ~USEM_HAS_WAITERS);
 			if (rv == -1)
 				error = EFAULT;
 			umtxq_lock(&key);
 		}
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 inline int
 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
 {
 	int error;
 
 	error = copyin(addr, tsp, sizeof(struct timespec));
 	if (error == 0) {
 		if (tsp->tv_sec < 0 ||
 		    tsp->tv_nsec >= 1000000000 ||
 		    tsp->tv_nsec < 0)
 			error = EINVAL;
 	}
 	return (error);
 }
 
 static inline int
 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
 {
 	int error;
 	
 	if (size <= sizeof(struct timespec)) {
 		tp->_clockid = CLOCK_REALTIME;
 		tp->_flags = 0;
 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
 	} else 
 		error = copyin(addr, tp, sizeof(struct _umtx_time));
 	if (error != 0)
 		return (error);
 	if (tp->_timeout.tv_sec < 0 ||
 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
 		return (EINVAL);
 	return (0);
 }
 
 static int
 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (EOPNOTSUPP);
 }
 
 static int
 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout, *tm_p;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
-	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
+	return (do_wait(td, uap->obj, uap->val, tm_p, 0, 0));
 }
 
 static int
 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout, *tm_p;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
-	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
+	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
 }
 
 static int
 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
-	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
+	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
 }
 
 static int
 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
 {
+
 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
 }
 
 #define BATCH_SIZE	128
 static int
 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
 {
-	int count = uap->val;
-	void *uaddrs[BATCH_SIZE];
-	char **upp = (char **)uap->obj;
-	int tocopy;
-	int error = 0;
-	int i, pos = 0;
+	char *uaddrs[BATCH_SIZE], **upp;
+	int count, error, i, pos, tocopy;
 
-	while (count > 0) {
-		tocopy = count;
-		if (tocopy > BATCH_SIZE)
-			tocopy = BATCH_SIZE;
-		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
+	upp = (char **)uap->obj;
+	error = 0;
+	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
+	    pos += tocopy) {
+		tocopy = MIN(count, BATCH_SIZE);
+		error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *));
 		if (error != 0)
 			break;
 		for (i = 0; i < tocopy; ++i)
 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
-		count -= tocopy;
-		pos += tocopy;
+		maybe_yield();
 	}
 	return (error);
 }
 
 static int
 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
 {
+
 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
 }
 
 static int
 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
-	return do_lock_umutex(td, uap->obj, tm_p, 0);
+	return (do_lock_umutex(td, uap->obj, tm_p, 0));
 }
 
 static int
 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
-	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
+
+	return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY));
 }
 
 static int
 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
-	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
+	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
 }
 
 static int
 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
-	return do_wake_umutex(td, uap->obj);
+
+	return (do_wake_umutex(td, uap->obj));
 }
 
 static int
 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
-	return do_unlock_umutex(td, uap->obj);
+
+	return (do_unlock_umutex(td, uap->obj, false));
 }
 
 static int
 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
 {
-	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
+
+	return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1));
 }
 
 static int
 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		ts = &timeout;
 	}
 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
 }
 
 static int
 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
 {
-	return do_cv_signal(td, uap->obj);
+
+	return (do_cv_signal(td, uap->obj));
 }
 
 static int
 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
 {
-	return do_cv_broadcast(td, uap->obj);
+
+	return (do_cv_broadcast(td, uap->obj));
 }
 
 static int
 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
 	} else {
 		error = umtx_copyin_umtx_time(uap->uaddr2,
 		   (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_wrlock(td, uap->obj, 0);
 	} else {
 		error = umtx_copyin_umtx_time(uap->uaddr2, 
 		   (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 
 		error = do_rw_wrlock(td, uap->obj, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
 {
-	return do_rw_unlock(td, uap->obj);
+
+	return (do_rw_unlock(td, uap->obj));
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem_wait(td, uap->obj, tm_p));
 }
 
 static int
 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_sem_wake(td, uap->obj));
 }
 #endif
 
 static int
 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_wake2_umutex(td, uap->obj, uap->val));
 }
 
 static int
 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem2_wait(td, uap->obj, tm_p));
 }
 
 static int
 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_sem2_wake(td, uap->obj));
 }
 
 #define	USHM_OBJ_UMTX(o)						\
     ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
 
 #define	USHMF_REG_LINKED	0x0001
 #define	USHMF_OBJ_LINKED	0x0002
 struct umtx_shm_reg {
 	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
 	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
 	struct umtx_key		ushm_key;
 	struct ucred		*ushm_cred;
 	struct shmfd		*ushm_obj;
 	u_int			ushm_refcnt;
 	u_int			ushm_flags;
 };
 
 LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
 TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
 
 static uma_zone_t umtx_shm_reg_zone;
 static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
 static struct mtx umtx_shm_lock;
 static struct umtx_shm_reg_head umtx_shm_reg_delfree =
     TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
 
 static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
 
 static void
 umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
 {
 	struct umtx_shm_reg_head d;
 	struct umtx_shm_reg *reg, *reg1;
 
 	TAILQ_INIT(&d);
 	mtx_lock(&umtx_shm_lock);
 	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
 	mtx_unlock(&umtx_shm_lock);
 	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
 		TAILQ_REMOVE(&d, reg, ushm_reg_link);
 		umtx_shm_free_reg(reg);
 	}
 }
 
 static struct task umtx_shm_reg_delfree_task =
     TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
 
 static struct umtx_shm_reg *
 umtx_shm_find_reg_locked(const struct umtx_key *key)
 {
 	struct umtx_shm_reg *reg;
 	struct umtx_shm_reg_head *reg_head;
 
 	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
 	mtx_assert(&umtx_shm_lock, MA_OWNED);
 	reg_head = &umtx_shm_registry[key->hash];
 	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
 		KASSERT(reg->ushm_key.shared,
 		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
 		if (reg->ushm_key.info.shared.object ==
 		    key->info.shared.object &&
 		    reg->ushm_key.info.shared.offset ==
 		    key->info.shared.offset) {
 			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
 			KASSERT(reg->ushm_refcnt > 0,
 			    ("reg %p refcnt 0 onlist", reg));
 			KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0,
 			    ("reg %p not linked", reg));
 			reg->ushm_refcnt++;
 			return (reg);
 		}
 	}
 	return (NULL);
 }
 
 static struct umtx_shm_reg *
 umtx_shm_find_reg(const struct umtx_key *key)
 {
 	struct umtx_shm_reg *reg;
 
 	mtx_lock(&umtx_shm_lock);
 	reg = umtx_shm_find_reg_locked(key);
 	mtx_unlock(&umtx_shm_lock);
 	return (reg);
 }
 
 static void
 umtx_shm_free_reg(struct umtx_shm_reg *reg)
 {
 
 	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
 	crfree(reg->ushm_cred);
 	shm_drop(reg->ushm_obj);
 	uma_zfree(umtx_shm_reg_zone, reg);
 }
 
 static bool
 umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force)
 {
 	bool res;
 
 	mtx_assert(&umtx_shm_lock, MA_OWNED);
 	KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg));
 	reg->ushm_refcnt--;
 	res = reg->ushm_refcnt == 0;
 	if (res || force) {
 		if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) {
 			TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash],
 			    reg, ushm_reg_link);
 			reg->ushm_flags &= ~USHMF_REG_LINKED;
 		}
 		if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) {
 			LIST_REMOVE(reg, ushm_obj_link);
 			reg->ushm_flags &= ~USHMF_OBJ_LINKED;
 		}
 	}
 	return (res);
 }
 
 static void
 umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force)
 {
 	vm_object_t object;
 	bool dofree;
 
 	if (force) {
 		object = reg->ushm_obj->shm_object;
 		VM_OBJECT_WLOCK(object);
 		object->flags |= OBJ_UMTXDEAD;
 		VM_OBJECT_WUNLOCK(object);
 	}
 	mtx_lock(&umtx_shm_lock);
 	dofree = umtx_shm_unref_reg_locked(reg, force);
 	mtx_unlock(&umtx_shm_lock);
 	if (dofree)
 		umtx_shm_free_reg(reg);
 }
 
 void
 umtx_shm_object_init(vm_object_t object)
 {
 
 	LIST_INIT(USHM_OBJ_UMTX(object));
 }
 
 void
 umtx_shm_object_terminated(vm_object_t object)
 {
 	struct umtx_shm_reg *reg, *reg1;
 	bool dofree;
 
 	dofree = false;
 	mtx_lock(&umtx_shm_lock);
 	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
 		if (umtx_shm_unref_reg_locked(reg, true)) {
 			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
 			    ushm_reg_link);
 			dofree = true;
 		}
 	}
 	mtx_unlock(&umtx_shm_lock);
 	if (dofree)
 		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
 }
 
 static int
 umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
     struct umtx_shm_reg **res)
 {
 	struct umtx_shm_reg *reg, *reg1;
 	struct ucred *cred;
 	int error;
 
 	reg = umtx_shm_find_reg(key);
 	if (reg != NULL) {
 		*res = reg;
 		return (0);
 	}
 	cred = td->td_ucred;
 	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
 		return (ENOMEM);
 	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
 	reg->ushm_refcnt = 1;
 	bcopy(key, &reg->ushm_key, sizeof(*key));
 	reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
 	reg->ushm_cred = crhold(cred);
 	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
 	if (error != 0) {
 		umtx_shm_free_reg(reg);
 		return (error);
 	}
 	mtx_lock(&umtx_shm_lock);
 	reg1 = umtx_shm_find_reg_locked(key);
 	if (reg1 != NULL) {
 		mtx_unlock(&umtx_shm_lock);
 		umtx_shm_free_reg(reg);
 		*res = reg1;
 		return (0);
 	}
 	reg->ushm_refcnt++;
 	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
 	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
 	    ushm_obj_link);
 	reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED;
 	mtx_unlock(&umtx_shm_lock);
 	*res = reg;
 	return (0);
 }
 
 static int
 umtx_shm_alive(struct thread *td, void *addr)
 {
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	int res, ret;
 	boolean_t wired;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
 	    &object, &pindex, &prot, &wired);
 	if (res != KERN_SUCCESS)
 		return (EFAULT);
 	if (object == NULL)
 		ret = EINVAL;
 	else
 		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
 	vm_map_lookup_done(map, entry);
 	return (ret);
 }
 
 static void
 umtx_shm_init(void)
 {
 	int i;
 
 	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
 	for (i = 0; i < nitems(umtx_shm_registry); i++)
 		TAILQ_INIT(&umtx_shm_registry[i]);
 }
 
 static int
 umtx_shm(struct thread *td, void *addr, u_int flags)
 {
 	struct umtx_key key;
 	struct umtx_shm_reg *reg;
 	struct file *fp;
 	int error, fd;
 
 	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
 	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
 		return (EINVAL);
 	if ((flags & UMTX_SHM_ALIVE) != 0)
 		return (umtx_shm_alive(td, addr));
 	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
 	if (error != 0)
 		return (error);
 	KASSERT(key.shared == 1, ("non-shared key"));
 	if ((flags & UMTX_SHM_CREAT) != 0) {
 		error = umtx_shm_create_reg(td, &key, &reg);
 	} else {
 		reg = umtx_shm_find_reg(&key);
 		if (reg == NULL)
 			error = ESRCH;
 	}
 	umtx_key_release(&key);
 	if (error != 0)
 		return (error);
 	KASSERT(reg != NULL, ("no reg"));
 	if ((flags & UMTX_SHM_DESTROY) != 0) {
 		umtx_shm_unref_reg(reg, true);
 	} else {
 #if 0
 #ifdef MAC
 		error = mac_posixshm_check_open(td->td_ucred,
 		    reg->ushm_obj, FFLAGS(O_RDWR));
 		if (error == 0)
 #endif
 			error = shm_access(reg->ushm_obj, td->td_ucred,
 			    FFLAGS(O_RDWR));
 		if (error == 0)
 #endif
 			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
 		if (error == 0) {
 			shm_hold(reg->ushm_obj);
 			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
 			    &shm_ops);
 			td->td_retval[0] = fd;
 			fdrop(fp, td);
 		}
 	}
 	umtx_shm_unref_reg(reg, false);
 	return (error);
 }
 
 static int
 __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (umtx_shm(td, uap->uaddr1, uap->val));
 }
 
+static int
+umtx_robust_lists(struct thread *td, struct umtx_robust_lists_params *rbp)
+{
+
+	td->td_rb_list = rbp->robust_list_offset;
+	td->td_rbp_list = rbp->robust_priv_list_offset;
+	td->td_rb_inact = rbp->robust_inact_offset;
+	return (0);
+}
+
+static int
+__umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct umtx_robust_lists_params rb;
+	int error;
+
+	if (uap->val > sizeof(rb))
+		return (EINVAL);
+	bzero(&rb, sizeof(rb));
+	error = copyin(uap->uaddr1, &rb, uap->val);
+	if (error != 0)
+		return (error);
+	return (umtx_robust_lists(td, &rb));
+}
+
 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
 
 static const _umtx_op_func op_table[] = {
 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
 	[UMTX_OP_WAIT]		= __umtx_op_wait,
 	[UMTX_OP_WAKE]		= __umtx_op_wake,
 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
 #else
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
 #endif
 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
 	[UMTX_OP_SHM]		= __umtx_op_shm,
+	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists,
 };
 
 int
 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	if ((unsigned)uap->op < nitems(op_table))
 		return (*op_table[uap->op])(td, uap);
 	return (EINVAL);
 }
 
 #ifdef COMPAT_FREEBSD32
 
 struct timespec32 {
 	int32_t tv_sec;
 	int32_t tv_nsec;
 };
 
 struct umtx_time32 {
 	struct	timespec32	timeout;
 	uint32_t		flags;
 	uint32_t		clockid;
 };
 
 static inline int
 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
 {
 	struct timespec32 ts32;
 	int error;
 
 	error = copyin(addr, &ts32, sizeof(struct timespec32));
 	if (error == 0) {
 		if (ts32.tv_sec < 0 ||
 		    ts32.tv_nsec >= 1000000000 ||
 		    ts32.tv_nsec < 0)
 			error = EINVAL;
 		else {
 			tsp->tv_sec = ts32.tv_sec;
 			tsp->tv_nsec = ts32.tv_nsec;
 		}
 	}
 	return (error);
 }
 
 static inline int
 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
 {
 	struct umtx_time32 t32;
 	int error;
 	
 	t32.clockid = CLOCK_REALTIME;
 	t32.flags   = 0;
 	if (size <= sizeof(struct timespec32))
 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
 	else 
 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
 	if (error != 0)
 		return (error);
 	if (t32.timeout.tv_sec < 0 ||
 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
 		return (EINVAL);
 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
 	tp->_flags = t32.flags;
 	tp->_clockid = t32.clockid;
 	return (0);
 }
 
 static int
 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 			(size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
-	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
+	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
 }
 
 static int
 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(uap->uaddr2,
 			    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
-	return do_lock_umutex(td, uap->obj, tm_p, 0);
+	return (do_lock_umutex(td, uap->obj, tm_p, 0));
 }
 
 static int
 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2, 
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
-	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
+	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
 }
 
 static int
 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		ts = &timeout;
 	}
 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
 }
 
 static int
 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
 	} else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_wrlock(td, uap->obj, 0);
 	} else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_wrlock(td, uap->obj, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(
 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
-	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
+	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem_wait(td, uap->obj, tm_p));
 }
 #endif
 
 static int
 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem2_wait(td, uap->obj, tm_p));
 }
 
 static int
 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
 {
-	int count = uap->val;
-	uint32_t uaddrs[BATCH_SIZE];
-	uint32_t **upp = (uint32_t **)uap->obj;
-	int tocopy;
-	int error = 0;
-	int i, pos = 0;
+	uint32_t uaddrs[BATCH_SIZE], **upp;
+	int count, error, i, pos, tocopy;
 
-	while (count > 0) {
-		tocopy = count;
-		if (tocopy > BATCH_SIZE)
-			tocopy = BATCH_SIZE;
-		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
+	upp = (uint32_t **)uap->obj;
+	error = 0;
+	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
+	    pos += tocopy) {
+		tocopy = MIN(count, BATCH_SIZE);
+		error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t));
 		if (error != 0)
 			break;
 		for (i = 0; i < tocopy; ++i)
 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
-				INT_MAX, 1);
-		count -= tocopy;
-		pos += tocopy;
+			    INT_MAX, 1);
+		maybe_yield();
 	}
 	return (error);
 }
 
+struct umtx_robust_lists_params_compat32 {
+	uint32_t	robust_list_offset;
+	uint32_t	robust_priv_list_offset;
+	uint32_t	robust_inact_offset;
+};
+
+static int
+__umtx_op_robust_lists_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct umtx_robust_lists_params rb;
+	struct umtx_robust_lists_params_compat32 rb32;
+	int error;
+
+	if (uap->val > sizeof(rb32))
+		return (EINVAL);
+	bzero(&rb, sizeof(rb));
+	bzero(&rb32, sizeof(rb32));
+	error = copyin(uap->uaddr1, &rb32, uap->val);
+	if (error != 0)
+		return (error);
+	rb.robust_list_offset = rb32.robust_list_offset;
+	rb.robust_priv_list_offset = rb32.robust_priv_list_offset;
+	rb.robust_inact_offset = rb32.robust_inact_offset;
+	return (umtx_robust_lists(td, &rb));
+}
+
 static const _umtx_op_func op_table_compat32[] = {
 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
-	[UMTX_OP_WAIT]	= __umtx_op_wait_compat32,
-	[UMTX_OP_WAKE]	= __umtx_op_wake,
+	[UMTX_OP_WAIT]		= __umtx_op_wait_compat32,
+	[UMTX_OP_WAKE]		= __umtx_op_wake,
 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex_compat32,
 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait_compat32,
 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_compat32,
 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock_compat32,
 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock_compat32,
 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32,
 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex_compat32,
 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait_compat32,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
 #else
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
 #endif
 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private32,
 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait_compat32,
 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
 	[UMTX_OP_SHM]		= __umtx_op_shm,
+	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists_compat32,
 };
 
 int
 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
 {
 
 	if ((unsigned)uap->op < nitems(op_table_compat32)) {
 		return (*op_table_compat32[uap->op])(td,
 		    (struct _umtx_op_args *)uap);
 	}
 	return (EINVAL);
 }
 #endif
 
 void
 umtx_thread_init(struct thread *td)
 {
+
 	td->td_umtxq = umtxq_alloc();
 	td->td_umtxq->uq_thread = td;
 }
 
 void
 umtx_thread_fini(struct thread *td)
 {
+
 	umtxq_free(td->td_umtxq);
 }
 
 /*
  * It will be called when new thread is created, e.g fork().
  */
 void
 umtx_thread_alloc(struct thread *td)
 {
 	struct umtx_q *uq;
 
 	uq = td->td_umtxq;
 	uq->uq_inherited_pri = PRI_MAX;
 
 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
 }
 
 /*
  * exec() hook.
+ *
+ * Clear robust lists for all process' threads, not delaying the
+ * cleanup to thread_exit hook, since the relevant address space is
+ * destroyed right now.
  */
 static void
-umtx_exec_hook(void *arg __unused, struct proc *p __unused,
-	struct image_params *imgp __unused)
+umtx_exec_hook(void *arg __unused, struct proc *p,
+    struct image_params *imgp __unused)
 {
-	umtx_thread_cleanup(curthread);
+	struct thread *td;
+
+	KASSERT(p == curproc, ("need curproc"));
+	PROC_LOCK(p);
+	KASSERT((p->p_flag & P_HADTHREADS) == 0 ||
+	    (p->p_flag & P_STOPPED_SINGLE) != 0,
+	    ("curproc must be single-threaded"));
+	FOREACH_THREAD_IN_PROC(p, td) {
+		KASSERT(td == curthread ||
+		    ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)),
+		    ("running thread %p %p", p, td));
+		PROC_UNLOCK(p);
+		umtx_thread_cleanup(td);
+		PROC_LOCK(p);
+		td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0;
+	}
+	PROC_UNLOCK(p);
 }
 
 /*
  * thread_exit() hook.
  */
 void
 umtx_thread_exit(struct thread *td)
 {
+
 	umtx_thread_cleanup(td);
 }
 
+static int
+umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res)
+{
+	u_long res1;
+#ifdef COMPAT_FREEBSD32
+	uint32_t res32;
+#endif
+	int error;
+
+#ifdef COMPAT_FREEBSD32
+	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+		error = fueword32((void *)ptr, &res32);
+		if (error == 0)
+			res1 = res32;
+	} else
+#endif
+	{
+		error = fueword((void *)ptr, &res1);
+	}
+	if (error == 0)
+		*res = res1;
+	else
+		error = EFAULT;
+	return (error);
+}
+
+static void
+umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list)
+{
+#ifdef COMPAT_FREEBSD32
+	struct umutex32 m32;
+
+	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
+		memcpy(&m32, m, sizeof(m32));
+		*rb_list = m32.m_rb_lnk;
+	} else
+#endif
+		*rb_list = m->m_rb_lnk;
+}
+
+static int
+umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact)
+{
+	struct umutex m;
+	int error;
+
+	KASSERT(td->td_proc == curproc, ("need current vmspace"));
+	error = copyin((void *)rbp, &m, sizeof(m));
+	if (error != 0)
+		return (error);
+	if (rb_list != NULL)
+		umtx_read_rb_list(td, &m, rb_list);
+	if ((m.m_flags & UMUTEX_ROBUST) == 0)
+		return (EINVAL);
+	if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid)
+		/* inact is cleared after unlock, allow the inconsistency */
+		return (inact ? 0 : EINVAL);
+	return (do_unlock_umutex(td, (struct umutex *)rbp, true));
+}
+
+static void
+umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact,
+    const char *name)
+{
+	int error, i;
+	uintptr_t rbp;
+	bool inact;
+
+	if (rb_list == 0)
+		return;
+	error = umtx_read_uptr(td, rb_list, &rbp);
+	for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) {
+		if (rbp == *rb_inact) {
+			inact = true;
+			*rb_inact = 0;
+		} else
+			inact = false;
+		error = umtx_handle_rb(td, rbp, &rbp, inact);
+	}
+	if (i == umtx_max_rb && umtx_verbose_rb) {
+		uprintf("comm %s pid %d: reached umtx %smax rb %d\n",
+		    td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb);
+	}
+	if (error != 0 && umtx_verbose_rb) {
+		uprintf("comm %s pid %d: handling %srb error %d\n",
+		    td->td_proc->p_comm, td->td_proc->p_pid, name, error);
+	}
+}
+
 /*
- * clean up umtx data.
+ * Clean up umtx data.
  */
 static void
 umtx_thread_cleanup(struct thread *td)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
+	uintptr_t rb_inact;
 
-	if ((uq = td->td_umtxq) == NULL)
-		return;
-
-	mtx_lock(&umtx_lock);
-	uq->uq_inherited_pri = PRI_MAX;
-	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
-		pi->pi_owner = NULL;
-		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
+	/*
+	 * Disown pi mutexes.
+	 */
+	uq = td->td_umtxq;
+	if (uq != NULL) {
+		mtx_lock(&umtx_lock);
+		uq->uq_inherited_pri = PRI_MAX;
+		while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
+			pi->pi_owner = NULL;
+			TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
+		}
+		mtx_unlock(&umtx_lock);
+		thread_lock(td);
+		sched_lend_user_prio(td, PRI_MAX);
+		thread_unlock(td);
 	}
-	mtx_unlock(&umtx_lock);
-	thread_lock(td);
-	sched_lend_user_prio(td, PRI_MAX);
-	thread_unlock(td);
+
+	/*
+	 * Handle terminated robust mutexes.  Must be done after
+	 * robust pi disown, otherwise unlock could see unowned
+	 * entries.
+	 */
+	rb_inact = td->td_rb_inact;
+	if (rb_inact != 0)
+		(void)umtx_read_uptr(td, rb_inact, &rb_inact);
+	umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, "");
+	umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv ");
+	if (rb_inact != 0)
+		(void)umtx_handle_rb(td, rb_inact, NULL, true);
 }
Index: head/sys/sys/_umtx.h
===================================================================
--- head/sys/sys/_umtx.h	(revision 300042)
+++ head/sys/sys/_umtx.h	(revision 300043)
@@ -1,75 +1,79 @@
 /*-
  * Copyright (c) 2010, David Xu <davidxu@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 #ifndef _SYS__UMTX_H_
 #define	_SYS__UMTX_H_
 
 #include <sys/_types.h>
 #include <sys/_timespec.h>
 
 struct umutex {
 	volatile __lwpid_t	m_owner;	/* Owner of the mutex */
 	__uint32_t		m_flags;	/* Flags of the mutex */
 	__uint32_t		m_ceilings[2];	/* Priority protect ceiling */
-	__uint32_t		m_spare[4];
+	__uintptr_t		m_rb_lnk;	/* Robust linkage */
+#ifndef __LP64__
+	__uint32_t		m_pad;
+#endif
+	__uint32_t		m_spare[2];
 };
 
 struct ucond {
 	volatile __uint32_t	c_has_waiters;	/* Has waiters in kernel */
 	__uint32_t		c_flags;	/* Flags of the condition variable */
 	__uint32_t              c_clockid;	/* Clock id */
 	__uint32_t              c_spare[1];	/* Spare space */
 };
 
 struct urwlock {
 	volatile __int32_t	rw_state;
 	__uint32_t		rw_flags;
 	__uint32_t		rw_blocked_readers;
 	__uint32_t		rw_blocked_writers;
 	__uint32_t		rw_spare[4];
 };
 
 struct _usem {
 	volatile __uint32_t	_has_waiters;
 	volatile __uint32_t	_count;
 	__uint32_t		_flags;
 };
 
 struct _usem2 {
 	volatile __uint32_t	_count;		/* Waiters flag in high bit. */
 	__uint32_t		_flags;
 };
 
 struct _umtx_time {
 	struct timespec		_timeout;
 	__uint32_t		_flags;
 	__uint32_t		_clockid;
 };
 
 #endif /* !_SYS__UMTX_H_ */
Index: head/sys/sys/proc.h
===================================================================
--- head/sys/sys/proc.h	(revision 300042)
+++ head/sys/sys/proc.h	(revision 300043)
@@ -1,1067 +1,1070 @@
 /*-
  * Copyright (c) 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)proc.h	8.15 (Berkeley) 5/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PROC_H_
 #define	_SYS_PROC_H_
 
 #include <sys/callout.h>		/* For struct callout. */
 #include <sys/event.h>			/* For struct klist. */
 #include <sys/condvar.h>
 #ifndef _KERNEL
 #include <sys/filedesc.h>
 #endif
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/lock_profile.h>
 #include <sys/_mutex.h>
 #include <sys/osd.h>
 #include <sys/priority.h>
 #include <sys/rtprio.h>			/* XXX. */
 #include <sys/runq.h>
 #include <sys/resource.h>
 #include <sys/sigio.h>
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #ifndef _KERNEL
 #include <sys/time.h>			/* For structs itimerval, timeval. */
 #else
 #include <sys/pcpu.h>
 #endif
 #include <sys/ucontext.h>
 #include <sys/ucred.h>
 #include <sys/_vm_domain.h>
 #include <machine/proc.h>		/* Machine-dependent proc substruct. */
 
 /*
  * One structure allocated per session.
  *
  * List of locks
  * (m)		locked by s_mtx mtx
  * (e)		locked by proctree_lock sx
  * (c)		const until freeing
  */
 struct session {
 	u_int		s_count;	/* Ref cnt; pgrps in session - atomic. */
 	struct proc	*s_leader;	/* (m + e) Session leader. */
 	struct vnode	*s_ttyvp;	/* (m) Vnode of controlling tty. */
 	struct cdev_priv *s_ttydp;	/* (m) Device of controlling tty.  */
 	struct tty	*s_ttyp;	/* (e) Controlling tty. */
 	pid_t		s_sid;		/* (c) Session ID. */
 					/* (m) Setlogin() name: */
 	char		s_login[roundup(MAXLOGNAME, sizeof(long))];
 	struct mtx	s_mtx;		/* Mutex to protect members. */
 };
 
 /*
  * One structure allocated per process group.
  *
  * List of locks
  * (m)		locked by pg_mtx mtx
  * (e)		locked by proctree_lock sx
  * (c)		const until freeing
  */
 struct pgrp {
 	LIST_ENTRY(pgrp) pg_hash;	/* (e) Hash chain. */
 	LIST_HEAD(, proc) pg_members;	/* (m + e) Pointer to pgrp members. */
 	struct session	*pg_session;	/* (c) Pointer to session. */
 	struct sigiolst	pg_sigiolst;	/* (m) List of sigio sources. */
 	pid_t		pg_id;		/* (c) Process group id. */
 	int		pg_jobc;	/* (m) Job control process count. */
 	struct mtx	pg_mtx;		/* Mutex to protect members */
 };
 
 /*
  * pargs, used to hold a copy of the command line, if it had a sane length.
  */
 struct pargs {
 	u_int	ar_ref;		/* Reference count. */
 	u_int	ar_length;	/* Length. */
 	u_char	ar_args[1];	/* Arguments. */
 };
 
 /*-
  * Description of a process.
  *
  * This structure contains the information needed to manage a thread of
  * control, known in UN*X as a process; it has references to substructures
  * containing descriptions of things that the process uses, but may share
  * with related processes.  The process structure and the substructures
  * are always addressable except for those marked "(CPU)" below,
  * which might be addressable only on a processor on which the process
  * is running.
  *
  * Below is a key of locks used to protect each member of struct proc.  The
  * lock is indicated by a reference to a specific character in parens in the
  * associated comment.
  *      * - not yet protected
  *      a - only touched by curproc or parent during fork/wait
  *      b - created at fork, never changes
  *		(exception aiods switch vmspaces, but they are also
  *		marked 'P_SYSTEM' so hopefully it will be left alone)
  *      c - locked by proc mtx
  *      d - locked by allproc_lock lock
  *      e - locked by proctree_lock lock
  *      f - session mtx
  *      g - process group mtx
  *      h - callout_lock mtx
  *      i - by curproc or the master session mtx
  *      j - locked by proc slock
  *      k - only accessed by curthread
  *	k*- only accessed by curthread and from an interrupt
  *      l - the attaching proc or attaching proc parent
  *      m - Giant
  *      n - not locked, lazy
  *      o - ktrace lock
  *      q - td_contested lock
  *      r - p_peers lock
  *      t - thread lock
  *	u - process stat lock
  *	w - process timer lock
  *      x - created at fork, only changes during single threading in exec
  *      y - created at first aio, doesn't change until exit or exec at which
  *          point we are single-threaded and only curthread changes it
  *      z - zombie threads lock
  *
  * If the locking key specifies two identifiers (for example, p_pptr) then
  * either lock is sufficient for read access, but both locks must be held
  * for write access.
  */
 struct cpuset;
 struct filecaps;
 struct filemon;
 struct kaioinfo;
 struct kaudit_record;
 struct kdtrace_proc;
 struct kdtrace_thread;
 struct mqueue_notifier;
 struct nlminfo;
 struct p_sched;
 struct proc;
 struct procdesc;
 struct racct;
 struct sbuf;
 struct sleepqueue;
 struct syscall_args;
 struct td_sched;
 struct thread;
 struct trapframe;
 struct turnstile;
 
 /*
  * XXX: Does this belong in resource.h or resourcevar.h instead?
  * Resource usage extension.  The times in rusage structs in the kernel are
  * never up to date.  The actual times are kept as runtimes and tick counts
  * (with control info in the "previous" times), and are converted when
  * userland asks for rusage info.  Backwards compatibility prevents putting
  * this directly in the user-visible rusage struct.
  *
  * Locking for p_rux: (cu) means (u) for p_rux and (c) for p_crux.
  * Locking for td_rux: (t) for all fields.
  */
 struct rusage_ext {
 	uint64_t	rux_runtime;    /* (cu) Real time. */
 	uint64_t	rux_uticks;     /* (cu) Statclock hits in user mode. */
 	uint64_t	rux_sticks;     /* (cu) Statclock hits in sys mode. */
 	uint64_t	rux_iticks;     /* (cu) Statclock hits in intr mode. */
 	uint64_t	rux_uu;         /* (c) Previous user time in usec. */
 	uint64_t	rux_su;         /* (c) Previous sys time in usec. */
 	uint64_t	rux_tu;         /* (c) Previous total time in usec. */
 };
 
 /*
  * Kernel runnable context (thread).
  * This is what is put to sleep and reactivated.
  * Thread context.  Processes may have multiple threads.
  */
 struct thread {
 	struct mtx	*volatile td_lock; /* replaces sched lock */
 	struct proc	*td_proc;	/* (*) Associated process. */
 	TAILQ_ENTRY(thread) td_plist;	/* (*) All threads in this proc. */
 	TAILQ_ENTRY(thread) td_runq;	/* (t) Run queue. */
 	TAILQ_ENTRY(thread) td_slpq;	/* (t) Sleep queue. */
 	TAILQ_ENTRY(thread) td_lockq;	/* (t) Lock queue. */
 	LIST_ENTRY(thread) td_hash;	/* (d) Hash chain. */
 	struct cpuset	*td_cpuset;	/* (t) CPU affinity mask. */
 	struct seltd	*td_sel;	/* Select queue/channel. */
 	struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */
 	struct turnstile *td_turnstile;	/* (k) Associated turnstile. */
 	struct rl_q_entry *td_rlqe;	/* (k) Associated range lock entry. */
 	struct umtx_q   *td_umtxq;	/* (c?) Link for when we're blocked. */
 	struct vm_domain_policy td_vm_dom_policy;	/* (c) current numa domain policy */
 	lwpid_t		td_tid;		/* (b) Thread ID. */
 	sigqueue_t	td_sigqueue;	/* (c) Sigs arrived, not delivered. */
 #define	td_siglist	td_sigqueue.sq_signals
 	u_char		td_lend_user_pri; /* (t) Lend user pri. */
 
 /* Cleared during fork1() */
 #define	td_startzero td_flags
 	int		td_flags;	/* (t) TDF_* flags. */
 	int		td_inhibitors;	/* (t) Why can not run. */
 	int		td_pflags;	/* (k) Private thread (TDP_*) flags. */
 	int		td_dupfd;	/* (k) Ret value from fdopen. XXX */
 	int		td_sqqueue;	/* (t) Sleepqueue queue blocked on. */
 	void		*td_wchan;	/* (t) Sleep address. */
 	const char	*td_wmesg;	/* (t) Reason for sleep. */
 	volatile u_char td_owepreempt;  /* (k*) Preempt on last critical_exit */
 	u_char		td_tsqueue;	/* (t) Turnstile queue blocked on. */
 	short		td_locks;	/* (k) Debug: count of non-spin locks */
 	short		td_rw_rlocks;	/* (k) Count of rwlock read locks. */
 	short		td_lk_slocks;	/* (k) Count of lockmgr shared locks. */
 	short		td_stopsched;	/* (k) Scheduler stopped. */
 	struct turnstile *td_blocked;	/* (t) Lock thread is blocked on. */
 	const char	*td_lockname;	/* (t) Name of lock blocked on. */
 	LIST_HEAD(, turnstile) td_contested;	/* (q) Contested locks. */
 	struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */
 	int		td_intr_nesting_level; /* (k) Interrupt recursion. */
 	int		td_pinned;	/* (k) Temporary cpu pin count. */
 	struct ucred	*td_ucred;	/* (k) Reference to credentials. */
 	struct plimit	*td_limit;	/* (k) Resource limits. */
 	int		td_slptick;	/* (t) Time at sleep. */
 	int		td_blktick;	/* (t) Time spent blocked. */
 	int		td_swvoltick;	/* (t) Time at last SW_VOL switch. */
 	int		td_swinvoltick;	/* (t) Time at last SW_INVOL switch. */
 	u_int		td_cow;		/* (*) Number of copy-on-write faults */
 	struct rusage	td_ru;		/* (t) rusage information. */
 	struct rusage_ext td_rux;	/* (t) Internal rusage information. */
 	uint64_t	td_incruntime;	/* (t) Cpu ticks to transfer to proc. */
 	uint64_t	td_runtime;	/* (t) How many cpu ticks we've run. */
 	u_int 		td_pticks;	/* (t) Statclock hits for profiling */
 	u_int		td_sticks;	/* (t) Statclock hits in system mode. */
 	u_int		td_iticks;	/* (t) Statclock hits in intr mode. */
 	u_int		td_uticks;	/* (t) Statclock hits in user mode. */
 	int		td_intrval;	/* (t) Return value for sleepq. */
 	sigset_t	td_oldsigmask;	/* (k) Saved mask from pre sigpause. */
 	volatile u_int	td_generation;	/* (k) For detection of preemption */
 	stack_t		td_sigstk;	/* (k) Stack ptr and on-stack flag. */
 	int		td_xsig;	/* (c) Signal for ptrace */
 	u_long		td_profil_addr;	/* (k) Temporary addr until AST. */
 	u_int		td_profil_ticks; /* (k) Temporary ticks until AST. */
 	char		td_name[MAXCOMLEN + 1];	/* (*) Thread name. */
 	struct file	*td_fpop;	/* (k) file referencing cdev under op */
 	int		td_dbgflags;	/* (c) Userland debugger flags */
 	struct ksiginfo td_dbgksi;	/* (c) ksi reflected to debugger. */
 	int		td_ng_outbound;	/* (k) Thread entered ng from above. */
 	struct osd	td_osd;		/* (k) Object specific data. */
 	struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */
 	pid_t		td_dbg_forked;	/* (c) Child pid for debugger. */
 	u_int		td_vp_reserv;	/* (k) Count of reserved vnodes. */
 	int		td_no_sleeping;	/* (k) Sleeping disabled count. */
 	int		td_dom_rr_idx;	/* (k) RR Numa domain selection. */
 	void		*td_su;		/* (k) FFS SU private */
+	uintptr_t	td_rb_list;	/* (k) Robust list head. */
+	uintptr_t	td_rbp_list;	/* (k) Robust priv list head. */
+	uintptr_t	td_rb_inact;	/* (k) Current in-action mutex loc. */
 #define	td_endzero td_sigmask
 
 /* Copied during fork1() or create_thread(). */
 #define	td_startcopy td_endzero
 	sigset_t	td_sigmask;	/* (c) Current signal mask. */
 	u_char		td_rqindex;	/* (t) Run queue index. */
 	u_char		td_base_pri;	/* (t) Thread base kernel priority. */
 	u_char		td_priority;	/* (t) Thread active priority. */
 	u_char		td_pri_class;	/* (t) Scheduling class. */
 	u_char		td_user_pri;	/* (t) User pri from estcpu and nice. */
 	u_char		td_base_user_pri; /* (t) Base user pri */
 	u_int		td_dbg_sc_code;	/* (c) Syscall code to debugger. */
 	u_int		td_dbg_sc_narg;	/* (c) Syscall arg count to debugger.*/
 #define	td_endcopy td_pcb
 
 /*
  * Fields that must be manually set in fork1() or create_thread()
  * or already have been set in the allocator, constructor, etc.
  */
 	struct pcb	*td_pcb;	/* (k) Kernel VA of pcb and kstack. */
 	enum {
 		TDS_INACTIVE = 0x0,
 		TDS_INHIBITED,
 		TDS_CAN_RUN,
 		TDS_RUNQ,
 		TDS_RUNNING
 	} td_state;			/* (t) thread state */
 	union {
 		register_t	tdu_retval[2];
 		off_t		tdu_off;	
 	} td_uretoff;			/* (k) Syscall aux returns. */
 #define td_retval	td_uretoff.tdu_retval
 	u_int		td_cowgen;	/* (k) Generation of COW pointers. */
 	struct callout	td_slpcallout;	/* (h) Callout for sleep. */
 	struct trapframe *td_frame;	/* (k) */
 	struct vm_object *td_kstack_obj;/* (a) Kstack object. */
 	vm_offset_t	td_kstack;	/* (a) Kernel VA of kstack. */
 	int		td_kstack_pages; /* (a) Size of the kstack. */
 	volatile u_int	td_critnest;	/* (k*) Critical section nest level. */
 	struct mdthread td_md;		/* (k) Any machine-dependent fields. */
 	struct td_sched	*td_sched;	/* (*) Scheduler-specific data. */
 	struct kaudit_record	*td_ar;	/* (k) Active audit record, if any. */
 	struct lpohead	td_lprof[2];	/* (a) lock profiling objects. */
 	struct kdtrace_thread	*td_dtrace; /* (*) DTrace-specific data. */
 	int		td_errno;	/* Error returned by last syscall. */
 	struct vnet	*td_vnet;	/* (k) Effective vnet. */
 	const char	*td_vnet_lpush;	/* (k) Debugging vnet push / pop. */
 	struct trapframe *td_intr_frame;/* (k) Frame of the current irq */
 	struct proc	*td_rfppwait_p;	/* (k) The vforked child */
 	struct vm_page	**td_ma;	/* (k) uio pages held */
 	int		td_ma_cnt;	/* (k) size of *td_ma */
 	void		*td_emuldata;	/* Emulator state data */
 	int		td_lastcpu;	/* (t) Last cpu we were on. */
 	int		td_oncpu;	/* (t) Which cpu we are on. */
 };
 
 struct mtx *thread_lock_block(struct thread *);
 void thread_lock_unblock(struct thread *, struct mtx *);
 void thread_lock_set(struct thread *, struct mtx *);
 #define	THREAD_LOCK_ASSERT(td, type)					\
 do {									\
 	struct mtx *__m = (td)->td_lock;				\
 	if (__m != &blocked_lock)					\
 		mtx_assert(__m, (type));				\
 } while (0)
 
 #ifdef INVARIANTS
 #define	THREAD_LOCKPTR_ASSERT(td, lock)					\
 do {									\
 	struct mtx *__m = (td)->td_lock;				\
 	KASSERT((__m == &blocked_lock || __m == (lock)),		\
 	    ("Thread %p lock %p does not match %p", td, __m, (lock)));	\
 } while (0)
 
 #define	TD_LOCKS_INC(td)	((td)->td_locks++)
 #define	TD_LOCKS_DEC(td)	((td)->td_locks--)
 #else
 #define	THREAD_LOCKPTR_ASSERT(td, lock)
 
 #define	TD_LOCKS_INC(td)
 #define	TD_LOCKS_DEC(td)
 #endif
 
 /*
  * Flags kept in td_flags:
  * To change these you MUST have the scheduler lock.
  */
 #define	TDF_BORROWING	0x00000001 /* Thread is borrowing pri from another. */
 #define	TDF_INPANIC	0x00000002 /* Caused a panic, let it drive crashdump. */
 #define	TDF_INMEM	0x00000004 /* Thread's stack is in memory. */
 #define	TDF_SINTR	0x00000008 /* Sleep is interruptible. */
 #define	TDF_TIMEOUT	0x00000010 /* Timing out during sleep. */
 #define	TDF_IDLETD	0x00000020 /* This is a per-CPU idle thread. */
 #define	TDF_CANSWAP	0x00000040 /* Thread can be swapped. */
 #define	TDF_SLEEPABORT	0x00000080 /* sleepq_abort was called. */
 #define	TDF_KTH_SUSP	0x00000100 /* kthread is suspended */
 #define	TDF_ALLPROCSUSP	0x00000200 /* suspended by SINGLE_ALLPROC */
 #define	TDF_BOUNDARY	0x00000400 /* Thread suspended at user boundary */
 #define	TDF_ASTPENDING	0x00000800 /* Thread has some asynchronous events. */
 #define	TDF_TIMOFAIL	0x00001000 /* Timeout from sleep after we were awake. */
 #define	TDF_SBDRY	0x00002000 /* Stop only on usermode boundary. */
 #define	TDF_UPIBLOCKED	0x00004000 /* Thread blocked on user PI mutex. */
 #define	TDF_NEEDSUSPCHK	0x00008000 /* Thread may need to suspend. */
 #define	TDF_NEEDRESCHED	0x00010000 /* Thread needs to yield. */
 #define	TDF_NEEDSIGCHK	0x00020000 /* Thread may need signal delivery. */
 #define	TDF_NOLOAD	0x00040000 /* Ignore during load avg calculations. */
 #define	TDF_UNUSED19	0x00080000 /* --available-- */
 #define	TDF_THRWAKEUP	0x00100000 /* Libthr thread must not suspend itself. */
 #define	TDF_UNUSED21	0x00200000 /* --available-- */
 #define	TDF_SWAPINREQ	0x00400000 /* Swapin request due to wakeup. */
 #define	TDF_UNUSED23	0x00800000 /* --available-- */
 #define	TDF_SCHED0	0x01000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED1	0x02000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED2	0x04000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED3	0x08000000 /* Reserved for scheduler private use */
 #define	TDF_ALRMPEND	0x10000000 /* Pending SIGVTALRM needs to be posted. */
 #define	TDF_PROFPEND	0x20000000 /* Pending SIGPROF needs to be posted. */
 #define	TDF_MACPEND	0x40000000 /* AST-based MAC event pending. */
 
 /* Userland debug flags */
 #define	TDB_SUSPEND	0x00000001 /* Thread is suspended by debugger */
 #define	TDB_XSIG	0x00000002 /* Thread is exchanging signal under trace */
 #define	TDB_USERWR	0x00000004 /* Debugger modified memory or registers */
 #define	TDB_SCE		0x00000008 /* Thread performs syscall enter */
 #define	TDB_SCX		0x00000010 /* Thread performs syscall exit */
 #define	TDB_EXEC	0x00000020 /* TDB_SCX from exec(2) family */
 #define	TDB_FORK	0x00000040 /* TDB_SCX from fork(2) that created new
 				      process */
 #define	TDB_STOPATFORK	0x00000080 /* Stop at the return from fork (child
 				      only) */
 #define	TDB_CHILD	0x00000100 /* New child indicator for ptrace() */
 #define	TDB_BORN	0x00000200 /* New LWP indicator for ptrace() */
 #define	TDB_EXIT	0x00000400 /* Exiting LWP indicator for ptrace() */
 
 /*
  * "Private" flags kept in td_pflags:
  * These are only written by curthread and thus need no locking.
  */
 #define	TDP_OLDMASK	0x00000001 /* Need to restore mask after suspend. */
 #define	TDP_INKTR	0x00000002 /* Thread is currently in KTR code. */
 #define	TDP_INKTRACE	0x00000004 /* Thread is currently in KTRACE code. */
 #define	TDP_BUFNEED	0x00000008 /* Do not recurse into the buf flush */
 #define	TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */
 #define	TDP_ALTSTACK	0x00000020 /* Have alternate signal stack. */
 #define	TDP_DEADLKTREAT	0x00000040 /* Lock acquisition - deadlock treatment. */
 #define	TDP_NOFAULTING	0x00000080 /* Do not handle page faults. */
 #define	TDP_UNUSED9	0x00000100 /* --available-- */
 #define	TDP_OWEUPC	0x00000200 /* Call addupc() at next AST. */
 #define	TDP_ITHREAD	0x00000400 /* Thread is an interrupt thread. */
 #define	TDP_SYNCIO	0x00000800 /* Local override, disable async i/o. */
 #define	TDP_SCHED1	0x00001000 /* Reserved for scheduler private use */
 #define	TDP_SCHED2	0x00002000 /* Reserved for scheduler private use */
 #define	TDP_SCHED3	0x00004000 /* Reserved for scheduler private use */
 #define	TDP_SCHED4	0x00008000 /* Reserved for scheduler private use */
 #define	TDP_GEOM	0x00010000 /* Settle GEOM before finishing syscall */
 #define	TDP_SOFTDEP	0x00020000 /* Stuck processing softdep worklist */
 #define	TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */
 #define	TDP_WAKEUP	0x00080000 /* Don't sleep in umtx cond_wait */
 #define	TDP_INBDFLUSH	0x00100000 /* Already in BO_BDFLUSH, do not recurse */
 #define	TDP_KTHREAD	0x00200000 /* This is an official kernel thread */
 #define	TDP_CALLCHAIN	0x00400000 /* Capture thread's callchain */
 #define	TDP_IGNSUSP	0x00800000 /* Permission to ignore the MNTK_SUSPEND* */
 #define	TDP_AUDITREC	0x01000000 /* Audit record pending on thread */
 #define	TDP_RFPPWAIT	0x02000000 /* Handle RFPPWAIT on syscall exit */
 #define	TDP_RESETSPUR	0x04000000 /* Reset spurious page fault history. */
 #define	TDP_NERRNO	0x08000000 /* Last errno is already in td_errno */
 #define	TDP_UIOHELD	0x10000000 /* Current uio has pages held in td_ma */
 #define	TDP_FORKING	0x20000000 /* Thread is being created through fork() */
 #define	TDP_EXECVMSPC	0x40000000 /* Execve destroyed old vmspace */
 
 /*
  * Reasons that the current thread can not be run yet.
  * More than one may apply.
  */
 #define	TDI_SUSPENDED	0x0001	/* On suspension queue. */
 #define	TDI_SLEEPING	0x0002	/* Actually asleep! (tricky). */
 #define	TDI_SWAPPED	0x0004	/* Stack not in mem.  Bad juju if run. */
 #define	TDI_LOCK	0x0008	/* Stopped on a lock. */
 #define	TDI_IWAIT	0x0010	/* Awaiting interrupt. */
 
 #define	TD_IS_SLEEPING(td)	((td)->td_inhibitors & TDI_SLEEPING)
 #define	TD_ON_SLEEPQ(td)	((td)->td_wchan != NULL)
 #define	TD_IS_SUSPENDED(td)	((td)->td_inhibitors & TDI_SUSPENDED)
 #define	TD_IS_SWAPPED(td)	((td)->td_inhibitors & TDI_SWAPPED)
 #define	TD_ON_LOCK(td)		((td)->td_inhibitors & TDI_LOCK)
 #define	TD_AWAITING_INTR(td)	((td)->td_inhibitors & TDI_IWAIT)
 #define	TD_IS_RUNNING(td)	((td)->td_state == TDS_RUNNING)
 #define	TD_ON_RUNQ(td)		((td)->td_state == TDS_RUNQ)
 #define	TD_CAN_RUN(td)		((td)->td_state == TDS_CAN_RUN)
 #define	TD_IS_INHIBITED(td)	((td)->td_state == TDS_INHIBITED)
 #define	TD_ON_UPILOCK(td)	((td)->td_flags & TDF_UPIBLOCKED)
 #define TD_IS_IDLETHREAD(td)	((td)->td_flags & TDF_IDLETD)
 
 
 #define	TD_SET_INHIB(td, inhib) do {			\
 	(td)->td_state = TDS_INHIBITED;			\
 	(td)->td_inhibitors |= (inhib);			\
 } while (0)
 
 #define	TD_CLR_INHIB(td, inhib) do {			\
 	if (((td)->td_inhibitors & (inhib)) &&		\
 	    (((td)->td_inhibitors &= ~(inhib)) == 0))	\
 		(td)->td_state = TDS_CAN_RUN;		\
 } while (0)
 
 #define	TD_SET_SLEEPING(td)	TD_SET_INHIB((td), TDI_SLEEPING)
 #define	TD_SET_SWAPPED(td)	TD_SET_INHIB((td), TDI_SWAPPED)
 #define	TD_SET_LOCK(td)		TD_SET_INHIB((td), TDI_LOCK)
 #define	TD_SET_SUSPENDED(td)	TD_SET_INHIB((td), TDI_SUSPENDED)
 #define	TD_SET_IWAIT(td)	TD_SET_INHIB((td), TDI_IWAIT)
 #define	TD_SET_EXITING(td)	TD_SET_INHIB((td), TDI_EXITING)
 
 #define	TD_CLR_SLEEPING(td)	TD_CLR_INHIB((td), TDI_SLEEPING)
 #define	TD_CLR_SWAPPED(td)	TD_CLR_INHIB((td), TDI_SWAPPED)
 #define	TD_CLR_LOCK(td)		TD_CLR_INHIB((td), TDI_LOCK)
 #define	TD_CLR_SUSPENDED(td)	TD_CLR_INHIB((td), TDI_SUSPENDED)
 #define	TD_CLR_IWAIT(td)	TD_CLR_INHIB((td), TDI_IWAIT)
 
 #define	TD_SET_RUNNING(td)	(td)->td_state = TDS_RUNNING
 #define	TD_SET_RUNQ(td)		(td)->td_state = TDS_RUNQ
 #define	TD_SET_CAN_RUN(td)	(td)->td_state = TDS_CAN_RUN
 
 /*
  * Process structure.
  */
 struct proc {
 	LIST_ENTRY(proc) p_list;	/* (d) List of all processes. */
 	TAILQ_HEAD(, thread) p_threads;	/* (c) all threads. */
 	struct mtx	p_slock;	/* process spin lock */
 	struct ucred	*p_ucred;	/* (c) Process owner's identity. */
 	struct filedesc	*p_fd;		/* (b) Open files. */
 	struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */
 	struct pstats	*p_stats;	/* (b) Accounting/statistics (CPU). */
 	struct plimit	*p_limit;	/* (c) Resource limits. */
 	struct callout	p_limco;	/* (c) Limit callout handle */
 	struct sigacts	*p_sigacts;	/* (x) Signal actions, state (CPU). */
 
 	int		p_flag;		/* (c) P_* flags. */
 	int		p_flag2;	/* (c) P2_* flags. */
 	enum {
 		PRS_NEW = 0,		/* In creation */
 		PRS_NORMAL,		/* threads can be run. */
 		PRS_ZOMBIE
 	} p_state;			/* (j/c) Process status. */
 	pid_t		p_pid;		/* (b) Process identifier. */
 	LIST_ENTRY(proc) p_hash;	/* (d) Hash chain. */
 	LIST_ENTRY(proc) p_pglist;	/* (g + e) List of processes in pgrp. */
 	struct proc	*p_pptr;	/* (c + e) Pointer to parent process. */
 	LIST_ENTRY(proc) p_sibling;	/* (e) List of sibling processes. */
 	LIST_HEAD(, proc) p_children;	/* (e) Pointer to list of children. */
 	struct proc	*p_reaper;	/* (e) My reaper. */
 	LIST_HEAD(, proc) p_reaplist;	/* (e) List of my descendants
 					       (if I am reaper). */
 	LIST_ENTRY(proc) p_reapsibling;	/* (e) List of siblings - descendants of
 					       the same reaper. */
 	struct mtx	p_mtx;		/* (n) Lock for this struct. */
 	struct mtx	p_statmtx;	/* Lock for the stats */
 	struct mtx	p_itimmtx;	/* Lock for the virt/prof timers */
 	struct mtx	p_profmtx;	/* Lock for the profiling */
 	struct ksiginfo *p_ksi;	/* Locked by parent proc lock */
 	sigqueue_t	p_sigqueue;	/* (c) Sigs not delivered to a td. */
 #define p_siglist	p_sigqueue.sq_signals
 
 /* The following fields are all zeroed upon creation in fork. */
 #define	p_startzero	p_oppid
 	pid_t		p_oppid;	/* (c + e) Save ppid in ptrace. XXX */
 	struct vmspace	*p_vmspace;	/* (b) Address space. */
 	u_int		p_swtick;	/* (c) Tick when swapped in or out. */
 	u_int		p_cowgen;	/* (c) Generation of COW pointers. */
 	struct itimerval p_realtimer;	/* (c) Alarm timer. */
 	struct rusage	p_ru;		/* (a) Exit information. */
 	struct rusage_ext p_rux;	/* (cu) Internal resource usage. */
 	struct rusage_ext p_crux;	/* (c) Internal child resource usage. */
 	int		p_profthreads;	/* (c) Num threads in addupc_task. */
 	volatile int	p_exitthreads;	/* (j) Number of threads exiting */
 	int		p_traceflag;	/* (o) Kernel trace points. */
 	struct vnode	*p_tracevp;	/* (c + o) Trace to vnode. */
 	struct ucred	*p_tracecred;	/* (o) Credentials to trace with. */
 	struct vnode	*p_textvp;	/* (b) Vnode of executable. */
 	u_int		p_lock;		/* (c) Proclock (prevent swap) count. */
 	struct sigiolst	p_sigiolst;	/* (c) List of sigio sources. */
 	int		p_sigparent;	/* (c) Signal to parent on exit. */
 	int		p_sig;		/* (n) For core dump/debugger XXX. */
 	u_long		p_code;		/* (n) For core dump/debugger XXX. */
 	u_int		p_stops;	/* (c) Stop event bitmask. */
 	u_int		p_stype;	/* (c) Stop event type. */
 	char		p_step;		/* (c) Process is stopped. */
 	u_char		p_pfsflags;	/* (c) Procfs flags. */
 	struct nlminfo	*p_nlminfo;	/* (?) Only used by/for lockd. */
 	struct kaioinfo	*p_aioinfo;	/* (y) ASYNC I/O info. */
 	struct thread	*p_singlethread;/* (c + j) If single threading this is it */
 	int		p_suspcount;	/* (j) Num threads in suspended mode. */
 	struct thread	*p_xthread;	/* (c) Trap thread */
 	int		p_boundary_count;/* (j) Num threads at user boundary */
 	int		p_pendingcnt;	/* how many signals are pending */
 	struct itimers	*p_itimers;	/* (c) POSIX interval timers. */
 	struct procdesc	*p_procdesc;	/* (e) Process descriptor, if any. */
 	u_int		p_treeflag;	/* (e) P_TREE flags */
 	int		p_pendingexits; /* (c) Count of pending thread exits. */
 	struct filemon	*p_filemon;	/* (c) filemon-specific data. */
 /* End area that is zeroed on creation. */
 #define	p_endzero	p_magic
 
 /* The following fields are all copied upon creation in fork. */
 #define	p_startcopy	p_endzero
 	u_int		p_magic;	/* (b) Magic number. */
 	int		p_osrel;	/* (x) osreldate for the
 					       binary (from ELF note, if any) */
 	char		p_comm[MAXCOMLEN + 1];	/* (b) Process name. */
 	struct sysentvec *p_sysent;	/* (b) Syscall dispatch info. */
 	struct pargs	*p_args;	/* (c) Process arguments. */
 	rlim_t		p_cpulimit;	/* (c) Current CPU limit in seconds. */
 	signed char	p_nice;		/* (c) Process "nice" value. */
 	int		p_fibnum;	/* in this routing domain XXX MRT */
 	pid_t		p_reapsubtree;	/* (e) Pid of the direct child of the
 					       reaper which spawned
 					       our subtree. */
 	u_int		p_xexit;	/* (c) Exit code. */
 	u_int		p_xsig;		/* (c) Stop/kill sig. */
 /* End area that is copied on creation. */
 #define	p_endcopy	p_xsig
 	struct pgrp	*p_pgrp;	/* (c + e) Pointer to process group. */
 	struct knlist	p_klist;	/* (c) Knotes attached to this proc. */
 	int		p_numthreads;	/* (c) Number of threads. */
 	struct mdproc	p_md;		/* Any machine-dependent fields. */
 	struct callout	p_itcallout;	/* (h + c) Interval timer callout. */
 	u_short		p_acflag;	/* (c) Accounting flags. */
 	struct proc	*p_peers;	/* (r) */
 	struct proc	*p_leader;	/* (b) */
 	void		*p_emuldata;	/* (c) Emulator state data. */
 	struct label	*p_label;	/* (*) Proc (not subject) MAC label. */
 	struct p_sched	*p_sched;	/* (*) Scheduler-specific data. */
 	STAILQ_HEAD(, ktr_request)	p_ktr;	/* (o) KTR event queue. */
 	LIST_HEAD(, mqueue_notifier)	p_mqnotifier; /* (c) mqueue notifiers.*/
 	struct kdtrace_proc	*p_dtrace; /* (*) DTrace-specific data. */
 	struct cv	p_pwait;	/* (*) wait cv for exit/exec. */
 	struct cv	p_dbgwait;	/* (*) wait cv for debugger attach
 					   after fork. */
 	uint64_t	p_prev_runtime;	/* (c) Resource usage accounting. */
 	struct racct	*p_racct;	/* (b) Resource accounting. */
 	int		p_throttled;	/* (c) Flag for racct pcpu throttling */
 	struct vm_domain_policy p_vm_dom_policy;	/* (c) process default VM domain, or -1 */
 	/*
 	 * An orphan is the child that has beed re-parented to the
 	 * debugger as a result of attaching to it.  Need to keep
 	 * track of them for parent to be able to collect the exit
 	 * status of what used to be children.
 	 */
 	LIST_ENTRY(proc) p_orphan;	/* (e) List of orphan processes. */
 	LIST_HEAD(, proc) p_orphans;	/* (e) Pointer to list of orphans. */
 };
 
 #define	p_session	p_pgrp->pg_session
 #define	p_pgid		p_pgrp->pg_id
 
 #define	NOCPU		(-1)	/* For when we aren't on a CPU. */
 #define	NOCPU_OLD	(255)
 #define	MAXCPU_OLD	(254)
 
 #define	PROC_SLOCK(p)	mtx_lock_spin(&(p)->p_slock)
 #define	PROC_SUNLOCK(p)	mtx_unlock_spin(&(p)->p_slock)
 #define	PROC_SLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_slock, (type))
 
 #define	PROC_STATLOCK(p)	mtx_lock_spin(&(p)->p_statmtx)
 #define	PROC_STATUNLOCK(p)	mtx_unlock_spin(&(p)->p_statmtx)
 #define	PROC_STATLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_statmtx, (type))
 
 #define	PROC_ITIMLOCK(p)	mtx_lock_spin(&(p)->p_itimmtx)
 #define	PROC_ITIMUNLOCK(p)	mtx_unlock_spin(&(p)->p_itimmtx)
 #define	PROC_ITIMLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_itimmtx, (type))
 
 #define	PROC_PROFLOCK(p)	mtx_lock_spin(&(p)->p_profmtx)
 #define	PROC_PROFUNLOCK(p)	mtx_unlock_spin(&(p)->p_profmtx)
 #define	PROC_PROFLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_profmtx, (type))
 
 /* These flags are kept in p_flag. */
 #define	P_ADVLOCK	0x00001	/* Process may hold a POSIX advisory lock. */
 #define	P_CONTROLT	0x00002	/* Has a controlling terminal. */
 #define	P_KPROC		0x00004	/* Kernel process. */
 #define	P_FOLLOWFORK	0x00008	/* Attach parent debugger to children. */
 #define	P_PPWAIT	0x00010	/* Parent is waiting for child to exec/exit. */
 #define	P_PROFIL	0x00020	/* Has started profiling. */
 #define	P_STOPPROF	0x00040	/* Has thread requesting to stop profiling. */
 #define	P_HADTHREADS	0x00080	/* Has had threads (no cleanup shortcuts) */
 #define	P_SUGID		0x00100	/* Had set id privileges since last exec. */
 #define	P_SYSTEM	0x00200	/* System proc: no sigs, stats or swapping. */
 #define	P_SINGLE_EXIT	0x00400	/* Threads suspending should exit, not wait. */
 #define	P_TRACED	0x00800	/* Debugged process being traced. */
 #define	P_WAITED	0x01000	/* Someone is waiting for us. */
 #define	P_WEXIT		0x02000	/* Working on exiting. */
 #define	P_EXEC		0x04000	/* Process called exec. */
 #define	P_WKILLED	0x08000	/* Killed, go to kernel/user boundary ASAP. */
 #define	P_CONTINUED	0x10000	/* Proc has continued from a stopped state. */
 #define	P_STOPPED_SIG	0x20000	/* Stopped due to SIGSTOP/SIGTSTP. */
 #define	P_STOPPED_TRACE	0x40000	/* Stopped because of tracing. */
 #define	P_STOPPED_SINGLE 0x80000 /* Only 1 thread can continue (not to user). */
 #define	P_PROTECTED	0x100000 /* Do not kill on memory overcommit. */
 #define	P_SIGEVENT	0x200000 /* Process pending signals changed. */
 #define	P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */
 #define	P_HWPMC		0x800000 /* Process is using HWPMCs */
 #define	P_JAILED	0x1000000 /* Process is in jail. */
 #define	P_TOTAL_STOP	0x2000000 /* Stopped in stop_all_proc. */
 #define	P_INEXEC	0x4000000 /* Process is in execve(). */
 #define	P_STATCHILD	0x8000000 /* Child process stopped or exited. */
 #define	P_INMEM		0x10000000 /* Loaded into memory. */
 #define	P_SWAPPINGOUT	0x20000000 /* Process is being swapped out. */
 #define	P_SWAPPINGIN	0x40000000 /* Process is being swapped in. */
 #define	P_PPTRACE	0x80000000 /* PT_TRACEME by vforked child. */
 
 #define	P_STOPPED	(P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE)
 #define	P_SHOULDSTOP(p)	((p)->p_flag & P_STOPPED)
 #define	P_KILLED(p)	((p)->p_flag & P_WKILLED)
 
 /* These flags are kept in p_flag2. */
 #define	P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */
 #define	P2_NOTRACE	0x00000002	/* No ptrace(2) attach or coredumps. */
 #define	P2_NOTRACE_EXEC 0x00000004	/* Keep P2_NOPTRACE on exec(2). */
 #define	P2_AST_SU	0x00000008	/* Handles SU ast for kthreads. */
 #define	P2_LWP_EVENTS	0x00000010	/* Report LWP events via ptrace(2). */
 
 /* Flags protected by proctree_lock, kept in p_treeflags. */
 #define	P_TREE_ORPHANED		0x00000001	/* Reparented, on orphan list */
 #define	P_TREE_FIRST_ORPHAN	0x00000002	/* First element of orphan
 						   list */
 #define	P_TREE_REAPER		0x00000004	/* Reaper of subtree */
 
 /*
  * These were process status values (p_stat), now they are only used in
  * legacy conversion code.
  */
 #define	SIDL	1		/* Process being created by fork. */
 #define	SRUN	2		/* Currently runnable. */
 #define	SSLEEP	3		/* Sleeping on an address. */
 #define	SSTOP	4		/* Process debugging or suspension. */
 #define	SZOMB	5		/* Awaiting collection by parent. */
 #define	SWAIT	6		/* Waiting for interrupt. */
 #define	SLOCK	7		/* Blocked on a lock. */
 
 #define	P_MAGIC		0xbeefface
 
 #ifdef _KERNEL
 
 /* Types and flags for mi_switch(). */
 #define	SW_TYPE_MASK		0xff	/* First 8 bits are switch type */
 #define	SWT_NONE		0	/* Unspecified switch. */
 #define	SWT_PREEMPT		1	/* Switching due to preemption. */
 #define	SWT_OWEPREEMPT		2	/* Switching due to opepreempt. */
 #define	SWT_TURNSTILE		3	/* Turnstile contention. */
 #define	SWT_SLEEPQ		4	/* Sleepq wait. */
 #define	SWT_SLEEPQTIMO		5	/* Sleepq timeout wait. */
 #define	SWT_RELINQUISH		6	/* yield call. */
 #define	SWT_NEEDRESCHED		7	/* NEEDRESCHED was set. */
 #define	SWT_IDLE		8	/* Switching from the idle thread. */
 #define	SWT_IWAIT		9	/* Waiting for interrupts. */
 #define	SWT_SUSPEND		10	/* Thread suspended. */
 #define	SWT_REMOTEPREEMPT	11	/* Remote processor preempted. */
 #define	SWT_REMOTEWAKEIDLE	12	/* Remote processor preempted idle. */
 #define	SWT_COUNT		13	/* Number of switch types. */
 /* Flags */
 #define	SW_VOL		0x0100		/* Voluntary switch. */
 #define	SW_INVOL	0x0200		/* Involuntary switch. */
 #define SW_PREEMPT	0x0400		/* The invol switch is a preemption */
 
 /* How values for thread_single(). */
 #define	SINGLE_NO_EXIT	0
 #define	SINGLE_EXIT	1
 #define	SINGLE_BOUNDARY	2
 #define	SINGLE_ALLPROC	3
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_PARGS);
 MALLOC_DECLARE(M_PGRP);
 MALLOC_DECLARE(M_SESSION);
 MALLOC_DECLARE(M_SUBPROC);
 #endif
 
 #define	FOREACH_PROC_IN_SYSTEM(p)					\
 	LIST_FOREACH((p), &allproc, p_list)
 #define	FOREACH_THREAD_IN_PROC(p, td)					\
 	TAILQ_FOREACH((td), &(p)->p_threads, td_plist)
 
 #define	FIRST_THREAD_IN_PROC(p)	TAILQ_FIRST(&(p)->p_threads)
 
 /*
  * We use process IDs <= pid_max <= PID_MAX; PID_MAX + 1 must also fit
  * in a pid_t, as it is used to represent "no process group".
  */
 #define	PID_MAX		99999
 #define	NO_PID		100000
 extern pid_t pid_max;
 
 #define	SESS_LEADER(p)	((p)->p_session->s_leader == (p))
 
 
 #define	STOPEVENT(p, e, v) do {						\
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,			\
  	    "checking stopevent %d", (e));				\
 	if ((p)->p_stops & (e))	{					\
 		PROC_LOCK(p);						\
 		stopevent((p), (e), (v));				\
 		PROC_UNLOCK(p);						\
 	}								\
 } while (0)
 #define	_STOPEVENT(p, e, v) do {					\
 	PROC_LOCK_ASSERT(p, MA_OWNED);					\
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, \
  	    "checking stopevent %d", (e));				\
 	if ((p)->p_stops & (e))						\
 		stopevent((p), (e), (v));				\
 } while (0)
 
 /* Lock and unlock a process. */
 #define	PROC_LOCK(p)	mtx_lock(&(p)->p_mtx)
 #define	PROC_TRYLOCK(p)	mtx_trylock(&(p)->p_mtx)
 #define	PROC_UNLOCK(p)	mtx_unlock(&(p)->p_mtx)
 #define	PROC_LOCKED(p)	mtx_owned(&(p)->p_mtx)
 #define	PROC_LOCK_ASSERT(p, type)	mtx_assert(&(p)->p_mtx, (type))
 
 /* Lock and unlock a process group. */
 #define	PGRP_LOCK(pg)	mtx_lock(&(pg)->pg_mtx)
 #define	PGRP_UNLOCK(pg)	mtx_unlock(&(pg)->pg_mtx)
 #define	PGRP_LOCKED(pg)	mtx_owned(&(pg)->pg_mtx)
 #define	PGRP_LOCK_ASSERT(pg, type)	mtx_assert(&(pg)->pg_mtx, (type))
 
 #define	PGRP_LOCK_PGSIGNAL(pg) do {					\
 	if ((pg) != NULL)						\
 		PGRP_LOCK(pg);						\
 } while (0)
 #define	PGRP_UNLOCK_PGSIGNAL(pg) do {					\
 	if ((pg) != NULL)						\
 		PGRP_UNLOCK(pg);					\
 } while (0)
 
 /* Lock and unlock a session. */
 #define	SESS_LOCK(s)	mtx_lock(&(s)->s_mtx)
 #define	SESS_UNLOCK(s)	mtx_unlock(&(s)->s_mtx)
 #define	SESS_LOCKED(s)	mtx_owned(&(s)->s_mtx)
 #define	SESS_LOCK_ASSERT(s, type)	mtx_assert(&(s)->s_mtx, (type))
 
 /* Hold process U-area in memory, normally for ptrace/procfs work. */
 #define	PHOLD(p) do {							\
 	PROC_LOCK(p);							\
 	_PHOLD(p);							\
 	PROC_UNLOCK(p);							\
 } while (0)
 #define	_PHOLD(p) do {							\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc,		\
 	    ("PHOLD of exiting process %p", p));			\
 	(p)->p_lock++;							\
 	if (((p)->p_flag & P_INMEM) == 0)				\
 		faultin((p));						\
 } while (0)
 #define	PROC_ASSERT_HELD(p) do {					\
 	KASSERT((p)->p_lock > 0, ("process %p not held", p));		\
 } while (0)
 
 #define	PRELE(p) do {							\
 	PROC_LOCK((p));							\
 	_PRELE((p));							\
 	PROC_UNLOCK((p));						\
 } while (0)
 #define	_PRELE(p) do {							\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	PROC_ASSERT_HELD(p);						\
 	(--(p)->p_lock);						\
 	if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0)		\
 		wakeup(&(p)->p_lock);					\
 } while (0)
 #define	PROC_ASSERT_NOT_HELD(p) do {					\
 	KASSERT((p)->p_lock == 0, ("process %p held", p));		\
 } while (0)
 
 #define	PROC_UPDATE_COW(p) do {						\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	(p)->p_cowgen++;						\
 } while (0)
 
 /* Check whether a thread is safe to be swapped out. */
 #define	thread_safetoswapout(td)	((td)->td_flags & TDF_CANSWAP)
 
 /* Control whether or not it is safe for curthread to sleep. */
 #define	THREAD_NO_SLEEPING()		((curthread)->td_no_sleeping++)
 
 #define	THREAD_SLEEPING_OK()		((curthread)->td_no_sleeping--)
 
 #define	THREAD_CAN_SLEEP()		((curthread)->td_no_sleeping == 0)
 
 #define	PIDHASH(pid)	(&pidhashtbl[(pid) & pidhash])
 extern LIST_HEAD(pidhashhead, proc) *pidhashtbl;
 extern u_long pidhash;
 #define	TIDHASH(tid)	(&tidhashtbl[(tid) & tidhash])
 extern LIST_HEAD(tidhashhead, thread) *tidhashtbl;
 extern u_long tidhash;
 extern struct rwlock tidhash_lock;
 
 #define	PGRPHASH(pgid)	(&pgrphashtbl[(pgid) & pgrphash])
 extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl;
 extern u_long pgrphash;
 
 extern struct sx allproc_lock;
 extern int allproc_gen;
 extern struct sx proctree_lock;
 extern struct mtx ppeers_lock;
 extern struct proc proc0;		/* Process slot for swapper. */
 extern struct thread thread0;		/* Primary thread in proc0. */
 extern struct vmspace vmspace0;		/* VM space for proc0. */
 extern int hogticks;			/* Limit on kernel cpu hogs. */
 extern int lastpid;
 extern int nprocs, maxproc;		/* Current and max number of procs. */
 extern int maxprocperuid;		/* Max procs per uid. */
 extern u_long ps_arg_cache_limit;
 
 LIST_HEAD(proclist, proc);
 TAILQ_HEAD(procqueue, proc);
 TAILQ_HEAD(threadqueue, thread);
 extern struct proclist allproc;		/* List of all processes. */
 extern struct proclist zombproc;	/* List of zombie processes. */
 extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
 
 extern struct uma_zone *proc_zone;
 
 struct	proc *pfind(pid_t);		/* Find process by id. */
 struct	proc *pfind_locked(pid_t pid);
 struct	pgrp *pgfind(pid_t);		/* Find process group by id. */
 struct	proc *zpfind(pid_t);		/* Find zombie process by id. */
 
 struct	fork_req {
 	int		fr_flags;
 	int		fr_pages;
 	int 		*fr_pidp;
 	struct proc 	**fr_procp;
 	int 		*fr_pd_fd;
 	int 		fr_pd_flags;
 	struct filecaps	*fr_pd_fcaps;
 };
 
 /*
  * pget() flags.
  */
 #define	PGET_HOLD	0x00001	/* Hold the process. */
 #define	PGET_CANSEE	0x00002	/* Check against p_cansee(). */
 #define	PGET_CANDEBUG	0x00004	/* Check against p_candebug(). */
 #define	PGET_ISCURRENT	0x00008	/* Check that the found process is current. */
 #define	PGET_NOTWEXIT	0x00010	/* Check that the process is not in P_WEXIT. */
 #define	PGET_NOTINEXEC	0x00020	/* Check that the process is not in P_INEXEC. */
 #define	PGET_NOTID	0x00040	/* Do not assume tid if pid > PID_MAX. */
 
 #define	PGET_WANTREAD	(PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT)
 
 int	pget(pid_t pid, int flags, struct proc **pp);
 
 void	ast(struct trapframe *framep);
 struct	thread *choosethread(void);
 int	cr_cansignal(struct ucred *cred, struct proc *proc, int signum);
 int	enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp,
 	    struct session *sess);
 int	enterthispgrp(struct proc *p, struct pgrp *pgrp);
 void	faultin(struct proc *p);
 void	fixjobc(struct proc *p, struct pgrp *pgrp, int entering);
 int	fork1(struct thread *, struct fork_req *);
 void	fork_exit(void (*)(void *, struct trapframe *), void *,
 	    struct trapframe *);
 void	fork_return(struct thread *, struct trapframe *);
 int	inferior(struct proc *p);
 void	kern_yield(int);
 void 	kick_proc0(void);
 void	killjobc(void);
 int	leavepgrp(struct proc *p);
 int	maybe_preempt(struct thread *td);
 void	maybe_yield(void);
 void	mi_switch(int flags, struct thread *newtd);
 int	p_candebug(struct thread *td, struct proc *p);
 int	p_cansee(struct thread *td, struct proc *p);
 int	p_cansched(struct thread *td, struct proc *p);
 int	p_cansignal(struct thread *td, struct proc *p, int signum);
 int	p_canwait(struct thread *td, struct proc *p);
 struct	pargs *pargs_alloc(int len);
 void	pargs_drop(struct pargs *pa);
 void	pargs_hold(struct pargs *pa);
 int	proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb);
 int	proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb);
 int	proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb);
 void	procinit(void);
 void	proc_linkup0(struct proc *p, struct thread *td);
 void	proc_linkup(struct proc *p, struct thread *td);
 struct proc *proc_realparent(struct proc *child);
 void	proc_reap(struct thread *td, struct proc *p, int *status, int options);
 void	proc_reparent(struct proc *child, struct proc *newparent);
 struct	pstats *pstats_alloc(void);
 void	pstats_fork(struct pstats *src, struct pstats *dst);
 void	pstats_free(struct pstats *ps);
 void	reaper_abandon_children(struct proc *p, bool exiting);
 int	securelevel_ge(struct ucred *cr, int level);
 int	securelevel_gt(struct ucred *cr, int level);
 void	sess_hold(struct session *);
 void	sess_release(struct session *);
 int	setrunnable(struct thread *);
 void	setsugid(struct proc *p);
 int	should_yield(void);
 int	sigonstack(size_t sp);
 void	stopevent(struct proc *, u_int, u_int);
 struct	thread *tdfind(lwpid_t, pid_t);
 void	threadinit(void);
 void	tidhash_add(struct thread *);
 void	tidhash_remove(struct thread *);
 void	cpu_idle(int);
 int	cpu_idle_wakeup(int);
 extern	void (*cpu_idle_hook)(sbintime_t);	/* Hook to machdep CPU idler. */
 void	cpu_switch(struct thread *, struct thread *, struct mtx *);
 void	cpu_throw(struct thread *, struct thread *) __dead2;
 void	unsleep(struct thread *);
 void	userret(struct thread *, struct trapframe *);
 
 void	cpu_exit(struct thread *);
 void	exit1(struct thread *, int, int) __dead2;
 int	cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa);
 void	cpu_fork(struct thread *, struct proc *, struct thread *, int);
 void	cpu_set_fork_handler(struct thread *, void (*)(void *), void *);
 void	cpu_set_syscall_retval(struct thread *, int);
 void	cpu_set_upcall(struct thread *td, struct thread *td0);
 void	cpu_set_upcall_kse(struct thread *, void (*)(void *), void *,
 	    stack_t *);
 int	cpu_set_user_tls(struct thread *, void *tls_base);
 void	cpu_thread_alloc(struct thread *);
 void	cpu_thread_clean(struct thread *);
 void	cpu_thread_exit(struct thread *);
 void	cpu_thread_free(struct thread *);
 void	cpu_thread_swapin(struct thread *);
 void	cpu_thread_swapout(struct thread *);
 struct	thread *thread_alloc(int pages);
 int	thread_alloc_stack(struct thread *, int pages);
 void	thread_cow_get_proc(struct thread *newtd, struct proc *p);
 void	thread_cow_get(struct thread *newtd, struct thread *td);
 void	thread_cow_free(struct thread *td);
 void	thread_cow_update(struct thread *td);
 int	thread_create(struct thread *td, struct rtprio *rtp,
 	    int (*initialize_thread)(struct thread *, void *), void *thunk);
 void	thread_exit(void) __dead2;
 void	thread_free(struct thread *td);
 void	thread_link(struct thread *td, struct proc *p);
 void	thread_reap(void);
 int	thread_single(struct proc *p, int how);
 void	thread_single_end(struct proc *p, int how);
 void	thread_stash(struct thread *td);
 void	thread_stopped(struct proc *p);
 void	childproc_stopped(struct proc *child, int reason);
 void	childproc_continued(struct proc *child);
 void	childproc_exited(struct proc *child);
 int	thread_suspend_check(int how);
 bool	thread_suspend_check_needed(void);
 void	thread_suspend_switch(struct thread *, struct proc *p);
 void	thread_suspend_one(struct thread *td);
 void	thread_unlink(struct thread *td);
 void	thread_unsuspend(struct proc *p);
 void	thread_wait(struct proc *p);
 struct thread	*thread_find(struct proc *p, lwpid_t tid);
 
 void	stop_all_proc(void);
 void	resume_all_proc(void);
 
 static __inline int
 curthread_pflags_set(int flags)
 {
 	struct thread *td;
 	int save;
 
 	td = curthread;
 	save = ~flags | (td->td_pflags & flags);
 	td->td_pflags |= flags;
 	return (save);
 }
 
 static __inline void
 curthread_pflags_restore(int save)
 {
 
 	curthread->td_pflags &= save;
 }
 
 #endif	/* _KERNEL */
 
 #endif	/* !_SYS_PROC_H_ */
Index: head/sys/sys/umtx.h
===================================================================
--- head/sys/sys/umtx.h	(revision 300042)
+++ head/sys/sys/umtx.h	(revision 300043)
@@ -1,174 +1,196 @@
 /*-
  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 #ifndef _SYS_UMTX_H_
 #define	_SYS_UMTX_H_
 
 #include <sys/_umtx.h>
 
+/* Common lock flags */
 #define USYNC_PROCESS_SHARED	0x0001	/* Process shared sync objs */
 
-#define	UMUTEX_UNOWNED		0x0
-#define	UMUTEX_CONTESTED	0x80000000U
-
+/* umutex flags */
 #define	UMUTEX_PRIO_INHERIT	0x0004	/* Priority inherited mutex */
 #define	UMUTEX_PRIO_PROTECT	0x0008	/* Priority protect mutex */
+#define	UMUTEX_ROBUST		0x0010	/* Robust mutex */
+#define	UMUTEX_NONCONSISTENT	0x0020	/* Robust locked but not consistent */
 
+/*
+ * The umutex.m_lock values and bits.  The m_owner is the word which
+ * serves as the lock.  Its high bit is the contention indicator and
+ * rest of bits records the owner TID.  TIDs values start with PID_MAX
+ * + 2 and end by INT32_MAX.  The low range [1..PID_MAX] is guaranteed
+ * to be useable as the special markers.
+ */
+#define	UMUTEX_UNOWNED		0x0
+#define	UMUTEX_CONTESTED	0x80000000U
+#define	UMUTEX_RB_OWNERDEAD	(UMUTEX_CONTESTED | 0x10)
+#define	UMUTEX_RB_NOTRECOV	(UMUTEX_CONTESTED | 0x11)
+
 /* urwlock flags */
 #define URWLOCK_PREFER_READER	0x0002
 
 #define URWLOCK_WRITE_OWNER	0x80000000U
 #define URWLOCK_WRITE_WAITERS	0x40000000U
 #define URWLOCK_READ_WAITERS	0x20000000U
 #define URWLOCK_MAX_READERS	0x1fffffffU
 #define URWLOCK_READER_COUNT(c)	((c) & URWLOCK_MAX_READERS)
 
 /* _usem flags */
 #define SEM_NAMED	0x0002
 
 /* _usem2 count field */
 #define	USEM_HAS_WAITERS	0x80000000U
 #define	USEM_MAX_COUNT		0x7fffffffU
 #define	USEM_COUNT(c)		((c) & USEM_MAX_COUNT)
 
 /* op code for _umtx_op */
 #define	UMTX_OP_RESERVED0	0
 #define	UMTX_OP_RESERVED1	1
 #define	UMTX_OP_WAIT		2
 #define	UMTX_OP_WAKE		3
 #define	UMTX_OP_MUTEX_TRYLOCK	4
 #define	UMTX_OP_MUTEX_LOCK	5
 #define	UMTX_OP_MUTEX_UNLOCK	6
 #define	UMTX_OP_SET_CEILING	7
 #define	UMTX_OP_CV_WAIT		8
 #define	UMTX_OP_CV_SIGNAL	9
 #define	UMTX_OP_CV_BROADCAST	10
 #define	UMTX_OP_WAIT_UINT	11
 #define	UMTX_OP_RW_RDLOCK	12
 #define	UMTX_OP_RW_WRLOCK	13
 #define	UMTX_OP_RW_UNLOCK	14
 #define	UMTX_OP_WAIT_UINT_PRIVATE	15
 #define	UMTX_OP_WAKE_PRIVATE	16
 #define	UMTX_OP_MUTEX_WAIT	17
 #define	UMTX_OP_MUTEX_WAKE	18	/* deprecated */
 #define	UMTX_OP_SEM_WAIT	19	/* deprecated */
 #define	UMTX_OP_SEM_WAKE	20	/* deprecated */
 #define	UMTX_OP_NWAKE_PRIVATE   21
 #define	UMTX_OP_MUTEX_WAKE2	22
 #define	UMTX_OP_SEM2_WAIT	23
 #define	UMTX_OP_SEM2_WAKE	24
 #define	UMTX_OP_SHM		25
+#define	UMTX_OP_ROBUST_LISTS	26
 
 /* Flags for UMTX_OP_CV_WAIT */
 #define	CVWAIT_CHECK_UNPARKING	0x01
 #define	CVWAIT_ABSTIME		0x02
 #define	CVWAIT_CLOCKID		0x04
 
 #define	UMTX_ABSTIME		0x01
 
 #define	UMTX_CHECK_UNPARKING	CVWAIT_CHECK_UNPARKING
 
 /* Flags for UMTX_OP_SHM */
 #define	UMTX_SHM_CREAT		0x0001
 #define	UMTX_SHM_LOOKUP		0x0002
 #define	UMTX_SHM_DESTROY	0x0004
 #define	UMTX_SHM_ALIVE		0x0008
 
+struct umtx_robust_lists_params {
+	uintptr_t	robust_list_offset;
+	uintptr_t	robust_priv_list_offset;
+	uintptr_t	robust_inact_offset;
+};
+
 #ifndef _KERNEL
 
 int _umtx_op(void *obj, int op, u_long val, void *uaddr, void *uaddr2);
 
 #else
 
 /*
  * The umtx_key structure is used by both the Linux futex code and the
  * umtx implementation to map userland addresses to unique keys.
  */
 
 enum {
 	TYPE_SIMPLE_WAIT,
 	TYPE_CV,
 	TYPE_SEM,
 	TYPE_SIMPLE_LOCK,
 	TYPE_NORMAL_UMUTEX,
 	TYPE_PI_UMUTEX,
 	TYPE_PP_UMUTEX,
 	TYPE_RWLOCK,
 	TYPE_FUTEX,
 	TYPE_SHM,
+	TYPE_PI_ROBUST_UMUTEX,
+	TYPE_PP_ROBUST_UMUTEX,
 };
 
 /* Key to represent a unique userland synchronous object */
 struct umtx_key {
 	int	hash;
 	int	type;
 	int	shared;
 	union {
 		struct {
 			struct vm_object *object;
 			uintptr_t	offset;
 		} shared;
 		struct {
 			struct vmspace	*vs;
 			uintptr_t	addr;
 		} private;
 		struct {
 			void		*a;
 			uintptr_t	b;
 		} both;
 	} info;
 };
 
 #define THREAD_SHARE		0
 #define PROCESS_SHARE		1
 #define AUTO_SHARE		2
 
 struct thread;
 
 static inline int
 umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
 {
 	return (k1->type == k2->type &&
 		k1->info.both.a == k2->info.both.a &&
 	        k1->info.both.b == k2->info.both.b);
 }
 
 int umtx_copyin_timeout(const void *, struct timespec *);
 int umtx_key_get(const void *, int, int, struct umtx_key *);
 void umtx_key_release(struct umtx_key *);
 struct umtx_q *umtxq_alloc(void);
 void umtxq_free(struct umtx_q *);
 int kern_umtx_wake(struct thread *, void *, int, int);
 void umtx_pi_adjust(struct thread *, u_char);
 void umtx_thread_init(struct thread *);
 void umtx_thread_fini(struct thread *);
 void umtx_thread_alloc(struct thread *);
 void umtx_thread_exit(struct thread *);
 #endif /* !_KERNEL */
 #endif /* !_SYS_UMTX_H_ */
Index: head/sys/vm/vm_object.c
===================================================================
--- head/sys/vm/vm_object.c	(revision 300042)
+++ head/sys/vm/vm_object.c	(revision 300043)
@@ -1,2632 +1,2632 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory object module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>		/* for curproc, pageproc */
 #include <sys/socket.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 static int old_msync;
 SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0,
     "Use old (insecure) msync behavior");
 
 static int	vm_object_page_collect_flush(vm_object_t object, vm_page_t p,
 		    int pagerflags, int flags, boolean_t *clearobjflags,
 		    boolean_t *eio);
 static boolean_t vm_object_page_remove_write(vm_page_t p, int flags,
 		    boolean_t *clearobjflags);
 static void	vm_object_qcollapse(vm_object_t object);
 static void	vm_object_vndeallocate(vm_object_t object);
 
 /*
  *	Virtual memory objects maintain the actual data
  *	associated with allocated virtual memory.  A given
  *	page of memory exists within exactly one object.
  *
  *	An object is only deallocated when all "references"
  *	are given up.  Only one "reference" to a given
  *	region of an object should be writeable.
  *
  *	Associated with each object is a list of all resident
  *	memory pages belonging to that object; this list is
  *	maintained by the "vm_page" module, and locked by the object's
  *	lock.
  *
  *	Each object also records a "pager" routine which is
  *	used to retrieve (and store) pages to the proper backing
  *	storage.  In addition, objects may be backed by other
  *	objects from which they were virtual-copied.
  *
  *	The only items within the object structure which are
  *	modified after time of creation are:
  *		reference count		locked by object's lock
  *		pager routine		locked by object's lock
  *
  */
 
 struct object_q vm_object_list;
 struct mtx vm_object_list_mtx;	/* lock for object list and count */
 
 struct vm_object kernel_object_store;
 struct vm_object kmem_object_store;
 
 static SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0,
     "VM object stats");
 
 static long object_collapses;
 SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD,
     &object_collapses, 0, "VM object collapses");
 
 static long object_bypasses;
 SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
     &object_bypasses, 0, "VM object bypasses");
 
 static uma_zone_t obj_zone;
 
 static int vm_object_zinit(void *mem, int size, int flags);
 
 #ifdef INVARIANTS
 static void vm_object_zdtor(void *mem, int size, void *arg);
 
 static void
 vm_object_zdtor(void *mem, int size, void *arg)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	KASSERT(object->ref_count == 0,
 	    ("object %p ref_count = %d", object, object->ref_count));
 	KASSERT(TAILQ_EMPTY(&object->memq),
 	    ("object %p has resident pages in its memq", object));
 	KASSERT(vm_radix_is_empty(&object->rtree),
 	    ("object %p has resident pages in its trie", object));
 #if VM_NRESERVLEVEL > 0
 	KASSERT(LIST_EMPTY(&object->rvq),
 	    ("object %p has reservations",
 	    object));
 #endif
 	KASSERT(vm_object_cache_is_empty(object),
 	    ("object %p has cached pages",
 	    object));
 	KASSERT(object->paging_in_progress == 0,
 	    ("object %p paging_in_progress = %d",
 	    object, object->paging_in_progress));
 	KASSERT(object->resident_page_count == 0,
 	    ("object %p resident_page_count = %d",
 	    object, object->resident_page_count));
 	KASSERT(object->shadow_count == 0,
 	    ("object %p shadow_count = %d",
 	    object, object->shadow_count));
 	KASSERT(object->type == OBJT_DEAD,
 	    ("object %p has non-dead type %d",
 	    object, object->type));
 }
 #endif
 
 static int
 vm_object_zinit(void *mem, int size, int flags)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	rw_init_flags(&object->lock, "vm object", RW_DUPOK | RW_NEW);
 
 	/* These are true for any object that has been freed */
 	object->type = OBJT_DEAD;
 	object->ref_count = 0;
 	object->rtree.rt_root = 0;
 	object->rtree.rt_flags = 0;
 	object->paging_in_progress = 0;
 	object->resident_page_count = 0;
 	object->shadow_count = 0;
 	object->cache.rt_root = 0;
 	object->cache.rt_flags = 0;
 
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
 	mtx_unlock(&vm_object_list_mtx);
 	return (0);
 }
 
 static void
 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
 {
 
 	TAILQ_INIT(&object->memq);
 	LIST_INIT(&object->shadow_head);
 
 	object->type = type;
 	switch (type) {
 	case OBJT_DEAD:
 		panic("_vm_object_allocate: can't create OBJT_DEAD");
 	case OBJT_DEFAULT:
 	case OBJT_SWAP:
 		object->flags = OBJ_ONEMAPPING;
 		break;
 	case OBJT_DEVICE:
 	case OBJT_SG:
 		object->flags = OBJ_FICTITIOUS | OBJ_UNMANAGED;
 		break;
 	case OBJT_MGTDEVICE:
 		object->flags = OBJ_FICTITIOUS;
 		break;
 	case OBJT_PHYS:
 		object->flags = OBJ_UNMANAGED;
 		break;
 	case OBJT_VNODE:
 		object->flags = 0;
 		break;
 	default:
 		panic("_vm_object_allocate: type %d is undefined", type);
 	}
 	object->size = size;
 	object->generation = 1;
 	object->ref_count = 1;
 	object->memattr = VM_MEMATTR_DEFAULT;
 	object->cred = NULL;
 	object->charge = 0;
 	object->handle = NULL;
 	object->backing_object = NULL;
 	object->backing_object_offset = (vm_ooffset_t) 0;
 #if VM_NRESERVLEVEL > 0
 	LIST_INIT(&object->rvq);
 #endif
 	umtx_shm_object_init(object);
 }
 
 /*
  *	vm_object_init:
  *
  *	Initialize the VM objects module.
  */
 void
 vm_object_init(void)
 {
 	TAILQ_INIT(&vm_object_list);
 	mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
 	
 	rw_init(&kernel_object->lock, "kernel vm object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kernel_object);
 #if VM_NRESERVLEVEL > 0
 	kernel_object->flags |= OBJ_COLORED;
 	kernel_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
 #endif
 
 	rw_init(&kmem_object->lock, "kmem vm object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kmem_object);
 #if VM_NRESERVLEVEL > 0
 	kmem_object->flags |= OBJ_COLORED;
 	kmem_object->pg_color = (u_short)atop(VM_MIN_KERNEL_ADDRESS);
 #endif
 
 	/*
 	 * The lock portion of struct vm_object must be type stable due
 	 * to vm_pageout_fallback_object_lock locking a vm object
 	 * without holding any references to it.
 	 */
 	obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL,
 #ifdef INVARIANTS
 	    vm_object_zdtor,
 #else
 	    NULL,
 #endif
 	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 
 	vm_radix_init();
 }
 
 void
 vm_object_clear_flag(vm_object_t object, u_short bits)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->flags &= ~bits;
 }
 
 /*
  *	Sets the default memory attribute for the specified object.  Pages
  *	that are allocated to this object are by default assigned this memory
  *	attribute.
  *
  *	Presently, this function must be called before any pages are allocated
  *	to the object.  In the future, this requirement may be relaxed for
  *	"default" and "swap" objects.
  */
 int
 vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	switch (object->type) {
 	case OBJT_DEFAULT:
 	case OBJT_DEVICE:
 	case OBJT_MGTDEVICE:
 	case OBJT_PHYS:
 	case OBJT_SG:
 	case OBJT_SWAP:
 	case OBJT_VNODE:
 		if (!TAILQ_EMPTY(&object->memq))
 			return (KERN_FAILURE);
 		break;
 	case OBJT_DEAD:
 		return (KERN_INVALID_ARGUMENT);
 	default:
 		panic("vm_object_set_memattr: object %p is of undefined type",
 		    object);
 	}
 	object->memattr = memattr;
 	return (KERN_SUCCESS);
 }
 
 void
 vm_object_pip_add(vm_object_t object, short i)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->paging_in_progress += i;
 }
 
 void
 vm_object_pip_subtract(vm_object_t object, short i)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->paging_in_progress -= i;
 }
 
 void
 vm_object_pip_wakeup(vm_object_t object)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->paging_in_progress--;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
 		wakeup(object);
 	}
 }
 
 void
 vm_object_pip_wakeupn(vm_object_t object, short i)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (i)
 		object->paging_in_progress -= i;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
 		wakeup(object);
 	}
 }
 
 void
 vm_object_pip_wait(vm_object_t object, char *waitid)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	while (object->paging_in_progress) {
 		object->flags |= OBJ_PIPWNT;
 		VM_OBJECT_SLEEP(object, object, PVM, waitid, 0);
 	}
 }
 
 /*
  *	vm_object_allocate:
  *
  *	Returns a new object with the given size.
  */
 vm_object_t
 vm_object_allocate(objtype_t type, vm_pindex_t size)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK);
 	_vm_object_allocate(type, size, object);
 	return (object);
 }
 
 
 /*
  *	vm_object_reference:
  *
  *	Gets another reference to the given object.  Note: OBJ_DEAD
  *	objects can be referenced during final cleaning.
  */
 void
 vm_object_reference(vm_object_t object)
 {
 	if (object == NULL)
 		return;
 	VM_OBJECT_WLOCK(object);
 	vm_object_reference_locked(object);
 	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
  *	vm_object_reference_locked:
  *
  *	Gets another reference to the given object.
  *
  *	The object must be locked.
  */
 void
 vm_object_reference_locked(vm_object_t object)
 {
 	struct vnode *vp;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	object->ref_count++;
 	if (object->type == OBJT_VNODE) {
 		vp = object->handle;
 		vref(vp);
 	}
 }
 
 /*
  * Handle deallocating an object of type OBJT_VNODE.
  */
 static void
 vm_object_vndeallocate(vm_object_t object)
 {
 	struct vnode *vp = (struct vnode *) object->handle;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_VNODE,
 	    ("vm_object_vndeallocate: not a vnode object"));
 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
 #ifdef INVARIANTS
 	if (object->ref_count == 0) {
 		vprint("vm_object_vndeallocate", vp);
 		panic("vm_object_vndeallocate: bad object reference count");
 	}
 #endif
 
-	if (object->ref_count == 1)
+	if (!umtx_shm_vnobj_persistent && object->ref_count == 1)
 		umtx_shm_object_terminated(object);
 
 	/*
 	 * The test for text of vp vnode does not need a bypass to
 	 * reach right VV_TEXT there, since it is obtained from
 	 * object->handle.
 	 */
 	if (object->ref_count > 1 || (vp->v_vflag & VV_TEXT) == 0) {
 		object->ref_count--;
 		VM_OBJECT_WUNLOCK(object);
 		/* vrele may need the vnode lock. */
 		vrele(vp);
 	} else {
 		vhold(vp);
 		VM_OBJECT_WUNLOCK(object);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		vdrop(vp);
 		VM_OBJECT_WLOCK(object);
 		object->ref_count--;
 		if (object->type == OBJT_DEAD) {
 			VM_OBJECT_WUNLOCK(object);
 			VOP_UNLOCK(vp, 0);
 		} else {
 			if (object->ref_count == 0)
 				VOP_UNSET_TEXT(vp);
 			VM_OBJECT_WUNLOCK(object);
 			vput(vp);
 		}
 	}
 }
 
 /*
  *	vm_object_deallocate:
  *
  *	Release a reference to the specified object,
  *	gained either through a vm_object_allocate
  *	or a vm_object_reference call.  When all references
  *	are gone, storage associated with this object
  *	may be relinquished.
  *
  *	No object may be locked.
  */
 void
 vm_object_deallocate(vm_object_t object)
 {
 	vm_object_t temp;
 	struct vnode *vp;
 
 	while (object != NULL) {
 		VM_OBJECT_WLOCK(object);
 		if (object->type == OBJT_VNODE) {
 			vm_object_vndeallocate(object);
 			return;
 		}
 
 		KASSERT(object->ref_count != 0,
 			("vm_object_deallocate: object deallocated too many times: %d", object->type));
 
 		/*
 		 * If the reference count goes to 0 we start calling
 		 * vm_object_terminate() on the object chain.
 		 * A ref count of 1 may be a special case depending on the
 		 * shadow count being 0 or 1.
 		 */
 		object->ref_count--;
 		if (object->ref_count > 1) {
 			VM_OBJECT_WUNLOCK(object);
 			return;
 		} else if (object->ref_count == 1) {
 			if (object->type == OBJT_SWAP &&
 			    (object->flags & OBJ_TMPFS) != 0) {
 				vp = object->un_pager.swp.swp_tmpfs;
 				vhold(vp);
 				VM_OBJECT_WUNLOCK(object);
 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 				VM_OBJECT_WLOCK(object);
 				if (object->type == OBJT_DEAD ||
 				    object->ref_count != 1) {
 					VM_OBJECT_WUNLOCK(object);
 					VOP_UNLOCK(vp, 0);
 					vdrop(vp);
 					return;
 				}
 				if ((object->flags & OBJ_TMPFS) != 0)
 					VOP_UNSET_TEXT(vp);
 				VOP_UNLOCK(vp, 0);
 				vdrop(vp);
 			}
 			if (object->shadow_count == 0 &&
 			    object->handle == NULL &&
 			    (object->type == OBJT_DEFAULT ||
 			    (object->type == OBJT_SWAP &&
 			    (object->flags & OBJ_TMPFS_NODE) == 0))) {
 				vm_object_set_flag(object, OBJ_ONEMAPPING);
 			} else if ((object->shadow_count == 1) &&
 			    (object->handle == NULL) &&
 			    (object->type == OBJT_DEFAULT ||
 			     object->type == OBJT_SWAP)) {
 				vm_object_t robject;
 
 				robject = LIST_FIRST(&object->shadow_head);
 				KASSERT(robject != NULL,
 				    ("vm_object_deallocate: ref_count: %d, shadow_count: %d",
 					 object->ref_count,
 					 object->shadow_count));
 				KASSERT((robject->flags & OBJ_TMPFS_NODE) == 0,
 				    ("shadowed tmpfs v_object %p", object));
 				if (!VM_OBJECT_TRYWLOCK(robject)) {
 					/*
 					 * Avoid a potential deadlock.
 					 */
 					object->ref_count++;
 					VM_OBJECT_WUNLOCK(object);
 					/*
 					 * More likely than not the thread
 					 * holding robject's lock has lower
 					 * priority than the current thread.
 					 * Let the lower priority thread run.
 					 */
 					pause("vmo_de", 1);
 					continue;
 				}
 				/*
 				 * Collapse object into its shadow unless its
 				 * shadow is dead.  In that case, object will
 				 * be deallocated by the thread that is
 				 * deallocating its shadow.
 				 */
 				if ((robject->flags & OBJ_DEAD) == 0 &&
 				    (robject->handle == NULL) &&
 				    (robject->type == OBJT_DEFAULT ||
 				     robject->type == OBJT_SWAP)) {
 
 					robject->ref_count++;
 retry:
 					if (robject->paging_in_progress) {
 						VM_OBJECT_WUNLOCK(object);
 						vm_object_pip_wait(robject,
 						    "objde1");
 						temp = robject->backing_object;
 						if (object == temp) {
 							VM_OBJECT_WLOCK(object);
 							goto retry;
 						}
 					} else if (object->paging_in_progress) {
 						VM_OBJECT_WUNLOCK(robject);
 						object->flags |= OBJ_PIPWNT;
 						VM_OBJECT_SLEEP(object, object,
 						    PDROP | PVM, "objde2", 0);
 						VM_OBJECT_WLOCK(robject);
 						temp = robject->backing_object;
 						if (object == temp) {
 							VM_OBJECT_WLOCK(object);
 							goto retry;
 						}
 					} else
 						VM_OBJECT_WUNLOCK(object);
 
 					if (robject->ref_count == 1) {
 						robject->ref_count--;
 						object = robject;
 						goto doterm;
 					}
 					object = robject;
 					vm_object_collapse(object);
 					VM_OBJECT_WUNLOCK(object);
 					continue;
 				}
 				VM_OBJECT_WUNLOCK(robject);
 			}
 			VM_OBJECT_WUNLOCK(object);
 			return;
 		}
 doterm:
 		umtx_shm_object_terminated(object);
 		temp = object->backing_object;
 		if (temp != NULL) {
 			KASSERT((object->flags & OBJ_TMPFS_NODE) == 0,
 			    ("shadowed tmpfs v_object 2 %p", object));
 			VM_OBJECT_WLOCK(temp);
 			LIST_REMOVE(object, shadow_list);
 			temp->shadow_count--;
 			VM_OBJECT_WUNLOCK(temp);
 			object->backing_object = NULL;
 		}
 		/*
 		 * Don't double-terminate, we could be in a termination
 		 * recursion due to the terminate having to sync data
 		 * to disk.
 		 */
 		if ((object->flags & OBJ_DEAD) == 0)
 			vm_object_terminate(object);
 		else
 			VM_OBJECT_WUNLOCK(object);
 		object = temp;
 	}
 }
 
 /*
  *	vm_object_destroy removes the object from the global object list
  *      and frees the space for the object.
  */
 void
 vm_object_destroy(vm_object_t object)
 {
 
 	/*
 	 * Release the allocation charge.
 	 */
 	if (object->cred != NULL) {
 		swap_release_by_cred(object->charge, object->cred);
 		object->charge = 0;
 		crfree(object->cred);
 		object->cred = NULL;
 	}
 
 	/*
 	 * Free the space for the object.
 	 */
 	uma_zfree(obj_zone, object);
 }
 
 /*
  *	vm_object_terminate actually destroys the specified object, freeing
  *	up all previously used resources.
  *
  *	The object must be locked.
  *	This routine may block.
  */
 void
 vm_object_terminate(vm_object_t object)
 {
 	vm_page_t p, p_next;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Make sure no one uses us.
 	 */
 	vm_object_set_flag(object, OBJ_DEAD);
 
 	/*
 	 * wait for the pageout daemon to be done with the object
 	 */
 	vm_object_pip_wait(object, "objtrm");
 
 	KASSERT(!object->paging_in_progress,
 		("vm_object_terminate: pageout in progress"));
 
 	/*
 	 * Clean and free the pages, as appropriate. All references to the
 	 * object are gone, so we don't need to lock it.
 	 */
 	if (object->type == OBJT_VNODE) {
 		struct vnode *vp = (struct vnode *)object->handle;
 
 		/*
 		 * Clean pages and flush buffers.
 		 */
 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_WUNLOCK(object);
 
 		vinvalbuf(vp, V_SAVE, 0, 0);
 
 		VM_OBJECT_WLOCK(object);
 	}
 
 	KASSERT(object->ref_count == 0, 
 		("vm_object_terminate: object with references, ref_count=%d",
 		object->ref_count));
 
 	/*
 	 * Free any remaining pageable pages.  This also removes them from the
 	 * paging queues.  However, don't free wired pages, just remove them
 	 * from the object.  Rather than incrementally removing each page from
 	 * the object, the page and object are reset to any empty state. 
 	 */
 	TAILQ_FOREACH_SAFE(p, &object->memq, listq, p_next) {
 		vm_page_assert_unbusied(p);
 		vm_page_lock(p);
 		/*
 		 * Optimize the page's removal from the object by resetting
 		 * its "object" field.  Specifically, if the page is not
 		 * wired, then the effect of this assignment is that
 		 * vm_page_free()'s call to vm_page_remove() will return
 		 * immediately without modifying the page or the object.
 		 */ 
 		p->object = NULL;
 		if (p->wire_count == 0) {
 			vm_page_free(p);
 			PCPU_INC(cnt.v_pfree);
 		}
 		vm_page_unlock(p);
 	}
 	/*
 	 * If the object contained any pages, then reset it to an empty state.
 	 * None of the object's fields, including "resident_page_count", were
 	 * modified by the preceding loop.
 	 */
 	if (object->resident_page_count != 0) {
 		vm_radix_reclaim_allnodes(&object->rtree);
 		TAILQ_INIT(&object->memq);
 		object->resident_page_count = 0;
 		if (object->type == OBJT_VNODE)
 			vdrop(object->handle);
 	}
 
 #if VM_NRESERVLEVEL > 0
 	if (__predict_false(!LIST_EMPTY(&object->rvq)))
 		vm_reserv_break_all(object);
 #endif
 	if (__predict_false(!vm_object_cache_is_empty(object)))
 		vm_page_cache_free(object, 0, 0);
 
 	KASSERT(object->cred == NULL || object->type == OBJT_DEFAULT ||
 	    object->type == OBJT_SWAP,
 	    ("%s: non-swap obj %p has cred", __func__, object));
 
 	/*
 	 * Let the pager know object is dead.
 	 */
 	vm_pager_deallocate(object);
 	VM_OBJECT_WUNLOCK(object);
 
 	vm_object_destroy(object);
 }
 
 /*
  * Make the page read-only so that we can clear the object flags.  However, if
  * this is a nosync mmap then the object is likely to stay dirty so do not
  * mess with the page and do not clear the object flags.  Returns TRUE if the
  * page should be flushed, and FALSE otherwise.
  */
 static boolean_t
 vm_object_page_remove_write(vm_page_t p, int flags, boolean_t *clearobjflags)
 {
 
 	/*
 	 * If we have been asked to skip nosync pages and this is a
 	 * nosync page, skip it.  Note that the object flags were not
 	 * cleared in this case so we do not have to set them.
 	 */
 	if ((flags & OBJPC_NOSYNC) != 0 && (p->oflags & VPO_NOSYNC) != 0) {
 		*clearobjflags = FALSE;
 		return (FALSE);
 	} else {
 		pmap_remove_write(p);
 		return (p->dirty != 0);
 	}
 }
 
 /*
  *	vm_object_page_clean
  *
  *	Clean all dirty pages in the specified range of object.  Leaves page 
  * 	on whatever queue it is currently on.   If NOSYNC is set then do not
  *	write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC),
  *	leaving the object dirty.
  *
  *	When stuffing pages asynchronously, allow clustering.  XXX we need a
  *	synchronous clustering mode implementation.
  *
  *	Odd semantics: if start == end, we clean everything.
  *
  *	The object must be locked.
  *
  *	Returns FALSE if some page from the range was not written, as
  *	reported by the pager, and TRUE otherwise.
  */
 boolean_t
 vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end,
     int flags)
 {
 	vm_page_t np, p;
 	vm_pindex_t pi, tend, tstart;
 	int curgeneration, n, pagerflags;
 	boolean_t clearobjflags, eio, res;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * The OBJ_MIGHTBEDIRTY flag is only set for OBJT_VNODE
 	 * objects.  The check below prevents the function from
 	 * operating on non-vnode objects.
 	 */
 	if ((object->flags & OBJ_MIGHTBEDIRTY) == 0 ||
 	    object->resident_page_count == 0)
 		return (TRUE);
 
 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) != 0 ?
 	    VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
 	pagerflags |= (flags & OBJPC_INVAL) != 0 ? VM_PAGER_PUT_INVAL : 0;
 
 	tstart = OFF_TO_IDX(start);
 	tend = (end == 0) ? object->size : OFF_TO_IDX(end + PAGE_MASK);
 	clearobjflags = tstart == 0 && tend >= object->size;
 	res = TRUE;
 
 rescan:
 	curgeneration = object->generation;
 
 	for (p = vm_page_find_least(object, tstart); p != NULL; p = np) {
 		pi = p->pindex;
 		if (pi >= tend)
 			break;
 		np = TAILQ_NEXT(p, listq);
 		if (p->valid == 0)
 			continue;
 		if (vm_page_sleep_if_busy(p, "vpcwai")) {
 			if (object->generation != curgeneration) {
 				if ((flags & OBJPC_SYNC) != 0)
 					goto rescan;
 				else
 					clearobjflags = FALSE;
 			}
 			np = vm_page_find_least(object, pi);
 			continue;
 		}
 		if (!vm_object_page_remove_write(p, flags, &clearobjflags))
 			continue;
 
 		n = vm_object_page_collect_flush(object, p, pagerflags,
 		    flags, &clearobjflags, &eio);
 		if (eio) {
 			res = FALSE;
 			clearobjflags = FALSE;
 		}
 		if (object->generation != curgeneration) {
 			if ((flags & OBJPC_SYNC) != 0)
 				goto rescan;
 			else
 				clearobjflags = FALSE;
 		}
 
 		/*
 		 * If the VOP_PUTPAGES() did a truncated write, so
 		 * that even the first page of the run is not fully
 		 * written, vm_pageout_flush() returns 0 as the run
 		 * length.  Since the condition that caused truncated
 		 * write may be permanent, e.g. exhausted free space,
 		 * accepting n == 0 would cause an infinite loop.
 		 *
 		 * Forwarding the iterator leaves the unwritten page
 		 * behind, but there is not much we can do there if
 		 * filesystem refuses to write it.
 		 */
 		if (n == 0) {
 			n = 1;
 			clearobjflags = FALSE;
 		}
 		np = vm_page_find_least(object, pi + n);
 	}
 #if 0
 	VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC) ? MNT_WAIT : 0);
 #endif
 
 	if (clearobjflags)
 		vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY);
 	return (res);
 }
 
 static int
 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int pagerflags,
     int flags, boolean_t *clearobjflags, boolean_t *eio)
 {
 	vm_page_t ma[vm_pageout_page_count], p_first, tp;
 	int count, i, mreq, runlen;
 
 	vm_page_lock_assert(p, MA_NOTOWNED);
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	count = 1;
 	mreq = 0;
 
 	for (tp = p; count < vm_pageout_page_count; count++) {
 		tp = vm_page_next(tp);
 		if (tp == NULL || vm_page_busied(tp))
 			break;
 		if (!vm_object_page_remove_write(tp, flags, clearobjflags))
 			break;
 	}
 
 	for (p_first = p; count < vm_pageout_page_count; count++) {
 		tp = vm_page_prev(p_first);
 		if (tp == NULL || vm_page_busied(tp))
 			break;
 		if (!vm_object_page_remove_write(tp, flags, clearobjflags))
 			break;
 		p_first = tp;
 		mreq++;
 	}
 
 	for (tp = p_first, i = 0; i < count; tp = TAILQ_NEXT(tp, listq), i++)
 		ma[i] = tp;
 
 	vm_pageout_flush(ma, count, pagerflags, mreq, &runlen, eio);
 	return (runlen);
 }
 
 /*
  * Note that there is absolutely no sense in writing out
  * anonymous objects, so we track down the vnode object
  * to write out.
  * We invalidate (remove) all pages from the address space
  * for semantic correctness.
  *
  * If the backing object is a device object with unmanaged pages, then any
  * mappings to the specified range of pages must be removed before this
  * function is called.
  *
  * Note: certain anonymous maps, such as MAP_NOSYNC maps,
  * may start out with a NULL object.
  */
 boolean_t
 vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size,
     boolean_t syncio, boolean_t invalidate)
 {
 	vm_object_t backing_object;
 	struct vnode *vp;
 	struct mount *mp;
 	int error, flags, fsync_after;
 	boolean_t res;
 
 	if (object == NULL)
 		return (TRUE);
 	res = TRUE;
 	error = 0;
 	VM_OBJECT_WLOCK(object);
 	while ((backing_object = object->backing_object) != NULL) {
 		VM_OBJECT_WLOCK(backing_object);
 		offset += object->backing_object_offset;
 		VM_OBJECT_WUNLOCK(object);
 		object = backing_object;
 		if (object->size < OFF_TO_IDX(offset + size))
 			size = IDX_TO_OFF(object->size) - offset;
 	}
 	/*
 	 * Flush pages if writing is allowed, invalidate them
 	 * if invalidation requested.  Pages undergoing I/O
 	 * will be ignored by vm_object_page_remove().
 	 *
 	 * We cannot lock the vnode and then wait for paging
 	 * to complete without deadlocking against vm_fault.
 	 * Instead we simply call vm_object_page_remove() and
 	 * allow it to block internally on a page-by-page
 	 * basis when it encounters pages undergoing async
 	 * I/O.
 	 */
 	if (object->type == OBJT_VNODE &&
 	    (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		vp = object->handle;
 		VM_OBJECT_WUNLOCK(object);
 		(void) vn_start_write(vp, &mp, V_WAIT);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 		if (syncio && !invalidate && offset == 0 &&
 		    OFF_TO_IDX(size) == object->size) {
 			/*
 			 * If syncing the whole mapping of the file,
 			 * it is faster to schedule all the writes in
 			 * async mode, also allowing the clustering,
 			 * and then wait for i/o to complete.
 			 */
 			flags = 0;
 			fsync_after = TRUE;
 		} else {
 			flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
 			flags |= invalidate ? (OBJPC_SYNC | OBJPC_INVAL) : 0;
 			fsync_after = FALSE;
 		}
 		VM_OBJECT_WLOCK(object);
 		res = vm_object_page_clean(object, offset, offset + size,
 		    flags);
 		VM_OBJECT_WUNLOCK(object);
 		if (fsync_after)
 			error = VOP_FSYNC(vp, MNT_WAIT, curthread);
 		VOP_UNLOCK(vp, 0);
 		vn_finished_write(mp);
 		if (error != 0)
 			res = FALSE;
 		VM_OBJECT_WLOCK(object);
 	}
 	if ((object->type == OBJT_VNODE ||
 	     object->type == OBJT_DEVICE) && invalidate) {
 		if (object->type == OBJT_DEVICE)
 			/*
 			 * The option OBJPR_NOTMAPPED must be passed here
 			 * because vm_object_page_remove() cannot remove
 			 * unmanaged mappings.
 			 */
 			flags = OBJPR_NOTMAPPED;
 		else if (old_msync)
 			flags = 0;
 		else
 			flags = OBJPR_CLEANONLY;
 		vm_object_page_remove(object, OFF_TO_IDX(offset),
 		    OFF_TO_IDX(offset + size + PAGE_MASK), flags);
 	}
 	VM_OBJECT_WUNLOCK(object);
 	return (res);
 }
 
 /*
  *	vm_object_madvise:
  *
  *	Implements the madvise function at the object/page level.
  *
  *	MADV_WILLNEED	(any object)
  *
  *	    Activate the specified pages if they are resident.
  *
  *	MADV_DONTNEED	(any object)
  *
  *	    Deactivate the specified pages if they are resident.
  *
  *	MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects,
  *			 OBJ_ONEMAPPING only)
  *
  *	    Deactivate and clean the specified pages if they are
  *	    resident.  This permits the process to reuse the pages
  *	    without faulting or the kernel to reclaim the pages
  *	    without I/O.
  */
 void
 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, vm_pindex_t end,
     int advise)
 {
 	vm_pindex_t tpindex;
 	vm_object_t backing_object, tobject;
 	vm_page_t m;
 
 	if (object == NULL)
 		return;
 	VM_OBJECT_WLOCK(object);
 	/*
 	 * Locate and adjust resident pages
 	 */
 	for (; pindex < end; pindex += 1) {
 relookup:
 		tobject = object;
 		tpindex = pindex;
 shadowlookup:
 		/*
 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
 		 * and those pages must be OBJ_ONEMAPPING.
 		 */
 		if (advise == MADV_FREE) {
 			if ((tobject->type != OBJT_DEFAULT &&
 			     tobject->type != OBJT_SWAP) ||
 			    (tobject->flags & OBJ_ONEMAPPING) == 0) {
 				goto unlock_tobject;
 			}
 		} else if ((tobject->flags & OBJ_UNMANAGED) != 0)
 			goto unlock_tobject;
 		m = vm_page_lookup(tobject, tpindex);
 		if (m == NULL && advise == MADV_WILLNEED) {
 			/*
 			 * If the page is cached, reactivate it.
 			 */
 			m = vm_page_alloc(tobject, tpindex, VM_ALLOC_IFCACHED |
 			    VM_ALLOC_NOBUSY);
 		}
 		if (m == NULL) {
 			/*
 			 * There may be swap even if there is no backing page
 			 */
 			if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
 				swap_pager_freespace(tobject, tpindex, 1);
 			/*
 			 * next object
 			 */
 			backing_object = tobject->backing_object;
 			if (backing_object == NULL)
 				goto unlock_tobject;
 			VM_OBJECT_WLOCK(backing_object);
 			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
 			if (tobject != object)
 				VM_OBJECT_WUNLOCK(tobject);
 			tobject = backing_object;
 			goto shadowlookup;
 		} else if (m->valid != VM_PAGE_BITS_ALL)
 			goto unlock_tobject;
 		/*
 		 * If the page is not in a normal state, skip it.
 		 */
 		vm_page_lock(m);
 		if (m->hold_count != 0 || m->wire_count != 0) {
 			vm_page_unlock(m);
 			goto unlock_tobject;
 		}
 		KASSERT((m->flags & PG_FICTITIOUS) == 0,
 		    ("vm_object_madvise: page %p is fictitious", m));
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 		    ("vm_object_madvise: page %p is not managed", m));
 		if (vm_page_busied(m)) {
 			if (advise == MADV_WILLNEED) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it. 
 				 */
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			}
 			if (object != tobject)
 				VM_OBJECT_WUNLOCK(object);
 			VM_OBJECT_WUNLOCK(tobject);
 			vm_page_busy_sleep(m, "madvpo");
 			VM_OBJECT_WLOCK(object);
   			goto relookup;
 		}
 		if (advise == MADV_WILLNEED) {
 			vm_page_activate(m);
 		} else {
 			vm_page_advise(m, advise);
 		}
 		vm_page_unlock(m);
 		if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
 			swap_pager_freespace(tobject, tpindex, 1);
 unlock_tobject:
 		if (tobject != object)
 			VM_OBJECT_WUNLOCK(tobject);
 	}	
 	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
  *	vm_object_shadow:
  *
  *	Create a new object which is backed by the
  *	specified existing object range.  The source
  *	object reference is deallocated.
  *
  *	The new object and offset into that object
  *	are returned in the source parameters.
  */
 void
 vm_object_shadow(
 	vm_object_t *object,	/* IN/OUT */
 	vm_ooffset_t *offset,	/* IN/OUT */
 	vm_size_t length)
 {
 	vm_object_t source;
 	vm_object_t result;
 
 	source = *object;
 
 	/*
 	 * Don't create the new object if the old object isn't shared.
 	 */
 	if (source != NULL) {
 		VM_OBJECT_WLOCK(source);
 		if (source->ref_count == 1 &&
 		    source->handle == NULL &&
 		    (source->type == OBJT_DEFAULT ||
 		     source->type == OBJT_SWAP)) {
 			VM_OBJECT_WUNLOCK(source);
 			return;
 		}
 		VM_OBJECT_WUNLOCK(source);
 	}
 
 	/*
 	 * Allocate a new object with the given length.
 	 */
 	result = vm_object_allocate(OBJT_DEFAULT, atop(length));
 
 	/*
 	 * The new object shadows the source object, adding a reference to it.
 	 * Our caller changes his reference to point to the new object,
 	 * removing a reference to the source object.  Net result: no change
 	 * of reference count.
 	 *
 	 * Try to optimize the result object's page color when shadowing
 	 * in order to maintain page coloring consistency in the combined 
 	 * shadowed object.
 	 */
 	result->backing_object = source;
 	/*
 	 * Store the offset into the source object, and fix up the offset into
 	 * the new object.
 	 */
 	result->backing_object_offset = *offset;
 	if (source != NULL) {
 		VM_OBJECT_WLOCK(source);
 		LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
 		source->shadow_count++;
 #if VM_NRESERVLEVEL > 0
 		result->flags |= source->flags & OBJ_COLORED;
 		result->pg_color = (source->pg_color + OFF_TO_IDX(*offset)) &
 		    ((1 << (VM_NFREEORDER - 1)) - 1);
 #endif
 		VM_OBJECT_WUNLOCK(source);
 	}
 
 
 	/*
 	 * Return the new things
 	 */
 	*offset = 0;
 	*object = result;
 }
 
 /*
  *	vm_object_split:
  *
  * Split the pages in a map entry into a new object.  This affords
  * easier removal of unused pages, and keeps object inheritance from
  * being a negative impact on memory usage.
  */
 void
 vm_object_split(vm_map_entry_t entry)
 {
 	vm_page_t m, m_next;
 	vm_object_t orig_object, new_object, source;
 	vm_pindex_t idx, offidxstart;
 	vm_size_t size;
 
 	orig_object = entry->object.vm_object;
 	if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
 		return;
 	if (orig_object->ref_count <= 1)
 		return;
 	VM_OBJECT_WUNLOCK(orig_object);
 
 	offidxstart = OFF_TO_IDX(entry->offset);
 	size = atop(entry->end - entry->start);
 
 	/*
 	 * If swap_pager_copy() is later called, it will convert new_object
 	 * into a swap object.
 	 */
 	new_object = vm_object_allocate(OBJT_DEFAULT, size);
 
 	/*
 	 * At this point, the new object is still private, so the order in
 	 * which the original and new objects are locked does not matter.
 	 */
 	VM_OBJECT_WLOCK(new_object);
 	VM_OBJECT_WLOCK(orig_object);
 	source = orig_object->backing_object;
 	if (source != NULL) {
 		VM_OBJECT_WLOCK(source);
 		if ((source->flags & OBJ_DEAD) != 0) {
 			VM_OBJECT_WUNLOCK(source);
 			VM_OBJECT_WUNLOCK(orig_object);
 			VM_OBJECT_WUNLOCK(new_object);
 			vm_object_deallocate(new_object);
 			VM_OBJECT_WLOCK(orig_object);
 			return;
 		}
 		LIST_INSERT_HEAD(&source->shadow_head,
 				  new_object, shadow_list);
 		source->shadow_count++;
 		vm_object_reference_locked(source);	/* for new_object */
 		vm_object_clear_flag(source, OBJ_ONEMAPPING);
 		VM_OBJECT_WUNLOCK(source);
 		new_object->backing_object_offset = 
 			orig_object->backing_object_offset + entry->offset;
 		new_object->backing_object = source;
 	}
 	if (orig_object->cred != NULL) {
 		new_object->cred = orig_object->cred;
 		crhold(orig_object->cred);
 		new_object->charge = ptoa(size);
 		KASSERT(orig_object->charge >= ptoa(size),
 		    ("orig_object->charge < 0"));
 		orig_object->charge -= ptoa(size);
 	}
 retry:
 	m = vm_page_find_least(orig_object, offidxstart);
 	for (; m != NULL && (idx = m->pindex - offidxstart) < size;
 	    m = m_next) {
 		m_next = TAILQ_NEXT(m, listq);
 
 		/*
 		 * We must wait for pending I/O to complete before we can
 		 * rename the page.
 		 *
 		 * We do not have to VM_PROT_NONE the page as mappings should
 		 * not be changed by this operation.
 		 */
 		if (vm_page_busied(m)) {
 			VM_OBJECT_WUNLOCK(new_object);
 			vm_page_lock(m);
 			VM_OBJECT_WUNLOCK(orig_object);
 			vm_page_busy_sleep(m, "spltwt");
 			VM_OBJECT_WLOCK(orig_object);
 			VM_OBJECT_WLOCK(new_object);
 			goto retry;
 		}
 
 		/* vm_page_rename() will handle dirty and cache. */
 		if (vm_page_rename(m, new_object, idx)) {
 			VM_OBJECT_WUNLOCK(new_object);
 			VM_OBJECT_WUNLOCK(orig_object);
 			VM_WAIT;
 			VM_OBJECT_WLOCK(orig_object);
 			VM_OBJECT_WLOCK(new_object);
 			goto retry;
 		}
 #if VM_NRESERVLEVEL > 0
 		/*
 		 * If some of the reservation's allocated pages remain with
 		 * the original object, then transferring the reservation to
 		 * the new object is neither particularly beneficial nor
 		 * particularly harmful as compared to leaving the reservation
 		 * with the original object.  If, however, all of the
 		 * reservation's allocated pages are transferred to the new
 		 * object, then transferring the reservation is typically
 		 * beneficial.  Determining which of these two cases applies
 		 * would be more costly than unconditionally renaming the
 		 * reservation.
 		 */
 		vm_reserv_rename(m, new_object, orig_object, offidxstart);
 #endif
 		if (orig_object->type == OBJT_SWAP)
 			vm_page_xbusy(m);
 	}
 	if (orig_object->type == OBJT_SWAP) {
 		/*
 		 * swap_pager_copy() can sleep, in which case the orig_object's
 		 * and new_object's locks are released and reacquired. 
 		 */
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
 		TAILQ_FOREACH(m, &new_object->memq, listq)
 			vm_page_xunbusy(m);
 
 		/*
 		 * Transfer any cached pages from orig_object to new_object.
 		 * If swap_pager_copy() found swapped out pages within the
 		 * specified range of orig_object, then it changed
 		 * new_object's type to OBJT_SWAP when it transferred those
 		 * pages to new_object.  Otherwise, new_object's type
 		 * should still be OBJT_DEFAULT and orig_object should not
 		 * contain any cached pages within the specified range.
 		 */
 		if (__predict_false(!vm_object_cache_is_empty(orig_object)))
 			vm_page_cache_transfer(orig_object, offidxstart,
 			    new_object);
 	}
 	VM_OBJECT_WUNLOCK(orig_object);
 	VM_OBJECT_WUNLOCK(new_object);
 	entry->object.vm_object = new_object;
 	entry->offset = 0LL;
 	vm_object_deallocate(orig_object);
 	VM_OBJECT_WLOCK(new_object);
 }
 
 #define	OBSC_COLLAPSE_NOWAIT	0x0002
 #define	OBSC_COLLAPSE_WAIT	0x0004
 
 static vm_page_t
 vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next,
     int op)
 {
 	vm_object_t backing_object;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	backing_object = object->backing_object;
 	VM_OBJECT_ASSERT_WLOCKED(backing_object);
 
 	KASSERT(p == NULL || vm_page_busied(p), ("unbusy page %p", p));
 	KASSERT(p == NULL || p->object == object || p->object == backing_object,
 	    ("invalid ownership %p %p %p", p, object, backing_object));
 	if ((op & OBSC_COLLAPSE_NOWAIT) != 0)
 		return (next);
 	if (p != NULL)
 		vm_page_lock(p);
 	VM_OBJECT_WUNLOCK(object);
 	VM_OBJECT_WUNLOCK(backing_object);
 	if (p == NULL)
 		VM_WAIT;
 	else
 		vm_page_busy_sleep(p, "vmocol");
 	VM_OBJECT_WLOCK(object);
 	VM_OBJECT_WLOCK(backing_object);
 	return (TAILQ_FIRST(&backing_object->memq));
 }
 
 static bool
 vm_object_scan_all_shadowed(vm_object_t object)
 {
 	vm_object_t backing_object;
 	vm_page_t p, pp;
 	vm_pindex_t backing_offset_index, new_pindex;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
 
 	backing_object = object->backing_object;
 
 	/*
 	 * Initial conditions:
 	 *
 	 * We do not want to have to test for the existence of cache or swap
 	 * pages in the backing object.  XXX but with the new swapper this
 	 * would be pretty easy to do.
 	 */
 	if (backing_object->type != OBJT_DEFAULT)
 		return (false);
 
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
 	for (p = TAILQ_FIRST(&backing_object->memq); p != NULL;
 	    p = TAILQ_NEXT(p, listq)) {
 		new_pindex = p->pindex - backing_offset_index;
 
 		/*
 		 * Ignore pages outside the parent object's range and outside
 		 * the parent object's mapping of the backing object.
 		 */
 		if (p->pindex < backing_offset_index ||
 		    new_pindex >= object->size)
 			continue;
 
 		/*
 		 * See if the parent has the page or if the parent's object
 		 * pager has the page.  If the parent has the page but the page
 		 * is not valid, the parent's object pager must have the page.
 		 *
 		 * If this fails, the parent does not completely shadow the
 		 * object and we might as well give up now.
 		 */
 		pp = vm_page_lookup(object, new_pindex);
 		if ((pp == NULL || pp->valid == 0) &&
 		    !vm_pager_has_page(object, new_pindex, NULL, NULL))
 			return (false);
 	}
 	return (true);
 }
 
 static bool
 vm_object_collapse_scan(vm_object_t object, int op)
 {
 	vm_object_t backing_object;
 	vm_page_t next, p, pp;
 	vm_pindex_t backing_offset_index, new_pindex;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(object->backing_object);
 
 	backing_object = object->backing_object;
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
 	/*
 	 * Initial conditions
 	 */
 	if ((op & OBSC_COLLAPSE_WAIT) != 0)
 		vm_object_set_flag(backing_object, OBJ_DEAD);
 
 	/*
 	 * Our scan
 	 */
 	for (p = TAILQ_FIRST(&backing_object->memq); p != NULL; p = next) {
 		next = TAILQ_NEXT(p, listq);
 		new_pindex = p->pindex - backing_offset_index;
 
 		/*
 		 * Check for busy page
 		 */
 		if (vm_page_busied(p)) {
 			next = vm_object_collapse_scan_wait(object, p, next, op);
 			continue;
 		}
 
 		KASSERT(p->object == backing_object,
 		    ("vm_object_collapse_scan: object mismatch"));
 
 		if (p->pindex < backing_offset_index ||
 		    new_pindex >= object->size) {
 			if (backing_object->type == OBJT_SWAP)
 				swap_pager_freespace(backing_object, p->pindex,
 				    1);
 
 			/*
 			 * Page is out of the parent object's range, we can
 			 * simply destroy it.
 			 */
 			vm_page_lock(p);
 			KASSERT(!pmap_page_is_mapped(p),
 			    ("freeing mapped page %p", p));
 			if (p->wire_count == 0)
 				vm_page_free(p);
 			else
 				vm_page_remove(p);
 			vm_page_unlock(p);
 			continue;
 		}
 
 		pp = vm_page_lookup(object, new_pindex);
 		if (pp != NULL && vm_page_busied(pp)) {
 			/*
 			 * The page in the parent is busy and possibly not
 			 * (yet) valid.  Until its state is finalized by the
 			 * busy bit owner, we can't tell whether it shadows the
 			 * original page.  Therefore, we must either skip it
 			 * and the original (backing_object) page or wait for
 			 * its state to be finalized.
 			 *
 			 * This is due to a race with vm_fault() where we must
 			 * unbusy the original (backing_obj) page before we can
 			 * (re)lock the parent.  Hence we can get here.
 			 */
 			next = vm_object_collapse_scan_wait(object, pp, next,
 			    op);
 			continue;
 		}
 
 		KASSERT(pp == NULL || pp->valid != 0,
 		    ("unbusy invalid page %p", pp));
 
 		if (pp != NULL || vm_pager_has_page(object, new_pindex, NULL,
 			NULL)) {
 			/*
 			 * The page already exists in the parent OR swap exists
 			 * for this location in the parent.  Leave the parent's
 			 * page alone.  Destroy the original page from the
 			 * backing object.
 			 */
 			if (backing_object->type == OBJT_SWAP)
 				swap_pager_freespace(backing_object, p->pindex,
 				    1);
 			vm_page_lock(p);
 			KASSERT(!pmap_page_is_mapped(p),
 			    ("freeing mapped page %p", p));
 			if (p->wire_count == 0)
 				vm_page_free(p);
 			else
 				vm_page_remove(p);
 			vm_page_unlock(p);
 			continue;
 		}
 
 		/*
 		 * Page does not exist in parent, rename the page from the
 		 * backing object to the main object.
 		 *
 		 * If the page was mapped to a process, it can remain mapped
 		 * through the rename.  vm_page_rename() will handle dirty and
 		 * cache.
 		 */
 		if (vm_page_rename(p, object, new_pindex)) {
 			next = vm_object_collapse_scan_wait(object, NULL, next,
 			    op);
 			continue;
 		}
 
 		/* Use the old pindex to free the right page. */
 		if (backing_object->type == OBJT_SWAP)
 			swap_pager_freespace(backing_object,
 			    new_pindex + backing_offset_index, 1);
 
 #if VM_NRESERVLEVEL > 0
 		/*
 		 * Rename the reservation.
 		 */
 		vm_reserv_rename(p, object, backing_object,
 		    backing_offset_index);
 #endif
 	}
 	return (true);
 }
 
 
 /*
  * this version of collapse allows the operation to occur earlier and
  * when paging_in_progress is true for an object...  This is not a complete
  * operation, but should plug 99.9% of the rest of the leaks.
  */
 static void
 vm_object_qcollapse(vm_object_t object)
 {
 	vm_object_t backing_object = object->backing_object;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	VM_OBJECT_ASSERT_WLOCKED(backing_object);
 
 	if (backing_object->ref_count != 1)
 		return;
 
 	vm_object_collapse_scan(object, OBSC_COLLAPSE_NOWAIT);
 }
 
 /*
  *	vm_object_collapse:
  *
  *	Collapse an object with the object backing it.
  *	Pages in the backing object are moved into the
  *	parent, and the backing object is deallocated.
  */
 void
 vm_object_collapse(vm_object_t object)
 {
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	
 	while (TRUE) {
 		vm_object_t backing_object;
 
 		/*
 		 * Verify that the conditions are right for collapse:
 		 *
 		 * The object exists and the backing object exists.
 		 */
 		if ((backing_object = object->backing_object) == NULL)
 			break;
 
 		/*
 		 * we check the backing object first, because it is most likely
 		 * not collapsable.
 		 */
 		VM_OBJECT_WLOCK(backing_object);
 		if (backing_object->handle != NULL ||
 		    (backing_object->type != OBJT_DEFAULT &&
 		     backing_object->type != OBJT_SWAP) ||
 		    (backing_object->flags & OBJ_DEAD) ||
 		    object->handle != NULL ||
 		    (object->type != OBJT_DEFAULT &&
 		     object->type != OBJT_SWAP) ||
 		    (object->flags & OBJ_DEAD)) {
 			VM_OBJECT_WUNLOCK(backing_object);
 			break;
 		}
 
 		if (
 		    object->paging_in_progress != 0 ||
 		    backing_object->paging_in_progress != 0
 		) {
 			vm_object_qcollapse(object);
 			VM_OBJECT_WUNLOCK(backing_object);
 			break;
 		}
 		/*
 		 * We know that we can either collapse the backing object (if
 		 * the parent is the only reference to it) or (perhaps) have
 		 * the parent bypass the object if the parent happens to shadow
 		 * all the resident pages in the entire backing object.
 		 *
 		 * This is ignoring pager-backed pages such as swap pages.
 		 * vm_object_collapse_scan fails the shadowing test in this
 		 * case.
 		 */
 		if (backing_object->ref_count == 1) {
 			/*
 			 * If there is exactly one reference to the backing
 			 * object, we can collapse it into the parent.
 			 */
 			vm_object_collapse_scan(object, OBSC_COLLAPSE_WAIT);
 
 #if VM_NRESERVLEVEL > 0
 			/*
 			 * Break any reservations from backing_object.
 			 */
 			if (__predict_false(!LIST_EMPTY(&backing_object->rvq)))
 				vm_reserv_break_all(backing_object);
 #endif
 
 			/*
 			 * Move the pager from backing_object to object.
 			 */
 			if (backing_object->type == OBJT_SWAP) {
 				/*
 				 * swap_pager_copy() can sleep, in which case
 				 * the backing_object's and object's locks are
 				 * released and reacquired.
 				 * Since swap_pager_copy() is being asked to
 				 * destroy the source, it will change the
 				 * backing_object's type to OBJT_DEFAULT.
 				 */
 				swap_pager_copy(
 				    backing_object,
 				    object,
 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
 
 				/*
 				 * Free any cached pages from backing_object.
 				 */
 				if (__predict_false(
 				    !vm_object_cache_is_empty(backing_object)))
 					vm_page_cache_free(backing_object, 0, 0);
 			}
 			/*
 			 * Object now shadows whatever backing_object did.
 			 * Note that the reference to 
 			 * backing_object->backing_object moves from within 
 			 * backing_object to within object.
 			 */
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 			if (backing_object->backing_object) {
 				VM_OBJECT_WLOCK(backing_object->backing_object);
 				LIST_REMOVE(backing_object, shadow_list);
 				LIST_INSERT_HEAD(
 				    &backing_object->backing_object->shadow_head,
 				    object, shadow_list);
 				/*
 				 * The shadow_count has not changed.
 				 */
 				VM_OBJECT_WUNLOCK(backing_object->backing_object);
 			}
 			object->backing_object = backing_object->backing_object;
 			object->backing_object_offset +=
 			    backing_object->backing_object_offset;
 
 			/*
 			 * Discard backing_object.
 			 *
 			 * Since the backing object has no pages, no pager left,
 			 * and no object references within it, all that is
 			 * necessary is to dispose of it.
 			 */
 			KASSERT(backing_object->ref_count == 1, (
 "backing_object %p was somehow re-referenced during collapse!",
 			    backing_object));
 			backing_object->type = OBJT_DEAD;
 			backing_object->ref_count = 0;
 			VM_OBJECT_WUNLOCK(backing_object);
 			vm_object_destroy(backing_object);
 
 			object_collapses++;
 		} else {
 			vm_object_t new_backing_object;
 
 			/*
 			 * If we do not entirely shadow the backing object,
 			 * there is nothing we can do so we give up.
 			 */
 			if (object->resident_page_count != object->size &&
 			    !vm_object_scan_all_shadowed(object)) {
 				VM_OBJECT_WUNLOCK(backing_object);
 				break;
 			}
 
 			/*
 			 * Make the parent shadow the next object in the
 			 * chain.  Deallocating backing_object will not remove
 			 * it, since its reference count is at least 2.
 			 */
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 
 			new_backing_object = backing_object->backing_object;
 			if ((object->backing_object = new_backing_object) != NULL) {
 				VM_OBJECT_WLOCK(new_backing_object);
 				LIST_INSERT_HEAD(
 				    &new_backing_object->shadow_head,
 				    object,
 				    shadow_list
 				);
 				new_backing_object->shadow_count++;
 				vm_object_reference_locked(new_backing_object);
 				VM_OBJECT_WUNLOCK(new_backing_object);
 				object->backing_object_offset +=
 					backing_object->backing_object_offset;
 			}
 
 			/*
 			 * Drop the reference count on backing_object. Since
 			 * its ref_count was at least 2, it will not vanish.
 			 */
 			backing_object->ref_count--;
 			VM_OBJECT_WUNLOCK(backing_object);
 			object_bypasses++;
 		}
 
 		/*
 		 * Try again with this object's new backing object.
 		 */
 	}
 }
 
 /*
  *	vm_object_page_remove:
  *
  *	For the given object, either frees or invalidates each of the
  *	specified pages.  In general, a page is freed.  However, if a page is
  *	wired for any reason other than the existence of a managed, wired
  *	mapping, then it may be invalidated but not removed from the object.
  *	Pages are specified by the given range ["start", "end") and the option
  *	OBJPR_CLEANONLY.  As a special case, if "end" is zero, then the range
  *	extends from "start" to the end of the object.  If the option
  *	OBJPR_CLEANONLY is specified, then only the non-dirty pages within the
  *	specified range are affected.  If the option OBJPR_NOTMAPPED is
  *	specified, then the pages within the specified range must have no
  *	mappings.  Otherwise, if this option is not specified, any mappings to
  *	the specified pages are removed before the pages are freed or
  *	invalidated.
  *
  *	In general, this operation should only be performed on objects that
  *	contain managed pages.  There are, however, two exceptions.  First, it
  *	is performed on the kernel and kmem objects by vm_map_entry_delete().
  *	Second, it is used by msync(..., MS_INVALIDATE) to invalidate device-
  *	backed pages.  In both of these cases, the option OBJPR_CLEANONLY must
  *	not be specified and the option OBJPR_NOTMAPPED must be specified.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
     int options)
 {
 	vm_page_t p, next;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_UNMANAGED) == 0 ||
 	    (options & (OBJPR_CLEANONLY | OBJPR_NOTMAPPED)) == OBJPR_NOTMAPPED,
 	    ("vm_object_page_remove: illegal options for object %p", object));
 	if (object->resident_page_count == 0)
 		goto skipmemq;
 	vm_object_pip_add(object, 1);
 again:
 	p = vm_page_find_least(object, start);
 
 	/*
 	 * Here, the variable "p" is either (1) the page with the least pindex
 	 * greater than or equal to the parameter "start" or (2) NULL. 
 	 */
 	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
 		next = TAILQ_NEXT(p, listq);
 
 		/*
 		 * If the page is wired for any reason besides the existence
 		 * of managed, wired mappings, then it cannot be freed.  For
 		 * example, fictitious pages, which represent device memory,
 		 * are inherently wired and cannot be freed.  They can,
 		 * however, be invalidated if the option OBJPR_CLEANONLY is
 		 * not specified.
 		 */
 		vm_page_lock(p);
 		if (vm_page_xbusied(p)) {
 			VM_OBJECT_WUNLOCK(object);
 			vm_page_busy_sleep(p, "vmopax");
 			VM_OBJECT_WLOCK(object);
 			goto again;
 		}
 		if (p->wire_count != 0) {
 			if ((options & OBJPR_NOTMAPPED) == 0)
 				pmap_remove_all(p);
 			if ((options & OBJPR_CLEANONLY) == 0) {
 				p->valid = 0;
 				vm_page_undirty(p);
 			}
 			goto next;
 		}
 		if (vm_page_busied(p)) {
 			VM_OBJECT_WUNLOCK(object);
 			vm_page_busy_sleep(p, "vmopar");
 			VM_OBJECT_WLOCK(object);
 			goto again;
 		}
 		KASSERT((p->flags & PG_FICTITIOUS) == 0,
 		    ("vm_object_page_remove: page %p is fictitious", p));
 		if ((options & OBJPR_CLEANONLY) != 0 && p->valid != 0) {
 			if ((options & OBJPR_NOTMAPPED) == 0)
 				pmap_remove_write(p);
 			if (p->dirty)
 				goto next;
 		}
 		if ((options & OBJPR_NOTMAPPED) == 0)
 			pmap_remove_all(p);
 		vm_page_free(p);
 next:
 		vm_page_unlock(p);
 	}
 	vm_object_pip_wakeup(object);
 skipmemq:
 	if (__predict_false(!vm_object_cache_is_empty(object)))
 		vm_page_cache_free(object, start, end);
 }
 
 /*
  *	vm_object_page_noreuse:
  *
  *	For the given object, attempt to move the specified pages to
  *	the head of the inactive queue.  This bypasses regular LRU
  *	operation and allows the pages to be reused quickly under memory
  *	pressure.  If a page is wired for any reason, then it will not
  *	be queued.  Pages are specified by the range ["start", "end").
  *	As a special case, if "end" is zero, then the range extends from
  *	"start" to the end of the object.
  *
  *	This operation should only be performed on objects that
  *	contain non-fictitious, managed pages.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
 	struct mtx *mtx, *new_mtx;
 	vm_page_t p, next;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0,
 	    ("vm_object_page_noreuse: illegal object %p", object));
 	if (object->resident_page_count == 0)
 		return;
 	p = vm_page_find_least(object, start);
 
 	/*
 	 * Here, the variable "p" is either (1) the page with the least pindex
 	 * greater than or equal to the parameter "start" or (2) NULL. 
 	 */
 	mtx = NULL;
 	for (; p != NULL && (p->pindex < end || end == 0); p = next) {
 		next = TAILQ_NEXT(p, listq);
 
 		/*
 		 * Avoid releasing and reacquiring the same page lock.
 		 */
 		new_mtx = vm_page_lockptr(p);
 		if (mtx != new_mtx) {
 			if (mtx != NULL)
 				mtx_unlock(mtx);
 			mtx = new_mtx;
 			mtx_lock(mtx);
 		}
 		vm_page_deactivate_noreuse(p);
 	}
 	if (mtx != NULL)
 		mtx_unlock(mtx);
 }
 
 /*
  *	Populate the specified range of the object with valid pages.  Returns
  *	TRUE if the range is successfully populated and FALSE otherwise.
  *
  *	Note: This function should be optimized to pass a larger array of
  *	pages to vm_pager_get_pages() before it is applied to a non-
  *	OBJT_DEVICE object.
  *
  *	The object must be locked.
  */
 boolean_t
 vm_object_populate(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_page_t m;
 	vm_pindex_t pindex;
 	int rv;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	for (pindex = start; pindex < end; pindex++) {
 		m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
 		if (m->valid != VM_PAGE_BITS_ALL) {
 			rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
 			if (rv != VM_PAGER_OK) {
 				vm_page_lock(m);
 				vm_page_free(m);
 				vm_page_unlock(m);
 				break;
 			}
 		}
 		/*
 		 * Keep "m" busy because a subsequent iteration may unlock
 		 * the object.
 		 */
 	}
 	if (pindex > start) {
 		m = vm_page_lookup(object, start);
 		while (m != NULL && m->pindex < pindex) {
 			vm_page_xunbusy(m);
 			m = TAILQ_NEXT(m, listq);
 		}
 	}
 	return (pindex == end);
 }
 
 /*
  *	Routine:	vm_object_coalesce
  *	Function:	Coalesces two objects backing up adjoining
  *			regions of memory into a single object.
  *
  *	returns TRUE if objects were combined.
  *
  *	NOTE:	Only works at the moment if the second object is NULL -
  *		if it's not, which object do we lock first?
  *
  *	Parameters:
  *		prev_object	First object to coalesce
  *		prev_offset	Offset into prev_object
  *		prev_size	Size of reference to prev_object
  *		next_size	Size of reference to the second object
  *		reserved	Indicator that extension region has
  *				swap accounted for
  *
  *	Conditions:
  *	The object must *not* be locked.
  */
 boolean_t
 vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset,
     vm_size_t prev_size, vm_size_t next_size, boolean_t reserved)
 {
 	vm_pindex_t next_pindex;
 
 	if (prev_object == NULL)
 		return (TRUE);
 	VM_OBJECT_WLOCK(prev_object);
 	if ((prev_object->type != OBJT_DEFAULT &&
 	    prev_object->type != OBJT_SWAP) ||
 	    (prev_object->flags & OBJ_TMPFS_NODE) != 0) {
 		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Try to collapse the object first
 	 */
 	vm_object_collapse(prev_object);
 
 	/*
 	 * Can't coalesce if: . more than one reference . paged out . shadows
 	 * another object . has a copy elsewhere (any of which mean that the
 	 * pages not mapped to prev_entry may be in use anyway)
 	 */
 	if (prev_object->backing_object != NULL) {
 		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	prev_size >>= PAGE_SHIFT;
 	next_size >>= PAGE_SHIFT;
 	next_pindex = OFF_TO_IDX(prev_offset) + prev_size;
 
 	if ((prev_object->ref_count > 1) &&
 	    (prev_object->size != next_pindex)) {
 		VM_OBJECT_WUNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Account for the charge.
 	 */
 	if (prev_object->cred != NULL) {
 
 		/*
 		 * If prev_object was charged, then this mapping,
 		 * although not charged now, may become writable
 		 * later. Non-NULL cred in the object would prevent
 		 * swap reservation during enabling of the write
 		 * access, so reserve swap now. Failed reservation
 		 * cause allocation of the separate object for the map
 		 * entry, and swap reservation for this entry is
 		 * managed in appropriate time.
 		 */
 		if (!reserved && !swap_reserve_by_cred(ptoa(next_size),
 		    prev_object->cred)) {
 			return (FALSE);
 		}
 		prev_object->charge += ptoa(next_size);
 	}
 
 	/*
 	 * Remove any pages that may still be in the object from a previous
 	 * deallocation.
 	 */
 	if (next_pindex < prev_object->size) {
 		vm_object_page_remove(prev_object, next_pindex, next_pindex +
 		    next_size, 0);
 		if (prev_object->type == OBJT_SWAP)
 			swap_pager_freespace(prev_object,
 					     next_pindex, next_size);
 #if 0
 		if (prev_object->cred != NULL) {
 			KASSERT(prev_object->charge >=
 			    ptoa(prev_object->size - next_pindex),
 			    ("object %p overcharged 1 %jx %jx", prev_object,
 				(uintmax_t)next_pindex, (uintmax_t)next_size));
 			prev_object->charge -= ptoa(prev_object->size -
 			    next_pindex);
 		}
 #endif
 	}
 
 	/*
 	 * Extend the object if necessary.
 	 */
 	if (next_pindex + next_size > prev_object->size)
 		prev_object->size = next_pindex + next_size;
 
 	VM_OBJECT_WUNLOCK(prev_object);
 	return (TRUE);
 }
 
 void
 vm_object_set_writeable_dirty(vm_object_t object)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type != OBJT_VNODE) {
 		if ((object->flags & OBJ_TMPFS_NODE) != 0) {
 			KASSERT(object->type == OBJT_SWAP, ("non-swap tmpfs"));
 			vm_object_set_flag(object, OBJ_TMPFS_DIRTY);
 		}
 		return;
 	}
 	object->generation++;
 	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0)
 		return;
 	vm_object_set_flag(object, OBJ_MIGHTBEDIRTY);
 }
 
 /*
  *	vm_object_unwire:
  *
  *	For each page offset within the specified range of the given object,
  *	find the highest-level page in the shadow chain and unwire it.  A page
  *	must exist at every page offset, and the highest-level page must be
  *	wired.
  */
 void
 vm_object_unwire(vm_object_t object, vm_ooffset_t offset, vm_size_t length,
     uint8_t queue)
 {
 	vm_object_t tobject;
 	vm_page_t m, tm;
 	vm_pindex_t end_pindex, pindex, tpindex;
 	int depth, locked_depth;
 
 	KASSERT((offset & PAGE_MASK) == 0,
 	    ("vm_object_unwire: offset is not page aligned"));
 	KASSERT((length & PAGE_MASK) == 0,
 	    ("vm_object_unwire: length is not a multiple of PAGE_SIZE"));
 	/* The wired count of a fictitious page never changes. */
 	if ((object->flags & OBJ_FICTITIOUS) != 0)
 		return;
 	pindex = OFF_TO_IDX(offset);
 	end_pindex = pindex + atop(length);
 	locked_depth = 1;
 	VM_OBJECT_RLOCK(object);
 	m = vm_page_find_least(object, pindex);
 	while (pindex < end_pindex) {
 		if (m == NULL || pindex < m->pindex) {
 			/*
 			 * The first object in the shadow chain doesn't
 			 * contain a page at the current index.  Therefore,
 			 * the page must exist in a backing object.
 			 */
 			tobject = object;
 			tpindex = pindex;
 			depth = 0;
 			do {
 				tpindex +=
 				    OFF_TO_IDX(tobject->backing_object_offset);
 				tobject = tobject->backing_object;
 				KASSERT(tobject != NULL,
 				    ("vm_object_unwire: missing page"));
 				if ((tobject->flags & OBJ_FICTITIOUS) != 0)
 					goto next_page;
 				depth++;
 				if (depth == locked_depth) {
 					locked_depth++;
 					VM_OBJECT_RLOCK(tobject);
 				}
 			} while ((tm = vm_page_lookup(tobject, tpindex)) ==
 			    NULL);
 		} else {
 			tm = m;
 			m = TAILQ_NEXT(m, listq);
 		}
 		vm_page_lock(tm);
 		vm_page_unwire(tm, queue);
 		vm_page_unlock(tm);
 next_page:
 		pindex++;
 	}
 	/* Release the accumulated object locks. */
 	for (depth = 0; depth < locked_depth; depth++) {
 		tobject = object->backing_object;
 		VM_OBJECT_RUNLOCK(object);
 		object = tobject;
 	}
 }
 
 struct vnode *
 vm_object_vnode(vm_object_t object)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	if (object->type == OBJT_VNODE)
 		return (object->handle);
 	if (object->type == OBJT_SWAP && (object->flags & OBJ_TMPFS) != 0)
 		return (object->un_pager.swp.swp_tmpfs);
 	return (NULL);
 }
 
 static int
 sysctl_vm_object_list(SYSCTL_HANDLER_ARGS)
 {
 	struct kinfo_vmobject kvo;
 	char *fullpath, *freepath;
 	struct vnode *vp;
 	struct vattr va;
 	vm_object_t obj;
 	vm_page_t m;
 	int count, error;
 
 	if (req->oldptr == NULL) {
 		/*
 		 * If an old buffer has not been provided, generate an
 		 * estimate of the space needed for a subsequent call.
 		 */
 		mtx_lock(&vm_object_list_mtx);
 		count = 0;
 		TAILQ_FOREACH(obj, &vm_object_list, object_list) {
 			if (obj->type == OBJT_DEAD)
 				continue;
 			count++;
 		}
 		mtx_unlock(&vm_object_list_mtx);
 		return (SYSCTL_OUT(req, NULL, sizeof(struct kinfo_vmobject) *
 		    count * 11 / 10));
 	}
 
 	error = 0;
 
 	/*
 	 * VM objects are type stable and are never removed from the
 	 * list once added.  This allows us to safely read obj->object_list
 	 * after reacquiring the VM object lock.
 	 */
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(obj, &vm_object_list, object_list) {
 		if (obj->type == OBJT_DEAD)
 			continue;
 		VM_OBJECT_RLOCK(obj);
 		if (obj->type == OBJT_DEAD) {
 			VM_OBJECT_RUNLOCK(obj);
 			continue;
 		}
 		mtx_unlock(&vm_object_list_mtx);
 		kvo.kvo_size = ptoa(obj->size);
 		kvo.kvo_resident = obj->resident_page_count;
 		kvo.kvo_ref_count = obj->ref_count;
 		kvo.kvo_shadow_count = obj->shadow_count;
 		kvo.kvo_memattr = obj->memattr;
 		kvo.kvo_active = 0;
 		kvo.kvo_inactive = 0;
 		TAILQ_FOREACH(m, &obj->memq, listq) {
 			/*
 			 * A page may belong to the object but be
 			 * dequeued and set to PQ_NONE while the
 			 * object lock is not held.  This makes the
 			 * reads of m->queue below racy, and we do not
 			 * count pages set to PQ_NONE.  However, this
 			 * sysctl is only meant to give an
 			 * approximation of the system anyway.
 			 */
 			if (m->queue == PQ_ACTIVE)
 				kvo.kvo_active++;
 			else if (m->queue == PQ_INACTIVE)
 				kvo.kvo_inactive++;
 		}
 
 		kvo.kvo_vn_fileid = 0;
 		kvo.kvo_vn_fsid = 0;
 		freepath = NULL;
 		fullpath = "";
 		vp = NULL;
 		switch (obj->type) {
 		case OBJT_DEFAULT:
 			kvo.kvo_type = KVME_TYPE_DEFAULT;
 			break;
 		case OBJT_VNODE:
 			kvo.kvo_type = KVME_TYPE_VNODE;
 			vp = obj->handle;
 			vref(vp);
 			break;
 		case OBJT_SWAP:
 			kvo.kvo_type = KVME_TYPE_SWAP;
 			break;
 		case OBJT_DEVICE:
 			kvo.kvo_type = KVME_TYPE_DEVICE;
 			break;
 		case OBJT_PHYS:
 			kvo.kvo_type = KVME_TYPE_PHYS;
 			break;
 		case OBJT_DEAD:
 			kvo.kvo_type = KVME_TYPE_DEAD;
 			break;
 		case OBJT_SG:
 			kvo.kvo_type = KVME_TYPE_SG;
 			break;
 		case OBJT_MGTDEVICE:
 			kvo.kvo_type = KVME_TYPE_MGTDEVICE;
 			break;
 		default:
 			kvo.kvo_type = KVME_TYPE_UNKNOWN;
 			break;
 		}
 		VM_OBJECT_RUNLOCK(obj);
 		if (vp != NULL) {
 			vn_fullpath(curthread, vp, &fullpath, &freepath);
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 			if (VOP_GETATTR(vp, &va, curthread->td_ucred) == 0) {
 				kvo.kvo_vn_fileid = va.va_fileid;
 				kvo.kvo_vn_fsid = va.va_fsid;
 			}
 			vput(vp);
 		}
 
 		strlcpy(kvo.kvo_path, fullpath, sizeof(kvo.kvo_path));
 		if (freepath != NULL)
 			free(freepath, M_TEMP);
 
 		/* Pack record size down */
 		kvo.kvo_structsize = offsetof(struct kinfo_vmobject, kvo_path) +
 		    strlen(kvo.kvo_path) + 1;
 		kvo.kvo_structsize = roundup(kvo.kvo_structsize,
 		    sizeof(uint64_t));
 		error = SYSCTL_OUT(req, &kvo, kvo.kvo_structsize);
 		mtx_lock(&vm_object_list_mtx);
 		if (error)
 			break;
 	}
 	mtx_unlock(&vm_object_list_mtx);
 	return (error);
 }
 SYSCTL_PROC(_vm, OID_AUTO, objects, CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_SKIP |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_object_list, "S,kinfo_vmobject",
     "List of VM objects");
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <sys/cons.h>
 
 #include <ddb/ddb.h>
 
 static int
 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
 {
 	vm_map_t tmpm;
 	vm_map_entry_t tmpe;
 	vm_object_t obj;
 	int entcount;
 
 	if (map == 0)
 		return 0;
 
 	if (entry == 0) {
 		tmpe = map->header.next;
 		entcount = map->nentries;
 		while (entcount-- && (tmpe != &map->header)) {
 			if (_vm_object_in_map(map, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		tmpm = entry->object.sub_map;
 		tmpe = tmpm->header.next;
 		entcount = tmpm->nentries;
 		while (entcount-- && tmpe != &tmpm->header) {
 			if (_vm_object_in_map(tmpm, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if ((obj = entry->object.vm_object) != NULL) {
 		for (; obj; obj = obj->backing_object)
 			if (obj == object) {
 				return 1;
 			}
 	}
 	return 0;
 }
 
 static int
 vm_object_in_map(vm_object_t object)
 {
 	struct proc *p;
 
 	/* sx_slock(&allproc_lock); */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
 			continue;
 		if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
 			/* sx_sunlock(&allproc_lock); */
 			return 1;
 		}
 	}
 	/* sx_sunlock(&allproc_lock); */
 	if (_vm_object_in_map(kernel_map, object, 0))
 		return 1;
 	return 0;
 }
 
 DB_SHOW_COMMAND(vmochk, vm_object_check)
 {
 	vm_object_t object;
 
 	/*
 	 * make sure that internal objs are in a map somewhere
 	 * and none have zero ref counts.
 	 */
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		if (object->handle == NULL &&
 		    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
 			if (object->ref_count == 0) {
 				db_printf("vmochk: internal obj has zero ref count: %ld\n",
 					(long)object->size);
 			}
 			if (!vm_object_in_map(object)) {
 				db_printf(
 			"vmochk: internal obj is not in a map: "
 			"ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
 				    object->ref_count, (u_long)object->size, 
 				    (u_long)object->size,
 				    (void *)object->backing_object);
 			}
 		}
 	}
 }
 
 /*
  *	vm_object_print:	[ debug ]
  */
 DB_SHOW_COMMAND(object, vm_object_print_static)
 {
 	/* XXX convert args. */
 	vm_object_t object = (vm_object_t)addr;
 	boolean_t full = have_addr;
 
 	vm_page_t p;
 
 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
 #define	count	was_count
 
 	int count;
 
 	if (object == NULL)
 		return;
 
 	db_iprintf(
 	    "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x ruid %d charge %jx\n",
 	    object, (int)object->type, (uintmax_t)object->size,
 	    object->resident_page_count, object->ref_count, object->flags,
 	    object->cred ? object->cred->cr_ruid : -1, (uintmax_t)object->charge);
 	db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n",
 	    object->shadow_count, 
 	    object->backing_object ? object->backing_object->ref_count : 0,
 	    object->backing_object, (uintmax_t)object->backing_object_offset);
 
 	if (!full)
 		return;
 
 	db_indent += 2;
 	count = 0;
 	TAILQ_FOREACH(p, &object->memq, listq) {
 		if (count == 0)
 			db_iprintf("memory:=");
 		else if (count == 6) {
 			db_printf("\n");
 			db_iprintf(" ...");
 			count = 0;
 		} else
 			db_printf(",");
 		count++;
 
 		db_printf("(off=0x%jx,page=0x%jx)",
 		    (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p));
 	}
 	if (count != 0)
 		db_printf("\n");
 	db_indent -= 2;
 }
 
 /* XXX. */
 #undef count
 
 /* XXX need this non-static entry for calling from vm_map_print. */
 void
 vm_object_print(
         /* db_expr_t */ long addr,
 	boolean_t have_addr,
 	/* db_expr_t */ long count,
 	char *modif)
 {
 	vm_object_print_static(addr, have_addr, count, modif);
 }
 
 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
 {
 	vm_object_t object;
 	vm_pindex_t fidx;
 	vm_paddr_t pa;
 	vm_page_t m, prev_m;
 	int rcount, nl, c;
 
 	nl = 0;
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		db_printf("new object: %p\n", (void *)object);
 		if (nl > 18) {
 			c = cngetc();
 			if (c != ' ')
 				return;
 			nl = 0;
 		}
 		nl++;
 		rcount = 0;
 		fidx = 0;
 		pa = -1;
 		TAILQ_FOREACH(m, &object->memq, listq) {
 			if (m->pindex > 128)
 				break;
 			if ((prev_m = TAILQ_PREV(m, pglist, listq)) != NULL &&
 			    prev_m->pindex + 1 != m->pindex) {
 				if (rcount) {
 					db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 						(long)fidx, rcount, (long)pa);
 					if (nl > 18) {
 						c = cngetc();
 						if (c != ' ')
 							return;
 						nl = 0;
 					}
 					nl++;
 					rcount = 0;
 				}
 			}				
 			if (rcount &&
 				(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
 				++rcount;
 				continue;
 			}
 			if (rcount) {
 				db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 					(long)fidx, rcount, (long)pa);
 				if (nl > 18) {
 					c = cngetc();
 					if (c != ' ')
 						return;
 					nl = 0;
 				}
 				nl++;
 			}
 			fidx = m->pindex;
 			pa = VM_PAGE_TO_PHYS(m);
 			rcount = 1;
 		}
 		if (rcount) {
 			db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 				(long)fidx, rcount, (long)pa);
 			if (nl > 18) {
 				c = cngetc();
 				if (c != ' ')
 					return;
 				nl = 0;
 			}
 			nl++;
 		}
 	}
 }
 #endif /* DDB */
Index: head/sys/vm/vm_object.h
===================================================================
--- head/sys/vm/vm_object.h	(revision 300042)
+++ head/sys/vm/vm_object.h	(revision 300043)
@@ -1,334 +1,335 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_object.h	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
  * $FreeBSD$
  */
 
 /*
  *	Virtual memory object module definitions.
  */
 
 #ifndef	_VM_OBJECT_
 #define	_VM_OBJECT_
 
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/_rwlock.h>
 
 #include <vm/_vm_radix.h>
 
 /*
  *	Types defined:
  *
  *	vm_object_t		Virtual memory object.
  *
  *	The root of cached pages pool is protected by both the per-object lock
  *	and the free pages queue mutex.
  *	On insert in the cache radix trie, the per-object lock is expected
  *	to be already held and the free pages queue mutex will be
  *	acquired during the operation too.
  *	On remove and lookup from the cache radix trie, only the free
  *	pages queue mutex is expected to be locked.
  *	These rules allow for reliably checking for the presence of cached
  *	pages with only the per-object lock held, thereby reducing contention
  *	for the free pages queue mutex.
  *
  * List of locks
  *	(c)	const until freed
  *	(o)	per-object lock 
  *	(f)	free pages queue mutex
  *
  */
 
 struct vm_object {
 	struct rwlock lock;
 	TAILQ_ENTRY(vm_object) object_list; /* list of all objects */
 	LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow for */
 	LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
 	TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */
 	struct vm_radix rtree;		/* root of the resident page radix trie*/
 	vm_pindex_t size;		/* Object size */
 	int generation;			/* generation ID */
 	int ref_count;			/* How many refs?? */
 	int shadow_count;		/* how many objects that this is a shadow for */
 	vm_memattr_t memattr;		/* default memory attribute for pages */
 	objtype_t type;			/* type of pager */
 	u_short flags;			/* see below */
 	u_short pg_color;		/* (c) color of first page in obj */
 	u_int paging_in_progress;	/* Paging (in or out) so don't collapse or destroy */
 	int resident_page_count;	/* number of resident pages */
 	struct vm_object *backing_object; /* object that I'm a shadow of */
 	vm_ooffset_t backing_object_offset;/* Offset in backing object */
 	TAILQ_ENTRY(vm_object) pager_object_list; /* list of all objects of this pager type */
 	LIST_HEAD(, vm_reserv) rvq;	/* list of reservations */
 	struct vm_radix cache;		/* (o + f) root of the cache page radix trie */
 	void *handle;
 	union {
 		/*
 		 * VNode pager
 		 *
 		 *	vnp_size - current size of file
 		 */
 		struct {
 			off_t vnp_size;
 			vm_ooffset_t writemappings;
 		} vnp;
 
 		/*
 		 * Device pager
 		 *
 		 *	devp_pglist - list of allocated pages
 		 */
 		struct {
 			TAILQ_HEAD(, vm_page) devp_pglist;
 			struct cdev_pager_ops *ops;
 			struct cdev *dev;
 		} devp;
 
 		/*
 		 * SG pager
 		 *
 		 *	sgp_pglist - list of allocated pages
 		 */
 		struct {
 			TAILQ_HEAD(, vm_page) sgp_pglist;
 		} sgp;
 
 		/*
 		 * Swap pager
 		 *
 		 *	swp_tmpfs - back-pointer to the tmpfs vnode,
 		 *		     if any, which uses the vm object
 		 *		     as backing store.  The handle
 		 *		     cannot be reused for linking,
 		 *		     because the vnode can be
 		 *		     reclaimed and recreated, making
 		 *		     the handle changed and hash-chain
 		 *		     invalid.
 		 *
 		 *	swp_bcount - number of swap 'swblock' metablocks, each
 		 *		     contains up to 16 swapblk assignments.
 		 *		     see vm/swap_pager.h
 		 */
 		struct {
 			void *swp_tmpfs;
 			int swp_bcount;
 		} swp;
 	} un_pager;
 	struct ucred *cred;
 	vm_ooffset_t charge;
 	void *umtx_data;
 };
 
 /*
  * Flags
  */
 #define	OBJ_FICTITIOUS	0x0001		/* (c) contains fictitious pages */
 #define	OBJ_UNMANAGED	0x0002		/* (c) contains unmanaged pages */
 #define OBJ_ACTIVE	0x0004		/* active objects */
 #define OBJ_DEAD	0x0008		/* dead objects (during rundown) */
 #define	OBJ_NOSPLIT	0x0010		/* dont split this object */
 #define	OBJ_UMTXDEAD	0x0020		/* umtx pshared was terminated */
 #define OBJ_PIPWNT	0x0040		/* paging in progress wanted */
 #define OBJ_MIGHTBEDIRTY 0x0100		/* object might be dirty, only for vnode */
 #define	OBJ_TMPFS_NODE	0x0200		/* object belongs to tmpfs VREG node */
 #define	OBJ_TMPFS_DIRTY	0x0400		/* dirty tmpfs obj */
 #define	OBJ_COLORED	0x1000		/* pg_color is defined */
 #define	OBJ_ONEMAPPING	0x2000		/* One USE (a single, non-forked) mapping flag */
 #define	OBJ_DISCONNECTWNT 0x4000	/* disconnect from vnode wanted */
 #define	OBJ_TMPFS	0x8000		/* has tmpfs vnode allocated */
 
 #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
 #define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT))
 
 #ifdef	_KERNEL
 
 #define OBJPC_SYNC	0x1			/* sync I/O */
 #define OBJPC_INVAL	0x2			/* invalidate */
 #define OBJPC_NOSYNC	0x4			/* skip if VPO_NOSYNC */
 
 /*
  * The following options are supported by vm_object_page_remove().
  */
 #define	OBJPR_CLEANONLY	0x1		/* Don't remove dirty pages. */
 #define	OBJPR_NOTMAPPED	0x2		/* Don't unmap pages. */
 
 TAILQ_HEAD(object_q, vm_object);
 
 extern struct object_q vm_object_list;	/* list of allocated objects */
 extern struct mtx vm_object_list_mtx;	/* lock for object list and count */
 
 extern struct vm_object kernel_object_store;
 extern struct vm_object kmem_object_store;
 
 #define	kernel_object	(&kernel_object_store)
 #define	kmem_object	(&kmem_object_store)
 
 #define	VM_OBJECT_ASSERT_LOCKED(object)					\
 	rw_assert(&(object)->lock, RA_LOCKED)
 #define	VM_OBJECT_ASSERT_RLOCKED(object)				\
 	rw_assert(&(object)->lock, RA_RLOCKED)
 #define	VM_OBJECT_ASSERT_WLOCKED(object)				\
 	rw_assert(&(object)->lock, RA_WLOCKED)
 #define	VM_OBJECT_ASSERT_UNLOCKED(object)				\
 	rw_assert(&(object)->lock, RA_UNLOCKED)
 #define	VM_OBJECT_LOCK_DOWNGRADE(object)				\
 	rw_downgrade(&(object)->lock)
 #define	VM_OBJECT_RLOCK(object)						\
 	rw_rlock(&(object)->lock)
 #define	VM_OBJECT_RUNLOCK(object)					\
 	rw_runlock(&(object)->lock)
 #define	VM_OBJECT_SLEEP(object, wchan, pri, wmesg, timo)		\
 	rw_sleep((wchan), &(object)->lock, (pri), (wmesg), (timo))
 #define	VM_OBJECT_TRYRLOCK(object)					\
 	rw_try_rlock(&(object)->lock)
 #define	VM_OBJECT_TRYWLOCK(object)					\
 	rw_try_wlock(&(object)->lock)
 #define	VM_OBJECT_TRYUPGRADE(object)					\
 	rw_try_upgrade(&(object)->lock)
 #define	VM_OBJECT_WLOCK(object)						\
 	rw_wlock(&(object)->lock)
 #define	VM_OBJECT_WOWNED(object)					\
 	rw_wowned(&(object)->lock)
 #define	VM_OBJECT_WUNLOCK(object)					\
 	rw_wunlock(&(object)->lock)
 
 /*
  *	The object must be locked or thread private.
  */
 static __inline void
 vm_object_set_flag(vm_object_t object, u_short bits)
 {
 
 	object->flags |= bits;
 }
 
 /*
  *	Conditionally set the object's color, which (1) enables the allocation
  *	of physical memory reservations for anonymous objects and larger-than-
  *	superpage-sized named objects and (2) determines the first page offset
  *	within the object at which a reservation may be allocated.  In other
  *	words, the color determines the alignment of the object with respect
  *	to the largest superpage boundary.  When mapping named objects, like
  *	files or POSIX shared memory objects, the color should be set to zero
  *	before a virtual address is selected for the mapping.  In contrast,
  *	for anonymous objects, the color may be set after the virtual address
  *	is selected.
  *
  *	The object must be locked.
  */
 static __inline void
 vm_object_color(vm_object_t object, u_short color)
 {
 
 	if ((object->flags & OBJ_COLORED) == 0) {
 		object->pg_color = color;
 		object->flags |= OBJ_COLORED;
 	}
 }
 
 void vm_object_clear_flag(vm_object_t object, u_short bits);
 void vm_object_pip_add(vm_object_t object, short i);
 void vm_object_pip_subtract(vm_object_t object, short i);
 void vm_object_pip_wakeup(vm_object_t object);
 void vm_object_pip_wakeupn(vm_object_t object, short i);
 void vm_object_pip_wait(vm_object_t object, char *waitid);
 
 static __inline boolean_t
 vm_object_cache_is_empty(vm_object_t object)
 {
 
 	return (vm_radix_is_empty(&object->cache));
 }
 
 void umtx_shm_object_init(vm_object_t object);
 void umtx_shm_object_terminated(vm_object_t object);
+extern int umtx_shm_vnobj_persistent;
 
 vm_object_t vm_object_allocate (objtype_t, vm_pindex_t);
 boolean_t vm_object_coalesce(vm_object_t, vm_ooffset_t, vm_size_t, vm_size_t,
    boolean_t);
 void vm_object_collapse (vm_object_t);
 void vm_object_deallocate (vm_object_t);
 void vm_object_destroy (vm_object_t);
 void vm_object_terminate (vm_object_t);
 void vm_object_set_writeable_dirty (vm_object_t);
 void vm_object_init (void);
 void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int);
 boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start,
     vm_ooffset_t end, int flags);
 void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start,
     vm_pindex_t end);
 void vm_object_page_remove(vm_object_t object, vm_pindex_t start,
     vm_pindex_t end, int options);
 boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t);
 void vm_object_print(long addr, boolean_t have_addr, long count, char *modif);
 void vm_object_reference (vm_object_t);
 void vm_object_reference_locked(vm_object_t);
 int  vm_object_set_memattr(vm_object_t object, vm_memattr_t memattr);
 void vm_object_shadow (vm_object_t *, vm_ooffset_t *, vm_size_t);
 void vm_object_split(vm_map_entry_t);
 boolean_t vm_object_sync(vm_object_t, vm_ooffset_t, vm_size_t, boolean_t,
     boolean_t);
 void vm_object_unwire(vm_object_t object, vm_ooffset_t offset,
     vm_size_t length, uint8_t queue);
 struct vnode *vm_object_vnode(vm_object_t object);
 #endif				/* _KERNEL */
 
 #endif				/* _VM_OBJECT_ */
Index: head/sys/vm/vnode_pager.c
===================================================================
--- head/sys/vm/vnode_pager.c	(revision 300042)
+++ head/sys/vm/vnode_pager.c	(revision 300043)
@@ -1,1371 +1,1372 @@
 /*-
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991 The Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1993, 1994 John S. Dyson
  * Copyright (c) 1995, David Greenman
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
  */
 
 /*
  * Page to/from files (vnodes).
  */
 
 /*
  * TODO:
  *	Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
  *	greatly re-simplify the vnode_pager.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/vmmeter.h>
 #include <sys/limits.h>
 #include <sys/conf.h>
 #include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_map.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_extern.h>
 
 static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
     daddr_t *rtaddress, int *run);
 static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
 static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
 static void vnode_pager_dealloc(vm_object_t);
 static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
 static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int *,
     int *, vop_getpages_iodone_t, void *);
 static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
 static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
 static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *cred);
 static int vnode_pager_generic_getpages_done(struct buf *);
 static void vnode_pager_generic_getpages_done_async(struct buf *);
 
 struct pagerops vnodepagerops = {
 	.pgo_alloc =	vnode_pager_alloc,
 	.pgo_dealloc =	vnode_pager_dealloc,
 	.pgo_getpages =	vnode_pager_getpages,
 	.pgo_getpages_async = vnode_pager_getpages_async,
 	.pgo_putpages =	vnode_pager_putpages,
 	.pgo_haspage =	vnode_pager_haspage,
 };
 
 int vnode_pbuf_freecnt;
 int vnode_async_pbuf_freecnt;
 
 /* Create the VM system backing object for this vnode */
 int
 vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td)
 {
 	vm_object_t object;
 	vm_ooffset_t size = isize;
 	struct vattr va;
 
 	if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE)
 		return (0);
 
 	while ((object = vp->v_object) != NULL) {
 		VM_OBJECT_WLOCK(object);
 		if (!(object->flags & OBJ_DEAD)) {
 			VM_OBJECT_WUNLOCK(object);
 			return (0);
 		}
 		VOP_UNLOCK(vp, 0);
 		vm_object_set_flag(object, OBJ_DISCONNECTWNT);
 		VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vodead", 0);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	if (size == 0) {
 		if (vn_isdisk(vp, NULL)) {
 			size = IDX_TO_OFF(INT_MAX);
 		} else {
 			if (VOP_GETATTR(vp, &va, td->td_ucred))
 				return (0);
 			size = va.va_size;
 		}
 	}
 
 	object = vnode_pager_alloc(vp, size, 0, 0, td->td_ucred);
 	/*
 	 * Dereference the reference we just created.  This assumes
 	 * that the object is associated with the vp.
 	 */
 	VM_OBJECT_WLOCK(object);
 	object->ref_count--;
 	VM_OBJECT_WUNLOCK(object);
 	vrele(vp);
 
 	KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object"));
 
 	return (0);
 }
 
 void
 vnode_destroy_vobject(struct vnode *vp)
 {
 	struct vm_object *obj;
 
 	obj = vp->v_object;
 	if (obj == NULL)
 		return;
 	ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject");
 	VM_OBJECT_WLOCK(obj);
+	umtx_shm_object_terminated(obj);
 	if (obj->ref_count == 0) {
 		/*
 		 * don't double-terminate the object
 		 */
 		if ((obj->flags & OBJ_DEAD) == 0)
 			vm_object_terminate(obj);
 		else
 			VM_OBJECT_WUNLOCK(obj);
 	} else {
 		/*
 		 * Woe to the process that tries to page now :-).
 		 */
 		vm_pager_deallocate(obj);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	vp->v_object = NULL;
 }
 
 
 /*
  * Allocate (or lookup) pager for a vnode.
  * Handle is a vnode pointer.
  *
  * MPSAFE
  */
 vm_object_t
 vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t offset, struct ucred *cred)
 {
 	vm_object_t object;
 	struct vnode *vp;
 
 	/*
 	 * Pageout to vnode, no can do yet.
 	 */
 	if (handle == NULL)
 		return (NULL);
 
 	vp = (struct vnode *) handle;
 
 	/*
 	 * If the object is being terminated, wait for it to
 	 * go away.
 	 */
 retry:
 	while ((object = vp->v_object) != NULL) {
 		VM_OBJECT_WLOCK(object);
 		if ((object->flags & OBJ_DEAD) == 0)
 			break;
 		vm_object_set_flag(object, OBJ_DISCONNECTWNT);
 		VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vadead", 0);
 	}
 
 	KASSERT(vp->v_usecount != 0, ("vnode_pager_alloc: no vnode reference"));
 
 	if (object == NULL) {
 		/*
 		 * Add an object of the appropriate size
 		 */
 		object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size)));
 
 		object->un_pager.vnp.vnp_size = size;
 		object->un_pager.vnp.writemappings = 0;
 
 		object->handle = handle;
 		VI_LOCK(vp);
 		if (vp->v_object != NULL) {
 			/*
 			 * Object has been created while we were sleeping
 			 */
 			VI_UNLOCK(vp);
 			VM_OBJECT_WLOCK(object);
 			KASSERT(object->ref_count == 1,
 			    ("leaked ref %p %d", object, object->ref_count));
 			object->type = OBJT_DEAD;
 			object->ref_count = 0;
 			VM_OBJECT_WUNLOCK(object);
 			vm_object_destroy(object);
 			goto retry;
 		}
 		vp->v_object = object;
 		VI_UNLOCK(vp);
 	} else {
 		object->ref_count++;
 #if VM_NRESERVLEVEL > 0
 		vm_object_color(object, 0);
 #endif
 		VM_OBJECT_WUNLOCK(object);
 	}
 	vref(vp);
 	return (object);
 }
 
 /*
  *	The object must be locked.
  */
 static void
 vnode_pager_dealloc(vm_object_t object)
 {
 	struct vnode *vp;
 	int refs;
 
 	vp = object->handle;
 	if (vp == NULL)
 		panic("vnode_pager_dealloc: pager already dealloced");
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	vm_object_pip_wait(object, "vnpdea");
 	refs = object->ref_count;
 
 	object->handle = NULL;
 	object->type = OBJT_DEAD;
 	if (object->flags & OBJ_DISCONNECTWNT) {
 		vm_object_clear_flag(object, OBJ_DISCONNECTWNT);
 		wakeup(object);
 	}
 	ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc");
 	if (object->un_pager.vnp.writemappings > 0) {
 		object->un_pager.vnp.writemappings = 0;
 		VOP_ADD_WRITECOUNT(vp, -1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
 	vp->v_object = NULL;
 	VOP_UNSET_TEXT(vp);
 	VM_OBJECT_WUNLOCK(object);
 	while (refs-- > 0)
 		vunref(vp);
 	VM_OBJECT_WLOCK(object);
 }
 
 static boolean_t
 vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
     int *after)
 {
 	struct vnode *vp = object->handle;
 	daddr_t bn;
 	int err;
 	daddr_t reqblock;
 	int poff;
 	int bsize;
 	int pagesperblock, blocksperpage;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	/*
 	 * If no vp or vp is doomed or marked transparent to VM, we do not
 	 * have the page.
 	 */
 	if (vp == NULL || vp->v_iflag & VI_DOOMED)
 		return FALSE;
 	/*
 	 * If the offset is beyond end of file we do
 	 * not have the page.
 	 */
 	if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size)
 		return FALSE;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	pagesperblock = bsize / PAGE_SIZE;
 	blocksperpage = 0;
 	if (pagesperblock > 0) {
 		reqblock = pindex / pagesperblock;
 	} else {
 		blocksperpage = (PAGE_SIZE / bsize);
 		reqblock = pindex * blocksperpage;
 	}
 	VM_OBJECT_WUNLOCK(object);
 	err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before);
 	VM_OBJECT_WLOCK(object);
 	if (err)
 		return TRUE;
 	if (bn == -1)
 		return FALSE;
 	if (pagesperblock > 0) {
 		poff = pindex - (reqblock * pagesperblock);
 		if (before) {
 			*before *= pagesperblock;
 			*before += poff;
 		}
 		if (after) {
 			/*
 			 * The BMAP vop can report a partial block in the
 			 * 'after', but must not report blocks after EOF.
 			 * Assert the latter, and truncate 'after' in case
 			 * of the former.
 			 */
 			KASSERT((reqblock + *after) * pagesperblock <
 			    roundup2(object->size, pagesperblock),
 			    ("%s: reqblock %jd after %d size %ju", __func__,
 			    (intmax_t )reqblock, *after,
 			    (uintmax_t )object->size));
 			*after *= pagesperblock;
 			*after += pagesperblock - (poff + 1);
 			if (pindex + *after >= object->size)
 				*after = object->size - 1 - pindex;
 		}
 	} else {
 		if (before) {
 			*before /= blocksperpage;
 		}
 
 		if (after) {
 			*after /= blocksperpage;
 		}
 	}
 	return TRUE;
 }
 
 /*
  * Lets the VM system know about a change in size for a file.
  * We adjust our own internal size and flush any cached pages in
  * the associated object that are affected by the size change.
  *
  * Note: this routine may be invoked as a result of a pager put
  * operation (possibly at object termination time), so we must be careful.
  */
 void
 vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize)
 {
 	vm_object_t object;
 	vm_page_t m;
 	vm_pindex_t nobjsize;
 
 	if ((object = vp->v_object) == NULL)
 		return;
 /* 	ASSERT_VOP_ELOCKED(vp, "vnode_pager_setsize and not locked vnode"); */
 	VM_OBJECT_WLOCK(object);
 	if (object->type == OBJT_DEAD) {
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 	KASSERT(object->type == OBJT_VNODE,
 	    ("not vnode-backed object %p", object));
 	if (nsize == object->un_pager.vnp.vnp_size) {
 		/*
 		 * Hasn't changed size
 		 */
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 	nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
 	if (nsize < object->un_pager.vnp.vnp_size) {
 		/*
 		 * File has shrunk. Toss any cached pages beyond the new EOF.
 		 */
 		if (nobjsize < object->size)
 			vm_object_page_remove(object, nobjsize, object->size,
 			    0);
 		/*
 		 * this gets rid of garbage at the end of a page that is now
 		 * only partially backed by the vnode.
 		 *
 		 * XXX for some reason (I don't know yet), if we take a
 		 * completely invalid page and mark it partially valid
 		 * it can screw up NFS reads, so we don't allow the case.
 		 */
 		if ((nsize & PAGE_MASK) &&
 		    (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL &&
 		    m->valid != 0) {
 			int base = (int)nsize & PAGE_MASK;
 			int size = PAGE_SIZE - base;
 
 			/*
 			 * Clear out partial-page garbage in case
 			 * the page has been mapped.
 			 */
 			pmap_zero_page_area(m, base, size);
 
 			/*
 			 * Update the valid bits to reflect the blocks that
 			 * have been zeroed.  Some of these valid bits may
 			 * have already been set.
 			 */
 			vm_page_set_valid_range(m, base, size);
 
 			/*
 			 * Round "base" to the next block boundary so that the
 			 * dirty bit for a partially zeroed block is not
 			 * cleared.
 			 */
 			base = roundup2(base, DEV_BSIZE);
 
 			/*
 			 * Clear out partial-page dirty bits.
 			 *
 			 * note that we do not clear out the valid
 			 * bits.  This would prevent bogus_page
 			 * replacement from working properly.
 			 */
 			vm_page_clear_dirty(m, base, PAGE_SIZE - base);
 		} else if ((nsize & PAGE_MASK) &&
 		    vm_page_is_cached(object, OFF_TO_IDX(nsize))) {
 			vm_page_cache_free(object, OFF_TO_IDX(nsize),
 			    nobjsize);
 		}
 	}
 	object->un_pager.vnp.vnp_size = nsize;
 	object->size = nobjsize;
 	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
  * calculate the linear (byte) disk address of specified virtual
  * file address
  */
 static int
 vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress,
     int *run)
 {
 	int bsize;
 	int err;
 	daddr_t vblock;
 	daddr_t voffset;
 
 	if (address < 0)
 		return -1;
 
 	if (vp->v_iflag & VI_DOOMED)
 		return -1;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	vblock = address / bsize;
 	voffset = address % bsize;
 
 	err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL);
 	if (err == 0) {
 		if (*rtaddress != -1)
 			*rtaddress += voffset / DEV_BSIZE;
 		if (run) {
 			*run += 1;
 			*run *= bsize/PAGE_SIZE;
 			*run -= voffset/PAGE_SIZE;
 		}
 	}
 
 	return (err);
 }
 
 /*
  * small block filesystem vnode pager input
  */
 static int
 vnode_pager_input_smlfs(vm_object_t object, vm_page_t m)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	struct buf *bp;
 	struct sf_buf *sf;
 	daddr_t fileaddr;
 	vm_offset_t bsize;
 	vm_page_bits_t bits;
 	int error, i;
 
 	error = 0;
 	vp = object->handle;
 	if (vp->v_iflag & VI_DOOMED)
 		return VM_PAGER_BAD;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 
 	VOP_BMAP(vp, 0, &bo, 0, NULL, NULL);
 
 	sf = sf_buf_alloc(m, 0);
 
 	for (i = 0; i < PAGE_SIZE / bsize; i++) {
 		vm_ooffset_t address;
 
 		bits = vm_page_bits(i * bsize, bsize);
 		if (m->valid & bits)
 			continue;
 
 		address = IDX_TO_OFF(m->pindex) + i * bsize;
 		if (address >= object->un_pager.vnp.vnp_size) {
 			fileaddr = -1;
 		} else {
 			error = vnode_pager_addr(vp, address, &fileaddr, NULL);
 			if (error)
 				break;
 		}
 		if (fileaddr != -1) {
 			bp = getpbuf(&vnode_pbuf_freecnt);
 
 			/* build a minimal buffer header */
 			bp->b_iocmd = BIO_READ;
 			bp->b_iodone = bdone;
 			KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 			KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 			bp->b_rcred = crhold(curthread->td_ucred);
 			bp->b_wcred = crhold(curthread->td_ucred);
 			bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize;
 			bp->b_blkno = fileaddr;
 			pbgetbo(bo, bp);
 			bp->b_vp = vp;
 			bp->b_bcount = bsize;
 			bp->b_bufsize = bsize;
 			bp->b_runningbufspace = bp->b_bufsize;
 			atomic_add_long(&runningbufspace, bp->b_runningbufspace);
 
 			/* do the input */
 			bp->b_iooffset = dbtob(bp->b_blkno);
 			bstrategy(bp);
 
 			bwait(bp, PVM, "vnsrd");
 
 			if ((bp->b_ioflags & BIO_ERROR) != 0)
 				error = EIO;
 
 			/*
 			 * free the buffer header back to the swap buffer pool
 			 */
 			bp->b_vp = NULL;
 			pbrelbo(bp);
 			relpbuf(bp, &vnode_pbuf_freecnt);
 			if (error)
 				break;
 		} else
 			bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize);
 		KASSERT((m->dirty & bits) == 0,
 		    ("vnode_pager_input_smlfs: page %p is dirty", m));
 		VM_OBJECT_WLOCK(object);
 		m->valid |= bits;
 		VM_OBJECT_WUNLOCK(object);
 	}
 	sf_buf_free(sf);
 	if (error) {
 		return VM_PAGER_ERROR;
 	}
 	return VM_PAGER_OK;
 }
 
 /*
  * old style vnode pager input routine
  */
 static int
 vnode_pager_input_old(vm_object_t object, vm_page_t m)
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 	int size;
 	struct sf_buf *sf;
 	struct vnode *vp;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	error = 0;
 
 	/*
 	 * Return failure if beyond current EOF
 	 */
 	if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
 		return VM_PAGER_BAD;
 	} else {
 		size = PAGE_SIZE;
 		if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
 			size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
 		vp = object->handle;
 		VM_OBJECT_WUNLOCK(object);
 
 		/*
 		 * Allocate a kernel virtual address and initialize so that
 		 * we can use VOP_READ/WRITE routines.
 		 */
 		sf = sf_buf_alloc(m, 0);
 
 		aiov.iov_base = (caddr_t)sf_buf_kva(sf);
 		aiov.iov_len = size;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = IDX_TO_OFF(m->pindex);
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_rw = UIO_READ;
 		auio.uio_resid = size;
 		auio.uio_td = curthread;
 
 		error = VOP_READ(vp, &auio, 0, curthread->td_ucred);
 		if (!error) {
 			int count = size - auio.uio_resid;
 
 			if (count == 0)
 				error = EINVAL;
 			else if (count != PAGE_SIZE)
 				bzero((caddr_t)sf_buf_kva(sf) + count,
 				    PAGE_SIZE - count);
 		}
 		sf_buf_free(sf);
 
 		VM_OBJECT_WLOCK(object);
 	}
 	KASSERT(m->dirty == 0, ("vnode_pager_input_old: page %p is dirty", m));
 	if (!error)
 		m->valid = VM_PAGE_BITS_ALL;
 	return error ? VM_PAGER_ERROR : VM_PAGER_OK;
 }
 
 /*
  * generic vnode pager input routine
  */
 
 /*
  * Local media VFS's that do not implement their own VOP_GETPAGES
  * should have their VOP_GETPAGES call to vnode_pager_generic_getpages()
  * to implement the previous behaviour.
  *
  * All other FS's should use the bypass to get to the local media
  * backing vp's VOP_GETPAGES.
  */
 static int
 vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
     int *rahead)
 {
 	struct vnode *vp;
 	int rtval;
 
 	vp = object->handle;
 	VM_OBJECT_WUNLOCK(object);
 	rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead);
 	KASSERT(rtval != EOPNOTSUPP,
 	    ("vnode_pager: FS getpages not implemented\n"));
 	VM_OBJECT_WLOCK(object);
 	return rtval;
 }
 
 static int
 vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count,
     int *rbehind, int *rahead, vop_getpages_iodone_t iodone, void *arg)
 {
 	struct vnode *vp;
 	int rtval;
 
 	vp = object->handle;
 	VM_OBJECT_WUNLOCK(object);
 	rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg);
 	KASSERT(rtval != EOPNOTSUPP,
 	    ("vnode_pager: FS getpages_async not implemented\n"));
 	VM_OBJECT_WLOCK(object);
 	return (rtval);
 }
 
 /*
  * The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for
  * local filesystems, where partially valid pages can only occur at
  * the end of file.
  */
 int
 vnode_pager_local_getpages(struct vop_getpages_args *ap)
 {
 
 	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
 	    ap->a_rbehind, ap->a_rahead, NULL, NULL));
 }
 
 int
 vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap)
 {
 
 	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
 	    ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg));
 }
 
 /*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_GETPAGES.
  */
 int
 vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count,
     int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg)
 {
 	vm_object_t object;
 	struct bufobj *bo;
 	struct buf *bp;
 	off_t foff;
 	int bsize, pagesperblock, *freecnt;
 	int error, before, after, rbehind, rahead, poff, i;
 	int bytecount, secmask;
 
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("%s does not support devices", __func__));
 
 	if (vp->v_iflag & VI_DOOMED)
 		return (VM_PAGER_BAD);
 
 	object = vp->v_object;
 	foff = IDX_TO_OFF(m[0]->pindex);
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	pagesperblock = bsize / PAGE_SIZE;
 
 	KASSERT(foff < object->un_pager.vnp.vnp_size,
 	    ("%s: page %p offset beyond vp %p size", __func__, m[0], vp));
 	KASSERT(count <= sizeof(bp->b_pages),
 	    ("%s: requested %d pages", __func__, count));
 
 	/*
 	 * The last page has valid blocks.  Invalid part can only
 	 * exist at the end of file, and the page is made fully valid
 	 * by zeroing in vm_pager_get_pages().
 	 */
 	if (m[count - 1]->valid != 0 && --count == 0) {
 		if (iodone != NULL)
 			iodone(arg, m, 1, 0);
 		return (VM_PAGER_OK);
 	}
 
 	/*
 	 * Synchronous and asynchronous paging operations use different
 	 * free pbuf counters.  This is done to avoid asynchronous requests
 	 * to consume all pbufs.
 	 * Allocate the pbuf at the very beginning of the function, so that
 	 * if we are low on certain kind of pbufs don't even proceed to BMAP,
 	 * but sleep.
 	 */
 	freecnt = iodone != NULL ?
 	    &vnode_async_pbuf_freecnt : &vnode_pbuf_freecnt;
 	bp = getpbuf(freecnt);
 
 	/*
 	 * Get the underlying device blocks for the file with VOP_BMAP().
 	 * If the file system doesn't support VOP_BMAP, use old way of
 	 * getting pages via VOP_READ.
 	 */
 	error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before);
 	if (error == EOPNOTSUPP) {
 		relpbuf(bp, freecnt);
 		VM_OBJECT_WLOCK(object);
 		for (i = 0; i < count; i++) {
 			PCPU_INC(cnt.v_vnodein);
 			PCPU_INC(cnt.v_vnodepgsin);
 			error = vnode_pager_input_old(object, m[i]);
 			if (error)
 				break;
 		}
 		VM_OBJECT_WUNLOCK(object);
 		return (error);
 	} else if (error != 0) {
 		relpbuf(bp, freecnt);
 		return (VM_PAGER_ERROR);
 	}
 
 	/*
 	 * If the file system supports BMAP, but blocksize is smaller
 	 * than a page size, then use special small filesystem code.
 	 */
 	if (pagesperblock == 0) {
 		relpbuf(bp, freecnt);
 		for (i = 0; i < count; i++) {
 			PCPU_INC(cnt.v_vnodein);
 			PCPU_INC(cnt.v_vnodepgsin);
 			error = vnode_pager_input_smlfs(object, m[i]);
 			if (error)
 				break;
 		}
 		return (error);
 	}
 
 	/*
 	 * A sparse file can be encountered only for a single page request,
 	 * which may not be preceded by call to vm_pager_haspage().
 	 */
 	if (bp->b_blkno == -1) {
 		KASSERT(count == 1,
 		    ("%s: array[%d] request to a sparse file %p", __func__,
 		    count, vp));
 		relpbuf(bp, freecnt);
 		pmap_zero_page(m[0]);
 		KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty",
 		    __func__, m[0]));
 		VM_OBJECT_WLOCK(object);
 		m[0]->valid = VM_PAGE_BITS_ALL;
 		VM_OBJECT_WUNLOCK(object);
 		return (VM_PAGER_OK);
 	}
 
 	bp->b_blkno += (foff % bsize) / DEV_BSIZE;
 
 	/* Recalculate blocks available after/before to pages. */
 	poff = (foff % bsize) / PAGE_SIZE;
 	before *= pagesperblock;
 	before += poff;
 	after *= pagesperblock;
 	after += pagesperblock - (poff + 1);
 	if (m[0]->pindex + after >= object->size)
 		after = object->size - 1 - m[0]->pindex;
 	KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d",
 	    __func__, count, after + 1));
 	after -= count - 1;
 
 	/* Trim requested rbehind/rahead to possible values. */   
 	rbehind = a_rbehind ? *a_rbehind : 0;
 	rahead = a_rahead ? *a_rahead : 0;
 	rbehind = min(rbehind, before);
 	rbehind = min(rbehind, m[0]->pindex);
 	rahead = min(rahead, after);
 	rahead = min(rahead, object->size - m[count - 1]->pindex);
 	KASSERT(rbehind + rahead + count <= sizeof(bp->b_pages),
 	    ("%s: behind %d ahead %d count %d", __func__,
 	    rbehind, rahead, count));
 
 	/*
 	 * Fill in the bp->b_pages[] array with requested and optional   
 	 * read behind or read ahead pages.  Read behind pages are looked
 	 * up in a backward direction, down to a first cached page.  Same
 	 * for read ahead pages, but there is no need to shift the array
 	 * in case of encountering a cached page.
 	 */
 	i = bp->b_npages = 0;
 	if (rbehind) {
 		vm_pindex_t startpindex, tpindex;
 		vm_page_t p;
 
 		VM_OBJECT_WLOCK(object);
 		startpindex = m[0]->pindex - rbehind;
 		if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL &&
 		    p->pindex >= startpindex)
 			startpindex = p->pindex + 1;
 
 		/* tpindex is unsigned; beware of numeric underflow. */
 		for (tpindex = m[0]->pindex - 1;
 		    tpindex >= startpindex && tpindex < m[0]->pindex;
 		    tpindex--, i++) {
 			p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
 			    VM_ALLOC_IFNOTCACHED);
 			if (p == NULL) {
 				/* Shift the array. */
 				for (int j = 0; j < i; j++)
 					bp->b_pages[j] = bp->b_pages[j + 
 					    tpindex + 1 - startpindex]; 
 				break;
 			}
 			bp->b_pages[tpindex - startpindex] = p;
 		}
 
 		bp->b_pgbefore = i;
 		bp->b_npages += i;
 		bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE;
 	} else
 		bp->b_pgbefore = 0;
 
 	/* Requested pages. */
 	for (int j = 0; j < count; j++, i++)
 		bp->b_pages[i] = m[j];
 	bp->b_npages += count;
 
 	if (rahead) {
 		vm_pindex_t endpindex, tpindex;
 		vm_page_t p;
 
 		if (!VM_OBJECT_WOWNED(object))
 			VM_OBJECT_WLOCK(object);
 		endpindex = m[count - 1]->pindex + rahead + 1;
 		if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL &&
 		    p->pindex < endpindex)
 			endpindex = p->pindex;
 		if (endpindex > object->size)
 			endpindex = object->size;
 
 		for (tpindex = m[count - 1]->pindex + 1;
 		    tpindex < endpindex; i++, tpindex++) {
 			p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL |
 			    VM_ALLOC_IFNOTCACHED);
 			if (p == NULL)
 				break;
 			bp->b_pages[i] = p;
 		}
 
 		bp->b_pgafter = i - bp->b_npages;
 		bp->b_npages = i;
 	} else
 		bp->b_pgafter = 0;
 
 	if (VM_OBJECT_WOWNED(object))
 		VM_OBJECT_WUNLOCK(object);
 
 	/* Report back actual behind/ahead read. */
 	if (a_rbehind)
 		*a_rbehind = bp->b_pgbefore;
 	if (a_rahead)
 		*a_rahead = bp->b_pgafter;
 
 	KASSERT(bp->b_npages <= sizeof(bp->b_pages),
 	    ("%s: buf %p overflowed", __func__, bp));
 
 	/*
 	 * Recalculate first offset and bytecount with regards to read behind.
 	 * Truncate bytecount to vnode real size and round up physical size
 	 * for real devices.
 	 */
 	foff = IDX_TO_OFF(bp->b_pages[0]->pindex);
 	bytecount = bp->b_npages << PAGE_SHIFT;
 	if ((foff + bytecount) > object->un_pager.vnp.vnp_size)
 		bytecount = object->un_pager.vnp.vnp_size - foff;
 	secmask = bo->bo_bsize - 1;
 	KASSERT(secmask < PAGE_SIZE && secmask > 0,
 	    ("%s: sector size %d too large", __func__, secmask + 1));
 	bytecount = (bytecount + secmask) & ~secmask;
 
 	/*
 	 * And map the pages to be read into the kva, if the filesystem
 	 * requires mapped buffers.
 	 */
 	if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
 	    unmapped_buf_allowed) {
 		bp->b_data = unmapped_buf;
 		bp->b_offset = 0;
 	} else {
 		bp->b_data = bp->b_kvabase;
 		pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
 	}
 
 	/* Build a minimal buffer header. */
 	bp->b_iocmd = BIO_READ;
 	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 	bp->b_rcred = crhold(curthread->td_ucred);
 	bp->b_wcred = crhold(curthread->td_ucred);
 	pbgetbo(bo, bp);
 	bp->b_vp = vp;
 	bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount;
 	bp->b_iooffset = dbtob(bp->b_blkno);
 
 	atomic_add_long(&runningbufspace, bp->b_runningbufspace);
 	PCPU_INC(cnt.v_vnodein);
 	PCPU_ADD(cnt.v_vnodepgsin, bp->b_npages);
 
 	if (iodone != NULL) { /* async */
 		bp->b_pgiodone = iodone;
 		bp->b_caller1 = arg;
 		bp->b_iodone = vnode_pager_generic_getpages_done_async;
 		bp->b_flags |= B_ASYNC;
 		BUF_KERNPROC(bp);
 		bstrategy(bp);
 		return (VM_PAGER_OK);
 	} else {
 		bp->b_iodone = bdone;
 		bstrategy(bp);
 		bwait(bp, PVM, "vnread");
 		error = vnode_pager_generic_getpages_done(bp);
 		for (i = 0; i < bp->b_npages; i++)
 			bp->b_pages[i] = NULL;
 		bp->b_vp = NULL;
 		pbrelbo(bp);
 		relpbuf(bp, &vnode_pbuf_freecnt);
 		return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
 	}
 }
 
 static void
 vnode_pager_generic_getpages_done_async(struct buf *bp)
 {
 	int error;
 
 	error = vnode_pager_generic_getpages_done(bp);
 	/* Run the iodone upon the requested range. */
 	bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore,
 	    bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error);
 	for (int i = 0; i < bp->b_npages; i++)
 		bp->b_pages[i] = NULL;
 	bp->b_vp = NULL;
 	pbrelbo(bp);
 	relpbuf(bp, &vnode_async_pbuf_freecnt);
 }
 
 static int
 vnode_pager_generic_getpages_done(struct buf *bp)
 {
 	vm_object_t object;
 	off_t tfoff, nextoff;
 	int i, error;
 
 	error = (bp->b_ioflags & BIO_ERROR) != 0 ? EIO : 0;
 	object = bp->b_vp->v_object;
 
 	if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) {
 		if (!buf_mapped(bp)) {
 			bp->b_data = bp->b_kvabase;
 			pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages,
 			    bp->b_npages);
 		}
 		bzero(bp->b_data + bp->b_bcount,
 		    PAGE_SIZE * bp->b_npages - bp->b_bcount);
 	}
 	if (buf_mapped(bp)) {
 		pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 		bp->b_data = unmapped_buf;
 	}
 
 	VM_OBJECT_WLOCK(object);
 	for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex);
 	    i < bp->b_npages; i++, tfoff = nextoff) {
 		vm_page_t mt;
 
 		nextoff = tfoff + PAGE_SIZE;
 		mt = bp->b_pages[i];
 
 		if (nextoff <= object->un_pager.vnp.vnp_size) {
 			/*
 			 * Read filled up entire page.
 			 */
 			mt->valid = VM_PAGE_BITS_ALL;
 			KASSERT(mt->dirty == 0,
 			    ("%s: page %p is dirty", __func__, mt));
 			KASSERT(!pmap_page_is_mapped(mt),
 			    ("%s: page %p is mapped", __func__, mt));
 		} else {
 			/*
 			 * Read did not fill up entire page.
 			 *
 			 * Currently we do not set the entire page valid,
 			 * we just try to clear the piece that we couldn't
 			 * read.
 			 */
 			vm_page_set_valid_range(mt, 0,
 			    object->un_pager.vnp.vnp_size - tfoff);
 			KASSERT((mt->dirty & vm_page_bits(0,
 			    object->un_pager.vnp.vnp_size - tfoff)) == 0,
 			    ("%s: page %p is dirty", __func__, mt));
 		}
 
 		if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter)
 			vm_page_readahead_finish(mt);
 	}
 	VM_OBJECT_WUNLOCK(object);
 	if (error != 0)
 		printf("%s: I/O read error %d\n", __func__, error);
 
 	return (error);
 }
 
 /*
  * EOPNOTSUPP is no longer legal.  For local media VFS's that do not
  * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
  * vnode_pager_generic_putpages() to implement the previous behaviour.
  *
  * All other FS's should use the bypass to get to the local media
  * backing vp's VOP_PUTPAGES.
  */
 static void
 vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count,
     int flags, int *rtvals)
 {
 	int rtval;
 	struct vnode *vp;
 	int bytes = count * PAGE_SIZE;
 
 	/*
 	 * Force synchronous operation if we are extremely low on memory
 	 * to prevent a low-memory deadlock.  VOP operations often need to
 	 * allocate more memory to initiate the I/O ( i.e. do a BMAP
 	 * operation ).  The swapper handles the case by limiting the amount
 	 * of asynchronous I/O, but that sort of solution doesn't scale well
 	 * for the vnode pager without a lot of work.
 	 *
 	 * Also, the backing vnode's iodone routine may not wake the pageout
 	 * daemon up.  This should be probably be addressed XXX.
 	 */
 
 	if (vm_cnt.v_free_count + vm_cnt.v_cache_count <
 	    vm_cnt.v_pageout_free_min)
 		flags |= VM_PAGER_PUT_SYNC;
 
 	/*
 	 * Call device-specific putpages function
 	 */
 	vp = object->handle;
 	VM_OBJECT_WUNLOCK(object);
 	rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals);
 	KASSERT(rtval != EOPNOTSUPP, 
 	    ("vnode_pager: stale FS putpages\n"));
 	VM_OBJECT_WLOCK(object);
 }
 
 
 /*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_PUTPAGES.
  *
  * This is typically called indirectly via the pageout daemon and
  * clustering has already typically occurred, so in general we ask the
  * underlying filesystem to write the data out asynchronously rather
  * then delayed.
  */
 int
 vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount,
     int flags, int *rtvals)
 {
 	int i;
 	vm_object_t object;
 	vm_page_t m;
 	int count;
 
 	int maxsize, ncount;
 	vm_ooffset_t poffset;
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 	int ioflags;
 	int ppscheck = 0;
 	static struct timeval lastfail;
 	static int curfail;
 
 	object = vp->v_object;
 	count = bytecount / PAGE_SIZE;
 
 	for (i = 0; i < count; i++)
 		rtvals[i] = VM_PAGER_ERROR;
 
 	if ((int64_t)ma[0]->pindex < 0) {
 		printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%lx)\n",
 		    (long)ma[0]->pindex, (u_long)ma[0]->dirty);
 		rtvals[0] = VM_PAGER_BAD;
 		return VM_PAGER_BAD;
 	}
 
 	maxsize = count * PAGE_SIZE;
 	ncount = count;
 
 	poffset = IDX_TO_OFF(ma[0]->pindex);
 
 	/*
 	 * If the page-aligned write is larger then the actual file we
 	 * have to invalidate pages occurring beyond the file EOF.  However,
 	 * there is an edge case where a file may not be page-aligned where
 	 * the last page is partially invalid.  In this case the filesystem
 	 * may not properly clear the dirty bits for the entire page (which
 	 * could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
 	 * With the page locked we are free to fix-up the dirty bits here.
 	 *
 	 * We do not under any circumstances truncate the valid bits, as
 	 * this will screw up bogus page replacement.
 	 */
 	VM_OBJECT_WLOCK(object);
 	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
 		if (object->un_pager.vnp.vnp_size > poffset) {
 			int pgoff;
 
 			maxsize = object->un_pager.vnp.vnp_size - poffset;
 			ncount = btoc(maxsize);
 			if ((pgoff = (int)maxsize & PAGE_MASK) != 0) {
 				/*
 				 * If the object is locked and the following
 				 * conditions hold, then the page's dirty
 				 * field cannot be concurrently changed by a
 				 * pmap operation.
 				 */
 				m = ma[ncount - 1];
 				vm_page_assert_sbusied(m);
 				KASSERT(!pmap_page_is_write_mapped(m),
 		("vnode_pager_generic_putpages: page %p is not read-only", m));
 				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
 				    pgoff);
 			}
 		} else {
 			maxsize = 0;
 			ncount = 0;
 		}
 		if (ncount < count) {
 			for (i = ncount; i < count; i++) {
 				rtvals[i] = VM_PAGER_BAD;
 			}
 		}
 	}
 	VM_OBJECT_WUNLOCK(object);
 
 	/*
 	 * pageouts are already clustered, use IO_ASYNC to force a bawrite()
 	 * rather then a bdwrite() to prevent paging I/O from saturating 
 	 * the buffer cache.  Dummy-up the sequential heuristic to cause
 	 * large ranges to cluster.  If neither IO_SYNC or IO_ASYNC is set,
 	 * the system decides how to cluster.
 	 */
 	ioflags = IO_VMIO;
 	if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL))
 		ioflags |= IO_SYNC;
 	else if ((flags & VM_PAGER_CLUSTER_OK) == 0)
 		ioflags |= IO_ASYNC;
 	ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
 	ioflags |= IO_SEQMAX << IO_SEQSHIFT;
 
 	aiov.iov_base = (caddr_t) 0;
 	aiov.iov_len = maxsize;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = poffset;
 	auio.uio_segflg = UIO_NOCOPY;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_resid = maxsize;
 	auio.uio_td = (struct thread *) 0;
 	error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred);
 	PCPU_INC(cnt.v_vnodeout);
 	PCPU_ADD(cnt.v_vnodepgsout, ncount);
 
 	if (error) {
 		if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1)))
 			printf("vnode_pager_putpages: I/O error %d\n", error);
 	}
 	if (auio.uio_resid) {
 		if (ppscheck || ppsratecheck(&lastfail, &curfail, 1))
 			printf("vnode_pager_putpages: residual I/O %zd at %lu\n",
 			    auio.uio_resid, (u_long)ma[0]->pindex);
 	}
 	for (i = 0; i < ncount; i++) {
 		rtvals[i] = VM_PAGER_OK;
 	}
 	return rtvals[0];
 }
 
 void
 vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written)
 {
 	vm_object_t obj;
 	int i, pos;
 
 	if (written == 0)
 		return;
 	obj = ma[0]->object;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0, pos = 0; pos < written; i++, pos += PAGE_SIZE) {
 		if (pos < trunc_page(written)) {
 			rtvals[i] = VM_PAGER_OK;
 			vm_page_undirty(ma[i]);
 		} else {
 			/* Partially written page. */
 			rtvals[i] = VM_PAGER_AGAIN;
 			vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK);
 		}
 	}
 	VM_OBJECT_WUNLOCK(obj);
 }
 
 void
 vnode_pager_update_writecount(vm_object_t object, vm_offset_t start,
     vm_offset_t end)
 {
 	struct vnode *vp;
 	vm_ooffset_t old_wm;
 
 	VM_OBJECT_WLOCK(object);
 	if (object->type != OBJT_VNODE) {
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 	old_wm = object->un_pager.vnp.writemappings;
 	object->un_pager.vnp.writemappings += (vm_ooffset_t)end - start;
 	vp = object->handle;
 	if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) {
 		ASSERT_VOP_ELOCKED(vp, "v_writecount inc");
 		VOP_ADD_WRITECOUNT(vp, 1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 		    __func__, vp, vp->v_writecount);
 	} else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) {
 		ASSERT_VOP_ELOCKED(vp, "v_writecount dec");
 		VOP_ADD_WRITECOUNT(vp, -1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
 	VM_OBJECT_WUNLOCK(object);
 }
 
 void
 vnode_pager_release_writecount(vm_object_t object, vm_offset_t start,
     vm_offset_t end)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	vm_offset_t inc;
 
 	VM_OBJECT_WLOCK(object);
 
 	/*
 	 * First, recheck the object type to account for the race when
 	 * the vnode is reclaimed.
 	 */
 	if (object->type != OBJT_VNODE) {
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 
 	/*
 	 * Optimize for the case when writemappings is not going to
 	 * zero.
 	 */
 	inc = end - start;
 	if (object->un_pager.vnp.writemappings != inc) {
 		object->un_pager.vnp.writemappings -= inc;
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 
 	vp = object->handle;
 	vhold(vp);
 	VM_OBJECT_WUNLOCK(object);
 	mp = NULL;
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 
 	/*
 	 * Decrement the object's writemappings, by swapping the start
 	 * and end arguments for vnode_pager_update_writecount().  If
 	 * there was not a race with vnode reclaimation, then the
 	 * vnode's v_writecount is decremented.
 	 */
 	vnode_pager_update_writecount(object, end, start);
 	VOP_UNLOCK(vp, 0);
 	vdrop(vp);
 	if (mp != NULL)
 		vn_finished_write(mp);
 }