Index: include/Makefile
===================================================================
--- include/Makefile
+++ include/Makefile
@@ -18,9 +18,9 @@
 	inttypes.h iso646.h kenv.h langinfo.h libgen.h limits.h link.h \
 	locale.h malloc.h malloc_np.h memory.h monetary.h mpool.h mqueue.h \
 	ndbm.h netconfig.h \
-	netdb.h nl_types.h nlist.h nss.h nsswitch.h paths.h \
-	printf.h proc_service.h pthread.h \
-	pthread_np.h pwd.h ranlib.h readpassphrase.h regex.h \
+	netdb.h nl_types.h nlist.h nss.h nsswitch.h paths.h printf.h \
+	proc_service.h pthread.h pthread_np.h pthread_workqueue.h \
+	pwd.h ranlib.h readpassphrase.h regex.h \
 	res_update.h resolv.h runetype.h search.h semaphore.h setjmp.h \
 	signal.h spawn.h stab.h stdalign.h stdbool.h stddef.h \
 	stdnoreturn.h stdio.h stdlib.h string.h stringlist.h \
Index: lib/libc/sys/Symbol.map
===================================================================
--- lib/libc/sys/Symbol.map
+++ lib/libc/sys/Symbol.map
@@ -399,7 +399,9 @@
 };
 
 FBSD_1.5 {
-	 fdatasync;
+	fdatasync;
+	thr_stack;
+	thr_workq;
 };
 
 FBSDprivate_1.0 {
@@ -1001,10 +1003,14 @@
 	__sys_thr_self;
 	_thr_set_name;
 	__sys_thr_set_name;
+	_thr_stack;
+	__sys_thr_stack;
 	_thr_suspend;
 	__sys_thr_suspend;
 	_thr_wake;
 	__sys_thr_wake;
+	_thr_workq;
+	__sys_thr_workq;
 	_ktimer_create;
 	__sys_ktimer_create;
 	_ktimer_delete;
Index: lib/libthr/pthread.map
===================================================================
--- lib/libthr/pthread.map
+++ lib/libthr/pthread.map
@@ -316,6 +316,39 @@
 	pthread_getthreadid_np;
 };
 
+
+FBSD_1.3 {
+  _pthread_workqueue_additem_np;
+  _pthread_workqueue_atfork_child;
+  _pthread_workqueue_atfork_parent;
+  _pthread_workqueue_atfork_prepare;
+  _pthread_workqueue_attr_destroy_np;
+  _pthread_workqueue_attr_getovercommit_np;
+  _pthread_workqueue_attr_getqueuepriority_np;
+  _pthread_workqueue_attr_init_np;
+  _pthread_workqueue_attr_setovercommit_np;
+  _pthread_workqueue_attr_setqueuepriority_np;
+  _pthread_workqueue_create_np;
+  _pthread_workqueue_getovercommit_np;
+  _pthread_workqueue_init_np;
+  _pthread_workqueue_requestconcurrency_np;
+
+  pthread_workqueue_additem_np;
+  pthread_workqueue_atfork_child;
+  pthread_workqueue_atfork_parent;
+  pthread_workqueue_atfork_prepare;
+  pthread_workqueue_attr_destroy_np;
+  pthread_workqueue_attr_getovercommit_np;
+  pthread_workqueue_attr_getqueuepriority_np;
+  pthread_workqueue_attr_init_np;
+  pthread_workqueue_attr_setovercommit_np;
+  pthread_workqueue_attr_setqueuepriority_np;
+  pthread_workqueue_create_np;
+  pthread_workqueue_getovercommit_np;
+  pthread_workqueue_init_np;
+  pthread_workqueue_requestconcurrency_np;
+};
+
 FBSD_1.4 {
 	 pthread_mutex_consistent;
 	 pthread_mutexattr_getrobust;
Index: lib/libthr/thread/Makefile.inc
===================================================================
--- lib/libthr/thread/Makefile.inc
+++ lib/libthr/thread/Makefile.inc
@@ -56,5 +56,6 @@
 	thr_suspend_np.c \
 	thr_switch_np.c \
 	thr_symbols.c \
+	thr_workq.c \
 	thr_umtx.c \
 	thr_yield.c
Index: lib/libthr/thread/thr_init.c
===================================================================
--- lib/libthr/thread/thr_init.c
+++ lib/libthr/thread/thr_init.c
@@ -59,7 +59,7 @@
 #include "libc_private.h"
 #include "thr_private.h"
 
-char		*_usrstack;
+static char		*_usrstack;
 struct pthread	*_thr_initial;
 int		_libthr_debug;
 int		_thread_event_mask;
@@ -113,8 +113,6 @@
 
 int		_thr_is_smp = 0;
 size_t		_thr_guard_default;
-size_t		_thr_stack_default = THR_STACK_DEFAULT;
-size_t		_thr_stack_initial = THR_STACK_INITIAL;
 int		_thr_page_size;
 int		_thr_spinloops;
 int		_thr_yieldloops;
@@ -387,9 +385,8 @@
 	 * resource limits, so this stack needs an explicitly mapped
 	 * red zone to protect the thread stack that is just beyond.
 	 */
-	if (mmap(_usrstack - _thr_stack_initial -
-	    _thr_guard_default, _thr_guard_default, 0, MAP_ANON,
-	    -1, 0) == MAP_FAILED)
+	if (mmap(_usrstack - THR_STACK_INITIAL - _thr_guard_default,
+		_thr_guard_default, PROT_NONE, MAP_ANON, -1, 0) == MAP_FAILED)
 		PANIC("Cannot allocate red zone for initial thread");
 
 	/*
@@ -401,8 +398,8 @@
 	 *       actually free() it; it just puts it in the free
 	 *       stack queue for later reuse.
 	 */
-	thread->attr.stackaddr_attr = _usrstack - _thr_stack_initial;
-	thread->attr.stacksize_attr = _thr_stack_initial;
+	thread->attr.stackaddr_attr = _usrstack - THR_STACK_INITIAL;
+	thread->attr.stacksize_attr = THR_STACK_INITIAL;
 	thread->attr.guardsize_attr = _thr_guard_default;
 	thread->attr.flags |= THR_STACK_USER;
 
@@ -471,7 +468,7 @@
 		if (env_bigstack != NULL || env_splitstack == NULL) {
 			if (getrlimit(RLIMIT_STACK, &rlim) == -1)
 				PANIC("Cannot get stack rlimit");
-			_thr_stack_initial = rlim.rlim_cur;
+			// XXX - _thr_stack_initial = rlim.rlim_cur;
 		}
 		len = sizeof(_thr_is_smp);
 		sysctlbyname("kern.smp.cpus", &_thr_is_smp, &len, NULL, 0);
@@ -479,7 +476,7 @@
 		_thr_page_size = getpagesize();
 		_thr_guard_default = _thr_page_size;
 		_pthread_attr_default.guardsize_attr = _thr_guard_default;
-		_pthread_attr_default.stacksize_attr = _thr_stack_default;
+		_pthread_attr_default.stacksize_attr = THR_STACK_DEFAULT;
 		env = getenv("LIBPTHREAD_SPINLOOPS");
 		if (env)
 			_thr_spinloops = atoi(env);
Index: lib/libthr/thread/thr_private.h
===================================================================
--- lib/libthr/thread/thr_private.h
+++ lib/libthr/thread/thr_private.h
@@ -705,7 +705,6 @@
  * Global variables for the pthread kernel.
  */
 
-extern char		*_usrstack __hidden;
 extern struct pthread	*_thr_initial __hidden;
 
 /* For debugger */
@@ -738,8 +737,6 @@
 extern int	_thr_is_smp __hidden;
 
 extern size_t	_thr_guard_default __hidden;
-extern size_t	_thr_stack_default __hidden;
-extern size_t	_thr_stack_initial __hidden;
 extern int	_thr_page_size __hidden;
 extern int	_thr_spinloops __hidden;
 extern int	_thr_yieldloops __hidden;
Index: lib/libthr/thread/thr_stack.c
===================================================================
--- lib/libthr/thread/thr_stack.c
+++ lib/libthr/thread/thr_stack.c
@@ -84,7 +84,7 @@
  *    |       Red Zone (guard page)       | red zone for 2nd thread
  *    |                                   |
  *    +-----------------------------------+
- *    |  stack 2 - _thr_stack_default     | top of 2nd thread stack
+ *    |  stack 2 - THR_STACK_DEFAULT      | top of 2nd thread stack
  *    |                                   |
  *    |                                   |
  *    |                                   |
@@ -95,7 +95,7 @@
  *    |       Red Zone                    | red zone for 1st thread
  *    |                                   |
  *    +-----------------------------------+
- *    |  stack 1 - _thr_stack_default     | top of 1st thread stack
+ *    |  stack 1 - THR_STACK_DEFAULT      | top of 1st thread stack
  *    |                                   |
  *    |                                   |
  *    |                                   |
@@ -106,7 +106,7 @@
  *    |       Red Zone                    |
  *    |                                   | red zone for main thread
  *    +-----------------------------------+
- *    | USRSTACK - _thr_stack_initial     | top of main thread stack
+ *    | KERN_USRSTACK - THR_STACK_INITIAL | top of main thread stack
  *    |                                   | ^
  *    |                                   | |
  *    |                                   | |
@@ -117,7 +117,6 @@
  * high memory
  *
  */
-static char *last_stack = NULL;
 
 /*
  * Round size up to the nearest multiple of
@@ -194,11 +193,10 @@
 	struct stack *spare_stack;
 	size_t stacksize;
 	size_t guardsize;
-	char *stackaddr;
 
 	/*
 	 * Round up stack size to nearest multiple of _thr_page_size so
-	 * that mmap() * will work.  If the stack size is not an even
+	 * that thr_stack() will work.  If the stack size is not an even
 	 * multiple, we end up initializing things such that there is
 	 * unused space above the beginning of the stack, so the stack
 	 * sits snugly against its guard.
@@ -211,7 +209,7 @@
 
 	/*
 	 * Use the garbage collector lock for synchronization of the
-	 * spare stack lists and allocations from usrstack.
+	 * spare stack lists.
 	 */
 	THREAD_LIST_WRLOCK(curthread);
 	/*
@@ -246,43 +244,10 @@
 		THREAD_LIST_UNLOCK(curthread);
 	}
 	else {
-		/*
-		 * Allocate a stack from or below usrstack, depending
-		 * on the LIBPTHREAD_BIGSTACK_MAIN env variable.
-		 */
-		if (last_stack == NULL)
-			last_stack = _usrstack - _thr_stack_initial -
-			    _thr_guard_default;
-
-		/* Allocate a new stack. */
-		stackaddr = last_stack - stacksize - guardsize;
-
-		/*
-		 * Even if stack allocation fails, we don't want to try to
-		 * use this location again, so unconditionally decrement
-		 * last_stack.  Under normal operating conditions, the most
-		 * likely reason for an mmap() error is a stack overflow of
-		 * the adjacent thread stack.
-		 */
-		last_stack -= (stacksize + guardsize);
-
-		/* Release the lock before mmap'ing it. */
+		/* thr_stack() can block so release the lock */
 		THREAD_LIST_UNLOCK(curthread);
 
-		/* Map the stack and guard page together, and split guard
-		   page from allocated space: */
-		if ((stackaddr = mmap(stackaddr, stacksize + guardsize,
-		     _rtld_get_stack_prot(), MAP_STACK,
-		     -1, 0)) != MAP_FAILED &&
-		    (guardsize == 0 ||
-		     mprotect(stackaddr, guardsize, PROT_NONE) == 0)) {
-			stackaddr += guardsize;
-		} else {
-			if (stackaddr != MAP_FAILED)
-				munmap(stackaddr, stacksize + guardsize);
-			stackaddr = NULL;
-		}
-		attr->stackaddr_attr = stackaddr;
+		attr->stackaddr_attr = thr_stack(stacksize, guardsize);
 	}
 	if (attr->stackaddr_attr != NULL)
 		return (0);
Index: share/man/man3/Makefile
===================================================================
--- share/man/man3/Makefile
+++ share/man/man3/Makefile
@@ -273,6 +273,7 @@
 		pthread_suspend_np.3 \
 		pthread_switch_add_np.3 \
 		pthread_testcancel.3 \
+		pthread_workqueue_np.3 \
 		pthread_yield.3
 
 PTHREAD_MLINKS=	pthread_affinity_np.3 pthread_getaffinity_np.3 \
@@ -335,6 +336,15 @@
 PTHREAD_MLINKS+=pthread_testcancel.3 pthread_setcancelstate.3 \
 		pthread_testcancel.3 pthread_setcanceltype.3
 PTHREAD_MLINKS+=pthread_join.3 pthread_timedjoin_np.3
+PTHREAD_MLINKS+=pthread_workqueue_np.3 pthread_workqueue_init_np.3 \
+	pthread_workqueue_np.3 pthread_workqueue_create_np.3 \
+	pthread_workqueue_np.3 pthread_workqueue_additem_np.3 \
+	pthread_workqueue_np.3 pthread_workqueue_attr_init_np.3 \
+	pthread_workqueue_np.3 pthread_workqueue_attr_destroy_np.3 \
+	pthread_workqueue_np.3 pthread_workqueue_attr_getqueuepriority_np.3 \
+	pthread_workqueue_np.3 pthread_workqueue_attr_setqueuepriority_np.3 \
+	pthread_workqueue_np.3 pthread_workqueue_attr_getovercommit_np.3 \
+	pthread_workqueue_np.3 pthread_workqueue_attr_setovercommit_np.3
 .endif
 
 .include <bsd.prog.mk>
Index: sys/conf/NOTES
===================================================================
--- sys/conf/NOTES
+++ sys/conf/NOTES
@@ -1183,6 +1183,9 @@
 
 # POSIX message queue
 options 	P1003_1B_MQUEUE
+
+# PThread WorkQueue support
+options		THRWORKQ
 
 #####################################################################
 # SECURITY POLICY PARAMETERS
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -3320,6 +3320,7 @@
 kern/kern_tc.c			standard
 kern/kern_thr.c			standard
 kern/kern_thread.c		standard
+kern/kern_thrworkq.c		standard
 kern/kern_time.c		standard
 kern/kern_timeout.c		standard
 kern/kern_umtx.c		standard
Index: sys/conf/options
===================================================================
--- sys/conf/options
+++ sys/conf/options
@@ -226,6 +226,9 @@
 P1003_1B_SEMAPHORES		opt_posix.h
 _KPOSIX_PRIORITY_SCHEDULING	opt_posix.h
 
+# PThread WorkQueue Option
+THRWORKQ	opt_thrworkq.h
+
 # Do we want the config file compiled into the kernel?
 INCLUDE_CONFIG_FILE	opt_config.h
 
Index: sys/i386/include/atomic.h
===================================================================
--- sys/i386/include/atomic.h
+++ sys/i386/include/atomic.h
@@ -214,6 +214,26 @@
 	return (res);
 }
 
+static __inline int
+atomic_cmpset_64(volatile uint64_t *dst, uint64_t exp, uint64_t src)
+{
+	int64_t res = exp;
+
+	__asm __volatile (
+	"       " MPLOCKED "            "
+	"       cmpxchg8b %2 ;          "
+	"       setz    %%al ;          "
+	"       movzbl  %%al,%0 ;       "
+	"# atomic_cmpset_64"
+	: "+A" (res),			/* 0 (result) */
+	  "=m" (*dst)			/* 1 */
+	: "m" (*dst),			/* 2 */
+	  "b" ((uint32_t)src),
+	  "c" ((uint32_t)(src >> 32)));
+
+	return (res);
+}
+
 #endif /* CPU_DISABLE_CMPXCHG */
 
 /*
Index: sys/kern/init_sysent.c
===================================================================
--- sys/kern/init_sysent.c
+++ sys/kern/init_sysent.c
@@ -513,8 +513,8 @@
 	{ AS(thr_set_name_args), (sy_call_t *)sys_thr_set_name, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 464 = thr_set_name */
 	{ AS(aio_fsync_args), (sy_call_t *)sys_aio_fsync, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 465 = aio_fsync */
 	{ AS(rtprio_thread_args), (sy_call_t *)sys_rtprio_thread, AUE_RTPRIO, NULL, 0, 0, SYF_CAPENABLED, SY_THR_STATIC },	/* 466 = rtprio_thread */
-	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 467 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 468 = nosys */
+	{ AS(thr_stack_args), (sy_call_t *)sys_thr_stack, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 467 = thr_stack */
+	{ AS(thr_workq_args), (sy_call_t *)sys_thr_workq, AUE_NULL, NULL, 0, 0, 0, SY_THR_STATIC },	/* 468 = thr_workq */
 	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 469 = __getpath_fromfd */
 	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },			/* 470 = __getpath_fromaddr */
 	{ AS(sctp_peeloff_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0, SYF_CAPENABLED, SY_THR_ABSENT },	/* 471 = sctp_peeloff */
Index: sys/kern/kern_exec.c
===================================================================
--- sys/kern/kern_exec.c
+++ sys/kern/kern_exec.c
@@ -30,6 +30,7 @@
 #include "opt_capsicum.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
+#include "opt_thrworkq.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
@@ -64,6 +65,9 @@
 #include <sys/sysent.h>
 #include <sys/shm.h>
 #include <sys/sysctl.h>
+#ifdef THRWORKQ
+#include <sys/thrworkq.h>
+#endif
 #include <sys/vnode.h>
 #include <sys/stat.h>
 #ifdef KTRACE
@@ -211,6 +215,7 @@
 	int error;
 
 	error = pre_execve(td, &oldvmspace);
+
 	if (error != 0)
 		return (error);
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
@@ -294,6 +299,10 @@
 			error = ERESTART;
 		PROC_UNLOCK(p);
 	}
+#ifdef THRWORKQ
+	if (error == 0)
+		thrworkq_exit(p);
+#endif
 	KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0,
 	    ("nested execve"));
 	*oldvmspace = p->p_vmspace;
Index: sys/kern/kern_exit.c
===================================================================
--- sys/kern/kern_exit.c
+++ sys/kern/kern_exit.c
@@ -39,6 +39,7 @@
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
+#include "opt_thrworkq.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -72,6 +73,9 @@
 #include <sys/shm.h>
 #include <sys/sem.h>
 #include <sys/umtx.h>
+#ifdef THRWORKQ
+#include <sys/thrworkq.h>
+#endif
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
@@ -204,6 +208,13 @@
 		panic("Going nowhere without my init!");
 	}
 
+#ifdef THRWORKQ
+	/*
+	 * Check if this process has a thread workqueue.
+	 */
+	thrworkq_exit(p);
+#endif
+
 	/*
 	 * Deref SU mp, since the thread does not return to userspace.
 	 */
Index: sys/kern/kern_mutex.c
===================================================================
--- sys/kern/kern_mutex.c
+++ sys/kern/kern_mutex.c
@@ -40,6 +40,7 @@
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_sched.h"
+#include "opt_thrworkq.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -1032,6 +1033,10 @@
 	blocked_lock.mtx_lock = 0xdeadc0de;	/* Always blocked. */
 	mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
 	mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN);
+#ifdef THRWORKQ
+	mtx_init(&proc0.p_twqlock, "thr workq lock", NULL, MTX_DEF | MTX_DUPOK);
+	proc0.p_twq = NULL;
+#endif /* THRWORKQ */
 	mtx_init(&proc0.p_statmtx, "pstatl", NULL, MTX_SPIN);
 	mtx_init(&proc0.p_itimmtx, "pitiml", NULL, MTX_SPIN);
 	mtx_init(&proc0.p_profmtx, "pprofl", NULL, MTX_SPIN);
Index: sys/kern/kern_proc.c
===================================================================
--- sys/kern/kern_proc.c
+++ sys/kern/kern_proc.c
@@ -37,6 +37,7 @@
 #include "opt_ktrace.h"
 #include "opt_kstack_pages.h"
 #include "opt_stack.h"
+#include "opt_thrworkq.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -245,6 +246,10 @@
 	cv_init(&p->p_pwait, "ppwait");
 	cv_init(&p->p_dbgwait, "dbgwait");
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
+#ifdef THRWORKQ
+	mtx_init(&p->p_twqlock, "thr workq lock", NULL, MTX_DEF | MTX_DUPOK);
+	p->p_twq = NULL;
+#endif /* THRQORKQ */
 	EVENTHANDLER_INVOKE(process_init, p);
 	p->p_stats = pstats_alloc();
 	p->p_pgrp = NULL;
Index: sys/kern/kern_synch.c
===================================================================
--- sys/kern/kern_synch.c
+++ sys/kern/kern_synch.c
@@ -429,6 +429,11 @@
 	SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]);
 #endif
 	/*
+	 * Do the context switch callback before blocking.
+	 */
+	if (td->td_cswitchcb != NULL)
+		(*td->td_cswitchcb)(SWCB_BLOCK, td);
+	/*
 	 * Compute the amount of time during which the current
 	 * thread was running, and add that to its total so far.
 	 */
@@ -459,6 +464,11 @@
 	CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)",
 	    td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name);
 
+	/*
+	 * Do the context switch callback for unblocking.
+	 */
+	if (td->td_cswitchcb != NULL)
+		(*td->td_cswitchcb)(SWCB_UNBLOCK, td);
 	/* 
 	 * If the last thread was exiting, finish cleaning it up.
 	 */
Index: sys/kern/kern_thr.c
===================================================================
--- sys/kern/kern_thr.c
+++ sys/kern/kern_thr.c
@@ -29,9 +29,12 @@
 
 #include "opt_compat.h"
 #include "opt_posix.h"
+#include "opt_thrworkq.h"
+
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
+#include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
@@ -52,6 +55,7 @@
 #include <sys/ucontext.h>
 #include <sys/thr.h>
 #include <sys/rtprio.h>
+#include <sys/thrworkq.h>
 #include <sys/umtx.h>
 #include <sys/limits.h>
 
@@ -59,8 +63,25 @@
 
 #include <machine/frame.h>
 
+#include <vm/pmap.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+
 #include <security/audit/audit.h>
 
+/*
+ * Default stack guard size for thread.  If set to zero then no
+ * guard page.
+ */
+#define        THR_GUARD_DEFAULT       PAGE_SIZE
+
+/*
+ * XXX - These should most likely be sysctl parameters.
+ */
+static vm_size_t thr_stack_default = THR_STACK_DEFAULT;
+static vm_size_t thr_stack_initial = THR_STACK_INITIAL;
+
 static SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0,
     "thread allocation");
 
@@ -312,8 +333,15 @@
 
 	umtx_thread_exit(td);
 
-	/* Signal userland that it can free the stack. */
+#ifdef THRWORKQ
+	if (td->td_reuse_stack != NULL) {
+		thrworkq_reusestack(td->td_proc, td->td_reuse_stack);
+		td->td_reuse_stack = NULL;
+	}
+#endif
+
 	if ((void *)uap->state != NULL) {
+		/* Signal userland that it can free the stack. */
 		suword_lwpid(uap->state, 1);
 		kern_umtx_wake(td, uap->state, INT_MAX, 0);
 	}
@@ -596,6 +624,137 @@
 }
 
 int
+sys_thr_stack(struct thread *td, struct thr_stack_args *uap)
+{
+	vm_size_t stacksz, guardsz;
+	void *addr;
+	int error;
+
+	/* Round up to the nearest page size. */
+	stacksz = (vm_size_t)round_page(uap->stacksize);
+	guardsz = (vm_size_t)round_page(uap->guardsize);
+
+	if (stacksz == 0)
+		stacksz = thr_stack_default;
+
+	error = kern_thr_stack(td->td_proc, &addr, stacksz, guardsz);
+
+	td->td_retval[0] = (register_t) addr;
+
+	return (error);
+}
+
+/*
+ * kern_thr_stack() maps a new thread stack in the process.  It returns
+ * the stack address in the 'addr' arg.
+ *
+ * Base address of the last stack allocated (including its red zone, if
+ * there is one).  Stacks are allocated contiguously, starting beyond the
+ * top of the main stack.  When a new stack is created, a red zone is
+ * typically created (actually, the red zone is mapped with PROT_NONE) above
+ * the top of the stack, such that the stack will not be able to grow all
+ * the way to the bottom of the next stack.  This isn't fool-proof.  It is
+ * possible for a stack to grow by a large amount, such that it grows into
+ * the next stack, and as long as the memory within the red zone is never
+ * accessed, nothing will prevent one thread stack from trouncing all over
+ * the next.
+ *
+ * low memory
+ *     . . . . . . . . . . . . . . . . . .
+ *    |                                   |
+ *    |             stack 3               | start of 3rd thread stack
+ *    +-----------------------------------+
+ *    |                                   |
+ *    |       Red Zone (guard page)       | red zone for 2nd thread
+ *    |                                   |
+ *    +-----------------------------------+
+ *    |  stack 2 - thr_stack_default      | top of 2nd thread stack
+ *    |                                   |
+ *    |                                   |
+ *    |                                   |
+ *    |                                   |
+ *    |             stack 2               |
+ *    +-----------------------------------+ <-- start of 2nd thread stack
+ *    |                                   |
+ *    |       Red Zone (guard page)       | red zone for 1st thread
+ *    |                                   |
+ *    +-----------------------------------+
+ *    |  stack 1 - thr_stack_default      | top of 1st thread stack
+ *    |                                   |
+ *    |                                   |
+ *    |                                   |
+ *    |                                   |
+ *    |             stack 1               |
+ *    +-----------------------------------+ <-- start of 1st thread stack
+ *    |                                   |   (initial value of p->p_thrstack)
+ *    |       Red Zone (guard page)       |
+ *    |                                   | red zone for main thread
+ *    +-----------------------------------+
+ *    | ->sv_usrstack - thr_stack_initial | top of main thread stack
+ *    |                                   | ^
+ *    |                                   | |
+ *    |                                   | |
+ *    |                                   | | stack growth
+ *    |                                   |
+ *    +-----------------------------------+ <-- start of main thread stack
+ *                                              (p->p_sysent->sv_usrstack)
+ * high memory
+ *
+ * XXX - This code assumes that the stack always grows down in address space.
+ */
+int
+kern_thr_stack(struct proc *p, void **addr, vm_size_t stacksz,
+    vm_size_t guardsz)
+{
+	vm_offset_t stackaddr;
+	vm_map_t map;
+	int error;
+
+	KASSERT(stacksz != 0, ("[%s: %d] stacksz = 0", __FILE__, __LINE__));
+
+	*addr = NULL;
+
+	PROC_LOCK(p);
+	if (p->p_thrstack == 0)  {
+		/* Compute the start of the first thread stack. */
+		p->p_thrstack = p->p_sysent->sv_usrstack -
+		    (vm_offset_t)(thr_stack_initial + THR_GUARD_DEFAULT);
+	}
+
+	stackaddr = p->p_thrstack - (vm_offset_t)(stacksz + guardsz);
+
+	/*
+	 * Compute the next stack location unconditionally.  Under normal
+	 * operating conditions, the most likely reason for no being able
+	 * to map the thread stack is a stack overflow of the adjacent
+	 * thread stack.
+	 */
+	p->p_thrstack -= (vm_offset_t)(stacksz + guardsz);
+	PROC_UNLOCK(p);
+
+	map = &p->p_vmspace->vm_map;
+	error = vm_mmap(map, &stackaddr, (stacksz + guardsz), VM_PROT_ALL,
+	    PROT_READ | PROT_WRITE, MAP_STACK, OBJT_DEFAULT, NULL, 0);
+	if (error)
+		return (error);
+
+	if (guardsz != 0) {
+		error = vm_map_protect(map, stackaddr, stackaddr + guardsz,
+		    PROT_NONE, 0);
+		if (error) {
+			/* unmap memory */
+			(void) vm_map_remove(map, stackaddr, stackaddr +
+			    (stacksz + guardsz));
+
+			return (error);
+		}
+	}
+
+	*addr = (void *)(stackaddr + guardsz);
+	return (0);
+}
+
+int
 kern_thr_alloc(struct proc *p, int pages, struct thread **ntd)
 {
 
Index: sys/kern/kern_thread.c
===================================================================
--- sys/kern/kern_thread.c
+++ sys/kern/kern_thread.c
@@ -142,6 +142,9 @@
 
 	td->td_tid = tid_alloc();
 
+	td->td_cswitchcb = NULL;
+	td->td_threadlist = NULL;
+	td->td_reuse_stack = NULL;
 	/*
 	 * Note that td_critnest begins life as 1 because the thread is not
 	 * running and is thereby implicitly waiting to be on the receiving
@@ -769,6 +772,12 @@
 			if (td2 == td)
 				continue;
 			thread_lock(td2);
+			/* a workq thread may not actually be runnable */
+			if (td2->td_state == TDS_INACTIVE && (td2->td_flags & TDF_WORKQ)) {
+				thread_unlock(td2);
+				thread_stopped(p);
+				continue;
+			}
 			td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
 			if (TD_IS_INHIBITED(td2)) {
 				wakeup_swapper |= weed_inhib(mode, td2, p);
Index: sys/kern/kern_thrworkq.c
===================================================================
--- /dev/null
+++ sys/kern/kern_thrworkq.c
@@ -0,0 +1,1863 @@
+/*-
+ * Copyright (c) 2009-2014, Stacey Son <sson@FreeBSD.org>
+ * Coryright (c) 2000-2009, Apple, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#include "opt_thrworkq.h"
+
+#ifdef THRWORKQ
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+
+#include <sys/condvar.h>
+#include <sys/cpuset.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/thr.h>
+#include <sys/thrworkq.h>
+#include <sys/time.h>
+#include <sys/unistd.h>
+
+#include <machine/atomic.h>
+
+#if !(defined(__i386__) || defined(__x86_64__) || defined(__sparc64__) || \
+      defined(__sparc__) || defined(__ia64__))
+/*
+ * XXX atomic.h for each arch that doesn't have atomic_*_64() should maybe
+ * have something like the following.
+ */
+static struct mtx atomic_mtx;
+MTX_SYSINIT(atomic, &atomic_mtx, "atomic_mtx", MTX_DEF);
+
+static __inline u_int32_t
+atomic_cmpset_64(volatile u_int64_t *p, volatile u_int64_t cmpval,
+    volatile u_int64_t newval)
+{
+	int ret;
+
+	mtx_lock(&atomic_mtx);
+	if (*p == cmpval) {
+		*p = newval;
+		ret = 1;
+	} else {
+		ret = 0;
+	}
+	mtx_unlock(&atomic_mtx);
+
+	return (ret);
+}
+#endif
+
+struct threadlist {
+	TAILQ_ENTRY(threadlist)  th_entry;
+	struct thread	 	*th_thread;
+	int			 th_flags;
+	uint16_t 		 th_affinity_tag;
+	uint16_t		 th_priority;
+	struct thrworkq		*th_workq;
+	stack_t			 th_stack;
+};
+
+/*
+ * threadlist flags.
+ */
+#define	TH_LIST_INITED		0x01
+#define	TH_LIST_RUNNING		0x02
+#define	TH_LIST_BLOCKED		0x04
+#define	TH_LIST_UNSCHEDULED	0x08
+#define	TH_LIST_BUSY		0x10
+#define	TH_LIST_SBUSY		0x20
+
+struct workitem {
+	TAILQ_ENTRY(workitem)	 wi_entry;
+	void			*wi_item;
+	uint32_t		 wi_affinity;
+};
+
+struct workitemlist {
+	TAILQ_HEAD(, workitem)	wl_itemlist;
+	TAILQ_HEAD(, workitem)	wl_freelist;
+};
+
+struct thrworkq {
+	void 		*wq_workqfunc;	/* user workq function */
+	void		*wq_newtdfunc;	/* funciton called on new td start up */
+	void		*wq_exitfunc;	/* funciton called on td shutdown */
+	char		*wq_ptlsbase;	/* parent TLS base */
+	struct thread	*wq_pthread;	/* parent thread */
+	size_t	 	 wq_stacksize;	/* stack size of each worker thread */
+	size_t	 	 wq_guardsize;	/* stack guard size.  Usually a page. */
+	struct workitem	 wq_array[WORKQ_OS_ELEM_MAX  * WORKQ_OS_NUMPRIOS];
+	struct proc	*wq_proc;
+	struct proc	*wq_atimer_thread;	/* timer kernel thread */
+	struct cv	 wq_atimer_cv;		/* timer condition var */
+	struct callout	*wq_atimer_call;
+	int		 wq_flags;
+	int		 wq_itemcount;
+	uint64_t	 wq_thread_yielded_timestamp;
+	uint32_t	 wq_thread_yielded_count;
+	uint32_t	 wq_timer_interval;
+	uint32_t	 wq_affinity_max;
+	uint32_t	 wq_threads_scheduled;
+	uint32_t	 wq_nthreads;		/* num of thread in workq */
+	uint32_t	 wq_thidlecount;	/* idle threads waiting */
+			 /* requested concurrency for each priority level */
+	uint32_t	 wq_reqconc[WORKQ_OS_NUMPRIOS];
+			 /* priority based item list */
+	struct workitemlist wq_list[WORKQ_OS_NUMPRIOS];
+	uint32_t	 wq_list_bitmap;
+	TAILQ_HEAD(, threadlist) wq_thrunlist;	/* workq threads working. */
+	TAILQ_HEAD(, threadlist) wq_thidlelist;	/* workq threads idle. */
+	void		**wq_stacklist;	/* recycled stacks FIFO. */
+	uint32_t	 wq_stacktop;		/* top of stack list FIFO. */
+	uint32_t	 wq_maxthreads;		/* max num of threads for
+						   this workq. */
+	uint32_t	*wq_thactive_count[WORKQ_OS_NUMPRIOS];
+	uint32_t	*wq_thscheduled_count[WORKQ_OS_NUMPRIOS];
+	uint64_t	*wq_lastblocked_ts[WORKQ_OS_NUMPRIOS];
+};
+
+/*
+ * Workqueue flags (wq_flags).
+ */
+#define	WQ_LIST_INITED		0x01
+#define	WQ_ATIMER_RUNNING	0x02
+#define	WQ_EXITING		0x04
+
+/*
+ * Upcall types for twq_set_upcall().
+ */
+#define	WQ_UPCALL_NEWTD		1
+#define	WQ_UPCALL_WORKQ		2
+#define	WQ_UPCALL_EXIT		3
+
+#define WORKQUEUE_LOCK(p)		mtx_lock(&(p)->p_twqlock)
+#define WORKQUEUE_UNLOCK(p)		mtx_unlock(&(p)->p_twqlock)
+#define WORKQUEUE_ASSERT_LOCKED(p)	mtx_assert(&(p)->p_twqlock, MA_OWNED)
+
+#define WQ_TIMER_NEEDED(wq, start_timer) do { 				\
+	int oldflags = wq->wq_flags;					\
+									\
+	if ( !(oldflags & (WQ_EXITING | WQ_ATIMER_RUNNING))) {		\
+	    if (atomic_cmpset_32(&wq->wq_flags, oldflags, 		\
+		    oldflags | WQ_ATIMER_RUNNING))			\
+		start_timer = 1;					\
+	}								\
+} while (0)
+
+static MALLOC_DEFINE(M_THRWORKQ, "thr_workq", "Thread Workqueue");
+
+static int 	twq_additem(struct thrworkq *wq, int prio, void *item,
+		    int affinity);
+static int 	twq_removeitem(struct thrworkq *wq, int prio, void *item);
+static int	twq_run_nextitem(struct proc *p, struct thrworkq *wq,
+		    struct thread *td, void *oc_item, int oc_prio,
+		    int oc_affinity);
+static void	twq_runitem(struct proc *p, void *item, struct thread *td,
+		    struct threadlist *tl, int wake_thread);
+static int	twq_unpark(struct thread *td, int timedout);
+static int 	twq_addnewthread(struct thrworkq *wq);
+static void	twq_removethread(struct threadlist *tl);
+static void	twq_add_timer(void *arg);
+static void	twq_interval_timer_start(struct thrworkq *wq);
+static void	twq_callback(int type, struct thread *td);
+static int	twq_thr_busy(uint64_t cur_ts, uint64_t *lastblocked_tsp);
+static int	twq_init_workqueue(struct proc *p, struct twq_param *arg);
+static int	twq_timer_work(struct proc *p, struct thrworkq *wq,
+		    int *start_timer);
+
+/*
+ * Thread work queue tunable paramaters defaults.
+ */
+#define	WORKQUEUE_MAXTHREADS		512	/* Max num of threads / workQ */
+#define	WQ_YIELDED_THRESHOLD		2000	/* Max num of threads to yield
+						   in window */
+#define	WQ_YIELDED_WINDOW_USECS		30000	/* Yield window interval size */
+#define	WQ_STALLED_WINDOW_USECS		200	/* Useconds until thread is
+						   considered stalled */
+#define	WQ_REDUCE_POOL_WINDOW_USECS	5000000 /* Useconds until idle thread
+						   is removed */
+#define	WQ_MAX_TIMER_INTERVAL_USECS	50000	/* Useconds to wait to check for
+						   stalled or idle threads */
+
+/*
+ * Thread work queue tunable parameters.
+ */
+static uint32_t wq_yielded_threshold		= WQ_YIELDED_THRESHOLD;
+static uint32_t wq_yielded_window_usecs	= WQ_YIELDED_WINDOW_USECS;
+static uint32_t wq_stalled_window_usecs	= WQ_STALLED_WINDOW_USECS;
+static uint32_t wq_reduce_pool_window_usecs	= WQ_REDUCE_POOL_WINDOW_USECS;
+static uint32_t wq_max_timer_interval_usecs	= WQ_MAX_TIMER_INTERVAL_USECS;
+static uint32_t wq_max_threads			= WORKQUEUE_MAXTHREADS;
+extern int max_threads_per_proc;
+
+SYSCTL_INT(_kern, OID_AUTO, wq_yielded_threshold, CTLFLAG_RW,
+    &wq_yielded_threshold, 0, "Max number of threads to yield in window");
+SYSCTL_INT(_kern, OID_AUTO, wq_yielded_window_usecs, CTLFLAG_RW,
+    &wq_yielded_window_usecs, 0, "Size of yielded window in useconds");
+SYSCTL_INT(_kern, OID_AUTO, wq_stalled_window_usecs, CTLFLAG_RW,
+    &wq_stalled_window_usecs, 0, "Useconds until thread is stalled");
+SYSCTL_INT(_kern, OID_AUTO, wq_reduce_pool_window_usecs, CTLFLAG_RW,
+    &wq_reduce_pool_window_usecs, 0, "Useconds until idle thread is removed");
+SYSCTL_INT(_kern, OID_AUTO, wq_max_timer_interval_usecs, CTLFLAG_RW,
+    &wq_max_timer_interval_usecs, 0,
+    "Useconds between stalled/idle thread checks");
+SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW,
+     &wq_max_threads, 0, "Max num of threads per workq");
+
+/*
+ * Set up callback from mi_switch().
+ */
+static void
+twq_set_schedcallback(struct thread *td, mi_switchcb_t cswitchcb)
+{
+
+	td->td_cswitchcb = cswitchcb;
+}
+
+static void
+twq_set_upcall(struct threadlist *tl, int which, void *item)
+{
+	struct thrworkq *wq = tl->th_workq;
+	void *func;
+
+	/* XXX should thread sched lock be held?? */
+
+	KASSERT(wq != NULL, ("[%s: %d] twq_set_upcall: wq == NULL", __FILE__,
+		__LINE__));
+
+	switch (which) {
+
+	case WQ_UPCALL_NEWTD:
+		func = wq->wq_newtdfunc;
+		break;
+
+	case WQ_UPCALL_WORKQ:
+		func = wq->wq_workqfunc;
+		break;
+
+	case WQ_UPCALL_EXIT:
+		func = wq->wq_exitfunc;
+		break;
+
+	default:
+		panic("twq_set_upcall: unknown upcall type");
+	}
+
+	cpu_set_upcall_kse(tl->th_thread, func, item, &tl->th_stack);
+}
+
+static void
+twq_schedthr(struct thread *newtd)
+{
+
+	thread_lock(newtd);
+	TD_SET_CAN_RUN(newtd);
+	sched_add(newtd, SRQ_BORING);
+	thread_unlock(newtd);
+}
+
+
+static uint64_t
+twq_microuptime(void)
+{
+	struct timeval t;
+
+	microuptime(&t);
+	return ((u_int64_t)t.tv_sec * 1000000 + (u_int64_t)t.tv_usec);
+}
+
+static uint32_t
+twq_usecstoticks(uint32_t usec)
+{
+	struct timeval tv;
+	uint32_t tticks;
+
+	tv.tv_sec = usec / 1000000;
+	tv.tv_usec = usec - (tv.tv_sec * 1000000);
+	tticks = tvtohz(&tv);
+
+	return (tticks);
+}
+
+static void
+twq_interval_timer_start(struct thrworkq *wq)
+{
+	uint32_t deadline;
+
+	if (wq->wq_timer_interval == 0)
+		wq->wq_timer_interval = wq_stalled_window_usecs;
+	else {
+
+		wq->wq_timer_interval = wq->wq_timer_interval * 2;
+		if (wq->wq_timer_interval > wq_max_timer_interval_usecs)
+			wq->wq_timer_interval = wq_max_timer_interval_usecs;
+	}
+
+	deadline = twq_usecstoticks(wq->wq_timer_interval);
+	callout_reset_curcpu(wq->wq_atimer_call, deadline, twq_add_timer,
+	    wq);
+}
+
+static int
+twq_thr_busy(uint64_t cur_ts, uint64_t *lastblocked_tsp)
+{
+	uint64_t lastblocked_ts;
+	uint64_t elapsed;
+
+	/*
+	 * The timestap is updated atomically w/o holding the workqueue
+	 * lock so we need to do an atomic read of the 64 bits so that
+	 * we don't see a mismatched pair of 32 bit reads.  We accomplish
+	 * this in an architecturally independent fashion by using
+	 * atomic_cmpset_64 to write back the value we grabbed.  If it
+	 * succeeds then we have a good timestamp to evalute.  If it fails
+	 * we straddled grabbing the timestamp while it was being updated.
+	 * Treat a failed update as a busy thread since it implies we are
+	 * about to see a really fresh timestamp anyway.
+	 *
+	 */
+	lastblocked_ts = *lastblocked_tsp;
+
+	if (!atomic_cmpset_64(lastblocked_tsp, lastblocked_ts, lastblocked_ts))
+		return (1);
+
+	if (lastblocked_ts >= cur_ts) {
+		/*
+		 * Because the update of the timestamp when a thread blocks
+		 * isn't serialized against us looking at it (i.e. we don't
+		 * hold the workq lock) it's possible to have a timestamp that
+		 * matches the current time or that even looks to be in the
+		 * future relative to when we grabbed the current time.
+		 * Just treat this as a busy thread since it must have just
+		 * blocked.
+		 */
+		return (1);
+	}
+
+	/* Timestamps are in usecs. */
+	elapsed = cur_ts - lastblocked_ts;
+
+	if (elapsed < (uint64_t)wq_stalled_window_usecs)
+		return (1);
+
+	return (0);
+}
+
+static void
+twq_add_timer(void *arg)
+{
+	struct thrworkq *wq = (struct thrworkq *)arg;
+
+
+	cv_signal(&wq->wq_atimer_cv);
+}
+
+static int
+twq_timer_work(struct proc *p, struct thrworkq *wq, int *start_timer)
+{
+	int retval;
+	int add_thread;
+	uint32_t busycount;
+	uint32_t priority;
+	uint32_t affinity_tag;
+	uint32_t i;
+	uint64_t curtime;
+
+
+	WORKQUEUE_ASSERT_LOCKED(p);
+
+	retval = 1;
+	add_thread = 0;
+
+	/*
+	 * Check to see if the stall frequency was beyond our tolerance
+	 * or we have work on the queue, but haven't scheduled any new
+	 * work within our acceptable time interval because there were
+	 * no idle threads left to schedule.
+	 */
+	if (wq->wq_itemcount) {
+
+		for (priority = 0; priority < WORKQ_OS_NUMPRIOS; priority++) {
+			if (wq->wq_list_bitmap & (1 << priority))
+				break;
+		}
+
+		KASSERT(priority < WORKQ_OS_NUMPRIOS,
+		    ("[%s: %d] priority >= WORKQ_OS_NUMPRIOS", __FILE__,
+		     __LINE__));
+
+		curtime = twq_microuptime();
+		busycount = 0;
+
+		for (affinity_tag = 0; affinity_tag < wq->wq_reqconc[priority];
+		    affinity_tag++) {
+			/*
+			 * if we have no idle threads, we can try to
+			 * add them if needed.
+			 */
+			if (wq->wq_thidlecount == 0)
+				add_thread = 1;
+
+			/*
+			 * Look for first affinity group that is
+			 * currently not active.  i.e. no active
+			 * threads at this priority level or higher
+			 * and has not been active recently at this
+			 * priority level or higher.
+			 */
+			for (i = 0; i <= priority; i++) {
+				if (wq->wq_thactive_count[i][affinity_tag]) {
+					add_thread = 0;
+					break;
+				}
+				if (wq->wq_thscheduled_count[i][affinity_tag]) {
+					if (twq_thr_busy(curtime,
+						&wq->wq_lastblocked_ts[i]
+						[affinity_tag])) {
+							 add_thread = 0;
+							 busycount++;
+							 break;
+					}
+				}
+			}
+			if (add_thread) {
+				retval = twq_addnewthread(wq);
+				break;
+			}
+		}
+		if (wq->wq_itemcount) {
+			/*
+			 * As long as we have threads to schedule, and
+			 * we successfully scheduled new work, keep
+			 * trying.
+			 */
+			while (wq->wq_thidlecount &&
+			    !(wq->wq_flags & WQ_EXITING)) {
+				/*
+				 * twq_run_nextitem is
+				 * responsible for dropping the
+				 * workqueue lock in all cases.
+				 */
+				retval = twq_run_nextitem(p, wq, NULL, 0, 0, 0);
+				WORKQUEUE_LOCK(p);
+
+				if (retval == 0)
+					break;
+			}
+			if ( !(wq->wq_flags & WQ_EXITING) && wq->wq_itemcount) {
+				if (wq->wq_thidlecount == 0 && retval &&
+				    add_thread)
+					return (1);
+
+				if (wq->wq_thidlecount == 0 || busycount)
+					WQ_TIMER_NEEDED(wq, *start_timer);
+			}
+		}
+	}
+
+	return (0);
+}
+
+
+static void
+twq_timer_kthread(void *arg)
+{
+	struct thrworkq *wq = (struct thrworkq *)arg;
+	struct proc	*p;
+	int start_timer;
+
+	p = wq->wq_proc;
+
+	while (1) {
+
+		WORKQUEUE_LOCK(p);
+
+		cv_wait(&wq->wq_atimer_cv, &p->p_twqlock);
+
+		start_timer = 0;
+
+		/*
+		 * The workq lock will protect us from seeing WQ_EXITING change
+		 * state, but we still need to update this atomically in case
+		 * someone else tries to start the timer just as we're
+		 * releasing it.
+		 */
+		while ( !(atomic_cmpset_32(&wq->wq_flags, wq->wq_flags,
+			    (wq->wq_flags & ~WQ_ATIMER_RUNNING))));
+
+		while (!(wq->wq_flags & WQ_EXITING) &&
+		    twq_timer_work(p, wq, &start_timer));
+
+		if ( !(wq->wq_flags & WQ_ATIMER_RUNNING))
+			wq->wq_timer_interval = 0;
+
+		if (wq->wq_flags & WQ_EXITING)
+			break;
+
+		WORKQUEUE_UNLOCK(p);
+
+		if (start_timer)
+			twq_interval_timer_start(wq);
+
+	}
+
+	wq->wq_atimer_thread = NULL;
+	WORKQUEUE_UNLOCK(p);
+	kproc_exit(0);
+}
+
+/*
+ * thrworkq_thread_yielded is called when an user thread voluntary yields.
+ */
+void
+thrworkq_thread_yielded(void)
+{
+	struct thrworkq *wq;
+	struct proc *p = curproc;
+
+	if ((wq = p->p_twq) == NULL || wq->wq_itemcount == 0)
+		return;
+
+	WORKQUEUE_LOCK(p);
+
+	if (wq->wq_itemcount) {
+		uint64_t        curtime;
+		uint64_t        elapsed;
+
+		if (wq->wq_thread_yielded_count++ == 0)
+			wq->wq_thread_yielded_timestamp = twq_microuptime();
+
+		if (wq->wq_thread_yielded_count < wq_yielded_threshold) {
+			WORKQUEUE_UNLOCK(p);
+			return;
+		}
+
+		wq->wq_thread_yielded_count = 0;
+
+		curtime = twq_microuptime();
+		elapsed = curtime - wq->wq_thread_yielded_timestamp;
+
+		if (elapsed < wq_yielded_window_usecs) {
+
+			/*
+			 * We have 'wq_yielded_threadhold' or more threads
+			 * yielding within a 'wq_yielded_window_usecs' period
+			 * of time.  Let's see if we need to add a thread or
+			 * assign some work.
+			 */
+
+			if (wq->wq_thidlecount == 0) {
+				(void) twq_addnewthread(wq);
+				/*
+				 * 'twq_addnewthread' drops the workqueue
+				 * lock when creating the new thread and then
+				 * retakes it before returning. This window
+				 * allows other threads to process work on the
+				 * queue, so we need to recheck for available
+				 * work if none found, we just return.  The
+				 * newly created thread will eventually get
+				 * used (if it hasn't already).
+				 */
+				if (wq->wq_itemcount == 0) {
+					WORKQUEUE_UNLOCK(p);
+					return;
+				}
+			}
+			if (wq->wq_thidlecount) {
+				uint32_t        priority;
+				uint32_t        affinity = -1;
+				void		*item;
+				struct workitem *witem = NULL;
+				struct workitemlist *wl = NULL;
+				struct thread	*td;
+				struct threadlist *tl;
+
+				/*
+				 * We have an idle thread.  Let's assign some
+				 * work.
+				 */
+
+				td = curthread;
+				if ((tl = td->td_threadlist))
+					affinity = tl->th_affinity_tag;
+
+				for (priority = 0;
+				    priority < WORKQ_OS_NUMPRIOS; priority++) {
+					if (wq->wq_list_bitmap &
+					    (1 << priority)) {
+
+						 wl = (struct workitemlist *)
+						     &wq->wq_list[priority];
+						 break;
+					}
+				}
+				KASSERT(wl != NULL, ("[%s: %d] wl == NULL",
+					__FILE__, __LINE__));
+				KASSERT(!(TAILQ_EMPTY(&wl->wl_itemlist)),
+				    ("[%s: %d] wl_itemlist not empty",
+				     __FILE__, __LINE__));
+
+				witem = TAILQ_FIRST(&wl->wl_itemlist);
+				TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry);
+
+				if (TAILQ_EMPTY(&wl->wl_itemlist))
+					wq->wq_list_bitmap &= ~(1 << priority);
+				wq->wq_itemcount--;
+
+				item = witem->wi_item;
+				witem->wi_item = (void *)0;
+				witem->wi_affinity = 0;
+
+				TAILQ_INSERT_HEAD(&wl->wl_freelist, witem,
+				    wi_entry);
+
+				(void)twq_run_nextitem(p, wq,
+				    NULL, item, priority, affinity);
+				/*
+				 * twq_run_nextitem is responsible for
+				 * dropping the workqueue lock in all cases.
+				 */
+
+				return;
+			}
+		}
+	}
+	WORKQUEUE_UNLOCK(p);
+}
+
+/*
+ * Callback for miswitch().  It is called before and after a context switch.
+ */
+static void
+twq_callback(int type, struct thread *td)
+{
+	struct threadlist *tl;
+	struct thrworkq  *wq;
+
+	tl = td->td_threadlist;
+	wq = tl->th_workq;
+
+	switch (type) {
+
+	case SWCB_BLOCK:
+		{
+			uint32_t        old_activecount;
+
+			 old_activecount = atomic_fetchadd_32(
+			     &wq->wq_thactive_count[tl->th_priority]
+			         [tl->th_affinity_tag], -1);
+			 if (old_activecount == 1) {
+				 int		 start_timer = 0;
+				 uint64_t        curtime;
+				 uint64_t	*lastblocked_ptr;
+
+				 /*
+				  * We were the last active thread on this
+				  * affinity set and we've got work to do.
+				  */
+				 lastblocked_ptr =
+				     &wq->wq_lastblocked_ts[tl->th_priority]
+					[tl->th_affinity_tag];
+				 curtime = twq_microuptime();
+
+				 /*
+				  * If we collide with another thread trying
+				  * to update the last_blocked (really
+				  * unlikely since another thread would have to
+				  * get scheduled and then block after we start
+				  * down this path), it's not a problem.  Either
+				  * timestamp is adequate, so no need to retry.
+				  */
+				 (void)atomic_cmpset_64(lastblocked_ptr,
+				     *lastblocked_ptr, curtime);
+				 if (wq->wq_itemcount)
+					 WQ_TIMER_NEEDED(wq, start_timer);
+
+				 if (start_timer)
+					 twq_interval_timer_start(wq);
+			 }
+		}
+		break;
+
+	case SWCB_UNBLOCK:
+		/*
+		 * We cannot take the workqueue_lock here.  An UNBLOCK can occur
+		 * from a timer event whichis run from an interrupt context.  If
+		 * the workqueue_lock is already held by this processor, we'll
+		 * deadlock.  The thread lock for this thread being UNBLOCKED
+		 * is also held.
+		 */
+		atomic_add_32(&wq->wq_thactive_count[tl->th_priority]
+		    [tl->th_affinity_tag], 1);
+		break;
+	}
+}
+
+static void
+twq_removethread(struct threadlist *tl)
+{
+	struct thrworkq *wq;
+	struct thread *td;
+
+	wq = tl->th_workq;
+
+	TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
+
+	wq->wq_nthreads--;
+	wq->wq_thidlecount--;
+
+
+	WORKQUEUE_UNLOCK(curproc);
+
+	td = tl->th_thread;
+
+	thread_lock(td);
+	/*
+	 * Recycle this thread's stack.  Done when the thread exits.
+	 */
+	td->td_reuse_stack = tl->th_stack.ss_sp;
+
+	twq_set_schedcallback(td, NULL);
+
+	/*
+	 * Clear the threadlist pointer so blocked thread on wakeup for
+	 * termination will not access the thread list as it is going to
+	 * be freed.
+	 */
+	td->td_threadlist = NULL;
+
+	/*
+	 * Set to call the exit upcall to clean up and exit.
+	 */
+	twq_set_upcall(tl, WQ_UPCALL_EXIT, NULL);
+
+	thread_unlock(td);
+
+	free(tl, M_THRWORKQ);
+}
+
+/*
+ * twq_addnewthread() is called with the workqueue lock held.
+ */
+static int
+twq_addnewthread(struct thrworkq *wq)
+{
+	int error;
+	struct threadlist *tl;
+	void *stackaddr = NULL;
+	struct thread *newtd, *td;
+	struct proc *p = wq->wq_proc;
+	int try;
+
+	WORKQUEUE_ASSERT_LOCKED(p);
+
+	if (wq->wq_nthreads >= wq->wq_maxthreads
+	    /* || wq->wq_nthreads >= (max_threads_per_proc - 20) */)
+		return (0);
+	wq->wq_nthreads++;
+
+	td = wq->wq_pthread;
+
+	/*
+	 * See if we have a stack we can reuse.
+	 */
+	if (wq->wq_stacktop >  0) {
+		wq->wq_stacktop--;
+		stackaddr = wq->wq_stacklist[wq->wq_stacktop];
+	        KASSERT(stackaddr != NULL, ("[%s:%d] stackaddr = NULL",
+			__FILE__, __LINE__));
+		wq->wq_stacklist[wq->wq_stacktop] = NULL;
+	}
+	WORKQUEUE_UNLOCK(p);
+
+	/*
+	 * If needed, map a new thread stack and guard page.
+	 */
+	if (stackaddr == NULL)
+		for (try = 0; try < 3; try++) {
+			error = kern_thr_stack(p, &stackaddr, wq->wq_stacksize,
+			    wq->wq_guardsize);
+			if (error == 0)
+				break;
+			if (error != ENOMEM)
+				goto failed;
+		}
+
+	newtd = thread_alloc(0);
+	if (newtd == NULL) {
+		/* Save the stack address so we can reuse it. */
+		thrworkq_reusestack(p, stackaddr);
+		goto failed;
+	}
+
+	bzero(&newtd->td_startzero,
+	    __rangeof(struct thread, td_startzero, td_endzero));
+	bcopy(&td->td_startcopy, &newtd->td_startcopy,
+	    __rangeof(struct thread, td_startcopy, td_endcopy));
+	newtd->td_proc = p;
+	newtd->td_ucred = crhold(td->td_ucred);
+
+	cpu_set_upcall(newtd, td);
+
+	/*
+	 * Allocate thread list and init.
+	 */
+	tl = (struct threadlist *) malloc(sizeof(struct threadlist),
+	    M_THRWORKQ, M_WAITOK | M_ZERO);
+
+	tl->th_thread = newtd;
+	tl->th_workq = wq;
+
+	tl->th_affinity_tag = -1;
+	tl->th_priority = WORKQ_OS_NUMPRIOS;
+
+	tl->th_stack.ss_sp = stackaddr;
+	tl->th_stack.ss_size = wq->wq_stacksize;
+
+	tl->th_flags = TH_LIST_INITED | TH_LIST_UNSCHEDULED;
+
+	newtd->td_threadlist = (void *)tl;
+
+	PROC_LOCK(p);
+	p->p_flag |= P_HADTHREADS;
+	newtd->td_sigmask = td->td_sigmask;
+	thread_link(newtd, p);
+	bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name));
+	thread_lock(td);
+	sched_fork_thread(td, newtd);
+	thread_unlock(td);
+	/*
+	 * tell suspend handling code that if this thread is inactive
+	 * to simply skip it
+	 */
+	newtd->td_flags |= TDF_WORKQ;
+	if (P_SHOULDSTOP(p))
+		newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
+	PROC_UNLOCK(p);
+
+	tidhash_add(newtd);
+
+	/*
+	 * We don't add the new thread to the scheduler yet until we find some
+	 * work for it to do.
+	 */
+
+	WORKQUEUE_LOCK(p);
+	TAILQ_INSERT_TAIL(&wq->wq_thidlelist, tl, th_entry);
+	wq->wq_thidlecount++;
+
+	return (1);
+
+failed:
+	WORKQUEUE_LOCK(p);
+	wq->wq_nthreads--;
+
+	return (0);
+
+}
+
+static int
+twq_init_workqueue(struct proc *p, struct twq_param *arg)
+{
+	struct thrworkq *wq;
+	uint32_t  i, j;
+	size_t wq_size;
+	char *ptr, *nptr;
+	struct workitem *witem;
+	struct workitemlist *wl;
+	struct callout *calloutp;
+	int error;
+	void *ssptr;
+	uint32_t maxthreads;
+
+	/* 'smp_cpus' is the number of cpus running. */
+	wq_size = sizeof(struct thrworkq) +
+	    (smp_cpus * WORKQ_OS_NUMPRIOS * sizeof(uint32_t)) +
+	    (smp_cpus * WORKQ_OS_NUMPRIOS * sizeof(uint32_t)) +
+	    (smp_cpus * WORKQ_OS_NUMPRIOS * sizeof(uint64_t)) +
+	    sizeof(uint64_t);
+
+	ptr = malloc(wq_size, M_THRWORKQ, M_WAITOK | M_ZERO);
+	maxthreads = wq_max_threads;
+	ssptr = malloc(sizeof(void *) * maxthreads, M_THRWORKQ,
+	    M_WAITOK | M_ZERO);
+	calloutp = (struct callout *)malloc(sizeof(struct callout), M_THRWORKQ,
+	    M_WAITOK | M_ZERO);
+
+	WORKQUEUE_LOCK(p);
+	if (p->p_twq != NULL) {
+		WORKQUEUE_UNLOCK(p);
+		free(ptr, M_THRWORKQ);
+		free(ssptr, M_THRWORKQ);
+		free(calloutp, M_THRWORKQ);
+		return (EINVAL);
+	}
+
+	/*
+	 * Initialize workqueue information.
+	 */
+	wq = (struct thrworkq *)ptr;
+	wq->wq_flags = WQ_LIST_INITED;
+	wq->wq_proc = p;
+	wq->wq_affinity_max = smp_cpus;
+	wq->wq_workqfunc = arg->twq_workqfunc;
+	wq->wq_newtdfunc = arg->twq_newtdfunc;
+	wq->wq_exitfunc = arg->twq_exitfunc;
+	if (arg->twq_stacksize == 0)
+		wq->wq_stacksize = THR_STACK_DEFAULT;
+	else
+		wq->wq_stacksize = round_page(arg->twq_stacksize);
+	wq->wq_guardsize = round_page(arg->twq_guardsize);
+	wq->wq_pthread = curthread;
+
+	wq->wq_stacklist = ssptr;
+	wq->wq_stacktop = 0;
+	wq->wq_maxthreads = maxthreads;
+
+	for (i = 0; i < WORKQ_OS_NUMPRIOS; i++) {
+		wl = (struct workitemlist *)&wq->wq_list[i];
+		TAILQ_INIT(&wl->wl_itemlist);
+		TAILQ_INIT(&wl->wl_freelist);
+
+		for (j = 0; j < WORKQ_OS_ELEM_MAX; j++) {
+			witem = &wq->wq_array[(i * WORKQ_OS_ELEM_MAX) + j];
+			TAILQ_INSERT_TAIL(&wl->wl_freelist, witem, wi_entry);
+		}
+		wq->wq_reqconc[i] = wq->wq_affinity_max;
+	}
+	nptr = ptr + sizeof(struct thrworkq);
+
+	for (i = 0; i < WORKQ_OS_NUMPRIOS; i++) {
+		wq->wq_thactive_count[i] = (uint32_t *)nptr;
+		nptr += (smp_cpus * sizeof(uint32_t));
+	}
+	for (i = 0; i < WORKQ_OS_NUMPRIOS; i++) {
+		wq->wq_thscheduled_count[i] = (uint32_t *)nptr;
+		nptr += (smp_cpus * sizeof(uint32_t));
+	}
+
+	/*
+	 * Align nptr on a 64 bit boundary so we can do atomic
+	 * operations on the timestamps.  (We allocated an extra
+	 * uint64_t space above so we have room for this adjustment.)
+	 */
+	nptr += (sizeof(uint64_t) - 1);
+	nptr = (char *)((long)nptr & ~(sizeof(uint64_t) - 1));
+
+	for (i = 0; i < WORKQ_OS_NUMPRIOS; i++) {
+		wq->wq_lastblocked_ts[i] = (uint64_t *)nptr;
+		nptr += (smp_cpus * sizeof(uint64_t));
+	}
+	TAILQ_INIT(&wq->wq_thrunlist);
+	TAILQ_INIT(&wq->wq_thidlelist);
+
+	cv_init(&wq->wq_atimer_cv, "twq_atimer_cv");
+	wq->wq_atimer_call = calloutp;
+	callout_init(wq->wq_atimer_call, CALLOUT_MPSAFE);
+
+	PROC_LOCK(p);
+	p->p_twq = wq;
+	PROC_UNLOCK(p);
+	WORKQUEUE_UNLOCK(p);
+
+	error = kproc_create(twq_timer_kthread, (void *)wq,
+	    &wq->wq_atimer_thread, RFHIGHPID, 0, "twq %d", p->p_pid);
+	if (error)
+		panic("twq_init_workqueue: kproc_create returned %d", error);
+
+	return (0);
+}
+
+/*
+ * thr_workq() system call.
+ */
+int
+sys_thr_workq(struct thread *td, struct thr_workq_args  *uap)
+{
+	struct twq_param arg;
+	struct proc *p = td->td_proc;
+	int cmd = uap->cmd;
+	int prio, reqconc, affinity;
+	int error = 0;
+	void *oc_item = NULL;
+	struct thrworkq *wq;
+
+	error = copyin(uap->args, &arg, sizeof(arg));
+	if (error)
+		return (error);
+
+	/*
+	 * Affinity is not used yet.
+	 */
+	affinity = -1;
+
+	switch (cmd) {
+
+	case WQOPS_INIT:
+		/*
+		 * Return the PID for the handle for now.  If we decide to
+		 * support multiple workqueues per process then we will need
+		 * to do something different.
+		 */
+		error = suword(arg.twq_retid, p->p_pid);
+		if (error)
+			return (error);
+		return (twq_init_workqueue(p, &arg));
+
+	case WQOPS_QUEUE_ADD:
+		WORKQUEUE_LOCK(p);
+		if ((wq = p->p_twq) == NULL || arg.twq_id != p->p_pid) {
+			WORKQUEUE_UNLOCK(p);
+			return (EINVAL);
+		}
+
+		prio = arg.twq_add_prio;
+		/* affinity = arg.twq_add_affin;  XXX Not yet used. */
+
+		/*
+		 * Add item to workqueue.  If the WORKQUEUE_OVERCOMMIT flag
+		 * is set we want to commit the item to a thread even if we
+		 * have to start a new one.
+		 */
+		if (prio & WORKQUEUE_OVERCOMMIT) {
+			prio &= ~WORKQUEUE_OVERCOMMIT;
+			oc_item = arg.twq_add_item;
+		}
+		if ((prio < 0) || (prio >= WORKQ_OS_NUMPRIOS)) {
+			WORKQUEUE_UNLOCK(p);
+			return (EINVAL);
+		}
+
+		if (wq->wq_thidlecount == 0 &&
+		    (oc_item || (wq->wq_nthreads < wq->wq_affinity_max))) {
+			(void) twq_addnewthread(wq);
+
+			/*
+			 * If we can start a new thread then this work item
+			 * will have to wait on the queue.
+			 */
+			if (wq->wq_thidlecount == 0)
+				oc_item = NULL;
+		}
+		if (oc_item == NULL)
+			error = twq_additem(wq, prio, arg.twq_add_item,
+			    affinity);
+
+		/* twq_run_nextitem() drops the workqueue lock.  */
+		(void)twq_run_nextitem(p, wq, NULL, oc_item, prio, affinity);
+
+		return (error);
+
+	case WQOPS_QUEUE_REMOVE:
+		WORKQUEUE_LOCK(p);
+		if ((wq = p->p_twq) == NULL || arg.twq_id != p->p_pid) {
+			WORKQUEUE_UNLOCK(p);
+			return (EINVAL);
+		}
+
+		prio = arg.twq_rm_prio;
+		/* affinity = arg.twq_add_affin;  Not yet used. */
+
+		/*
+		 * Remove item from workqueue.
+		 */
+		if ((prio < 0) || (prio >= WORKQ_OS_NUMPRIOS)) {
+			WORKQUEUE_UNLOCK(p);
+			return (EINVAL);
+		}
+
+
+		error = twq_removeitem(wq, prio, arg.twq_rm_item);
+
+		/*
+		 * twq_run_nextitem() drops the workqueue lock.  See if
+		 * we can assign a work item to an idle thread.
+		 */
+		(void)twq_run_nextitem(p, wq, NULL, NULL, prio, affinity);
+
+		return (error);
+
+	case WQOPS_THREAD_RETURN:
+		WORKQUEUE_LOCK(p);
+		if ((wq = p->p_twq) == NULL || arg.twq_id != p->p_pid) {
+			WORKQUEUE_UNLOCK(p);
+			return (EINVAL);
+		}
+
+		 if (td->td_threadlist == NULL) {
+			 WORKQUEUE_UNLOCK(p);
+			 return (EINVAL);
+		 }
+
+		/*
+		 * twq_run_nextitem() drops the workqueue lock.  Assign
+		 * any pending work to this (or other idle) thread.
+		 */
+		(void)twq_run_nextitem(p, wq, td, NULL, 0, -1);
+
+		return (0);
+
+	case WQOPS_THREAD_SETCONC:
+		WORKQUEUE_LOCK(p);
+		if ((wq = p->p_twq) == NULL || arg.twq_id != p->p_pid) {
+			WORKQUEUE_UNLOCK(p);
+			return (EINVAL);
+		}
+
+		prio = arg.twq_setconc_prio;
+		if ((prio < 0) || (prio > WORKQ_OS_NUMPRIOS)) {
+			WORKQUEUE_UNLOCK(p);
+			return (EINVAL);
+		}
+
+		reqconc = arg.twq_setconc_conc;
+
+
+		if (prio < WORKQ_OS_NUMPRIOS)
+			wq->wq_reqconc[prio] = reqconc;
+		else {
+			for (prio = 0; prio < WORKQ_OS_NUMPRIOS; prio++)
+			wq->wq_reqconc[prio] = reqconc;
+		}
+
+		/*
+		 * twq_run_nextitem() drops the workqueue lock. See if
+		 * we can assign a work item to an idle thread.
+		 */
+		(void)twq_run_nextitem(p, wq, NULL, NULL, 0, -1);
+
+		return (0);
+
+	default:
+		 return (EINVAL);
+	}
+}
+
+/*
+ *
+ */
+void
+thrworkq_reusestack(struct proc *p, void *stackaddr)
+{
+	struct thrworkq *wq;
+
+	WORKQUEUE_LOCK(p);
+	/* Recycle its stack. */
+	if ((wq = p->p_twq) != NULL)
+		wq->wq_stacklist[wq->wq_stacktop++] = stackaddr;
+	WORKQUEUE_UNLOCK(p);
+}
+
+/*
+ * thrworkq_exit is called when a process is about to exit (or has exec'ed).
+ */
+void
+thrworkq_exit(struct proc *p)
+{
+	struct thrworkq  *wq;
+	struct threadlist  *tl, *tlist;
+	struct thread *td;
+
+	KASSERT(p != 0, ("[%s: %d] thrworkq_exit: p = NULL",
+		__FILE__, __LINE__));
+
+	WORKQUEUE_LOCK(p);
+	if ((wq = p->p_twq) == NULL) {
+		WORKQUEUE_UNLOCK(p);
+		return;
+	}
+	p->p_twq = NULL;
+
+	/*
+	 * We now arm the timer in the callback function w/o
+	 * holding the workq lock.  WQ_ATIMER_RUNNING via
+	 * atomic_cmpset in order to insure only a single timer is
+	 * running and to notice that WQ_EXITING has been set been
+	 * set (we don't want to start a timer once WQ_EXITING is
+	 * posted).
+	 *
+	 * So once we have successfully set WQ_EXITING, we cannot fire
+	 * up a new timer.  Therefore no need to clear the timer state
+	 * atomically from the flags.
+	 *
+	 * Since we always hold the workq_lock when dropping
+	 * WQ_ATIMER_RUNNING the check for and sleep until clear is
+	 * protected.
+	 */
+
+	while ( !(atomic_cmpset_32(&wq->wq_flags, wq->wq_flags,
+		    (wq->wq_flags | WQ_EXITING))));
+
+	if (wq->wq_flags & WQ_ATIMER_RUNNING) {
+		if (callout_stop(wq->wq_atimer_call) != 0)
+			wq->wq_flags &= ~WQ_ATIMER_RUNNING;
+	}
+
+	/* Wait for timer thread to die. */
+	if (wq->wq_atimer_thread) {
+		cv_signal(&wq->wq_atimer_cv);
+		if (msleep(wq->wq_atimer_thread, &p->p_twqlock, PWAIT,
+		    "twq_atimer", 60 * hz))
+			printf("thr workq timer thread didn't die.");
+		else
+			cv_destroy(&wq->wq_atimer_cv);
+	}
+	WORKQUEUE_UNLOCK(p);
+
+	TAILQ_FOREACH_SAFE(tl, &wq->wq_thrunlist, th_entry, tlist) {
+
+		td = tl->th_thread;
+
+		thread_lock(td);
+		twq_set_schedcallback(td, NULL);
+		td->td_threadlist = NULL;
+		thread_unlock(td);
+
+		TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
+
+		free(tl, M_THRWORKQ);
+	}
+	TAILQ_FOREACH_SAFE(tl, &wq->wq_thidlelist, th_entry, tlist) {
+
+		td = tl->th_thread;
+
+		thread_lock(td);
+		twq_set_schedcallback(td, NULL);
+		td->td_threadlist = NULL;
+		twq_set_upcall(tl, WQ_UPCALL_EXIT, NULL);
+		thread_unlock(td);
+
+		if (tl->th_flags & TH_LIST_UNSCHEDULED) {
+			/*
+			 * Schedule unscheduled the thread so it can exit.
+			 */
+			tl->th_flags &= ~TH_LIST_UNSCHEDULED;
+			twq_schedthr(td);
+		}
+
+		TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
+
+		free(tl, M_THRWORKQ);
+	}
+
+	callout_drain(wq->wq_atimer_call);
+	free(wq->wq_atimer_call, M_THRWORKQ);
+
+	free(wq->wq_stacklist, M_THRWORKQ);
+
+	free(wq, M_THRWORKQ);
+}
+
+/*
+ * Add item to workqueue.  Workqueue lock must be held.
+ */
+static int
+twq_additem(struct thrworkq *wq, int prio, void *item, int affinity)
+{
+	struct workitem *witem;
+	struct workitemlist *wl;
+
+	WORKQUEUE_ASSERT_LOCKED(wq->wq_proc);
+
+	wl = (struct workitemlist *)&wq->wq_list[prio];
+
+	if (TAILQ_EMPTY(&wl->wl_freelist))
+		return (ENOMEM);
+
+	witem = (struct workitem *)TAILQ_FIRST(&wl->wl_freelist);
+	TAILQ_REMOVE(&wl->wl_freelist, witem, wi_entry);
+
+	witem->wi_item = item;
+	witem->wi_affinity = affinity;
+	TAILQ_INSERT_TAIL(&wl->wl_itemlist, witem, wi_entry);
+
+	wq->wq_list_bitmap |= (1 << prio);
+
+	wq->wq_itemcount++;
+
+	return (0);
+}
+
+/*
+ * Remove item from workqueue.  Workqueue lock must be held.
+ */
+static int
+twq_removeitem(struct thrworkq *wq, int prio, void *item)
+{
+	struct workitem *witem;
+	struct workitemlist *wl;
+	int error = ESRCH;
+
+	WORKQUEUE_ASSERT_LOCKED(wq->wq_proc);
+
+	wl = (struct workitemlist *)&wq->wq_list[prio];
+
+	TAILQ_FOREACH(witem, &wl->wl_itemlist, wi_entry) {
+		if (witem->wi_item == item) {
+			TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry);
+
+			if (TAILQ_EMPTY(&wl->wl_itemlist))
+				wq->wq_list_bitmap &= ~(1 << prio);
+			wq->wq_itemcount--;
+
+			witem->wi_item = NULL;
+			witem->wi_affinity = 0;
+			TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry);
+
+			error = 0;
+			break;
+		}
+	}
+	return (error);
+}
+
+/*
+ * twq_run_nextitem is called with the workqueue lock held and
+ * must drop it in all cases.
+ */
+static int
+twq_run_nextitem(struct proc *p, struct thrworkq *wq,
+    struct thread *thread, void * oc_item, int oc_prio, int oc_affinity)
+{
+	struct 	 	 workitem *witem = NULL;
+	void 		*item = 0;
+	struct thread 	*th_to_run = NULL;
+	struct thread 	*th_to_park = NULL;
+	int 		 wake_thread = 0;
+	uint32_t 	 priority, orig_priority;
+	uint32_t 	 affinity_tag, orig_affinity_tag;
+	uint32_t 	 i;
+	uint32_t	 activecount, busycount;
+	uint32_t	 us_to_wait;
+	int 		 start_timer = 0;
+	int		 adjust_counters = 1;
+	uint64_t	 curtime;
+	int		 error;
+	struct threadlist *tl = NULL;
+	struct threadlist *ttl = NULL;
+	struct workitemlist *wl = NULL;
+
+	WORKQUEUE_ASSERT_LOCKED(p);
+	/*
+	 * From here until we drop the workq lock we can't be pre-empted.
+	 * This is important since we have to independently update the priority
+	 * and affinity that the thread is associated with and these values are
+	 * used to index the multi-dimensional counter arrays in
+	 * 'twq_callback'.
+	 */
+	if (oc_item) {
+		uint32_t min_scheduled = 0;
+		uint32_t scheduled_count;
+		uint32_t active_count;
+		uint32_t t_affinity = 0;
+
+		priority = oc_prio;
+		item = oc_item;
+
+		if ((affinity_tag = oc_affinity) == (uint32_t)-1) {
+			/*
+			 * CPU affinity is not assigned yet.
+			 */
+			for (affinity_tag = 0;
+			    affinity_tag < wq->wq_reqconc[priority];
+			    affinity_tag++) {
+				/*
+				 * Look for the affinity group with the
+				 * least number of threads.
+				 */
+				scheduled_count = 0;
+				active_count = 0;
+
+				for (i = 0; i <= priority; i++) {
+				    scheduled_count +=
+				      wq->wq_thscheduled_count[i][affinity_tag];
+				    active_count +=
+				      wq->wq_thactive_count[i][affinity_tag];
+				}
+				if (active_count == 0) {
+					t_affinity = affinity_tag;
+					break;
+				}
+				if (affinity_tag == 0 ||
+				    scheduled_count < min_scheduled) {
+					min_scheduled = scheduled_count;
+					t_affinity = affinity_tag;
+				}
+			}
+			affinity_tag = t_affinity;
+		}
+		goto grab_idle_thread;
+	}
+	if (wq->wq_itemcount == 0) {
+		if ((th_to_park = thread) == NULL)
+			goto out_of_work;
+		goto parkit;
+	}
+	for (priority = 0; priority < WORKQ_OS_NUMPRIOS; priority++) {
+		if (wq->wq_list_bitmap & (1 << priority)) {
+			wl = (struct workitemlist *)&wq->wq_list[priority];
+			break;
+		}
+	}
+	KASSERT(wl != NULL, ("[%s:%d] workq list is NULL", __FILE__, __LINE__));
+	KASSERT(!(TAILQ_EMPTY(&wl->wl_itemlist)),
+	    ("[%s:%d] workq list is empty",  __FILE__, __LINE__));
+
+	curtime = twq_microuptime();
+
+	if (thread != NULL) {
+		tl = thread->td_threadlist;
+		KASSERT(tl != NULL, ("[%s:%d] tl = NULL", __FILE__, __LINE__));
+		affinity_tag = tl->th_affinity_tag;
+
+		/*
+		 * Check to see if the affinity group this thread is
+		 * associated with is still within the bounds of the
+		 * specified concurrency for the priority level we're
+		 * considering running work for.
+		 */
+		if (affinity_tag < wq->wq_reqconc[priority]) {
+			/*
+			 * We're a worker thread from the pool.  Currently
+			 * we are considered 'active' which means we're counted
+			 * in 'wq_thactive_count'.  Add up the active counts
+			 * of all the priority levels up to and including
+			 * the one we want to schedule.
+			 */
+			for (activecount = 0, i = 0; i <= priority; i++) {
+				uint32_t  acount;
+
+				acount =
+				    wq->wq_thactive_count[i][affinity_tag];
+				 if (acount == 0 &&
+				     wq->wq_thscheduled_count[i][affinity_tag]){
+					 if (twq_thr_busy(curtime,
+					     &wq->wq_lastblocked_ts[i]
+					     [affinity_tag]))
+						acount = 1;
+				 }
+				 activecount += acount;
+			}
+			if (activecount == 1) {
+				/*
+				 * We're the only active thread associated
+				 * with our affinity group at this priority
+				 * level and higher so pick up some work and
+				 * keep going.
+				 */
+				th_to_run = thread;
+				goto pick_up_work;
+			}
+		}
+
+		/*
+		 * There's more than one thread running in this affinity group
+		 * or the concurrency level has been cut back for this priority.
+		 * Let's continue on and look for an 'empty' group to run this
+		 * work item in.
+		 */
+	}
+	busycount = 0;
+
+	for (affinity_tag = 0; affinity_tag < wq->wq_reqconc[priority];
+	    affinity_tag++) {
+		/*
+		 * Look for first affinity group that is currently not active
+		 * (i.e. no active threads at this priority level of higher
+		 * and no threads that have run recently).
+		 */
+		for (activecount = 0, i = 0; i <= priority; i++) {
+			 if ((activecount =
+				 wq->wq_thactive_count[i][affinity_tag]) != 0)
+				 break;
+
+			 if (wq->wq_thscheduled_count[i][affinity_tag] != 0) {
+				 if (twq_thr_busy(curtime,
+				     &wq->wq_lastblocked_ts[i][affinity_tag])) {
+					 busycount++;
+					 break;
+				 }
+			 }
+		}
+		if (activecount == 0 && busycount == 0)
+			break;
+	}
+	if (affinity_tag >= wq->wq_reqconc[priority]) {
+		/*
+		 * We've already got at least 1 thread per affinity group in
+		 * the active state.
+		 */
+		if (busycount) {
+			/*
+			 * We found at least 1 thread in the 'busy' state.
+			 * Make sure we start the timer because if they are the
+			 * threads keeping us from scheduling this workitem then
+			 * we won't get a callback to kick off the timer.  We
+			 * need to start i now.
+			 */
+			WQ_TIMER_NEEDED(wq, start_timer);
+		}
+
+		if (thread != NULL) {
+			/*
+			 * Go park this one for later.
+			 */
+			th_to_park = thread;
+			goto parkit;
+		}
+		goto out_of_work;
+	}
+	if (thread != NULL) {
+		/*
+		 * We're overbooked on the affinity group this thread is
+		 * currently associated with but we have work to do and
+		 * at least 1 idle processor.  Therefore, we we'll just
+		 * retarget this thread to the new affinity group.
+		 */
+		th_to_run = thread;
+		goto pick_up_work;
+	}
+	if (wq->wq_thidlecount == 0) {
+		/*
+		 * We don't have a thread to schedule but we have work to
+		 * do and at least 1 affinity group doesn't currently have
+		 * an active thread.
+		 */
+		WQ_TIMER_NEEDED(wq, start_timer);
+		goto no_thread_to_run;
+	}
+
+grab_idle_thread:
+	/*
+	 * We've got a candidate (affinity group with no currently active
+	 * threads) to start a new thread on.  We already know there is both
+	 * work available and an idle thread, so activate a thread and then
+	 * fall into the code that pulls a new workitem (pick_up_work).
+	 */
+	TAILQ_FOREACH(ttl, &wq->wq_thidlelist, th_entry) {
+		if (ttl->th_affinity_tag == affinity_tag ||
+		    ttl->th_affinity_tag == (uint16_t)-1) {
+			TAILQ_REMOVE(&wq->wq_thidlelist, ttl, th_entry);
+			tl = ttl;
+
+			break;
+		}
+	}
+	if (tl == NULL) {
+		tl = TAILQ_FIRST(&wq->wq_thidlelist);
+		TAILQ_REMOVE(&wq->wq_thidlelist, tl, th_entry);
+	}
+	wq->wq_thidlecount--;
+
+	TAILQ_INSERT_TAIL(&wq->wq_thrunlist, tl, th_entry);
+
+	if ((tl->th_flags & TH_LIST_UNSCHEDULED) == TH_LIST_UNSCHEDULED) {
+		tl->th_flags &= ~TH_LIST_UNSCHEDULED;
+		tl->th_flags |= TH_LIST_SBUSY;
+
+		thread_lock(tl->th_thread);
+		twq_set_schedcallback(tl->th_thread, twq_callback);
+		thread_unlock(tl->th_thread);
+
+	} else if ((tl->th_flags & TH_LIST_BLOCKED) == TH_LIST_BLOCKED) {
+		tl->th_flags &= ~TH_LIST_BLOCKED;
+		tl->th_flags |= TH_LIST_BUSY;
+		wake_thread = 1;
+	}
+	tl->th_flags |= TH_LIST_RUNNING;
+
+	wq->wq_threads_scheduled++;
+	wq->wq_thscheduled_count[priority][affinity_tag]++;
+	atomic_add_32(&wq->wq_thactive_count[priority][affinity_tag], 1);
+
+	adjust_counters = 0;
+	th_to_run = tl->th_thread;
+
+pick_up_work:
+	if (item == 0) {
+		witem = TAILQ_FIRST(&wl->wl_itemlist);
+		TAILQ_REMOVE(&wl->wl_itemlist, witem, wi_entry);
+
+		if (TAILQ_EMPTY(&wl->wl_itemlist))
+			wq->wq_list_bitmap &= ~(1 << priority);
+		wq->wq_itemcount--;
+
+		item = witem->wi_item;
+		witem->wi_item = (void *)0;
+		witem->wi_affinity = 0;
+		TAILQ_INSERT_HEAD(&wl->wl_freelist, witem, wi_entry);
+	}
+	orig_priority = tl->th_priority;
+	orig_affinity_tag = tl->th_affinity_tag;
+
+	tl->th_priority = priority;
+	tl->th_affinity_tag = affinity_tag;
+
+	if (adjust_counters &&
+	    (orig_priority != priority || orig_affinity_tag != affinity_tag)) {
+		/*
+		 * We need to adjust these counters based on this thread's
+		 * new disposition w/r to affinity and priority.
+		 */
+		atomic_add_int(&wq->wq_thactive_count[orig_priority]
+		    [orig_affinity_tag], -1);
+		atomic_add_int(&wq->wq_thactive_count[priority][affinity_tag],
+		    1);
+		wq->wq_thscheduled_count[orig_priority][orig_affinity_tag]--;
+		wq->wq_thscheduled_count[priority][affinity_tag]++;
+	}
+	wq->wq_thread_yielded_count = 0;
+
+	WORKQUEUE_UNLOCK(p);
+
+	if (orig_affinity_tag != affinity_tag) {
+		/*
+		 * This thread's affinity does not match the affinity group
+		 * it's being placed on (it's either a brand new thread or
+		 * we're retargeting an existing thread to a new group).
+		 * An affinity tag of 0 means no affinity but we want our
+		 * tags to be 0 based because they are used to index arrays
+		 * so keep it 0 based on internally and bump by 1 when
+		 * calling out to set it.
+		 */
+
+		/* XXX - Not used yet. */
+#if 0
+		CPU_ZERO(&mask);
+		CPU_SET(affinity_tag, &mask);
+		cpuset_setthread(th_to_run->td_tid, &mask);
+#endif
+		;
+	}
+	if (orig_priority != priority) {
+		/*
+		 * XXX Set thread priority.
+		 *
+		 * Can't simply just set thread priority here since:
+		 *
+		 * (1) The thread priority of TIMESHARE priority class is
+		 * adjusted by the scheduler and there doesn't seem to be
+		 * a per-thread 'nice' parameter.
+		 *
+		 * (2) We really shouldn't use the REALTIME class since
+		 * thread workqueues doesn't require the process to have
+		 * privilege.
+		 *
+		 * (3) Could maybe use IDLE priority class for
+		 * WORKQ_LOW_PRIOQUUE.
+		 *
+		 * Need to figure out something here.
+		 */
+		;
+	}
+	twq_runitem(p, item, th_to_run, tl, wake_thread);
+
+	return (1);
+
+out_of_work:
+	/*
+	 * We have no work to do or we are fully booked w/r to running threads.
+	 */
+no_thread_to_run:
+	WORKQUEUE_UNLOCK(p);
+
+	if (start_timer)
+		twq_interval_timer_start(wq);
+
+	return (0);
+
+parkit:
+	/*
+	 * This is a workqueue thread with no more work to do.
+	 * Park it for now.
+	 */
+
+	KASSERT(th_to_park == curthread,
+	    ("[%s, %d] twq_run_nextitem: th_to_park is not current thread",
+	     __FILE__, __LINE__));
+
+	tl = th_to_park->td_threadlist;
+	if (tl == 0)
+		panic("wq thread with no threadlist ");
+
+	TAILQ_REMOVE(&wq->wq_thrunlist, tl, th_entry);
+	tl->th_flags &= ~TH_LIST_RUNNING;
+
+	tl->th_flags |= TH_LIST_BLOCKED;
+	TAILQ_INSERT_HEAD(&wq->wq_thidlelist, tl, th_entry);
+
+	thread_lock(th_to_park);
+	twq_set_schedcallback(th_to_park, NULL);
+	thread_unlock(th_to_park);
+
+	atomic_add_32(&wq->wq_thactive_count[tl->th_priority]
+	    [tl->th_affinity_tag], -1);
+	wq->wq_thscheduled_count[tl->th_priority][tl->th_affinity_tag]--;
+	wq->wq_threads_scheduled--;
+
+	if (wq->wq_thidlecount < 100)
+		us_to_wait = wq_reduce_pool_window_usecs -
+		    (wq->wq_thidlecount * (wq_reduce_pool_window_usecs / 100));
+	else
+		us_to_wait = wq_reduce_pool_window_usecs / 100;
+
+	wq->wq_thidlecount++;
+
+	if (start_timer)
+		twq_interval_timer_start(wq);
+
+	/*
+	 * XXX  I may be imaging things but it seems that only one
+	 * thread will get unparked when a bunch are parked.  Need
+	 * to look at this closer.
+	 */
+sleep_again:
+	error = msleep(tl, &p->p_twqlock, PCATCH, "twq_thpark",
+	    twq_usecstoticks(us_to_wait));
+	if (error == 0 || error == EWOULDBLOCK) {
+		if (twq_unpark(th_to_park, error == EWOULDBLOCK) != 0)
+			goto sleep_again;
+	} else
+		WORKQUEUE_UNLOCK(p);
+
+	return (0);	/* returning to system call handler */
+}
+
+static int
+twq_unpark(struct thread *td, int timedout)
+{
+	struct threadlist *tl;
+	struct proc *p = curproc;
+	struct thrworkq *wq = p->p_twq;
+
+	KASSERT(td == curthread, ("[%s: %d] twq_unpark: td != curthread",
+	    __FILE__, __LINE__));
+	WORKQUEUE_ASSERT_LOCKED(p);
+
+	if (wq == NULL || (tl = td->td_threadlist) == NULL) {
+		WORKQUEUE_UNLOCK(p);
+		return (0);
+	}
+
+	if (timedout) {
+		if ( !(tl->th_flags & TH_LIST_RUNNING)) {
+			/*
+			 * The timer popped us out and we've not been
+			 * moved off of the idle list so we should now
+			 * self-destruct.
+			 *
+			 * twq_removethread() consumes the workq lock.
+			 */
+			twq_removethread(tl);
+			return (0);
+		}
+
+		while (tl->th_flags & TH_LIST_BUSY) {
+
+			/*
+			 * The timer woke us up, but we have already started to
+			 * make this a runnable thread, but have not yet
+			 * finished that process so wait for the normal wakeup.
+			 * Set the timer again in case we miss the wakeup in
+			 * a race condition.
+			 */
+			/* Keep the workq lock held. */
+			return (1);
+		}
+	}
+
+	KASSERT(((tl->th_flags & (TH_LIST_RUNNING | TH_LIST_BUSY)) ==
+	    TH_LIST_RUNNING), ("[%s: %d] twq_unpark: !TH_LIST_RUNNING",
+	    __FILE__, __LINE__));
+
+	/*
+	 * A normal wakeup of this thread occurred.
+	 * No need for any synchronization with the
+	 * timer and twq_runitem
+	 */
+	thread_lock(td);
+	twq_set_schedcallback(td, twq_callback);
+	thread_unlock(td);
+
+	WORKQUEUE_UNLOCK(p);
+	return (0);
+}
+
+static void
+twq_runitem(struct proc *p, void *item, struct thread *td,
+    struct threadlist *tl, int wake_thread)
+{
+
+	KASSERT(p->p_twq != NULL, ("[%s: %d] twq_runitem: wq = NULL",
+	    __FILE__, __LINE__));
+
+	if (wake_thread) {
+		twq_set_upcall(tl, WQ_UPCALL_WORKQ, item);
+		WORKQUEUE_LOCK(p);
+		tl->th_flags &= ~TH_LIST_BUSY;
+		wakeup(tl);
+		WORKQUEUE_UNLOCK(p);
+	} else {
+		twq_set_upcall(tl, WQ_UPCALL_NEWTD, item);
+		WORKQUEUE_LOCK(p);
+		if (tl->th_flags & TH_LIST_SBUSY) {
+			tl->th_flags &= ~TH_LIST_SBUSY;
+			twq_schedthr(td);
+		}
+		WORKQUEUE_UNLOCK(p);
+	}
+}
+
+#else /* ! THRWORKQ */
+
+#include <sys/sysproto.h>
+
+int
+sys_thr_workq(struct thread *td, struct thr_workq_args  *uap)
+{
+
+	return (ENOSYS);
+}
+
+#endif /* ! THRWORKQ */
Index: sys/kern/p1003_1b.c
===================================================================
--- sys/kern/p1003_1b.c
+++ sys/kern/p1003_1b.c
@@ -37,6 +37,7 @@
 __FBSDID("$FreeBSD$");
 
 #include "opt_posix.h"
+#include "opt_thrworkq.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -52,6 +53,9 @@
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
+#ifdef THRWORKQ
+#include <sys/thrworkq.h>
+#endif
 
 MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B");
 
@@ -292,6 +296,9 @@
 sys_sched_yield(struct thread *td, struct sched_yield_args *uap)
 {
 
+#ifdef THRWORKQ
+	thrworkq_thread_yielded();
+#endif
 	sched_relinquish(curthread);
 	return 0;
 }
Index: sys/kern/syscalls.c
===================================================================
--- sys/kern/syscalls.c
+++ sys/kern/syscalls.c
@@ -474,8 +474,8 @@
 	"thr_set_name",			/* 464 = thr_set_name */
 	"aio_fsync",			/* 465 = aio_fsync */
 	"rtprio_thread",			/* 466 = rtprio_thread */
-	"#467",			/* 467 = nosys */
-	"#468",			/* 468 = nosys */
+	"thr_stack",			/* 467 = thr_stack */
+	"thr_workq",			/* 468 = thr_workq */
 	"#469",			/* 469 = __getpath_fromfd */
 	"#470",			/* 470 = __getpath_fromaddr */
 	"sctp_peeloff",			/* 471 = sctp_peeloff */
Index: sys/kern/systrace_args.c
===================================================================
--- sys/kern/systrace_args.c
+++ sys/kern/systrace_args.c
@@ -2615,6 +2615,22 @@
 		*n_args = 3;
 		break;
 	}
+	/* thr_stack */
+	case 467: {
+		struct thr_stack_args *p = params;
+		uarg[0] = p->stacksize; /* size_t */
+		uarg[1] = p->guardsize; /* size_t */
+		*n_args = 2;
+		break;
+	}
+	/* thr_workq */
+	case 468: {
+		struct thr_workq_args *p = params;
+		iarg[0] = p->cmd; /* int */
+		uarg[1] = (intptr_t) p->args; /* struct twq_param * */
+		*n_args = 2;
+		break;
+	}
 	/* sctp_peeloff */
 	case 471: {
 		struct sctp_peeloff_args *p = params;
@@ -7584,6 +7600,32 @@
 			break;
 		};
 		break;
+	/* thr_stack */
+	case 467:
+		switch(ndx) {
+		case 0:
+			p = "size_t";
+			break;
+		case 1:
+			p = "size_t";
+			break;
+		default:
+			break;
+		};
+		break;
+	/* thr_workq */
+	case 468:
+		switch(ndx) {
+		case 0:
+			p = "int";
+			break;
+		case 1:
+			p = "struct twq_param *";
+			break;
+		default:
+			break;
+		};
+		break;
 	/* sctp_peeloff */
 	case 471:
 		switch(ndx) {
@@ -10413,6 +10455,16 @@
 		if (ndx == 0 || ndx == 1)
 			p = "int";
 		break;
+	/* thr_stack */
+	case 467:
+		if (ndx == 0 || ndx == 1)
+			p = "caddr_t";
+		break;
+	/* thr_workq */
+	case 468:
+		if (ndx == 0 || ndx == 1)
+			p = "int";
+		break;
 	/* sctp_peeloff */
 	case 471:
 		if (ndx == 0 || ndx == 1)
Index: sys/sys/proc.h
===================================================================
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -180,6 +180,10 @@
 struct thread;
 struct trapframe;
 struct turnstile;
+struct thrworkq;
+struct threadlist;
+
+typedef void (*mi_switchcb_t)(int, struct thread *);
 
 /*
  * XXX: Does this belong in resource.h or resourcevar.h instead?
@@ -336,7 +340,11 @@
 	struct proc	*td_rfppwait_p;	/* (k) The vforked child */
 	struct vm_page	**td_ma;	/* (k) uio pages held */
 	int		td_ma_cnt;	/* (k) size of *td_ma */
+	mi_switchcb_t   td_cswitchcb;   /* (k) context switch callback. */
+	struct threadlist *td_threadlist; /* (?) thread workq thread list. */
+	void            *td_reuse_stack;  /* (?) reuse workq thread stack.  */
 	void		*td_emuldata;	/* Emulator state data */
+	void		*td_machdata;	/* (k) mach state. */
 	int		td_lastcpu;	/* (t) Last cpu we were on. */
 	int		td_oncpu;	/* (t) Which cpu we are on. */
 };
@@ -400,7 +408,7 @@
 #define	TDF_THRWAKEUP	0x00100000 /* Libthr thread must not suspend itself. */
 #define	TDF_SEINTR	0x00200000 /* EINTR on stop attempts. */
 #define	TDF_SWAPINREQ	0x00400000 /* Swapin request due to wakeup. */
-#define	TDF_UNUSED23	0x00800000 /* --available-- */
+#define	TDF_WORKQ	0x00800000 /* a workq thread */ 
 #define	TDF_SCHED0	0x01000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED1	0x02000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED2	0x04000000 /* Reserved for scheduler private use */
@@ -461,6 +469,7 @@
 #define	TDP_UIOHELD	0x10000000 /* Current uio has pages held in td_ma */
 #define	TDP_FORKING	0x20000000 /* Thread is being created through fork() */
 #define	TDP_EXECVMSPC	0x40000000 /* Execve destroyed old vmspace */
+#define	TDP_UNUSUED32	0x80000000 /* Mach initialization done */
 
 /*
  * Reasons that the current thread can not be run yet.
@@ -647,6 +656,10 @@
 	 */
 	LIST_ENTRY(proc) p_orphan;	/* (e) List of orphan processes. */
 	LIST_HEAD(, proc) p_orphans;	/* (e) Pointer to list of orphans. */
+	vm_offset_t	p_thrstack;	/* ( ) next addr for thread stack */
+	struct mtx	p_twqlock;	/* (n) thread workqueue lock. */
+	struct thrworkq *p_twq;		/* (^) thread workqueue. */
+	void		*p_machdata;	/* (c) Mach state data. */
 };
 
 #define	p_session	p_pgrp->pg_session
@@ -759,6 +772,9 @@
 #define	SW_VOL		0x0100		/* Voluntary switch. */
 #define	SW_INVOL	0x0200		/* Involuntary switch. */
 #define SW_PREEMPT	0x0400		/* The invol switch is a preemption */
+/* Callback type. */
+#define	SWCB_BLOCK		1	/* Thread is about to block. */
+#define	SWCB_UNBLOCK		2	/* Thread was just unblocked. */
 
 /* How values for thread_single(). */
 #define	SINGLE_NO_EXIT	0
Index: sys/sys/syscall.h
===================================================================
--- sys/sys/syscall.h
+++ sys/sys/syscall.h
@@ -389,6 +389,8 @@
 #define	SYS_thr_set_name	464
 #define	SYS_aio_fsync	465
 #define	SYS_rtprio_thread	466
+#define	SYS_thr_stack	467
+#define	SYS_thr_workq	468
 #define	SYS_sctp_peeloff	471
 #define	SYS_sctp_generic_sendmsg	472
 #define	SYS_sctp_generic_sendmsg_iov	473
Index: sys/sys/syscall.mk
===================================================================
--- sys/sys/syscall.mk
+++ sys/sys/syscall.mk
@@ -319,6 +319,8 @@
 	thr_set_name.o \
 	aio_fsync.o \
 	rtprio_thread.o \
+	thr_stack.o \
+	thr_workq.o \
 	sctp_peeloff.o \
 	sctp_generic_sendmsg.o \
 	sctp_generic_sendmsg_iov.o \
Index: sys/sys/syscallsubr.h
===================================================================
--- sys/sys/syscallsubr.h
+++ sys/sys/syscallsubr.h
@@ -241,6 +241,8 @@
 int	kern_thr_alloc(struct proc *, int pages, struct thread **);
 int	kern_thr_exit(struct thread *td);
 int	kern_thr_new(struct thread *td, struct thr_param *param);
+int	kern_thr_stack(struct proc *p, void **addr, vm_size_t stacksz,
+	    vm_size_t guardsz);
 int	kern_thr_suspend(struct thread *td, struct timespec *tsp);
 int	kern_truncate(struct thread *td, char *path, enum uio_seg pathseg,
 	    off_t length);
Index: sys/sys/sysproto.h
===================================================================
--- sys/sys/sysproto.h
+++ sys/sys/sysproto.h
@@ -1383,6 +1383,14 @@
 	char lwpid_l_[PADL_(lwpid_t)]; lwpid_t lwpid; char lwpid_r_[PADR_(lwpid_t)];
 	char rtp_l_[PADL_(struct rtprio *)]; struct rtprio * rtp; char rtp_r_[PADR_(struct rtprio *)];
 };
+struct thr_stack_args {
+	char stacksize_l_[PADL_(size_t)]; size_t stacksize; char stacksize_r_[PADR_(size_t)];
+	char guardsize_l_[PADL_(size_t)]; size_t guardsize; char guardsize_r_[PADR_(size_t)];
+};
+struct thr_workq_args {
+	char cmd_l_[PADL_(int)]; int cmd; char cmd_r_[PADR_(int)];
+	char args_l_[PADL_(struct twq_param *)]; struct twq_param * args; char args_r_[PADR_(struct twq_param *)];
+};
 struct sctp_peeloff_args {
 	char sd_l_[PADL_(int)]; int sd; char sd_r_[PADR_(int)];
 	char name_l_[PADL_(uint32_t)]; uint32_t name; char name_r_[PADR_(uint32_t)];
@@ -2100,6 +2108,8 @@
 int	sys_thr_set_name(struct thread *, struct thr_set_name_args *);
 int	sys_aio_fsync(struct thread *, struct aio_fsync_args *);
 int	sys_rtprio_thread(struct thread *, struct rtprio_thread_args *);
+int	sys_thr_stack(struct thread *, struct thr_stack_args *);
+int	sys_thr_workq(struct thread *, struct thr_workq_args *);
 int	sys_sctp_peeloff(struct thread *, struct sctp_peeloff_args *);
 int	sys_sctp_generic_sendmsg(struct thread *, struct sctp_generic_sendmsg_args *);
 int	sys_sctp_generic_sendmsg_iov(struct thread *, struct sctp_generic_sendmsg_iov_args *);
@@ -2878,6 +2888,8 @@
 #define	SYS_AUE_thr_set_name	AUE_NULL
 #define	SYS_AUE_aio_fsync	AUE_NULL
 #define	SYS_AUE_rtprio_thread	AUE_RTPRIO
+#define	SYS_AUE_thr_stack	AUE_NULL
+#define	SYS_AUE_thr_workq	AUE_NULL
 #define	SYS_AUE_sctp_peeloff	AUE_NULL
 #define	SYS_AUE_sctp_generic_sendmsg	AUE_NULL
 #define	SYS_AUE_sctp_generic_sendmsg_iov	AUE_NULL
Index: sys/sys/thr.h
===================================================================
--- sys/sys/thr.h
+++ sys/sys/thr.h
@@ -44,6 +44,11 @@
 /* Create the system scope thread. */
 #define	THR_SYSTEM_SCOPE	0x0002
 
+/* Default thread stack size. */
+#define	THR_STACK_DEFAULT	(sizeof(void *) / 4 * 1024 * 1024)
+/* Initial (main) thread's stack size. */
+#define	THR_STACK_INITIAL	(THR_STACK_DEFAULT * 2)
+
 struct thr_param {
     void	(*start_func)(void *);	/* thread entry function. */
     void	*arg;			/* argument for entry function. */
@@ -79,6 +84,7 @@
 int thr_suspend(const struct timespec *timeout);
 int thr_wake(long id);
 int thr_set_name(long id, const char *name);
+void *thr_stack(size_t stacksize, size_t guardsize);
 __END_DECLS
 #endif /* !_KERNEL */
 
Index: sys/sys/thrworkq.h
===================================================================
--- /dev/null
+++ sys/sys/thrworkq.h
@@ -0,0 +1,124 @@
+/*-
+ * Copyright (c) 2009-2014 Stacey Son <sson@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ *
+ */
+
+#ifndef	_SYS_THRWORKQ_H_
+#define	_SYS_THRWORKQ_H_
+
+/*
+ * thr_workq() system call commands.
+ */
+#define	WQOPS_INIT		1
+#define	WQOPS_QUEUE_ADD		2
+#define	WQOPS_QUEUE_REMOVE	3
+#define	WQOPS_THREAD_RETURN	4
+#define	WQOPS_THREAD_SETCONC	5
+
+/*
+ * Workqueue priority flags.
+ */
+#define	WORKQUEUE_OVERCOMMIT	0x10000	/* Attempt to start new thread if
+					   can't run immediately even if
+					   it requires overcommitting
+					   resources. */
+
+/*
+ * Kernel workqueue limits and sizing defaults.
+ */
+#define	WORKQ_OS_ELEM_MAX	64	/* Max number of work items pending in
+					   workq. */
+#define	WORKQ_OS_NUMPRIOS	 3	/* Number of workq priority levels. */
+
+struct wqa_init {
+	int	*retid;			/* workqueue ID returned */
+	void	(*workqfunc)(void *);	/* workq entry function */
+	void	(*newtdfunc)(void *);	/* new thread startup function */
+	void	(*exitfunc)(void *);	/* thread shutdown function */
+	size_t	stacksize;		/* per worker thread stack size */
+	size_t	guardsize;		/* per worker thread stack guard size */
+};
+
+struct wqa_qadd {
+	void	*item;			/* work item (arg to workq func) */
+	int	prio;			/* item priority */
+	int	affin;			/* item CPU affinity */
+};
+
+struct wqa_qrm {
+	void	*item;			/* work item */
+	int	prio;			/* item priority */
+};
+
+struct wqa_setconc {
+	int	prio;			/* priority queue */
+	int	conc;			/* request concurrency */
+};
+
+struct twq_param {
+	int	twq_id;
+	union {
+		struct 	wqa_init	init;
+		struct  wqa_qadd 	qadd;
+		struct	wqa_qrm		qrm;
+		struct  wqa_setconc	setconc;
+	} a;
+
+#define	twq_retid		a.init.retid
+#define	twq_workqfunc		a.init.workqfunc
+#define	twq_newtdfunc		a.init.newtdfunc
+#define	twq_exitfunc		a.init.exitfunc
+#define	twq_stacksize		a.init.stacksize
+#define	twq_guardsize		a.init.guardsize
+
+#define	twq_add_item		a.qadd.item
+#define	twq_add_prio		a.qadd.prio
+#define	twq_add_affin		a.qadd.affin
+
+#define	twq_rm_item		a.qrm.item
+#define	twq_rm_prio		a.qrm.prio
+
+#define	twq_setconc_prio	a.setconc.prio
+#define	twq_setconc_conc	a.setconc.conc
+};
+
+#ifdef _KERNEL
+#include <sys/proc.h>
+
+extern void thrworkq_exit(struct proc *p);
+extern int thrworkq_newthread(struct thread *td, void *func, void *arg,
+    stack_t *stack);
+extern void thrworkq_reusestack(struct proc *p, void *stackaddr);
+extern void thrworkq_thread_yielded(void);
+
+#else
+
+int thr_workq(int cmd, struct twq_param *args);
+
+#endif /* _KERNEL */
+
+#endif /* ! _SYS_THRWORKQ_H_ */