Index: head/sys/kern/kern_cpuset.c
===================================================================
--- head/sys/kern/kern_cpuset.c	(revision 317755)
+++ head/sys/kern/kern_cpuset.c	(revision 317756)
@@ -1,1321 +1,1329 @@
 /*-
  * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  * 
  * Copyright (c) 2008 Nokia Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/cpuset.h>
 #include <sys/sx.h>
 #include <sys/queue.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif /* DDB */
 
 /*
  * cpusets provide a mechanism for creating and manipulating sets of
  * processors for the purpose of constraining the scheduling of threads to
  * specific processors.
  *
  * Each process belongs to an identified set, by default this is set 1.  Each
  * thread may further restrict the cpus it may run on to a subset of this
  * named set.  This creates an anonymous set which other threads and processes
  * may not join by number.
  *
  * The named set is referred to herein as the 'base' set to avoid ambiguity.
  * This set is usually a child of a 'root' set while the anonymous set may
  * simply be referred to as a mask.  In the syscall api these are referred to
  * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
  *
  * Threads inherit their set from their creator whether it be anonymous or
  * not.  This means that anonymous sets are immutable because they may be
  * shared.  To modify an anonymous set a new set is created with the desired
  * mask and the same parent as the existing anonymous set.  This gives the
  * illusion of each thread having a private mask.
  *
  * Via the syscall apis a user may ask to retrieve or modify the root, base,
  * or mask that is discovered via a pid, tid, or setid.  Modifying a set
  * modifies all numbered and anonymous child sets to comply with the new mask.
  * Modifying a pid or tid's mask applies only to that tid but must still
  * exist within the assigned parent set.
  *
  * A thread may not be assigned to a group separate from other threads in
  * the process.  This is to remove ambiguity when the setid is queried with
  * a pid argument.  There is no other technical limitation.
  *
  * This somewhat complex arrangement is intended to make it easy for
  * applications to query available processors and bind their threads to
  * specific processors while also allowing administrators to dynamically
  * reprovision by changing sets which apply to groups of processes.
  *
  * A simple application should not concern itself with sets at all and
  * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
  * meaning 'curthread'.  It may query available cpus for that tid with a
  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
  */
 static uma_zone_t cpuset_zone;
 static struct mtx cpuset_lock;
 static struct setlist cpuset_ids;
 static struct unrhdr *cpuset_unr;
 static struct cpuset *cpuset_zero, *cpuset_default;
 
 /* Return the size of cpuset_t at the kernel level */
 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)");
 
 cpuset_t *cpuset_root;
 cpuset_t cpuset_domain[MAXMEMDOM];
 
 /*
  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
  */
 struct cpuset *
 cpuset_ref(struct cpuset *set)
 {
 
 	refcount_acquire(&set->cs_ref);
 	return (set);
 }
 
 /*
  * Walks up the tree from 'set' to find the root.  Returns the root
  * referenced.
  */
 static struct cpuset *
 cpuset_refroot(struct cpuset *set)
 {
 
 	for (; set->cs_parent != NULL; set = set->cs_parent)
 		if (set->cs_flags & CPU_SET_ROOT)
 			break;
 	cpuset_ref(set);
 
 	return (set);
 }
 
 /*
  * Find the first non-anonymous set starting from 'set'.  Returns this set
  * referenced.  May return the passed in set with an extra ref if it is
  * not anonymous. 
  */
 static struct cpuset *
 cpuset_refbase(struct cpuset *set)
 {
 
 	if (set->cs_id == CPUSET_INVALID)
 		set = set->cs_parent;
 	cpuset_ref(set);
 
 	return (set);
 }
 
 /*
  * Release a reference in a context where it is safe to allocate.
  */
 void
 cpuset_rel(struct cpuset *set)
 {
 	cpusetid_t id;
 
 	if (refcount_release(&set->cs_ref) == 0)
 		return;
 	mtx_lock_spin(&cpuset_lock);
 	LIST_REMOVE(set, cs_siblings);
 	id = set->cs_id;
 	if (id != CPUSET_INVALID)
 		LIST_REMOVE(set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 	cpuset_rel(set->cs_parent);
 	uma_zfree(cpuset_zone, set);
 	if (id != CPUSET_INVALID)
 		free_unr(cpuset_unr, id);
 }
 
 /*
  * Deferred release must be used when in a context that is not safe to
  * allocate/free.  This places any unreferenced sets on the list 'head'.
  */
 static void
 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
 {
 
 	if (refcount_release(&set->cs_ref) == 0)
 		return;
 	mtx_lock_spin(&cpuset_lock);
 	LIST_REMOVE(set, cs_siblings);
 	if (set->cs_id != CPUSET_INVALID)
 		LIST_REMOVE(set, cs_link);
 	LIST_INSERT_HEAD(head, set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 }
 
 /*
  * Complete a deferred release.  Removes the set from the list provided to
  * cpuset_rel_defer.
  */
 static void
 cpuset_rel_complete(struct cpuset *set)
 {
 	LIST_REMOVE(set, cs_link);
 	cpuset_rel(set->cs_parent);
 	uma_zfree(cpuset_zone, set);
 }
 
 /*
  * Find a set based on an id.  Returns it with a ref.
  */
 static struct cpuset *
 cpuset_lookup(cpusetid_t setid, struct thread *td)
 {
 	struct cpuset *set;
 
 	if (setid == CPUSET_INVALID)
 		return (NULL);
 	mtx_lock_spin(&cpuset_lock);
 	LIST_FOREACH(set, &cpuset_ids, cs_link)
 		if (set->cs_id == setid)
 			break;
 	if (set)
 		cpuset_ref(set);
 	mtx_unlock_spin(&cpuset_lock);
 
 	KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
 	if (set != NULL && jailed(td->td_ucred)) {
 		struct cpuset *jset, *tset;
 
 		jset = td->td_ucred->cr_prison->pr_cpuset;
 		for (tset = set; tset != NULL; tset = tset->cs_parent)
 			if (tset == jset)
 				break;
 		if (tset == NULL) {
 			cpuset_rel(set);
 			set = NULL;
 		}
 	}
 
 	return (set);
 }
 
 /*
  * Create a set in the space provided in 'set' with the provided parameters.
  * The set is returned with a single ref.  May return EDEADLK if the set
  * will have no valid cpu based on restrictions from the parent.
  */
 static int
 _cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask,
     cpusetid_t id)
 {
 
 	if (!CPU_OVERLAP(&parent->cs_mask, mask))
 		return (EDEADLK);
 	CPU_COPY(mask, &set->cs_mask);
 	LIST_INIT(&set->cs_children);
 	refcount_init(&set->cs_ref, 1);
 	set->cs_flags = 0;
 	mtx_lock_spin(&cpuset_lock);
 	CPU_AND(&set->cs_mask, &parent->cs_mask);
 	set->cs_id = id;
 	set->cs_parent = cpuset_ref(parent);
 	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
 	if (set->cs_id != CPUSET_INVALID)
 		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 
 	return (0);
 }
 
 /*
  * Create a new non-anonymous set with the requested parent and mask.  May
  * return failures if the mask is invalid or a new number can not be
  * allocated.
  */
 static int
 cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
 {
 	struct cpuset *set;
 	cpusetid_t id;
 	int error;
 
 	id = alloc_unr(cpuset_unr);
 	if (id == -1)
 		return (ENFILE);
 	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
 	error = _cpuset_create(set, parent, mask, id);
 	if (error == 0)
 		return (0);
 	free_unr(cpuset_unr, id);
 	uma_zfree(cpuset_zone, set);
 
 	return (error);
 }
 
 /*
  * Recursively check for errors that would occur from applying mask to
  * the tree of sets starting at 'set'.  Checks for sets that would become
  * empty as well as RDONLY flags.
  */
 static int
 cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask)
 {
 	struct cpuset *nset;
 	cpuset_t newmask;
 	int error;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	if (set->cs_flags & CPU_SET_RDONLY)
 		return (EPERM);
 	if (check_mask) {
 		if (!CPU_OVERLAP(&set->cs_mask, mask))
 			return (EDEADLK);
 		CPU_COPY(&set->cs_mask, &newmask);
 		CPU_AND(&newmask, mask);
 	} else
 		CPU_COPY(mask, &newmask);
 	error = 0;
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
 			break;
 	return (error);
 }
 
 /*
  * Applies the mask 'mask' without checking for empty sets or permissions.
  */
 static void
 cpuset_update(struct cpuset *set, cpuset_t *mask)
 {
 	struct cpuset *nset;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	CPU_AND(&set->cs_mask, mask);
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		cpuset_update(nset, &set->cs_mask);
 
 	return;
 }
 
 /*
  * Modify the set 'set' to use a copy of the mask provided.  Apply this new
  * mask to restrict all children in the tree.  Checks for validity before
  * applying the changes.
  */
 static int
 cpuset_modify(struct cpuset *set, cpuset_t *mask)
 {
 	struct cpuset *root;
 	int error;
 
 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
 	if (error)
 		return (error);
 	/*
 	 * In case we are called from within the jail
 	 * we do not allow modifying the dedicated root
 	 * cpuset of the jail but may still allow to
 	 * change child sets.
 	 */
 	if (jailed(curthread->td_ucred) &&
 	    set->cs_flags & CPU_SET_ROOT)
 		return (EPERM);
 	/*
 	 * Verify that we have access to this set of
 	 * cpus.
 	 */
 	root = set->cs_parent;
 	if (root && !CPU_SUBSET(&root->cs_mask, mask))
 		return (EINVAL);
 	mtx_lock_spin(&cpuset_lock);
 	error = cpuset_testupdate(set, mask, 0);
 	if (error)
 		goto out;
 	CPU_COPY(mask, &set->cs_mask);
 	cpuset_update(set, mask);
 out:
 	mtx_unlock_spin(&cpuset_lock);
 
 	return (error);
 }
 
 /*
  * Resolve the 'which' parameter of several cpuset apis.
  *
  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
  * checks for permission via p_cansched().
  *
  * For WHICH_SET returns a valid set with a new reference.
  *
  * -1 may be supplied for any argument to mean the current proc/thread or
  * the base set of the current thread.  May fail with ESRCH/EPERM.
  */
 int
 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
     struct cpuset **setp)
 {
 	struct cpuset *set;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	*pp = p = NULL;
 	*tdp = td = NULL;
 	*setp = set = NULL;
 	switch (which) {
 	case CPU_WHICH_PID:
 		if (id == -1) {
 			PROC_LOCK(curproc);
 			p = curproc;
 			break;
 		}
 		if ((p = pfind(id)) == NULL)
 			return (ESRCH);
 		break;
 	case CPU_WHICH_TID:
 		if (id == -1) {
 			PROC_LOCK(curproc);
 			p = curproc;
 			td = curthread;
 			break;
 		}
 		td = tdfind(id, -1);
 		if (td == NULL)
 			return (ESRCH);
 		p = td->td_proc;
 		break;
 	case CPU_WHICH_CPUSET:
 		if (id == -1) {
 			thread_lock(curthread);
 			set = cpuset_refbase(curthread->td_cpuset);
 			thread_unlock(curthread);
 		} else
 			set = cpuset_lookup(id, curthread);
 		if (set) {
 			*setp = set;
 			return (0);
 		}
 		return (ESRCH);
 	case CPU_WHICH_JAIL:
 	{
 		/* Find `set' for prison with given id. */
 		struct prison *pr;
 
 		sx_slock(&allprison_lock);
 		pr = prison_find_child(curthread->td_ucred->cr_prison, id);
 		sx_sunlock(&allprison_lock);
 		if (pr == NULL)
 			return (ESRCH);
 		cpuset_ref(pr->pr_cpuset);
 		*setp = pr->pr_cpuset;
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 	case CPU_WHICH_IRQ:
 	case CPU_WHICH_DOMAIN:
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	error = p_cansched(curthread, p);
 	if (error) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (td == NULL)
 		td = FIRST_THREAD_IN_PROC(p);
 	*pp = p;
 	*tdp = td;
 	return (0);
 }
 
 /*
  * Create an anonymous set with the provided mask in the space provided by
  * 'fset'.  If the passed in set is anonymous we use its parent otherwise
  * the new set is a child of 'set'.
  */
 static int
 cpuset_shadow(struct cpuset *set, struct cpuset *fset, const cpuset_t *mask)
 {
 	struct cpuset *parent;
 
 	if (set->cs_id == CPUSET_INVALID)
 		parent = set->cs_parent;
 	else
 		parent = set;
 	if (!CPU_SUBSET(&parent->cs_mask, mask))
 		return (EDEADLK);
 	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
 }
 
 /*
  * Handle two cases for replacing the base set or mask of an entire process.
  *
  * 1) Set is non-null and mask is null.  This reparents all anonymous sets
  *    to the provided set and replaces all non-anonymous td_cpusets with the
  *    provided set.
  * 2) Mask is non-null and set is null.  This replaces or creates anonymous
  *    sets for every thread with the existing base as a parent.
  *
  * This is overly complicated because we can't allocate while holding a 
  * spinlock and spinlocks must be held while changing and examining thread
  * state.
  */
 static int
 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask)
 {
 	struct setlist freelist;
 	struct setlist droplist;
 	struct cpuset *tdset;
 	struct cpuset *nset;
 	struct thread *td;
 	struct proc *p;
 	int threads;
 	int nfree;
 	int error;
 	/*
 	 * The algorithm requires two passes due to locking considerations.
 	 * 
 	 * 1) Lookup the process and acquire the locks in the required order.
 	 * 2) If enough cpusets have not been allocated release the locks and
 	 *    allocate them.  Loop.
 	 */
 	LIST_INIT(&freelist);
 	LIST_INIT(&droplist);
 	nfree = 0;
 	for (;;) {
 		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
 		if (error)
 			goto out;
 		if (nfree >= p->p_numthreads)
 			break;
 		threads = p->p_numthreads;
 		PROC_UNLOCK(p);
 		for (; nfree < threads; nfree++) {
 			nset = uma_zalloc(cpuset_zone, M_WAITOK);
 			LIST_INSERT_HEAD(&freelist, nset, cs_link);
 		}
 	}
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * Now that the appropriate locks are held and we have enough cpusets,
 	 * make sure the operation will succeed before applying changes.  The
 	 * proc lock prevents td_cpuset from changing between calls.
 	 */
 	error = 0;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		tdset = td->td_cpuset;
 		/*
 		 * Verify that a new mask doesn't specify cpus outside of
 		 * the set the thread is a member of.
 		 */
 		if (mask) {
 			if (tdset->cs_id == CPUSET_INVALID)
 				tdset = tdset->cs_parent;
 			if (!CPU_SUBSET(&tdset->cs_mask, mask))
 				error = EDEADLK;
 		/*
 		 * Verify that a new set won't leave an existing thread
 		 * mask without a cpu to run on.  It can, however, restrict
 		 * the set.
 		 */
 		} else if (tdset->cs_id == CPUSET_INVALID) {
 			if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
 				error = EDEADLK;
 		}
 		thread_unlock(td);
 		if (error)
 			goto unlock_out;
 	}
 	/*
 	 * Replace each thread's cpuset while using deferred release.  We
 	 * must do this because the thread lock must be held while operating
 	 * on the thread and this limits the type of operations allowed.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		/*
 		 * If we presently have an anonymous set or are applying a
 		 * mask we must create an anonymous shadow set.  That is
 		 * either parented to our existing base or the supplied set.
 		 *
 		 * If we have a base set with no anonymous shadow we simply
 		 * replace it outright.
 		 */
 		tdset = td->td_cpuset;
 		if (tdset->cs_id == CPUSET_INVALID || mask) {
 			nset = LIST_FIRST(&freelist);
 			LIST_REMOVE(nset, cs_link);
 			if (mask)
 				error = cpuset_shadow(tdset, nset, mask);
 			else
 				error = _cpuset_create(nset, set,
 				    &tdset->cs_mask, CPUSET_INVALID);
 			if (error) {
 				LIST_INSERT_HEAD(&freelist, nset, cs_link);
 				thread_unlock(td);
 				break;
 			}
 		} else
 			nset = cpuset_ref(set);
 		cpuset_rel_defer(&droplist, tdset);
 		td->td_cpuset = nset;
 		sched_affinity(td);
 		thread_unlock(td);
 	}
 unlock_out:
 	PROC_UNLOCK(p);
 out:
 	while ((nset = LIST_FIRST(&droplist)) != NULL)
 		cpuset_rel_complete(nset);
 	while ((nset = LIST_FIRST(&freelist)) != NULL) {
 		LIST_REMOVE(nset, cs_link);
 		uma_zfree(cpuset_zone, nset);
 	}
 	return (error);
 }
 
 /*
  * Return a string representing a valid layout for a cpuset_t object.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
  */
 char *
 cpusetobj_strprint(char *buf, const cpuset_t *set)
 {
 	char *tbuf;
 	size_t i, bytesp, bufsiz;
 
 	tbuf = buf;
 	bytesp = 0;
 	bufsiz = CPUSETBUFSIZ;
 
 	for (i = 0; i < (_NCPUWORDS - 1); i++) {
 		bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]);
 		bufsiz -= bytesp;
 		tbuf += bytesp;
 	}
 	snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]);
 	return (buf);
 }
 
 /*
  * Build a valid cpuset_t object from a string representation.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
  */
 int
 cpusetobj_strscan(cpuset_t *set, const char *buf)
 {
 	u_int nwords;
 	int i, ret;
 
 	if (strlen(buf) > CPUSETBUFSIZ - 1)
 		return (-1);
 
 	/* Allow to pass a shorter version of the mask when necessary. */
 	nwords = 1;
 	for (i = 0; buf[i] != '\0'; i++)
 		if (buf[i] == ',')
 			nwords++;
 	if (nwords > _NCPUWORDS)
 		return (-1);
 
 	CPU_ZERO(set);
 	for (i = 0; i < (nwords - 1); i++) {
 		ret = sscanf(buf, "%lx,", &set->__bits[i]);
 		if (ret == 0 || ret == -1)
 			return (-1);
 		buf = strstr(buf, ",");
 		if (buf == NULL)
 			return (-1);
 		buf++;
 	}
 	ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]);
 	if (ret == 0 || ret == -1)
 		return (-1);
 	return (0);
 }
 
 /*
  * Apply an anonymous mask to a single thread.
  */
 int
 cpuset_setthread(lwpid_t id, cpuset_t *mask)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	nset = uma_zalloc(cpuset_zone, M_WAITOK);
 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
 	if (error)
 		goto out;
 	set = NULL;
 	thread_lock(td);
 	error = cpuset_shadow(td->td_cpuset, nset, mask);
 	if (error == 0) {
 		set = td->td_cpuset;
 		td->td_cpuset = nset;
 		sched_affinity(td);
 		nset = NULL;
 	}
 	thread_unlock(td);
 	PROC_UNLOCK(p);
 	if (set)
 		cpuset_rel(set);
 out:
 	if (nset)
 		uma_zfree(cpuset_zone, nset);
 	return (error);
 }
 
 /*
  * Apply new cpumask to the ithread.
  */
 int
 cpuset_setithread(lwpid_t id, int cpu)
 {
 	struct cpuset *nset, *rset;
 	struct cpuset *parent, *old_set;
 	struct thread *td;
 	struct proc *p;
 	cpusetid_t cs_id;
 	cpuset_t mask;
 	int error;
 
 	nset = uma_zalloc(cpuset_zone, M_WAITOK);
 	rset = uma_zalloc(cpuset_zone, M_WAITOK);
 	cs_id = CPUSET_INVALID;
 
 	CPU_ZERO(&mask);
 	if (cpu == NOCPU)
 		CPU_COPY(cpuset_root, &mask);
 	else
 		CPU_SET(cpu, &mask);
 
 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &old_set);
 	if (error != 0 || ((cs_id = alloc_unr(cpuset_unr)) == CPUSET_INVALID))
 		goto out;
 
 	/* cpuset_which() returns with PROC_LOCK held. */
 	old_set = td->td_cpuset;
 
 	if (cpu == NOCPU) {
 
 		/*
 		 * roll back to default set. We're not using cpuset_shadow()
 		 * here because we can fail CPU_SUBSET() check. This can happen
 		 * if default set does not contain all CPUs.
 		 */
 		error = _cpuset_create(nset, cpuset_default, &mask,
 		    CPUSET_INVALID);
 
 		goto applyset;
 	}
 
 	if (old_set->cs_id == 1 || (old_set->cs_id == CPUSET_INVALID &&
 	    old_set->cs_parent->cs_id == 1)) {
 
 		/*
 		 * Current set is either default (1) or
 		 * shadowed version of default set.
 		 *
 		 * Allocate new root set to be able to shadow it
 		 * with any mask.
 		 */
 		error = _cpuset_create(rset, cpuset_zero,
 		    &cpuset_zero->cs_mask, cs_id);
 		if (error != 0) {
 			PROC_UNLOCK(p);
 			goto out;
 		}
 		rset->cs_flags |= CPU_SET_ROOT;
 		parent = rset;
 		rset = NULL;
 		cs_id = CPUSET_INVALID;
 	} else {
 		/* Assume existing set was already allocated by previous call */
 		parent = old_set;
 		old_set = NULL;
 	}
 
 	error = cpuset_shadow(parent, nset, &mask);
 applyset:
 	if (error == 0) {
 		thread_lock(td);
 		td->td_cpuset = nset;
 		sched_affinity(td);
 		thread_unlock(td);
 		nset = NULL;
 	} else
 		old_set = NULL;
 	PROC_UNLOCK(p);
 	if (old_set != NULL)
 		cpuset_rel(old_set);
 out:
 	if (nset != NULL)
 		uma_zfree(cpuset_zone, nset);
 	if (rset != NULL)
 		uma_zfree(cpuset_zone, rset);
 	if (cs_id != CPUSET_INVALID)
 		free_unr(cpuset_unr, cs_id);
 	return (error);
 }
 
 
 /*
  * Creates system-wide cpusets and the cpuset for thread0 including two
  * sets:
  * 
  * 0 - The root set which should represent all valid processors in the
  *     system.  It is initially created with a mask of all processors
  *     because we don't know what processors are valid until cpuset_init()
  *     runs.  This set is immutable.
  * 1 - The default set which all processes are a member of until changed.
  *     This allows an administrator to move all threads off of given cpus to
  *     dedicate them to high priority tasks or save power etc.
  */
 struct cpuset *
 cpuset_thread0(void)
 {
 	struct cpuset *set;
 	int error, i;
 
 	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
 
 	/*
 	 * Create the root system set for the whole machine.  Doesn't use
 	 * cpuset_create() due to NULL parent.
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	CPU_FILL(&set->cs_mask);
 	LIST_INIT(&set->cs_children);
 	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
 	set->cs_ref = 1;
 	set->cs_flags = CPU_SET_ROOT;
 	cpuset_zero = set;
 	cpuset_root = &set->cs_mask;
 
 	/*
 	 * Now derive a default, modifiable set from that to give out.
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK);
 	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
 	KASSERT(error == 0, ("Error creating default set: %d\n", error));
 	cpuset_default = set;
 
 	/*
 	 * Initialize the unit allocator. 0 and 1 are allocated above.
 	 */
 	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
 
 	/*
 	 * If MD code has not initialized per-domain cpusets, place all
 	 * CPUs in domain 0.
 	 */
 	for (i = 0; i < MAXMEMDOM; i++)
 		if (!CPU_EMPTY(&cpuset_domain[i]))
 			goto domains_set;
 	CPU_COPY(&all_cpus, &cpuset_domain[0]);
 domains_set:
 
 	return (set);
 }
 
 /*
  * Create a cpuset, which would be cpuset_create() but
  * mark the new 'set' as root.
  *
  * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
  * for that.
  *
  * In case of no error, returns the set in *setp locked with a reference.
  */
 int
 cpuset_create_root(struct prison *pr, struct cpuset **setp)
 {
 	struct cpuset *set;
 	int error;
 
 	KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
 	KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
 
 	error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
 	if (error)
 		return (error);
 
 	KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
 	    __func__, __LINE__));
 
 	/* Mark the set as root. */
 	set = *setp;
 	set->cs_flags |= CPU_SET_ROOT;
 
 	return (0);
 }
 
 int
 cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
 {
 	int error;
 
 	KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
 	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
 
 	cpuset_ref(set);
 	error = cpuset_setproc(p->p_pid, set, NULL);
 	if (error)
 		return (error);
 	cpuset_rel(set);
 	return (0);
 }
 
 /*
  * This is called once the final set of system cpus is known.  Modifies
  * the root set and all children and mark the root read-only.  
  */
 static void
 cpuset_init(void *arg)
 {
 	cpuset_t mask;
 
 	mask = all_cpus;
 	if (cpuset_modify(cpuset_zero, &mask))
 		panic("Can't set initial cpuset mask.\n");
 	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
 }
 SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_args {
 	cpusetid_t	*setid;
 };
 #endif
 int
 sys_cpuset(struct thread *td, struct cpuset_args *uap)
 {
 	struct cpuset *root;
 	struct cpuset *set;
 	int error;
 
 	thread_lock(td);
 	root = cpuset_refroot(td->td_cpuset);
 	thread_unlock(td);
 	error = cpuset_create(&set, root, &root->cs_mask);
 	cpuset_rel(root);
 	if (error)
 		return (error);
 	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
 	if (error == 0)
 		error = cpuset_setproc(-1, set, NULL);
 	cpuset_rel(set);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setid_args {
 	cpuwhich_t	which;
 	id_t		id;
 	cpusetid_t	setid;
 };
 #endif
 int
 sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
 {
 
 	return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid));
 }
 
 int
 kern_cpuset_setid(struct thread *td, cpuwhich_t which,
     id_t id, cpusetid_t setid)
 {
 	struct cpuset *set;
 	int error;
 
 	/*
 	 * Presently we only support per-process sets.
 	 */
 	if (which != CPU_WHICH_PID)
 		return (EINVAL);
 	set = cpuset_lookup(setid, td);
 	if (set == NULL)
 		return (ESRCH);
 	error = cpuset_setproc(id, set, NULL);
 	cpuset_rel(set);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getid_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	cpusetid_t	*setid;
 };
 #endif
 int
 sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
 {
 
 	return (kern_cpuset_getid(td, uap->level, uap->which, uap->id,
 	    uap->setid));
 }
 
 int
 kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, cpusetid_t *setid)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	cpusetid_t tmpid;
 	int error;
 
 	if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET)
 		return (EINVAL);
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error)
 		return (error);
 	switch (which) {
 	case CPU_WHICH_TID:
 	case CPU_WHICH_PID:
 		thread_lock(ttd);
 		set = cpuset_refbase(ttd->td_cpuset);
 		thread_unlock(ttd);
 		PROC_UNLOCK(p);
 		break;
 	case CPU_WHICH_CPUSET:
 	case CPU_WHICH_JAIL:
 		break;
 	case CPU_WHICH_IRQ:
 	case CPU_WHICH_DOMAIN:
 		return (EINVAL);
 	}
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 		nset = cpuset_refroot(set);
 		cpuset_rel(set);
 		set = nset;
 		break;
 	case CPU_LEVEL_CPUSET:
 		break;
 	case CPU_LEVEL_WHICH:
 		break;
 	}
 	tmpid = set->cs_id;
 	cpuset_rel(set);
 	if (error == 0)
 		error = copyout(&tmpid, setid, sizeof(id));
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getaffinity_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		cpusetsize;
 	cpuset_t	*mask;
 };
 #endif
 int
 sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
 {
 
 	return (kern_cpuset_getaffinity(td, uap->level, uap->which,
 	    uap->id, uap->cpusetsize, uap->mask));
 }
 
 int
 kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t cpusetsize, cpuset_t *maskp)
 {
 	struct thread *ttd;
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct proc *p;
 	cpuset_t *mask;
 	int error;
 	size_t size;
 
 	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
 		return (ERANGE);
 	size = cpusetsize;
 	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error)
 		goto out;
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
+		case CPU_WHICH_INTRHANDLER:
+		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		CPU_COPY(&nset->cs_mask, mask);
 		cpuset_rel(nset);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			thread_lock(ttd);
 			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_PID:
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				thread_lock(ttd);
 				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
 				thread_unlock(ttd);
 			}
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			CPU_COPY(&set->cs_mask, mask);
 			break;
 		case CPU_WHICH_IRQ:
-			error = intr_getaffinity(id, mask);
+		case CPU_WHICH_INTRHANDLER:
+		case CPU_WHICH_ITHREAD:
+			error = intr_getaffinity(id, which, mask);
 			break;
 		case CPU_WHICH_DOMAIN:
 			if (id < 0 || id >= MAXMEMDOM)
 				error = ESRCH;
 			else
 				CPU_COPY(&cpuset_domain[id], mask);
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (set)
 		cpuset_rel(set);
 	if (p)
 		PROC_UNLOCK(p);
 	if (error == 0)
 		error = copyout(mask, maskp, size);
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setaffinity_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		cpusetsize;
 	const cpuset_t	*mask;
 };
 #endif
 int
 sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
 {
 
 	return (kern_cpuset_setaffinity(td, uap->level, uap->which,
 	    uap->id, uap->cpusetsize, uap->mask));
 }
 
 int
 kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t cpusetsize, const cpuset_t *maskp)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	cpuset_t *mask;
 	int error;
 
 	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
 		return (ERANGE);
 	mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
 	error = copyin(maskp, mask, cpusetsize);
 	if (error)
 		goto out;
 	/*
 	 * Verify that no high bits are set.
 	 */
 	if (cpusetsize > sizeof(cpuset_t)) {
 		char *end;
 		char *cp;
 
 		end = cp = (char *)&mask->__bits;
 		end += cpusetsize;
 		cp += sizeof(cpuset_t);
 		while (cp != end)
 			if (*cp++ != 0) {
 				error = EINVAL;
 				goto out;
 			}
 
 	}
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		error = cpuset_which(which, id, &p, &ttd, &set);
 		if (error)
 			break;
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			PROC_UNLOCK(p);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
+		case CPU_WHICH_INTRHANDLER:
+		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		error = cpuset_modify(nset, mask);
 		cpuset_rel(nset);
 		cpuset_rel(set);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			error = cpuset_setthread(id, mask);
 			break;
 		case CPU_WHICH_PID:
 			error = cpuset_setproc(id, NULL, mask);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			error = cpuset_which(which, id, &p, &ttd, &set);
 			if (error == 0) {
 				error = cpuset_modify(set, mask);
 				cpuset_rel(set);
 			}
 			break;
 		case CPU_WHICH_IRQ:
-			error = intr_setaffinity(id, mask);
+		case CPU_WHICH_INTRHANDLER:
+		case CPU_WHICH_ITHREAD:
+			error = intr_setaffinity(id, which, mask);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifdef DDB
 void
 ddb_display_cpuset(const cpuset_t *set)
 {
 	int cpu, once;
 
 	for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
 		if (CPU_ISSET(cpu, set)) {
 			if (once == 0) {
 				db_printf("%d", cpu);
 				once = 1;
 			} else  
 				db_printf(",%d", cpu);
 		}
 	}
 	if (once == 0)
 		db_printf("<none>");
 }
 
 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
 {
 	struct cpuset *set;
 
 	LIST_FOREACH(set, &cpuset_ids, cs_link) {
 		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
 		    set, set->cs_id, set->cs_ref, set->cs_flags,
 		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
 		db_printf("  mask=");
 		ddb_display_cpuset(&set->cs_mask);
 		db_printf("\n");
 		if (db_pager_quit)
 			break;
 	}
 }
 #endif /* DDB */
Index: head/sys/kern/kern_intr.c
===================================================================
--- head/sys/kern/kern_intr.c	(revision 317755)
+++ head/sys/kern/kern_intr.c	(revision 317756)
@@ -1,1934 +1,2008 @@
 /*-
  * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_kstack_usage_prof.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/cpuset.h>
 #include <sys/rtprio.h>
 #include <sys/systm.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/random.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/unistd.h>
 #include <sys/vmmeter.h>
 #include <machine/atomic.h>
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/stdarg.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 /*
  * Describe an interrupt thread.  There is one of these per interrupt event.
  */
 struct intr_thread {
 	struct intr_event *it_event;
 	struct thread *it_thread;	/* Kernel thread. */
 	int	it_flags;		/* (j) IT_* flags. */
 	int	it_need;		/* Needs service. */
 };
 
 /* Interrupt thread flags kept in it_flags */
 #define	IT_DEAD		0x000001	/* Thread is waiting to exit. */
 #define	IT_WAIT		0x000002	/* Thread is waiting for completion. */
 
 struct	intr_entropy {
 	struct	thread *td;
 	uintptr_t event;
 };
 
 struct	intr_event *clk_intr_event;
 struct	intr_event *tty_intr_event;
 void	*vm_ih;
 struct proc *intrproc;
 
 static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads");
 
 static int intr_storm_threshold = 1000;
 SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RWTUN,
     &intr_storm_threshold, 0,
     "Number of consecutive interrupts before storm protection is enabled");
 static TAILQ_HEAD(, intr_event) event_list =
     TAILQ_HEAD_INITIALIZER(event_list);
 static struct mtx event_lock;
 MTX_SYSINIT(intr_event_list, &event_lock, "intr event list", MTX_DEF);
 
 static void	intr_event_update(struct intr_event *ie);
 #ifdef INTR_FILTER
 static int	intr_event_schedule_thread(struct intr_event *ie,
 		    struct intr_thread *ithd);
 static int	intr_filter_loop(struct intr_event *ie,
 		    struct trapframe *frame, struct intr_thread **ithd);
 static struct intr_thread *ithread_create(const char *name,
 			      struct intr_handler *ih);
 #else
 static int	intr_event_schedule_thread(struct intr_event *ie);
 static struct intr_thread *ithread_create(const char *name);
 #endif
 static void	ithread_destroy(struct intr_thread *ithread);
 static void	ithread_execute_handlers(struct proc *p, 
 		    struct intr_event *ie);
 #ifdef INTR_FILTER
 static void	priv_ithread_execute_handler(struct proc *p, 
 		    struct intr_handler *ih);
 #endif
 static void	ithread_loop(void *);
 static void	ithread_update(struct intr_thread *ithd);
 static void	start_softintr(void *);
 
 /* Map an interrupt type to an ithread priority. */
 u_char
 intr_priority(enum intr_type flags)
 {
 	u_char pri;
 
 	flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET |
 	    INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK | INTR_TYPE_AV);
 	switch (flags) {
 	case INTR_TYPE_TTY:
 		pri = PI_TTY;
 		break;
 	case INTR_TYPE_BIO:
 		pri = PI_DISK;
 		break;
 	case INTR_TYPE_NET:
 		pri = PI_NET;
 		break;
 	case INTR_TYPE_CAM:
 		pri = PI_DISK;
 		break;
 	case INTR_TYPE_AV:
 		pri = PI_AV;
 		break;
 	case INTR_TYPE_CLK:
 		pri = PI_REALTIME;
 		break;
 	case INTR_TYPE_MISC:
 		pri = PI_DULL;          /* don't care */
 		break;
 	default:
 		/* We didn't specify an interrupt level. */
 		panic("intr_priority: no interrupt type in flags");
 	}
 
 	return pri;
 }
 
 /*
  * Update an ithread based on the associated intr_event.
  */
 static void
 ithread_update(struct intr_thread *ithd)
 {
 	struct intr_event *ie;
 	struct thread *td;
 	u_char pri;
 
 	ie = ithd->it_event;
 	td = ithd->it_thread;
 
 	/* Determine the overall priority of this event. */
 	if (TAILQ_EMPTY(&ie->ie_handlers))
 		pri = PRI_MAX_ITHD;
 	else
 		pri = TAILQ_FIRST(&ie->ie_handlers)->ih_pri;
 
 	/* Update name and priority. */
 	strlcpy(td->td_name, ie->ie_fullname, sizeof(td->td_name));
 #ifdef KTR
 	sched_clear_tdname(td);
 #endif
 	thread_lock(td);
 	sched_prio(td, pri);
 	thread_unlock(td);
 }
 
 /*
  * Regenerate the full name of an interrupt event and update its priority.
  */
 static void
 intr_event_update(struct intr_event *ie)
 {
 	struct intr_handler *ih;
 	char *last;
 	int missed, space;
 
 	/* Start off with no entropy and just the name of the event. */
 	mtx_assert(&ie->ie_lock, MA_OWNED);
 	strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
 	ie->ie_flags &= ~IE_ENTROPY;
 	missed = 0;
 	space = 1;
 
 	/* Run through all the handlers updating values. */
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (strlen(ie->ie_fullname) + strlen(ih->ih_name) + 1 <
 		    sizeof(ie->ie_fullname)) {
 			strcat(ie->ie_fullname, " ");
 			strcat(ie->ie_fullname, ih->ih_name);
 			space = 0;
 		} else
 			missed++;
 		if (ih->ih_flags & IH_ENTROPY)
 			ie->ie_flags |= IE_ENTROPY;
 	}
 
 	/*
 	 * If the handler names were too long, add +'s to indicate missing
 	 * names. If we run out of room and still have +'s to add, change
 	 * the last character from a + to a *.
 	 */
 	last = &ie->ie_fullname[sizeof(ie->ie_fullname) - 2];
 	while (missed-- > 0) {
 		if (strlen(ie->ie_fullname) + 1 == sizeof(ie->ie_fullname)) {
 			if (*last == '+') {
 				*last = '*';
 				break;
 			} else
 				*last = '+';
 		} else if (space) {
 			strcat(ie->ie_fullname, " +");
 			space = 0;
 		} else
 			strcat(ie->ie_fullname, "+");
 	}
 
 	/*
 	 * If this event has an ithread, update it's priority and
 	 * name.
 	 */
 	if (ie->ie_thread != NULL)
 		ithread_update(ie->ie_thread);
 	CTR2(KTR_INTR, "%s: updated %s", __func__, ie->ie_fullname);
 }
 
 int
 intr_event_create(struct intr_event **event, void *source, int flags, int irq,
     void (*pre_ithread)(void *), void (*post_ithread)(void *),
     void (*post_filter)(void *), int (*assign_cpu)(void *, int),
     const char *fmt, ...)
 {
 	struct intr_event *ie;
 	va_list ap;
 
 	/* The only valid flag during creation is IE_SOFT. */
 	if ((flags & ~IE_SOFT) != 0)
 		return (EINVAL);
 	ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO);
 	ie->ie_source = source;
 	ie->ie_pre_ithread = pre_ithread;
 	ie->ie_post_ithread = post_ithread;
 	ie->ie_post_filter = post_filter;
 	ie->ie_assign_cpu = assign_cpu;
 	ie->ie_flags = flags;
 	ie->ie_irq = irq;
 	ie->ie_cpu = NOCPU;
 	TAILQ_INIT(&ie->ie_handlers);
 	mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF);
 
 	va_start(ap, fmt);
 	vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap);
 	va_end(ap);
 	strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
 	mtx_lock(&event_lock);
 	TAILQ_INSERT_TAIL(&event_list, ie, ie_list);
 	mtx_unlock(&event_lock);
 	if (event != NULL)
 		*event = ie;
 	CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name);
 	return (0);
 }
 
 /*
  * Bind an interrupt event to the specified CPU.  Note that not all
  * platforms support binding an interrupt to a CPU.  For those
- * platforms this request will fail.  For supported platforms, any
- * associated ithreads as well as the primary interrupt context will
- * be bound to the specificed CPU.  Using a cpu id of NOCPU unbinds
+ * platforms this request will fail.  Using a cpu id of NOCPU unbinds
  * the interrupt event.
  */
-int
-intr_event_bind(struct intr_event *ie, int cpu)
+static int
+_intr_event_bind(struct intr_event *ie, int cpu, bool bindirq, bool bindithread)
 {
 	lwpid_t id;
 	int error;
 
 	/* Need a CPU to bind to. */
 	if (cpu != NOCPU && CPU_ABSENT(cpu))
 		return (EINVAL);
 
 	if (ie->ie_assign_cpu == NULL)
 		return (EOPNOTSUPP);
 
 	error = priv_check(curthread, PRIV_SCHED_CPUSET_INTR);
 	if (error)
 		return (error);
 
 	/*
 	 * If we have any ithreads try to set their mask first to verify
 	 * permissions, etc.
 	 */
-	mtx_lock(&ie->ie_lock);
-	if (ie->ie_thread != NULL) {
-		id = ie->ie_thread->it_thread->td_tid;
-		mtx_unlock(&ie->ie_lock);
-		error = cpuset_setithread(id, cpu);
-		if (error)
-			return (error);
-	} else
-		mtx_unlock(&ie->ie_lock);
-	error = ie->ie_assign_cpu(ie->ie_source, cpu);
-	if (error) {
+	if (bindithread) {
 		mtx_lock(&ie->ie_lock);
 		if (ie->ie_thread != NULL) {
-			cpu = ie->ie_cpu;
 			id = ie->ie_thread->it_thread->td_tid;
 			mtx_unlock(&ie->ie_lock);
-			(void)cpuset_setithread(id, cpu);
+			error = cpuset_setithread(id, cpu);
+			if (error)
+				return (error);
 		} else
 			mtx_unlock(&ie->ie_lock);
+	}
+	if (bindirq)
+		error = ie->ie_assign_cpu(ie->ie_source, cpu);
+	if (error) {
+		if (bindithread) {
+			mtx_lock(&ie->ie_lock);
+			if (ie->ie_thread != NULL) {
+				cpu = ie->ie_cpu;
+				id = ie->ie_thread->it_thread->td_tid;
+				mtx_unlock(&ie->ie_lock);
+				(void)cpuset_setithread(id, cpu);
+			} else
+				mtx_unlock(&ie->ie_lock);
+		}
 		return (error);
 	}
 
-	mtx_lock(&ie->ie_lock);
-	ie->ie_cpu = cpu;
-	mtx_unlock(&ie->ie_lock);
+	if (bindirq) {
+		mtx_lock(&ie->ie_lock);
+		ie->ie_cpu = cpu;
+		mtx_unlock(&ie->ie_lock);
+	}
 
 	return (error);
 }
 
+/*
+ * Bind an interrupt event to the specified CPU.  For supported platforms, any
+ * associated ithreads as well as the primary interrupt context will be bound
+ * to the specificed CPU.
+ */
+int
+intr_event_bind(struct intr_event *ie, int cpu)
+{
+
+	return (_intr_event_bind(ie, cpu, true, true));
+}
+
+/*
+ * Bind an interrupt event to the specified CPU, but do not bind associated
+ * ithreads.
+ */
+int
+intr_event_bind_irqonly(struct intr_event *ie, int cpu)
+{
+
+	return (_intr_event_bind(ie, cpu, true, false));
+}
+
+/*
+ * Bind an interrupt event's ithread to the specified CPU.
+ */
+int
+intr_event_bind_ithread(struct intr_event *ie, int cpu)
+{
+
+	return (_intr_event_bind(ie, cpu, false, true));
+}
+
 static struct intr_event *
 intr_lookup(int irq)
 {
 	struct intr_event *ie;
 
 	mtx_lock(&event_lock);
 	TAILQ_FOREACH(ie, &event_list, ie_list)
 		if (ie->ie_irq == irq &&
 		    (ie->ie_flags & IE_SOFT) == 0 &&
 		    TAILQ_FIRST(&ie->ie_handlers) != NULL)
 			break;
 	mtx_unlock(&event_lock);
 	return (ie);
 }
 
 int
-intr_setaffinity(int irq, void *m)
+intr_setaffinity(int irq, int mode, void *m)
 {
 	struct intr_event *ie;
 	cpuset_t *mask;
 	int cpu, n;
 
 	mask = m;
 	cpu = NOCPU;
 	/*
 	 * If we're setting all cpus we can unbind.  Otherwise make sure
 	 * only one cpu is in the set.
 	 */
 	if (CPU_CMP(cpuset_root, mask)) {
 		for (n = 0; n < CPU_SETSIZE; n++) {
 			if (!CPU_ISSET(n, mask))
 				continue;
 			if (cpu != NOCPU)
 				return (EINVAL);
 			cpu = n;
 		}
 	}
 	ie = intr_lookup(irq);
 	if (ie == NULL)
 		return (ESRCH);
-	return (intr_event_bind(ie, cpu));
+	switch (mode) {
+	case CPU_WHICH_IRQ:
+		return (intr_event_bind(ie, cpu));
+	case CPU_WHICH_INTRHANDLER:
+		return (intr_event_bind_irqonly(ie, cpu));
+	case CPU_WHICH_ITHREAD:
+		return (intr_event_bind_ithread(ie, cpu));
+	default:
+		return (EINVAL);
+	}
 }
 
 int
-intr_getaffinity(int irq, void *m)
+intr_getaffinity(int irq, int mode, void *m)
 {
 	struct intr_event *ie;
+	struct thread *td;
+	struct proc *p;
 	cpuset_t *mask;
+	lwpid_t id;
+	int error;
 
 	mask = m;
 	ie = intr_lookup(irq);
 	if (ie == NULL)
 		return (ESRCH);
+
+	error = 0;
 	CPU_ZERO(mask);
-	mtx_lock(&ie->ie_lock);
-	if (ie->ie_cpu == NOCPU)
-		CPU_COPY(cpuset_root, mask);
-	else
-		CPU_SET(ie->ie_cpu, mask);
-	mtx_unlock(&ie->ie_lock);
+	switch (mode) {
+	case CPU_WHICH_IRQ:
+	case CPU_WHICH_INTRHANDLER:
+		mtx_lock(&ie->ie_lock);
+		if (ie->ie_cpu == NOCPU)
+			CPU_COPY(cpuset_root, mask);
+		else
+			CPU_SET(ie->ie_cpu, mask);
+		mtx_unlock(&ie->ie_lock);
+		break;
+	case CPU_WHICH_ITHREAD:
+		mtx_lock(&ie->ie_lock);
+		if (ie->ie_thread == NULL) {
+			mtx_unlock(&ie->ie_lock);
+			CPU_COPY(cpuset_root, mask);
+		} else {
+			id = ie->ie_thread->it_thread->td_tid;
+			mtx_unlock(&ie->ie_lock);
+			error = cpuset_which(CPU_WHICH_TID, id, &p, &td, NULL);
+			if (error != 0)
+				return (error);
+			CPU_COPY(&td->td_cpuset->cs_mask, mask);
+			PROC_UNLOCK(p);
+		}
+	default:
+		return (EINVAL);
+	}
 	return (0);
 }
 
 int
 intr_event_destroy(struct intr_event *ie)
 {
 
 	mtx_lock(&event_lock);
 	mtx_lock(&ie->ie_lock);
 	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
 		mtx_unlock(&ie->ie_lock);
 		mtx_unlock(&event_lock);
 		return (EBUSY);
 	}
 	TAILQ_REMOVE(&event_list, ie, ie_list);
 #ifndef notyet
 	if (ie->ie_thread != NULL) {
 		ithread_destroy(ie->ie_thread);
 		ie->ie_thread = NULL;
 	}
 #endif
 	mtx_unlock(&ie->ie_lock);
 	mtx_unlock(&event_lock);
 	mtx_destroy(&ie->ie_lock);
 	free(ie, M_ITHREAD);
 	return (0);
 }
 
 #ifndef INTR_FILTER
 static struct intr_thread *
 ithread_create(const char *name)
 {
 	struct intr_thread *ithd;
 	struct thread *td;
 	int error;
 
 	ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
 
 	error = kproc_kthread_add(ithread_loop, ithd, &intrproc,
 		    &td, RFSTOPPED | RFHIGHPID,
 	    	    0, "intr", "%s", name);
 	if (error)
 		panic("kproc_create() failed with %d", error);
 	thread_lock(td);
 	sched_class(td, PRI_ITHD);
 	TD_SET_IWAIT(td);
 	thread_unlock(td);
 	td->td_pflags |= TDP_ITHREAD;
 	ithd->it_thread = td;
 	CTR2(KTR_INTR, "%s: created %s", __func__, name);
 	return (ithd);
 }
 #else
 static struct intr_thread *
 ithread_create(const char *name, struct intr_handler *ih)
 {
 	struct intr_thread *ithd;
 	struct thread *td;
 	int error;
 
 	ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
 
 	error = kproc_kthread_add(ithread_loop, ih, &intrproc,
 		    &td, RFSTOPPED | RFHIGHPID,
 	    	    0, "intr", "%s", name);
 	if (error)
 		panic("kproc_create() failed with %d", error);
 	thread_lock(td);
 	sched_class(td, PRI_ITHD);
 	TD_SET_IWAIT(td);
 	thread_unlock(td);
 	td->td_pflags |= TDP_ITHREAD;
 	ithd->it_thread = td;
 	CTR2(KTR_INTR, "%s: created %s", __func__, name);
 	return (ithd);
 }
 #endif
 
 static void
 ithread_destroy(struct intr_thread *ithread)
 {
 	struct thread *td;
 
 	CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name);
 	td = ithread->it_thread;
 	thread_lock(td);
 	ithread->it_flags |= IT_DEAD;
 	if (TD_AWAITING_INTR(td)) {
 		TD_CLR_IWAIT(td);
 		sched_add(td, SRQ_INTR);
 	}
 	thread_unlock(td);
 }
 
 #ifndef INTR_FILTER
 int
 intr_event_add_handler(struct intr_event *ie, const char *name,
     driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
     enum intr_type flags, void **cookiep)
 {
 	struct intr_handler *ih, *temp_ih;
 	struct intr_thread *it;
 
 	if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
 		return (EINVAL);
 
 	/* Allocate and populate an interrupt handler structure. */
 	ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
 	ih->ih_filter = filter;
 	ih->ih_handler = handler;
 	ih->ih_argument = arg;
 	strlcpy(ih->ih_name, name, sizeof(ih->ih_name));
 	ih->ih_event = ie;
 	ih->ih_pri = pri;
 	if (flags & INTR_EXCL)
 		ih->ih_flags = IH_EXCLUSIVE;
 	if (flags & INTR_MPSAFE)
 		ih->ih_flags |= IH_MPSAFE;
 	if (flags & INTR_ENTROPY)
 		ih->ih_flags |= IH_ENTROPY;
 
 	/* We can only have one exclusive handler in a event. */
 	mtx_lock(&ie->ie_lock);
 	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
 		if ((flags & INTR_EXCL) ||
 		    (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) {
 			mtx_unlock(&ie->ie_lock);
 			free(ih, M_ITHREAD);
 			return (EINVAL);
 		}
 	}
 
 	/* Create a thread if we need one. */
 	while (ie->ie_thread == NULL && handler != NULL) {
 		if (ie->ie_flags & IE_ADDING_THREAD)
 			msleep(ie, &ie->ie_lock, 0, "ithread", 0);
 		else {
 			ie->ie_flags |= IE_ADDING_THREAD;
 			mtx_unlock(&ie->ie_lock);
 			it = ithread_create("intr: newborn");
 			mtx_lock(&ie->ie_lock);
 			ie->ie_flags &= ~IE_ADDING_THREAD;
 			ie->ie_thread = it;
 			it->it_event = ie;
 			ithread_update(it);
 			wakeup(ie);
 		}
 	}
 
 	/* Add the new handler to the event in priority order. */
 	TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) {
 		if (temp_ih->ih_pri > ih->ih_pri)
 			break;
 	}
 	if (temp_ih == NULL)
 		TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next);
 	else
 		TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
 	intr_event_update(ie);
 
 	CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
 	    ie->ie_name);
 	mtx_unlock(&ie->ie_lock);
 
 	if (cookiep != NULL)
 		*cookiep = ih;
 	return (0);
 }
 #else
 int
 intr_event_add_handler(struct intr_event *ie, const char *name,
     driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
     enum intr_type flags, void **cookiep)
 {
 	struct intr_handler *ih, *temp_ih;
 	struct intr_thread *it;
 
 	if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
 		return (EINVAL);
 
 	/* Allocate and populate an interrupt handler structure. */
 	ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
 	ih->ih_filter = filter;
 	ih->ih_handler = handler;
 	ih->ih_argument = arg;
 	strlcpy(ih->ih_name, name, sizeof(ih->ih_name));
 	ih->ih_event = ie;
 	ih->ih_pri = pri;
 	if (flags & INTR_EXCL)
 		ih->ih_flags = IH_EXCLUSIVE;
 	if (flags & INTR_MPSAFE)
 		ih->ih_flags |= IH_MPSAFE;
 	if (flags & INTR_ENTROPY)
 		ih->ih_flags |= IH_ENTROPY;
 
 	/* We can only have one exclusive handler in a event. */
 	mtx_lock(&ie->ie_lock);
 	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
 		if ((flags & INTR_EXCL) ||
 		    (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) {
 			mtx_unlock(&ie->ie_lock);
 			free(ih, M_ITHREAD);
 			return (EINVAL);
 		}
 	}
 
 	/* For filtered handlers, create a private ithread to run on. */
 	if (filter != NULL && handler != NULL) {
 		mtx_unlock(&ie->ie_lock);
 		it = ithread_create("intr: newborn", ih);
 		mtx_lock(&ie->ie_lock);
 		it->it_event = ie;
 		ih->ih_thread = it;
 		ithread_update(it); /* XXX - do we really need this?!?!? */
 	} else { /* Create the global per-event thread if we need one. */
 		while (ie->ie_thread == NULL && handler != NULL) {
 			if (ie->ie_flags & IE_ADDING_THREAD)
 				msleep(ie, &ie->ie_lock, 0, "ithread", 0);
 			else {
 				ie->ie_flags |= IE_ADDING_THREAD;
 				mtx_unlock(&ie->ie_lock);
 				it = ithread_create("intr: newborn", ih);
 				mtx_lock(&ie->ie_lock);
 				ie->ie_flags &= ~IE_ADDING_THREAD;
 				ie->ie_thread = it;
 				it->it_event = ie;
 				ithread_update(it);
 				wakeup(ie);
 			}
 		}
 	}
 
 	/* Add the new handler to the event in priority order. */
 	TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) {
 		if (temp_ih->ih_pri > ih->ih_pri)
 			break;
 	}
 	if (temp_ih == NULL)
 		TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next);
 	else
 		TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
 	intr_event_update(ie);
 
 	CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
 	    ie->ie_name);
 	mtx_unlock(&ie->ie_lock);
 
 	if (cookiep != NULL)
 		*cookiep = ih;
 	return (0);
 }
 #endif
 
 /*
  * Append a description preceded by a ':' to the name of the specified
  * interrupt handler.
  */
 int
 intr_event_describe_handler(struct intr_event *ie, void *cookie,
     const char *descr)
 {
 	struct intr_handler *ih;
 	size_t space;
 	char *start;
 
 	mtx_lock(&ie->ie_lock);
 #ifdef INVARIANTS
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (ih == cookie)
 			break;
 	}
 	if (ih == NULL) {
 		mtx_unlock(&ie->ie_lock);
 		panic("handler %p not found in interrupt event %p", cookie, ie);
 	}
 #endif
 	ih = cookie;
 
 	/*
 	 * Look for an existing description by checking for an
 	 * existing ":".  This assumes device names do not include
 	 * colons.  If one is found, prepare to insert the new
 	 * description at that point.  If one is not found, find the
 	 * end of the name to use as the insertion point.
 	 */
 	start = strchr(ih->ih_name, ':');
 	if (start == NULL)
 		start = strchr(ih->ih_name, 0);
 
 	/*
 	 * See if there is enough remaining room in the string for the
 	 * description + ":".  The "- 1" leaves room for the trailing
 	 * '\0'.  The "+ 1" accounts for the colon.
 	 */
 	space = sizeof(ih->ih_name) - (start - ih->ih_name) - 1;
 	if (strlen(descr) + 1 > space) {
 		mtx_unlock(&ie->ie_lock);
 		return (ENOSPC);
 	}
 
 	/* Append a colon followed by the description. */
 	*start = ':';
 	strcpy(start + 1, descr);
 	intr_event_update(ie);
 	mtx_unlock(&ie->ie_lock);
 	return (0);
 }
 
 /*
  * Return the ie_source field from the intr_event an intr_handler is
  * associated with.
  */
 void *
 intr_handler_source(void *cookie)
 {
 	struct intr_handler *ih;
 	struct intr_event *ie;
 
 	ih = (struct intr_handler *)cookie;
 	if (ih == NULL)
 		return (NULL);
 	ie = ih->ih_event;
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
 	    ih->ih_name));
 	return (ie->ie_source);
 }
 
 /*
  * Sleep until an ithread finishes executing an interrupt handler.
  *
  * XXX Doesn't currently handle interrupt filters or fast interrupt
  * handlers.  This is intended for compatibility with linux drivers
  * only.  Do not use in BSD code.
  */
 void
 _intr_drain(int irq)
 {
 	struct intr_event *ie;
 	struct intr_thread *ithd;
 	struct thread *td;
 
 	ie = intr_lookup(irq);
 	if (ie == NULL)
 		return;
 	if (ie->ie_thread == NULL)
 		return;
 	ithd = ie->ie_thread;
 	td = ithd->it_thread;
 	/*
 	 * We set the flag and wait for it to be cleared to avoid
 	 * long delays with potentially busy interrupt handlers
 	 * were we to only sample TD_AWAITING_INTR() every tick.
 	 */
 	thread_lock(td);
 	if (!TD_AWAITING_INTR(td)) {
 		ithd->it_flags |= IT_WAIT;
 		while (ithd->it_flags & IT_WAIT) {
 			thread_unlock(td);
 			pause("idrain", 1);
 			thread_lock(td);
 		}
 	}
 	thread_unlock(td);
 	return;
 }
 
 
 #ifndef INTR_FILTER
 int
 intr_event_remove_handler(void *cookie)
 {
 	struct intr_handler *handler = (struct intr_handler *)cookie;
 	struct intr_event *ie;
 #ifdef INVARIANTS
 	struct intr_handler *ih;
 #endif
 #ifdef notyet
 	int dead;
 #endif
 
 	if (handler == NULL)
 		return (EINVAL);
 	ie = handler->ih_event;
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
 	    handler->ih_name));
 	mtx_lock(&ie->ie_lock);
 	CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
 	    ie->ie_name);
 #ifdef INVARIANTS
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
 		if (ih == handler)
 			goto ok;
 	mtx_unlock(&ie->ie_lock);
 	panic("interrupt handler \"%s\" not found in interrupt event \"%s\"",
 	    ih->ih_name, ie->ie_name);
 ok:
 #endif
 	/*
 	 * If there is no ithread, then just remove the handler and return.
 	 * XXX: Note that an INTR_FAST handler might be running on another
 	 * CPU!
 	 */
 	if (ie->ie_thread == NULL) {
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
 		mtx_unlock(&ie->ie_lock);
 		free(handler, M_ITHREAD);
 		return (0);
 	}
 
 	/*
 	 * If the interrupt thread is already running, then just mark this
 	 * handler as being dead and let the ithread do the actual removal.
 	 *
 	 * During a cold boot while cold is set, msleep() does not sleep,
 	 * so we have to remove the handler here rather than letting the
 	 * thread do it.
 	 */
 	thread_lock(ie->ie_thread->it_thread);
 	if (!TD_AWAITING_INTR(ie->ie_thread->it_thread) && !cold) {
 		handler->ih_flags |= IH_DEAD;
 
 		/*
 		 * Ensure that the thread will process the handler list
 		 * again and remove this handler if it has already passed
 		 * it on the list.
 		 *
 		 * The release part of the following store ensures
 		 * that the update of ih_flags is ordered before the
 		 * it_need setting.  See the comment before
 		 * atomic_cmpset_acq(&ithd->it_need, ...) operation in
 		 * the ithread_execute_handlers().
 		 */
 		atomic_store_rel_int(&ie->ie_thread->it_need, 1);
 	} else
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
 	thread_unlock(ie->ie_thread->it_thread);
 	while (handler->ih_flags & IH_DEAD)
 		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
 	intr_event_update(ie);
 #ifdef notyet
 	/*
 	 * XXX: This could be bad in the case of ppbus(8).  Also, I think
 	 * this could lead to races of stale data when servicing an
 	 * interrupt.
 	 */
 	dead = 1;
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (!(ih->ih_flags & IH_FAST)) {
 			dead = 0;
 			break;
 		}
 	}
 	if (dead) {
 		ithread_destroy(ie->ie_thread);
 		ie->ie_thread = NULL;
 	}
 #endif
 	mtx_unlock(&ie->ie_lock);
 	free(handler, M_ITHREAD);
 	return (0);
 }
 
 static int
 intr_event_schedule_thread(struct intr_event *ie)
 {
 	struct intr_entropy entropy;
 	struct intr_thread *it;
 	struct thread *td;
 	struct thread *ctd;
 	struct proc *p;
 
 	/*
 	 * If no ithread or no handlers, then we have a stray interrupt.
 	 */
 	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) ||
 	    ie->ie_thread == NULL)
 		return (EINVAL);
 
 	ctd = curthread;
 	it = ie->ie_thread;
 	td = it->it_thread;
 	p = td->td_proc;
 
 	/*
 	 * If any of the handlers for this ithread claim to be good
 	 * sources of entropy, then gather some.
 	 */
 	if (ie->ie_flags & IE_ENTROPY) {
 		entropy.event = (uintptr_t)ie;
 		entropy.td = ctd;
 		random_harvest_queue(&entropy, sizeof(entropy), 2, RANDOM_INTERRUPT);
 	}
 
 	KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name));
 
 	/*
 	 * Set it_need to tell the thread to keep running if it is already
 	 * running.  Then, lock the thread and see if we actually need to
 	 * put it on the runqueue.
 	 *
 	 * Use store_rel to arrange that the store to ih_need in
 	 * swi_sched() is before the store to it_need and prepare for
 	 * transfer of this order to loads in the ithread.
 	 */
 	atomic_store_rel_int(&it->it_need, 1);
 	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
 		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
 		    td->td_name);
 		TD_CLR_IWAIT(td);
 		sched_add(td, SRQ_INTR);
 	} else {
 		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
 		    __func__, p->p_pid, td->td_name, it->it_need, td->td_state);
 	}
 	thread_unlock(td);
 
 	return (0);
 }
 #else
 int
 intr_event_remove_handler(void *cookie)
 {
 	struct intr_handler *handler = (struct intr_handler *)cookie;
 	struct intr_event *ie;
 	struct intr_thread *it;
 #ifdef INVARIANTS
 	struct intr_handler *ih;
 #endif
 #ifdef notyet
 	int dead;
 #endif
 
 	if (handler == NULL)
 		return (EINVAL);
 	ie = handler->ih_event;
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
 	    handler->ih_name));
 	mtx_lock(&ie->ie_lock);
 	CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
 	    ie->ie_name);
 #ifdef INVARIANTS
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
 		if (ih == handler)
 			goto ok;
 	mtx_unlock(&ie->ie_lock);
 	panic("interrupt handler \"%s\" not found in interrupt event \"%s\"",
 	    ih->ih_name, ie->ie_name);
 ok:
 #endif
 	/*
 	 * If there are no ithreads (per event and per handler), then
 	 * just remove the handler and return.  
 	 * XXX: Note that an INTR_FAST handler might be running on another CPU!
 	 */
 	if (ie->ie_thread == NULL && handler->ih_thread == NULL) {
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
 		mtx_unlock(&ie->ie_lock);
 		free(handler, M_ITHREAD);
 		return (0);
 	}
 
 	/* Private or global ithread? */
 	it = (handler->ih_thread) ? handler->ih_thread : ie->ie_thread;
 	/*
 	 * If the interrupt thread is already running, then just mark this
 	 * handler as being dead and let the ithread do the actual removal.
 	 *
 	 * During a cold boot while cold is set, msleep() does not sleep,
 	 * so we have to remove the handler here rather than letting the
 	 * thread do it.
 	 */
 	thread_lock(it->it_thread);
 	if (!TD_AWAITING_INTR(it->it_thread) && !cold) {
 		handler->ih_flags |= IH_DEAD;
 
 		/*
 		 * Ensure that the thread will process the handler list
 		 * again and remove this handler if it has already passed
 		 * it on the list.
 		 *
 		 * The release part of the following store ensures
 		 * that the update of ih_flags is ordered before the
 		 * it_need setting.  See the comment before
 		 * atomic_cmpset_acq(&ithd->it_need, ...) operation in
 		 * the ithread_execute_handlers().
 		 */
 		atomic_store_rel_int(&it->it_need, 1);
 	} else
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
 	thread_unlock(it->it_thread);
 	while (handler->ih_flags & IH_DEAD)
 		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
 	/* 
 	 * At this point, the handler has been disconnected from the event,
 	 * so we can kill the private ithread if any.
 	 */
 	if (handler->ih_thread) {
 		ithread_destroy(handler->ih_thread);
 		handler->ih_thread = NULL;
 	}
 	intr_event_update(ie);
 #ifdef notyet
 	/*
 	 * XXX: This could be bad in the case of ppbus(8).  Also, I think
 	 * this could lead to races of stale data when servicing an
 	 * interrupt.
 	 */
 	dead = 1;
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (handler != NULL) {
 			dead = 0;
 			break;
 		}
 	}
 	if (dead) {
 		ithread_destroy(ie->ie_thread);
 		ie->ie_thread = NULL;
 	}
 #endif
 	mtx_unlock(&ie->ie_lock);
 	free(handler, M_ITHREAD);
 	return (0);
 }
 
 static int
 intr_event_schedule_thread(struct intr_event *ie, struct intr_thread *it)
 {
 	struct intr_entropy entropy;
 	struct thread *td;
 	struct thread *ctd;
 	struct proc *p;
 
 	/*
 	 * If no ithread or no handlers, then we have a stray interrupt.
 	 */
 	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) || it == NULL)
 		return (EINVAL);
 
 	ctd = curthread;
 	td = it->it_thread;
 	p = td->td_proc;
 
 	/*
 	 * If any of the handlers for this ithread claim to be good
 	 * sources of entropy, then gather some.
 	 */
 	if (ie->ie_flags & IE_ENTROPY) {
 		entropy.event = (uintptr_t)ie;
 		entropy.td = ctd;
 		random_harvest_queue(&entropy, sizeof(entropy), 2, RANDOM_INTERRUPT);
 	}
 
 	KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name));
 
 	/*
 	 * Set it_need to tell the thread to keep running if it is already
 	 * running.  Then, lock the thread and see if we actually need to
 	 * put it on the runqueue.
 	 *
 	 * Use store_rel to arrange that the store to ih_need in
 	 * swi_sched() is before the store to it_need and prepare for
 	 * transfer of this order to loads in the ithread.
 	 */
 	atomic_store_rel_int(&it->it_need, 1);
 	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
 		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
 		    td->td_name);
 		TD_CLR_IWAIT(td);
 		sched_add(td, SRQ_INTR);
 	} else {
 		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
 		    __func__, p->p_pid, td->td_name, it->it_need, td->td_state);
 	}
 	thread_unlock(td);
 
 	return (0);
 }
 #endif
 
 /*
  * Allow interrupt event binding for software interrupt handlers -- a no-op,
  * since interrupts are generated in software rather than being directed by
  * a PIC.
  */
 static int
 swi_assign_cpu(void *arg, int cpu)
 {
 
 	return (0);
 }
 
 /*
  * Add a software interrupt handler to a specified event.  If a given event
  * is not specified, then a new event is created.
  */
 int
 swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler,
 	    void *arg, int pri, enum intr_type flags, void **cookiep)
 {
 	struct intr_event *ie;
 	int error;
 
 	if (flags & INTR_ENTROPY)
 		return (EINVAL);
 
 	ie = (eventp != NULL) ? *eventp : NULL;
 
 	if (ie != NULL) {
 		if (!(ie->ie_flags & IE_SOFT))
 			return (EINVAL);
 	} else {
 		error = intr_event_create(&ie, NULL, IE_SOFT, 0,
 		    NULL, NULL, NULL, swi_assign_cpu, "swi%d:", pri);
 		if (error)
 			return (error);
 		if (eventp != NULL)
 			*eventp = ie;
 	}
 	error = intr_event_add_handler(ie, name, NULL, handler, arg,
 	    PI_SWI(pri), flags, cookiep);
 	return (error);
 }
 
 /*
  * Schedule a software interrupt thread.
  */
 void
 swi_sched(void *cookie, int flags)
 {
 	struct intr_handler *ih = (struct intr_handler *)cookie;
 	struct intr_event *ie = ih->ih_event;
 	struct intr_entropy entropy;
 	int error;
 
 	CTR3(KTR_INTR, "swi_sched: %s %s need=%d", ie->ie_name, ih->ih_name,
 	    ih->ih_need);
 
 	entropy.event = (uintptr_t)ih;
 	entropy.td = curthread;
 	random_harvest_queue(&entropy, sizeof(entropy), 1, RANDOM_SWI);
 
 	/*
 	 * Set ih_need for this handler so that if the ithread is already
 	 * running it will execute this handler on the next pass.  Otherwise,
 	 * it will execute it the next time it runs.
 	 */
 	ih->ih_need = 1;
 
 	if (!(flags & SWI_DELAY)) {
 		VM_CNT_INC(v_soft);
 #ifdef INTR_FILTER
 		error = intr_event_schedule_thread(ie, ie->ie_thread);
 #else
 		error = intr_event_schedule_thread(ie);
 #endif
 		KASSERT(error == 0, ("stray software interrupt"));
 	}
 }
 
 /*
  * Remove a software interrupt handler.  Currently this code does not
  * remove the associated interrupt event if it becomes empty.  Calling code
  * may do so manually via intr_event_destroy(), but that's not really
  * an optimal interface.
  */
 int
 swi_remove(void *cookie)
 {
 
 	return (intr_event_remove_handler(cookie));
 }
 
 #ifdef INTR_FILTER
 static void
 priv_ithread_execute_handler(struct proc *p, struct intr_handler *ih)
 {
 	struct intr_event *ie;
 
 	ie = ih->ih_event;
 	/*
 	 * If this handler is marked for death, remove it from
 	 * the list of handlers and wake up the sleeper.
 	 */
 	if (ih->ih_flags & IH_DEAD) {
 		mtx_lock(&ie->ie_lock);
 		TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next);
 		ih->ih_flags &= ~IH_DEAD;
 		wakeup(ih);
 		mtx_unlock(&ie->ie_lock);
 		return;
 	}
 	
 	/* Execute this handler. */
 	CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
 	     __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument,
 	     ih->ih_name, ih->ih_flags);
 	
 	if (!(ih->ih_flags & IH_MPSAFE))
 		mtx_lock(&Giant);
 	ih->ih_handler(ih->ih_argument);
 	if (!(ih->ih_flags & IH_MPSAFE))
 		mtx_unlock(&Giant);
 }
 #endif
 
 /*
  * This is a public function for use by drivers that mux interrupt
  * handlers for child devices from their interrupt handler.
  */
 void
 intr_event_execute_handlers(struct proc *p, struct intr_event *ie)
 {
 	struct intr_handler *ih, *ihn;
 
 	TAILQ_FOREACH_SAFE(ih, &ie->ie_handlers, ih_next, ihn) {
 		/*
 		 * If this handler is marked for death, remove it from
 		 * the list of handlers and wake up the sleeper.
 		 */
 		if (ih->ih_flags & IH_DEAD) {
 			mtx_lock(&ie->ie_lock);
 			TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next);
 			ih->ih_flags &= ~IH_DEAD;
 			wakeup(ih);
 			mtx_unlock(&ie->ie_lock);
 			continue;
 		}
 
 		/* Skip filter only handlers */
 		if (ih->ih_handler == NULL)
 			continue;
 
 		/*
 		 * For software interrupt threads, we only execute
 		 * handlers that have their need flag set.  Hardware
 		 * interrupt threads always invoke all of their handlers.
 		 *
 		 * ih_need can only be 0 or 1.  Failed cmpset below
 		 * means that there is no request to execute handlers,
 		 * so a retry of the cmpset is not needed.
 		 */
 		if ((ie->ie_flags & IE_SOFT) != 0 &&
 		    atomic_cmpset_int(&ih->ih_need, 1, 0) == 0)
 			continue;
 
 		/* Execute this handler. */
 		CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
 		    __func__, p->p_pid, (void *)ih->ih_handler, 
 		    ih->ih_argument, ih->ih_name, ih->ih_flags);
 
 		if (!(ih->ih_flags & IH_MPSAFE))
 			mtx_lock(&Giant);
 		ih->ih_handler(ih->ih_argument);
 		if (!(ih->ih_flags & IH_MPSAFE))
 			mtx_unlock(&Giant);
 	}
 }
 
 static void
 ithread_execute_handlers(struct proc *p, struct intr_event *ie)
 {
 
 	/* Interrupt handlers should not sleep. */
 	if (!(ie->ie_flags & IE_SOFT))
 		THREAD_NO_SLEEPING();
 	intr_event_execute_handlers(p, ie);
 	if (!(ie->ie_flags & IE_SOFT))
 		THREAD_SLEEPING_OK();
 
 	/*
 	 * Interrupt storm handling:
 	 *
 	 * If this interrupt source is currently storming, then throttle
 	 * it to only fire the handler once  per clock tick.
 	 *
 	 * If this interrupt source is not currently storming, but the
 	 * number of back to back interrupts exceeds the storm threshold,
 	 * then enter storming mode.
 	 */
 	if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold &&
 	    !(ie->ie_flags & IE_SOFT)) {
 		/* Report the message only once every second. */
 		if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) {
 			printf(
 	"interrupt storm detected on \"%s\"; throttling interrupt source\n",
 			    ie->ie_name);
 		}
 		pause("istorm", 1);
 	} else
 		ie->ie_count++;
 
 	/*
 	 * Now that all the handlers have had a chance to run, reenable
 	 * the interrupt source.
 	 */
 	if (ie->ie_post_ithread != NULL)
 		ie->ie_post_ithread(ie->ie_source);
 }
 
 #ifndef INTR_FILTER
 /*
  * This is the main code for interrupt threads.
  */
 static void
 ithread_loop(void *arg)
 {
 	struct intr_thread *ithd;
 	struct intr_event *ie;
 	struct thread *td;
 	struct proc *p;
 	int wake;
 
 	td = curthread;
 	p = td->td_proc;
 	ithd = (struct intr_thread *)arg;
 	KASSERT(ithd->it_thread == td,
 	    ("%s: ithread and proc linkage out of sync", __func__));
 	ie = ithd->it_event;
 	ie->ie_count = 0;
 	wake = 0;
 
 	/*
 	 * As long as we have interrupts outstanding, go through the
 	 * list of handlers, giving each one a go at it.
 	 */
 	for (;;) {
 		/*
 		 * If we are an orphaned thread, then just die.
 		 */
 		if (ithd->it_flags & IT_DEAD) {
 			CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
 			    p->p_pid, td->td_name);
 			free(ithd, M_ITHREAD);
 			kthread_exit();
 		}
 
 		/*
 		 * Service interrupts.  If another interrupt arrives while
 		 * we are running, it will set it_need to note that we
 		 * should make another pass.
 		 *
 		 * The load_acq part of the following cmpset ensures
 		 * that the load of ih_need in ithread_execute_handlers()
 		 * is ordered after the load of it_need here.
 		 */
 		while (atomic_cmpset_acq_int(&ithd->it_need, 1, 0) != 0)
 			ithread_execute_handlers(p, ie);
 		WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
 		mtx_assert(&Giant, MA_NOTOWNED);
 
 		/*
 		 * Processed all our interrupts.  Now get the sched
 		 * lock.  This may take a while and it_need may get
 		 * set again, so we have to check it again.
 		 */
 		thread_lock(td);
 		if (atomic_load_acq_int(&ithd->it_need) == 0 &&
 		    (ithd->it_flags & (IT_DEAD | IT_WAIT)) == 0) {
 			TD_SET_IWAIT(td);
 			ie->ie_count = 0;
 			mi_switch(SW_VOL | SWT_IWAIT, NULL);
 		}
 		if (ithd->it_flags & IT_WAIT) {
 			wake = 1;
 			ithd->it_flags &= ~IT_WAIT;
 		}
 		thread_unlock(td);
 		if (wake) {
 			wakeup(ithd);
 			wake = 0;
 		}
 	}
 }
 
 /*
  * Main interrupt handling body.
  *
  * Input:
  * o ie:                        the event connected to this interrupt.
  * o frame:                     some archs (i.e. i386) pass a frame to some.
  *                              handlers as their main argument.
  * Return value:
  * o 0:                         everything ok.
  * o EINVAL:                    stray interrupt.
  */
 int
 intr_event_handle(struct intr_event *ie, struct trapframe *frame)
 {
 	struct intr_handler *ih;
 	struct trapframe *oldframe;
 	struct thread *td;
 	int error, ret, thread;
 
 	td = curthread;
 
 #ifdef KSTACK_USAGE_PROF
 	intr_prof_stack_use(td, frame);
 #endif
 
 	/* An interrupt with no event or handlers is a stray interrupt. */
 	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
 		return (EINVAL);
 
 	/*
 	 * Execute fast interrupt handlers directly.
 	 * To support clock handlers, if a handler registers
 	 * with a NULL argument, then we pass it a pointer to
 	 * a trapframe as its argument.
 	 */
 	td->td_intr_nesting_level++;
 	thread = 0;
 	ret = 0;
 	critical_enter();
 	oldframe = td->td_intr_frame;
 	td->td_intr_frame = frame;
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		if (ih->ih_filter == NULL) {
 			thread = 1;
 			continue;
 		}
 		CTR4(KTR_INTR, "%s: exec %p(%p) for %s", __func__,
 		    ih->ih_filter, ih->ih_argument == NULL ? frame :
 		    ih->ih_argument, ih->ih_name);
 		if (ih->ih_argument == NULL)
 			ret = ih->ih_filter(frame);
 		else
 			ret = ih->ih_filter(ih->ih_argument);
 		KASSERT(ret == FILTER_STRAY ||
 		    ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 &&
 		    (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0),
 		    ("%s: incorrect return value %#x from %s", __func__, ret,
 		    ih->ih_name));
 
 		/* 
 		 * Wrapper handler special handling:
 		 *
 		 * in some particular cases (like pccard and pccbb), 
 		 * the _real_ device handler is wrapped in a couple of
 		 * functions - a filter wrapper and an ithread wrapper.
 		 * In this case (and just in this case), the filter wrapper 
 		 * could ask the system to schedule the ithread and mask
 		 * the interrupt source if the wrapped handler is composed
 		 * of just an ithread handler.
 		 *
 		 * TODO: write a generic wrapper to avoid people rolling 
 		 * their own
 		 */
 		if (!thread) {
 			if (ret == FILTER_SCHEDULE_THREAD)
 				thread = 1;
 		}
 	}
 	td->td_intr_frame = oldframe;
 
 	if (thread) {
 		if (ie->ie_pre_ithread != NULL)
 			ie->ie_pre_ithread(ie->ie_source);
 	} else {
 		if (ie->ie_post_filter != NULL)
 			ie->ie_post_filter(ie->ie_source);
 	}
 	
 	/* Schedule the ithread if needed. */
 	if (thread) {
 		error = intr_event_schedule_thread(ie);
 		KASSERT(error == 0, ("bad stray interrupt"));
 	}
 	critical_exit();
 	td->td_intr_nesting_level--;
 	return (0);
 }
 #else
 /*
  * This is the main code for interrupt threads.
  */
 static void
 ithread_loop(void *arg)
 {
 	struct intr_thread *ithd;
 	struct intr_handler *ih;
 	struct intr_event *ie;
 	struct thread *td;
 	struct proc *p;
 	int priv;
 	int wake;
 
 	td = curthread;
 	p = td->td_proc;
 	ih = (struct intr_handler *)arg;
 	priv = (ih->ih_thread != NULL) ? 1 : 0;
 	ithd = (priv) ? ih->ih_thread : ih->ih_event->ie_thread;
 	KASSERT(ithd->it_thread == td,
 	    ("%s: ithread and proc linkage out of sync", __func__));
 	ie = ithd->it_event;
 	ie->ie_count = 0;
 	wake = 0;
 
 	/*
 	 * As long as we have interrupts outstanding, go through the
 	 * list of handlers, giving each one a go at it.
 	 */
 	for (;;) {
 		/*
 		 * If we are an orphaned thread, then just die.
 		 */
 		if (ithd->it_flags & IT_DEAD) {
 			CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
 			    p->p_pid, td->td_name);
 			free(ithd, M_ITHREAD);
 			kthread_exit();
 		}
 
 		/*
 		 * Service interrupts.  If another interrupt arrives while
 		 * we are running, it will set it_need to note that we
 		 * should make another pass.
 		 *
 		 * The load_acq part of the following cmpset ensures
 		 * that the load of ih_need in ithread_execute_handlers()
 		 * is ordered after the load of it_need here.
 		 */
 		while (atomic_cmpset_acq_int(&ithd->it_need, 1, 0) != 0) {
 			if (priv)
 				priv_ithread_execute_handler(p, ih);
 			else 
 				ithread_execute_handlers(p, ie);
 		}
 		WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
 		mtx_assert(&Giant, MA_NOTOWNED);
 
 		/*
 		 * Processed all our interrupts.  Now get the sched
 		 * lock.  This may take a while and it_need may get
 		 * set again, so we have to check it again.
 		 */
 		thread_lock(td);
 		if (atomic_load_acq_int(&ithd->it_need) == 0 &&
 		    (ithd->it_flags & (IT_DEAD | IT_WAIT)) == 0) {
 			TD_SET_IWAIT(td);
 			ie->ie_count = 0;
 			mi_switch(SW_VOL | SWT_IWAIT, NULL);
 		}
 		if (ithd->it_flags & IT_WAIT) {
 			wake = 1;
 			ithd->it_flags &= ~IT_WAIT;
 		}
 		thread_unlock(td);
 		if (wake) {
 			wakeup(ithd);
 			wake = 0;
 		}
 	}
 }
 
 /* 
  * Main loop for interrupt filter.
  *
  * Some architectures (i386, amd64 and arm) require the optional frame 
  * parameter, and use it as the main argument for fast handler execution
  * when ih_argument == NULL.
  *
  * Return value:
  * o FILTER_STRAY:              No filter recognized the event, and no
  *                              filter-less handler is registered on this 
  *                              line.
  * o FILTER_HANDLED:            A filter claimed the event and served it.
  * o FILTER_SCHEDULE_THREAD:    No filter claimed the event, but there's at
  *                              least one filter-less handler on this line.
  * o FILTER_HANDLED | 
  *   FILTER_SCHEDULE_THREAD:    A filter claimed the event, and asked for
  *                              scheduling the per-handler ithread.
  *
  * In case an ithread has to be scheduled, in *ithd there will be a 
  * pointer to a struct intr_thread containing the thread to be
  * scheduled.
  */
 
 static int
 intr_filter_loop(struct intr_event *ie, struct trapframe *frame, 
 		 struct intr_thread **ithd) 
 {
 	struct intr_handler *ih;
 	void *arg;
 	int ret, thread_only;
 
 	ret = 0;
 	thread_only = 0;
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
 		/*
 		 * Execute fast interrupt handlers directly.
 		 * To support clock handlers, if a handler registers
 		 * with a NULL argument, then we pass it a pointer to
 		 * a trapframe as its argument.
 		 */
 		arg = ((ih->ih_argument == NULL) ? frame : ih->ih_argument);
 		
 		CTR5(KTR_INTR, "%s: exec %p/%p(%p) for %s", __func__,
 		     ih->ih_filter, ih->ih_handler, arg, ih->ih_name);
 
 		if (ih->ih_filter != NULL)
 			ret = ih->ih_filter(arg);
 		else {
 			thread_only = 1;
 			continue;
 		}
 		KASSERT(ret == FILTER_STRAY ||
 		    ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 &&
 		    (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0),
 		    ("%s: incorrect return value %#x from %s", __func__, ret,
 		    ih->ih_name));
 		if (ret & FILTER_STRAY)
 			continue;
 		else { 
 			*ithd = ih->ih_thread;
 			return (ret);
 		}
 	}
 
 	/*
 	 * No filters handled the interrupt and we have at least
 	 * one handler without a filter.  In this case, we schedule
 	 * all of the filter-less handlers to run in the ithread.
 	 */	
 	if (thread_only) {
 		*ithd = ie->ie_thread;
 		return (FILTER_SCHEDULE_THREAD);
 	}
 	return (FILTER_STRAY);
 }
 
 /*
  * Main interrupt handling body.
  *
  * Input:
  * o ie:                        the event connected to this interrupt.
  * o frame:                     some archs (i.e. i386) pass a frame to some.
  *                              handlers as their main argument.
  * Return value:
  * o 0:                         everything ok.
  * o EINVAL:                    stray interrupt.
  */
 int
 intr_event_handle(struct intr_event *ie, struct trapframe *frame)
 {
 	struct intr_thread *ithd;
 	struct trapframe *oldframe;
 	struct thread *td;
 	int thread;
 
 	ithd = NULL;
 	td = curthread;
 
 	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
 		return (EINVAL);
 
 	td->td_intr_nesting_level++;
 	thread = 0;
 	critical_enter();
 	oldframe = td->td_intr_frame;
 	td->td_intr_frame = frame;
 	thread = intr_filter_loop(ie, frame, &ithd);	
 	if (thread & FILTER_HANDLED) {
 		if (ie->ie_post_filter != NULL)
 			ie->ie_post_filter(ie->ie_source);
 	} else {
 		if (ie->ie_pre_ithread != NULL)
 			ie->ie_pre_ithread(ie->ie_source);
 	}
 	td->td_intr_frame = oldframe;
 	critical_exit();
 	
 	/* Interrupt storm logic */
 	if (thread & FILTER_STRAY) {
 		ie->ie_count++;
 		if (ie->ie_count < intr_storm_threshold)
 			printf("Interrupt stray detection not present\n");
 	}
 
 	/* Schedule an ithread if needed. */
 	if (thread & FILTER_SCHEDULE_THREAD) {
 		if (intr_event_schedule_thread(ie, ithd) != 0)
 			panic("%s: impossible stray interrupt", __func__);
 	}
 	td->td_intr_nesting_level--;
 	return (0);
 }
 #endif
 
 #ifdef DDB
 /*
  * Dump details about an interrupt handler
  */
 static void
 db_dump_intrhand(struct intr_handler *ih)
 {
 	int comma;
 
 	db_printf("\t%-10s ", ih->ih_name);
 	switch (ih->ih_pri) {
 	case PI_REALTIME:
 		db_printf("CLK ");
 		break;
 	case PI_AV:
 		db_printf("AV  ");
 		break;
 	case PI_TTY:
 		db_printf("TTY ");
 		break;
 	case PI_NET:
 		db_printf("NET ");
 		break;
 	case PI_DISK:
 		db_printf("DISK");
 		break;
 	case PI_DULL:
 		db_printf("DULL");
 		break;
 	default:
 		if (ih->ih_pri >= PI_SOFT)
 			db_printf("SWI ");
 		else
 			db_printf("%4u", ih->ih_pri);
 		break;
 	}
 	db_printf(" ");
 	if (ih->ih_filter != NULL) {
 		db_printf("[F]");
 		db_printsym((uintptr_t)ih->ih_filter, DB_STGY_PROC);
 	}
 	if (ih->ih_handler != NULL) {
 		if (ih->ih_filter != NULL)
 			db_printf(",");
 		db_printf("[H]");
 		db_printsym((uintptr_t)ih->ih_handler, DB_STGY_PROC);
 	}
 	db_printf("(%p)", ih->ih_argument);
 	if (ih->ih_need ||
 	    (ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD |
 	    IH_MPSAFE)) != 0) {
 		db_printf(" {");
 		comma = 0;
 		if (ih->ih_flags & IH_EXCLUSIVE) {
 			if (comma)
 				db_printf(", ");
 			db_printf("EXCL");
 			comma = 1;
 		}
 		if (ih->ih_flags & IH_ENTROPY) {
 			if (comma)
 				db_printf(", ");
 			db_printf("ENTROPY");
 			comma = 1;
 		}
 		if (ih->ih_flags & IH_DEAD) {
 			if (comma)
 				db_printf(", ");
 			db_printf("DEAD");
 			comma = 1;
 		}
 		if (ih->ih_flags & IH_MPSAFE) {
 			if (comma)
 				db_printf(", ");
 			db_printf("MPSAFE");
 			comma = 1;
 		}
 		if (ih->ih_need) {
 			if (comma)
 				db_printf(", ");
 			db_printf("NEED");
 		}
 		db_printf("}");
 	}
 	db_printf("\n");
 }
 
 /*
  * Dump details about a event.
  */
 void
 db_dump_intr_event(struct intr_event *ie, int handlers)
 {
 	struct intr_handler *ih;
 	struct intr_thread *it;
 	int comma;
 
 	db_printf("%s ", ie->ie_fullname);
 	it = ie->ie_thread;
 	if (it != NULL)
 		db_printf("(pid %d)", it->it_thread->td_proc->p_pid);
 	else
 		db_printf("(no thread)");
 	if ((ie->ie_flags & (IE_SOFT | IE_ENTROPY | IE_ADDING_THREAD)) != 0 ||
 	    (it != NULL && it->it_need)) {
 		db_printf(" {");
 		comma = 0;
 		if (ie->ie_flags & IE_SOFT) {
 			db_printf("SOFT");
 			comma = 1;
 		}
 		if (ie->ie_flags & IE_ENTROPY) {
 			if (comma)
 				db_printf(", ");
 			db_printf("ENTROPY");
 			comma = 1;
 		}
 		if (ie->ie_flags & IE_ADDING_THREAD) {
 			if (comma)
 				db_printf(", ");
 			db_printf("ADDING_THREAD");
 			comma = 1;
 		}
 		if (it != NULL && it->it_need) {
 			if (comma)
 				db_printf(", ");
 			db_printf("NEED");
 		}
 		db_printf("}");
 	}
 	db_printf("\n");
 
 	if (handlers)
 		TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
 		    db_dump_intrhand(ih);
 }
 
 /*
  * Dump data about interrupt handlers
  */
 DB_SHOW_COMMAND(intr, db_show_intr)
 {
 	struct intr_event *ie;
 	int all, verbose;
 
 	verbose = strchr(modif, 'v') != NULL;
 	all = strchr(modif, 'a') != NULL;
 	TAILQ_FOREACH(ie, &event_list, ie_list) {
 		if (!all && TAILQ_EMPTY(&ie->ie_handlers))
 			continue;
 		db_dump_intr_event(ie, verbose);
 		if (db_pager_quit)
 			break;
 	}
 }
 #endif /* DDB */
 
 /*
  * Start standard software interrupt threads
  */
 static void
 start_softintr(void *dummy)
 {
 
 	if (swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, INTR_MPSAFE, &vm_ih))
 		panic("died while creating vm swi ithread");
 }
 SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr,
     NULL);
 
 /*
  * Sysctls used by systat and others: hw.intrnames and hw.intrcnt.
  * The data for this machine dependent, and the declarations are in machine
  * dependent code.  The layout of intrnames and intrcnt however is machine
  * independent.
  *
  * We do not know the length of intrcnt and intrnames at compile time, so
  * calculate things at run time.
  */
 static int
 sysctl_intrnames(SYSCTL_HANDLER_ARGS)
 {
 	return (sysctl_handle_opaque(oidp, intrnames, sintrnames, req));
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, intrnames, CTLTYPE_OPAQUE | CTLFLAG_RD,
     NULL, 0, sysctl_intrnames, "", "Interrupt Names");
 
 static int
 sysctl_intrcnt(SYSCTL_HANDLER_ARGS)
 {
 #ifdef SCTL_MASK32
 	uint32_t *intrcnt32;
 	unsigned i;
 	int error;
 
 	if (req->flags & SCTL_MASK32) {
 		if (!req->oldptr)
 			return (sysctl_handle_opaque(oidp, NULL, sintrcnt / 2, req));
 		intrcnt32 = malloc(sintrcnt / 2, M_TEMP, M_NOWAIT);
 		if (intrcnt32 == NULL)
 			return (ENOMEM);
 		for (i = 0; i < sintrcnt / sizeof (u_long); i++)
 			intrcnt32[i] = intrcnt[i];
 		error = sysctl_handle_opaque(oidp, intrcnt32, sintrcnt / 2, req);
 		free(intrcnt32, M_TEMP);
 		return (error);
 	}
 #endif
 	return (sysctl_handle_opaque(oidp, intrcnt, sintrcnt, req));
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, intrcnt, CTLTYPE_OPAQUE | CTLFLAG_RD,
     NULL, 0, sysctl_intrcnt, "", "Interrupt Counts");
 
 #ifdef DDB
 /*
  * DDB command to dump the interrupt statistics.
  */
 DB_SHOW_COMMAND(intrcnt, db_show_intrcnt)
 {
 	u_long *i;
 	char *cp;
 	u_int j;
 
 	cp = intrnames;
 	j = 0;
 	for (i = intrcnt; j < (sintrcnt / sizeof(u_long)) && !db_pager_quit;
 	    i++, j++) {
 		if (*cp == '\0')
 			break;
 		if (*i != 0)
 			db_printf("%s\t%lu\n", cp, *i);
 		cp += strlen(cp) + 1;
 	}
 }
 #endif
Index: head/sys/kern/subr_gtaskqueue.c
===================================================================
--- head/sys/kern/subr_gtaskqueue.c	(revision 317755)
+++ head/sys/kern/subr_gtaskqueue.c	(revision 317756)
@@ -1,965 +1,965 @@
 /*-
  * Copyright (c) 2000 Doug Rabson
  * Copyright (c) 2014 Jeff Roberson
  * Copyright (c) 2016 Matthew Macy
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/cpuset.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/gtaskqueue.h>
 #include <sys/unistd.h>
 #include <machine/stdarg.h>
 
 static MALLOC_DEFINE(M_GTASKQUEUE, "taskqueue", "Task Queues");
 static void	gtaskqueue_thread_enqueue(void *);
 static void	gtaskqueue_thread_loop(void *arg);
 
 TASKQGROUP_DEFINE(softirq, mp_ncpus, 1);
 
 struct gtaskqueue_busy {
 	struct gtask	*tb_running;
 	TAILQ_ENTRY(gtaskqueue_busy) tb_link;
 };
 
 static struct gtask * const TB_DRAIN_WAITER = (struct gtask *)0x1;
 
 struct gtaskqueue {
 	STAILQ_HEAD(, gtask)	tq_queue;
 	gtaskqueue_enqueue_fn	tq_enqueue;
 	void			*tq_context;
 	char			*tq_name;
 	TAILQ_HEAD(, gtaskqueue_busy) tq_active;
 	struct mtx		tq_mutex;
 	struct thread		**tq_threads;
 	int			tq_tcount;
 	int			tq_spin;
 	int			tq_flags;
 	int			tq_callouts;
 	taskqueue_callback_fn	tq_callbacks[TASKQUEUE_NUM_CALLBACKS];
 	void			*tq_cb_contexts[TASKQUEUE_NUM_CALLBACKS];
 };
 
 #define	TQ_FLAGS_ACTIVE		(1 << 0)
 #define	TQ_FLAGS_BLOCKED	(1 << 1)
 #define	TQ_FLAGS_UNLOCKED_ENQUEUE	(1 << 2)
 
 #define	DT_CALLOUT_ARMED	(1 << 0)
 
 #define	TQ_LOCK(tq)							\
 	do {								\
 		if ((tq)->tq_spin)					\
 			mtx_lock_spin(&(tq)->tq_mutex);			\
 		else							\
 			mtx_lock(&(tq)->tq_mutex);			\
 	} while (0)
 #define	TQ_ASSERT_LOCKED(tq)	mtx_assert(&(tq)->tq_mutex, MA_OWNED)
 
 #define	TQ_UNLOCK(tq)							\
 	do {								\
 		if ((tq)->tq_spin)					\
 			mtx_unlock_spin(&(tq)->tq_mutex);		\
 		else							\
 			mtx_unlock(&(tq)->tq_mutex);			\
 	} while (0)
 #define	TQ_ASSERT_UNLOCKED(tq)	mtx_assert(&(tq)->tq_mutex, MA_NOTOWNED)
 
 #ifdef INVARIANTS
 static void
 gtask_dump(struct gtask *gtask)
 {
 	printf("gtask: %p ta_flags=%x ta_priority=%d ta_func=%p ta_context=%p\n",
 	       gtask, gtask->ta_flags, gtask->ta_priority, gtask->ta_func, gtask->ta_context);
 }
 #endif
 
 static __inline int
 TQ_SLEEP(struct gtaskqueue *tq, void *p, struct mtx *m, int pri, const char *wm,
     int t)
 {
 	if (tq->tq_spin)
 		return (msleep_spin(p, m, wm, t));
 	return (msleep(p, m, pri, wm, t));
 }
 
 static struct gtaskqueue *
 _gtaskqueue_create(const char *name, int mflags,
 		 taskqueue_enqueue_fn enqueue, void *context,
 		 int mtxflags, const char *mtxname __unused)
 {
 	struct gtaskqueue *queue;
 	char *tq_name;
 
 	tq_name = malloc(TASKQUEUE_NAMELEN, M_GTASKQUEUE, mflags | M_ZERO);
 	if (!tq_name)
 		return (NULL);
 
 	snprintf(tq_name, TASKQUEUE_NAMELEN, "%s", (name) ? name : "taskqueue");
 
 	queue = malloc(sizeof(struct gtaskqueue), M_GTASKQUEUE, mflags | M_ZERO);
 	if (!queue)
 		return (NULL);
 
 	STAILQ_INIT(&queue->tq_queue);
 	TAILQ_INIT(&queue->tq_active);
 	queue->tq_enqueue = enqueue;
 	queue->tq_context = context;
 	queue->tq_name = tq_name;
 	queue->tq_spin = (mtxflags & MTX_SPIN) != 0;
 	queue->tq_flags |= TQ_FLAGS_ACTIVE;
 	if (enqueue == gtaskqueue_thread_enqueue)
 		queue->tq_flags |= TQ_FLAGS_UNLOCKED_ENQUEUE;
 	mtx_init(&queue->tq_mutex, tq_name, NULL, mtxflags);
 
 	return (queue);
 }
 
 
 /*
  * Signal a taskqueue thread to terminate.
  */
 static void
 gtaskqueue_terminate(struct thread **pp, struct gtaskqueue *tq)
 {
 
 	while (tq->tq_tcount > 0 || tq->tq_callouts > 0) {
 		wakeup(tq);
 		TQ_SLEEP(tq, pp, &tq->tq_mutex, PWAIT, "taskqueue_destroy", 0);
 	}
 }
 
 static void
 gtaskqueue_free(struct gtaskqueue *queue)
 {
 
 	TQ_LOCK(queue);
 	queue->tq_flags &= ~TQ_FLAGS_ACTIVE;
 	gtaskqueue_terminate(queue->tq_threads, queue);
 	KASSERT(TAILQ_EMPTY(&queue->tq_active), ("Tasks still running?"));
 	KASSERT(queue->tq_callouts == 0, ("Armed timeout tasks"));
 	mtx_destroy(&queue->tq_mutex);
 	free(queue->tq_threads, M_GTASKQUEUE);
 	free(queue->tq_name, M_GTASKQUEUE);
 	free(queue, M_GTASKQUEUE);
 }
 
 int
 grouptaskqueue_enqueue(struct gtaskqueue *queue, struct gtask *gtask)
 {
 #ifdef INVARIANTS
 	if (queue == NULL) {
 		gtask_dump(gtask);
 		panic("queue == NULL");
 	}
 #endif
 	TQ_LOCK(queue);
 	if (gtask->ta_flags & TASK_ENQUEUED) {
 		TQ_UNLOCK(queue);
 		return (0);
 	}
 	STAILQ_INSERT_TAIL(&queue->tq_queue, gtask, ta_link);
 	gtask->ta_flags |= TASK_ENQUEUED;
 	TQ_UNLOCK(queue);
 	if ((queue->tq_flags & TQ_FLAGS_BLOCKED) == 0)
 		queue->tq_enqueue(queue->tq_context);
 	return (0);
 }
 
 static void
 gtaskqueue_task_nop_fn(void *context)
 {
 }
 
 /*
  * Block until all currently queued tasks in this taskqueue
  * have begun execution.  Tasks queued during execution of
  * this function are ignored.
  */
 static void
 gtaskqueue_drain_tq_queue(struct gtaskqueue *queue)
 {
 	struct gtask t_barrier;
 
 	if (STAILQ_EMPTY(&queue->tq_queue))
 		return;
 
 	/*
 	 * Enqueue our barrier after all current tasks, but with
 	 * the highest priority so that newly queued tasks cannot
 	 * pass it.  Because of the high priority, we can not use
 	 * taskqueue_enqueue_locked directly (which drops the lock
 	 * anyway) so just insert it at tail while we have the
 	 * queue lock.
 	 */
 	GTASK_INIT(&t_barrier, 0, USHRT_MAX, gtaskqueue_task_nop_fn, &t_barrier);
 	STAILQ_INSERT_TAIL(&queue->tq_queue, &t_barrier, ta_link);
 	t_barrier.ta_flags |= TASK_ENQUEUED;
 
 	/*
 	 * Once the barrier has executed, all previously queued tasks
 	 * have completed or are currently executing.
 	 */
 	while (t_barrier.ta_flags & TASK_ENQUEUED)
 		TQ_SLEEP(queue, &t_barrier, &queue->tq_mutex, PWAIT, "-", 0);
 }
 
 /*
  * Block until all currently executing tasks for this taskqueue
  * complete.  Tasks that begin execution during the execution
  * of this function are ignored.
  */
 static void
 gtaskqueue_drain_tq_active(struct gtaskqueue *queue)
 {
 	struct gtaskqueue_busy tb_marker, *tb_first;
 
 	if (TAILQ_EMPTY(&queue->tq_active))
 		return;
 
 	/* Block taskq_terminate().*/
 	queue->tq_callouts++;
 
 	/*
 	 * Wait for all currently executing taskqueue threads
 	 * to go idle.
 	 */
 	tb_marker.tb_running = TB_DRAIN_WAITER;
 	TAILQ_INSERT_TAIL(&queue->tq_active, &tb_marker, tb_link);
 	while (TAILQ_FIRST(&queue->tq_active) != &tb_marker)
 		TQ_SLEEP(queue, &tb_marker, &queue->tq_mutex, PWAIT, "-", 0);
 	TAILQ_REMOVE(&queue->tq_active, &tb_marker, tb_link);
 
 	/*
 	 * Wakeup any other drain waiter that happened to queue up
 	 * without any intervening active thread.
 	 */
 	tb_first = TAILQ_FIRST(&queue->tq_active);
 	if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER)
 		wakeup(tb_first);
 
 	/* Release taskqueue_terminate(). */
 	queue->tq_callouts--;
 	if ((queue->tq_flags & TQ_FLAGS_ACTIVE) == 0)
 		wakeup_one(queue->tq_threads);
 }
 
 void
 gtaskqueue_block(struct gtaskqueue *queue)
 {
 
 	TQ_LOCK(queue);
 	queue->tq_flags |= TQ_FLAGS_BLOCKED;
 	TQ_UNLOCK(queue);
 }
 
 void
 gtaskqueue_unblock(struct gtaskqueue *queue)
 {
 
 	TQ_LOCK(queue);
 	queue->tq_flags &= ~TQ_FLAGS_BLOCKED;
 	if (!STAILQ_EMPTY(&queue->tq_queue))
 		queue->tq_enqueue(queue->tq_context);
 	TQ_UNLOCK(queue);
 }
 
 static void
 gtaskqueue_run_locked(struct gtaskqueue *queue)
 {
 	struct gtaskqueue_busy tb;
 	struct gtaskqueue_busy *tb_first;
 	struct gtask *gtask;
 
 	KASSERT(queue != NULL, ("tq is NULL"));
 	TQ_ASSERT_LOCKED(queue);
 	tb.tb_running = NULL;
 
 	while (STAILQ_FIRST(&queue->tq_queue)) {
 		TAILQ_INSERT_TAIL(&queue->tq_active, &tb, tb_link);
 
 		/*
 		 * Carefully remove the first task from the queue and
 		 * clear its TASK_ENQUEUED flag
 		 */
 		gtask = STAILQ_FIRST(&queue->tq_queue);
 		KASSERT(gtask != NULL, ("task is NULL"));
 		STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link);
 		gtask->ta_flags &= ~TASK_ENQUEUED;
 		tb.tb_running = gtask;
 		TQ_UNLOCK(queue);
 
 		KASSERT(gtask->ta_func != NULL, ("task->ta_func is NULL"));
 		gtask->ta_func(gtask->ta_context);
 
 		TQ_LOCK(queue);
 		tb.tb_running = NULL;
 		wakeup(gtask);
 
 		TAILQ_REMOVE(&queue->tq_active, &tb, tb_link);
 		tb_first = TAILQ_FIRST(&queue->tq_active);
 		if (tb_first != NULL &&
 		    tb_first->tb_running == TB_DRAIN_WAITER)
 			wakeup(tb_first);
 	}
 }
 
 static int
 task_is_running(struct gtaskqueue *queue, struct gtask *gtask)
 {
 	struct gtaskqueue_busy *tb;
 
 	TQ_ASSERT_LOCKED(queue);
 	TAILQ_FOREACH(tb, &queue->tq_active, tb_link) {
 		if (tb->tb_running == gtask)
 			return (1);
 	}
 	return (0);
 }
 
 static int
 gtaskqueue_cancel_locked(struct gtaskqueue *queue, struct gtask *gtask)
 {
 
 	if (gtask->ta_flags & TASK_ENQUEUED)
 		STAILQ_REMOVE(&queue->tq_queue, gtask, gtask, ta_link);
 	gtask->ta_flags &= ~TASK_ENQUEUED;
 	return (task_is_running(queue, gtask) ? EBUSY : 0);
 }
 
 int
 gtaskqueue_cancel(struct gtaskqueue *queue, struct gtask *gtask)
 {
 	int error;
 
 	TQ_LOCK(queue);
 	error = gtaskqueue_cancel_locked(queue, gtask);
 	TQ_UNLOCK(queue);
 
 	return (error);
 }
 
 void
 gtaskqueue_drain(struct gtaskqueue *queue, struct gtask *gtask)
 {
 
 	if (!queue->tq_spin)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
 
 	TQ_LOCK(queue);
 	while ((gtask->ta_flags & TASK_ENQUEUED) || task_is_running(queue, gtask))
 		TQ_SLEEP(queue, gtask, &queue->tq_mutex, PWAIT, "-", 0);
 	TQ_UNLOCK(queue);
 }
 
 void
 gtaskqueue_drain_all(struct gtaskqueue *queue)
 {
 
 	if (!queue->tq_spin)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
 
 	TQ_LOCK(queue);
 	gtaskqueue_drain_tq_queue(queue);
 	gtaskqueue_drain_tq_active(queue);
 	TQ_UNLOCK(queue);
 }
 
 static int
 _gtaskqueue_start_threads(struct gtaskqueue **tqp, int count, int pri,
     cpuset_t *mask, const char *name, va_list ap)
 {
 	char ktname[MAXCOMLEN + 1];
 	struct thread *td;
 	struct gtaskqueue *tq;
 	int i, error;
 
 	if (count <= 0)
 		return (EINVAL);
 
 	vsnprintf(ktname, sizeof(ktname), name, ap);
 	tq = *tqp;
 
 	tq->tq_threads = malloc(sizeof(struct thread *) * count, M_GTASKQUEUE,
 	    M_NOWAIT | M_ZERO);
 	if (tq->tq_threads == NULL) {
 		printf("%s: no memory for %s threads\n", __func__, ktname);
 		return (ENOMEM);
 	}
 
 	for (i = 0; i < count; i++) {
 		if (count == 1)
 			error = kthread_add(gtaskqueue_thread_loop, tqp, NULL,
 			    &tq->tq_threads[i], RFSTOPPED, 0, "%s", ktname);
 		else
 			error = kthread_add(gtaskqueue_thread_loop, tqp, NULL,
 			    &tq->tq_threads[i], RFSTOPPED, 0,
 			    "%s_%d", ktname, i);
 		if (error) {
 			/* should be ok to continue, taskqueue_free will dtrt */
 			printf("%s: kthread_add(%s): error %d", __func__,
 			    ktname, error);
 			tq->tq_threads[i] = NULL;		/* paranoid */
 		} else
 			tq->tq_tcount++;
 	}
 	for (i = 0; i < count; i++) {
 		if (tq->tq_threads[i] == NULL)
 			continue;
 		td = tq->tq_threads[i];
 		if (mask) {
 			error = cpuset_setthread(td->td_tid, mask);
 			/*
 			 * Failing to pin is rarely an actual fatal error;
 			 * it'll just affect performance.
 			 */
 			if (error)
 				printf("%s: curthread=%llu: can't pin; "
 				    "error=%d\n",
 				    __func__,
 				    (unsigned long long) td->td_tid,
 				    error);
 		}
 		thread_lock(td);
 		sched_prio(td, pri);
 		sched_add(td, SRQ_BORING);
 		thread_unlock(td);
 	}
 
 	return (0);
 }
 
 static int
 gtaskqueue_start_threads(struct gtaskqueue **tqp, int count, int pri,
     const char *name, ...)
 {
 	va_list ap;
 	int error;
 
 	va_start(ap, name);
 	error = _gtaskqueue_start_threads(tqp, count, pri, NULL, name, ap);
 	va_end(ap);
 	return (error);
 }
 
 static inline void
 gtaskqueue_run_callback(struct gtaskqueue *tq,
     enum taskqueue_callback_type cb_type)
 {
 	taskqueue_callback_fn tq_callback;
 
 	TQ_ASSERT_UNLOCKED(tq);
 	tq_callback = tq->tq_callbacks[cb_type];
 	if (tq_callback != NULL)
 		tq_callback(tq->tq_cb_contexts[cb_type]);
 }
 
 static void
 gtaskqueue_thread_loop(void *arg)
 {
 	struct gtaskqueue **tqp, *tq;
 
 	tqp = arg;
 	tq = *tqp;
 	gtaskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_INIT);
 	TQ_LOCK(tq);
 	while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) {
 		/* XXX ? */
 		gtaskqueue_run_locked(tq);
 		/*
 		 * Because taskqueue_run() can drop tq_mutex, we need to
 		 * check if the TQ_FLAGS_ACTIVE flag wasn't removed in the
 		 * meantime, which means we missed a wakeup.
 		 */
 		if ((tq->tq_flags & TQ_FLAGS_ACTIVE) == 0)
 			break;
 		TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0);
 	}
 	gtaskqueue_run_locked(tq);
 	/*
 	 * This thread is on its way out, so just drop the lock temporarily
 	 * in order to call the shutdown callback.  This allows the callback
 	 * to look at the taskqueue, even just before it dies.
 	 */
 	TQ_UNLOCK(tq);
 	gtaskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN);
 	TQ_LOCK(tq);
 
 	/* rendezvous with thread that asked us to terminate */
 	tq->tq_tcount--;
 	wakeup_one(tq->tq_threads);
 	TQ_UNLOCK(tq);
 	kthread_exit();
 }
 
 static void
 gtaskqueue_thread_enqueue(void *context)
 {
 	struct gtaskqueue **tqp, *tq;
 
 	tqp = context;
 	tq = *tqp;
 	wakeup_one(tq);
 }
 
 
 static struct gtaskqueue *
 gtaskqueue_create_fast(const char *name, int mflags,
 		 taskqueue_enqueue_fn enqueue, void *context)
 {
 	return _gtaskqueue_create(name, mflags, enqueue, context,
 			MTX_SPIN, "fast_taskqueue");
 }
 
 
 struct taskqgroup_cpu {
 	LIST_HEAD(, grouptask)	tgc_tasks;
 	struct gtaskqueue	*tgc_taskq;
 	int	tgc_cnt;
 	int	tgc_cpu;
 };
 
 struct taskqgroup {
 	struct taskqgroup_cpu tqg_queue[MAXCPU];
 	struct mtx	tqg_lock;
 	char *		tqg_name;
 	int		tqg_adjusting;
 	int		tqg_stride;
 	int		tqg_cnt;
 };
 
 struct taskq_bind_task {
 	struct gtask bt_task;
 	int	bt_cpuid;
 };
 
 static void
 taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu)
 {
 	struct taskqgroup_cpu *qcpu;
 
 	qcpu = &qgroup->tqg_queue[idx];
 	LIST_INIT(&qcpu->tgc_tasks);
 	qcpu->tgc_taskq = gtaskqueue_create_fast(NULL, M_WAITOK,
 	    taskqueue_thread_enqueue, &qcpu->tgc_taskq);
 	gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, PI_SOFT,
 	    "%s_%d", qgroup->tqg_name, idx);
 	qcpu->tgc_cpu = cpu;
 }
 
 static void
 taskqgroup_cpu_remove(struct taskqgroup *qgroup, int idx)
 {
 
 	gtaskqueue_free(qgroup->tqg_queue[idx].tgc_taskq);
 }
 
 /*
  * Find the taskq with least # of tasks that doesn't currently have any
  * other queues from the uniq identifier.
  */
 static int
 taskqgroup_find(struct taskqgroup *qgroup, void *uniq)
 {
 	struct grouptask *n;
 	int i, idx, mincnt;
 	int strict;
 
 	mtx_assert(&qgroup->tqg_lock, MA_OWNED);
 	if (qgroup->tqg_cnt == 0)
 		return (0);
 	idx = -1;
 	mincnt = INT_MAX;
 	/*
 	 * Two passes;  First scan for a queue with the least tasks that
 	 * does not already service this uniq id.  If that fails simply find
 	 * the queue with the least total tasks;
 	 */
 	for (strict = 1; mincnt == INT_MAX; strict = 0) {
 		for (i = 0; i < qgroup->tqg_cnt; i++) {
 			if (qgroup->tqg_queue[i].tgc_cnt > mincnt)
 				continue;
 			if (strict) {
 				LIST_FOREACH(n,
 				    &qgroup->tqg_queue[i].tgc_tasks, gt_list)
 					if (n->gt_uniq == uniq)
 						break;
 				if (n != NULL)
 					continue;
 			}
 			mincnt = qgroup->tqg_queue[i].tgc_cnt;
 			idx = i;
 		}
 	}
 	if (idx == -1)
 		panic("taskqgroup_find: Failed to pick a qid.");
 
 	return (idx);
 }
 
 /*
  * smp_started is unusable since it is not set for UP kernels or even for
  * SMP kernels when there is 1 CPU.  This is usually handled by adding a
  * (mp_ncpus == 1) test, but that would be broken here since we need to
  * to synchronize with the SI_SUB_SMP ordering.  Even in the pure SMP case
  * smp_started only gives a fuzzy ordering relative to SI_SUB_SMP.
  *
  * So maintain our own flag.  It must be set after all CPUs are started
  * and before SI_SUB_SMP:SI_ORDER_ANY so that the SYSINIT for delayed
  * adjustment is properly delayed.  SI_ORDER_FOURTH is clearly before
  * SI_ORDER_ANY and unclearly after the CPUs are started.  It would be
  * simpler for adjustment to pass a flag indicating if it is delayed.
  */ 
 
 static int tqg_smp_started;
 
 static void
 tqg_record_smp_started(void *arg)
 {
 	tqg_smp_started = 1;
 }
 
 SYSINIT(tqg_record_smp_started, SI_SUB_SMP, SI_ORDER_FOURTH,
 	tqg_record_smp_started, NULL);
 
 void
 taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *gtask,
     void *uniq, int irq, char *name)
 {
 	cpuset_t mask;
 	int qid;
 
 	gtask->gt_uniq = uniq;
 	gtask->gt_name = name;
 	gtask->gt_irq = irq;
 	gtask->gt_cpu = -1;
 	mtx_lock(&qgroup->tqg_lock);
 	qid = taskqgroup_find(qgroup, uniq);
 	qgroup->tqg_queue[qid].tgc_cnt++;
 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
 	if (irq != -1 && tqg_smp_started) {
 		gtask->gt_cpu = qgroup->tqg_queue[qid].tgc_cpu;
 		CPU_ZERO(&mask);
 		CPU_SET(qgroup->tqg_queue[qid].tgc_cpu, &mask);
 		mtx_unlock(&qgroup->tqg_lock);
-		intr_setaffinity(irq, &mask);
+		intr_setaffinity(irq, CPU_WHICH_IRQ, &mask);
 	} else
 		mtx_unlock(&qgroup->tqg_lock);
 }
 
 static void
 taskqgroup_attach_deferred(struct taskqgroup *qgroup, struct grouptask *gtask)
 {
 	cpuset_t mask;
 	int qid, cpu;
 
 	mtx_lock(&qgroup->tqg_lock);
 	qid = taskqgroup_find(qgroup, gtask->gt_uniq);
 	cpu = qgroup->tqg_queue[qid].tgc_cpu;
 	if (gtask->gt_irq != -1) {
 		mtx_unlock(&qgroup->tqg_lock);
 
 		CPU_ZERO(&mask);
 		CPU_SET(cpu, &mask);
-		intr_setaffinity(gtask->gt_irq, &mask);
+		intr_setaffinity(gtask->gt_irq, CPU_WHICH_IRQ, &mask);
 
 		mtx_lock(&qgroup->tqg_lock);
 	}
 	qgroup->tqg_queue[qid].tgc_cnt++;
 
 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask,
 			 gt_list);
 	MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL);
 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
 	mtx_unlock(&qgroup->tqg_lock);
 }
 
 int
 taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask,
 	void *uniq, int cpu, int irq, char *name)
 {
 	cpuset_t mask;
 	int i, qid;
 
 	qid = -1;
 	gtask->gt_uniq = uniq;
 	gtask->gt_name = name;
 	gtask->gt_irq = irq;
 	gtask->gt_cpu = cpu;
 	mtx_lock(&qgroup->tqg_lock);
 	if (tqg_smp_started) {
 		for (i = 0; i < qgroup->tqg_cnt; i++)
 			if (qgroup->tqg_queue[i].tgc_cpu == cpu) {
 				qid = i;
 				break;
 			}
 		if (qid == -1) {
 			mtx_unlock(&qgroup->tqg_lock);
 			return (EINVAL);
 		}
 	} else
 		qid = 0;
 	qgroup->tqg_queue[qid].tgc_cnt++;
 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
 	cpu = qgroup->tqg_queue[qid].tgc_cpu;
 	mtx_unlock(&qgroup->tqg_lock);
 
 	CPU_ZERO(&mask);
 	CPU_SET(cpu, &mask);
 	if (irq != -1 && tqg_smp_started)
-		intr_setaffinity(irq, &mask);
+		intr_setaffinity(irq, CPU_WHICH_IRQ, &mask);
 	return (0);
 }
 
 static int
 taskqgroup_attach_cpu_deferred(struct taskqgroup *qgroup, struct grouptask *gtask)
 {
 	cpuset_t mask;
 	int i, qid, irq, cpu;
 
 	qid = -1;
 	irq = gtask->gt_irq;
 	cpu = gtask->gt_cpu;
 	MPASS(tqg_smp_started);
 	mtx_lock(&qgroup->tqg_lock);
 	for (i = 0; i < qgroup->tqg_cnt; i++)
 		if (qgroup->tqg_queue[i].tgc_cpu == cpu) {
 			qid = i;
 			break;
 		}
 	if (qid == -1) {
 		mtx_unlock(&qgroup->tqg_lock);
 		return (EINVAL);
 	}
 	qgroup->tqg_queue[qid].tgc_cnt++;
 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
 	MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL);
 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
 	mtx_unlock(&qgroup->tqg_lock);
 
 	CPU_ZERO(&mask);
 	CPU_SET(cpu, &mask);
 
 	if (irq != -1)
-		intr_setaffinity(irq, &mask);
+		intr_setaffinity(irq, CPU_WHICH_IRQ, &mask);
 	return (0);
 }
 
 void
 taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask)
 {
 	int i;
 
 	mtx_lock(&qgroup->tqg_lock);
 	for (i = 0; i < qgroup->tqg_cnt; i++)
 		if (qgroup->tqg_queue[i].tgc_taskq == gtask->gt_taskqueue)
 			break;
 	if (i == qgroup->tqg_cnt)
 		panic("taskqgroup_detach: task not in group\n");
 	qgroup->tqg_queue[i].tgc_cnt--;
 	LIST_REMOVE(gtask, gt_list);
 	mtx_unlock(&qgroup->tqg_lock);
 	gtask->gt_taskqueue = NULL;
 }
 
 static void
 taskqgroup_binder(void *ctx)
 {
 	struct taskq_bind_task *gtask = (struct taskq_bind_task *)ctx;
 	cpuset_t mask;
 	int error;
 
 	CPU_ZERO(&mask);
 	CPU_SET(gtask->bt_cpuid, &mask);
 	error = cpuset_setthread(curthread->td_tid, &mask);
 	thread_lock(curthread);
 	sched_bind(curthread, gtask->bt_cpuid);
 	thread_unlock(curthread);
 
 	if (error)
 		printf("taskqgroup_binder: setaffinity failed: %d\n",
 		    error);
 	free(gtask, M_DEVBUF);
 }
 
 static void
 taskqgroup_bind(struct taskqgroup *qgroup)
 {
 	struct taskq_bind_task *gtask;
 	int i;
 
 	/*
 	 * Bind taskqueue threads to specific CPUs, if they have been assigned
 	 * one.
 	 */
 	if (qgroup->tqg_cnt == 1)
 		return;
 
 	for (i = 0; i < qgroup->tqg_cnt; i++) {
 		gtask = malloc(sizeof (*gtask), M_DEVBUF, M_WAITOK);
 		GTASK_INIT(&gtask->bt_task, 0, 0, taskqgroup_binder, gtask);
 		gtask->bt_cpuid = qgroup->tqg_queue[i].tgc_cpu;
 		grouptaskqueue_enqueue(qgroup->tqg_queue[i].tgc_taskq,
 		    &gtask->bt_task);
 	}
 }
 
 static int
 _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
 {
 	LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL);
 	struct grouptask *gtask;
 	int i, k, old_cnt, old_cpu, cpu;
 
 	mtx_assert(&qgroup->tqg_lock, MA_OWNED);
 
 	if (cnt < 1 || cnt * stride > mp_ncpus || !tqg_smp_started) {
 		printf("%s: failed cnt: %d stride: %d "
 		    "mp_ncpus: %d tqg_smp_started: %d\n",
 		    __func__, cnt, stride, mp_ncpus, tqg_smp_started);
 		return (EINVAL);
 	}
 	if (qgroup->tqg_adjusting) {
 		printf("taskqgroup_adjust failed: adjusting\n");
 		return (EBUSY);
 	}
 	qgroup->tqg_adjusting = 1;
 	old_cnt = qgroup->tqg_cnt;
 	old_cpu = 0;
 	if (old_cnt < cnt)
 		old_cpu = qgroup->tqg_queue[old_cnt].tgc_cpu;
 	mtx_unlock(&qgroup->tqg_lock);
 	/*
 	 * Set up queue for tasks added before boot.
 	 */
 	if (old_cnt == 0) {
 		LIST_SWAP(&gtask_head, &qgroup->tqg_queue[0].tgc_tasks,
 		    grouptask, gt_list);
 		qgroup->tqg_queue[0].tgc_cnt = 0;
 	}
 
 	/*
 	 * If new taskq threads have been added.
 	 */
 	cpu = old_cpu;
 	for (i = old_cnt; i < cnt; i++) {
 		taskqgroup_cpu_create(qgroup, i, cpu);
 
 		for (k = 0; k < stride; k++)
 			cpu = CPU_NEXT(cpu);
 	}
 	mtx_lock(&qgroup->tqg_lock);
 	qgroup->tqg_cnt = cnt;
 	qgroup->tqg_stride = stride;
 
 	/*
 	 * Adjust drivers to use new taskqs.
 	 */
 	for (i = 0; i < old_cnt; i++) {
 		while ((gtask = LIST_FIRST(&qgroup->tqg_queue[i].tgc_tasks))) {
 			LIST_REMOVE(gtask, gt_list);
 			qgroup->tqg_queue[i].tgc_cnt--;
 			LIST_INSERT_HEAD(&gtask_head, gtask, gt_list);
 		}
 	}
 	mtx_unlock(&qgroup->tqg_lock);
 
 	while ((gtask = LIST_FIRST(&gtask_head))) {
 		LIST_REMOVE(gtask, gt_list);
 		if (gtask->gt_cpu == -1)
 			taskqgroup_attach_deferred(qgroup, gtask);
 		else if (taskqgroup_attach_cpu_deferred(qgroup, gtask))
 			taskqgroup_attach_deferred(qgroup, gtask);
 	}
 
 #ifdef INVARIANTS
 	mtx_lock(&qgroup->tqg_lock);
 	for (i = 0; i < qgroup->tqg_cnt; i++) {
 		MPASS(qgroup->tqg_queue[i].tgc_taskq != NULL);
 		LIST_FOREACH(gtask, &qgroup->tqg_queue[i].tgc_tasks, gt_list)
 			MPASS(gtask->gt_taskqueue != NULL);
 	}
 	mtx_unlock(&qgroup->tqg_lock);
 #endif
 	/*
 	 * If taskq thread count has been reduced.
 	 */
 	for (i = cnt; i < old_cnt; i++)
 		taskqgroup_cpu_remove(qgroup, i);
 
 	taskqgroup_bind(qgroup);
 
 	mtx_lock(&qgroup->tqg_lock);
 	qgroup->tqg_adjusting = 0;
 
 	return (0);
 }
 
 int
 taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
 {
 	int error;
 
 	mtx_lock(&qgroup->tqg_lock);
 	error = _taskqgroup_adjust(qgroup, cnt, stride);
 	mtx_unlock(&qgroup->tqg_lock);
 
 	return (error);
 }
 
 struct taskqgroup *
 taskqgroup_create(char *name)
 {
 	struct taskqgroup *qgroup;
 
 	qgroup = malloc(sizeof(*qgroup), M_GTASKQUEUE, M_WAITOK | M_ZERO);
 	mtx_init(&qgroup->tqg_lock, "taskqgroup", NULL, MTX_DEF);
 	qgroup->tqg_name = name;
 	LIST_INIT(&qgroup->tqg_queue[0].tgc_tasks);
 
 	return (qgroup);
 }
 
 void
 taskqgroup_destroy(struct taskqgroup *qgroup)
 {
 
 }
Index: head/sys/sys/cpuset.h
===================================================================
--- head/sys/sys/cpuset.h	(revision 317755)
+++ head/sys/sys/cpuset.h	(revision 317756)
@@ -1,153 +1,155 @@
 /*-
  * Copyright (c) 2008,	Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Copyright (c) 2008 Nokia Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_CPUSET_H_
 #define	_SYS_CPUSET_H_
 
 #include <sys/_cpuset.h>
 
 #include <sys/bitset.h>
 
 #define	_NCPUBITS	_BITSET_BITS
 #define	_NCPUWORDS	__bitset_words(CPU_SETSIZE)
 
 #define	CPUSETBUFSIZ	((2 + sizeof(long) * 2) * _NCPUWORDS)
 
 #define	CPU_CLR(n, p)			BIT_CLR(CPU_SETSIZE, n, p)
 #define	CPU_COPY(f, t)			BIT_COPY(CPU_SETSIZE, f, t)
 #define	CPU_ISSET(n, p)			BIT_ISSET(CPU_SETSIZE, n, p)
 #define	CPU_SET(n, p)			BIT_SET(CPU_SETSIZE, n, p)
 #define	CPU_ZERO(p) 			BIT_ZERO(CPU_SETSIZE, p)
 #define	CPU_FILL(p) 			BIT_FILL(CPU_SETSIZE, p)
 #define	CPU_SETOF(n, p)			BIT_SETOF(CPU_SETSIZE, n, p)
 #define	CPU_EMPTY(p)			BIT_EMPTY(CPU_SETSIZE, p)
 #define	CPU_ISFULLSET(p)		BIT_ISFULLSET(CPU_SETSIZE, p)
 #define	CPU_SUBSET(p, c)		BIT_SUBSET(CPU_SETSIZE, p, c)
 #define	CPU_OVERLAP(p, c)		BIT_OVERLAP(CPU_SETSIZE, p, c)
 #define	CPU_CMP(p, c)			BIT_CMP(CPU_SETSIZE, p, c)
 #define	CPU_OR(d, s)			BIT_OR(CPU_SETSIZE, d, s)
 #define	CPU_AND(d, s)			BIT_AND(CPU_SETSIZE, d, s)
 #define	CPU_NAND(d, s)			BIT_NAND(CPU_SETSIZE, d, s)
 #define	CPU_CLR_ATOMIC(n, p)		BIT_CLR_ATOMIC(CPU_SETSIZE, n, p)
 #define	CPU_SET_ATOMIC(n, p)		BIT_SET_ATOMIC(CPU_SETSIZE, n, p)
 #define	CPU_SET_ATOMIC_ACQ(n, p)	BIT_SET_ATOMIC_ACQ(CPU_SETSIZE, n, p)
 #define	CPU_AND_ATOMIC(n, p)		BIT_AND_ATOMIC(CPU_SETSIZE, n, p)
 #define	CPU_OR_ATOMIC(d, s)		BIT_OR_ATOMIC(CPU_SETSIZE, d, s)
 #define	CPU_COPY_STORE_REL(f, t)	BIT_COPY_STORE_REL(CPU_SETSIZE, f, t)
 #define	CPU_FFS(p)			BIT_FFS(CPU_SETSIZE, p)
 #define	CPU_COUNT(p)			BIT_COUNT(CPU_SETSIZE, p)
 #define	CPUSET_FSET			BITSET_FSET(_NCPUWORDS)
 #define	CPUSET_T_INITIALIZER		BITSET_T_INITIALIZER
 
 /*
  * Valid cpulevel_t values.
  */
 #define	CPU_LEVEL_ROOT		1	/* All system cpus. */
 #define	CPU_LEVEL_CPUSET	2	/* Available cpus for which. */
 #define	CPU_LEVEL_WHICH		3	/* Actual mask/id for which. */
 
 /*
  * Valid cpuwhich_t values.
  */
 #define	CPU_WHICH_TID		1	/* Specifies a thread id. */
 #define	CPU_WHICH_PID		2	/* Specifies a process id. */
 #define	CPU_WHICH_CPUSET	3	/* Specifies a set id. */
 #define	CPU_WHICH_IRQ		4	/* Specifies an irq #. */
 #define	CPU_WHICH_JAIL		5	/* Specifies a jail id. */
 #define	CPU_WHICH_DOMAIN	6	/* Specifies a NUMA domain id. */
+#define	CPU_WHICH_INTRHANDLER	7	/* Specifies an irq # (not ithread). */
+#define	CPU_WHICH_ITHREAD	8	/* Specifies an irq's ithread. */
 
 /*
  * Reserved cpuset identifiers.
  */
 #define	CPUSET_INVALID	-1
 #define	CPUSET_DEFAULT	0
 
 #ifdef _KERNEL
 #include <sys/queue.h>
 
 LIST_HEAD(setlist, cpuset);
 
 /*
  * cpusets encapsulate cpu binding information for one or more threads.
  *
  * 	a - Accessed with atomics.
  *	s - Set at creation, never modified.  Only a ref required to read.
  *	c - Locked internally by a cpuset lock.
  *
  * The bitmask is only modified while holding the cpuset lock.  It may be
  * read while only a reference is held but the consumer must be prepared
  * to deal with inconsistent results.
  */
 struct cpuset {
 	cpuset_t		cs_mask;	/* bitmask of valid cpus. */
 	volatile u_int		cs_ref;		/* (a) Reference count. */
 	int			cs_flags;	/* (s) Flags from below. */
 	cpusetid_t		cs_id;		/* (s) Id or INVALID. */
 	struct cpuset		*cs_parent;	/* (s) Pointer to our parent. */
 	LIST_ENTRY(cpuset)	cs_link;	/* (c) All identified sets. */
 	LIST_ENTRY(cpuset)	cs_siblings;	/* (c) Sibling set link. */
 	struct setlist		cs_children;	/* (c) List of children. */
 };
 
 #define CPU_SET_ROOT    0x0001  /* Set is a root set. */
 #define CPU_SET_RDONLY  0x0002  /* No modification allowed. */
 
 extern cpuset_t *cpuset_root;
 struct prison;
 struct proc;
 struct thread;
 
 struct cpuset *cpuset_thread0(void);
 struct cpuset *cpuset_ref(struct cpuset *);
 void	cpuset_rel(struct cpuset *);
 int	cpuset_setthread(lwpid_t id, cpuset_t *);
 int	cpuset_setithread(lwpid_t id, int cpu);
 int	cpuset_create_root(struct prison *, struct cpuset **);
 int	cpuset_setproc_update_set(struct proc *, struct cpuset *);
 int	cpuset_which(cpuwhich_t, id_t, struct proc **,
 	    struct thread **, struct cpuset **);
 
 char	*cpusetobj_strprint(char *, const cpuset_t *);
 int	cpusetobj_strscan(cpuset_t *, const char *);
 #ifdef DDB
 void	ddb_display_cpuset(const cpuset_t *);
 #endif
 
 #else
 __BEGIN_DECLS
 int	cpuset(cpusetid_t *);
 int	cpuset_setid(cpuwhich_t, id_t, cpusetid_t);
 int	cpuset_getid(cpulevel_t, cpuwhich_t, id_t, cpusetid_t *);
 int	cpuset_getaffinity(cpulevel_t, cpuwhich_t, id_t, size_t, cpuset_t *);
 int	cpuset_setaffinity(cpulevel_t, cpuwhich_t, id_t, size_t, const cpuset_t *);
 __END_DECLS
 #endif
 #endif /* !_SYS_CPUSET_H_ */
Index: head/sys/sys/interrupt.h
===================================================================
--- head/sys/sys/interrupt.h	(revision 317755)
+++ head/sys/sys/interrupt.h	(revision 317756)
@@ -1,186 +1,188 @@
 /*-
  * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_INTERRUPT_H_
 #define _SYS_INTERRUPT_H_
 
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 
 struct intr_event;
 struct intr_thread;
 struct trapframe;
 
 /*
  * Describe a hardware interrupt handler.
  *
  * Multiple interrupt handlers for a specific event can be chained
  * together.
  */
 struct intr_handler {
 	driver_filter_t	*ih_filter;	/* Filter handler function. */
 	driver_intr_t	*ih_handler;	/* Threaded handler function. */
 	void		*ih_argument;	/* Argument to pass to handlers. */
 	int		 ih_flags;
 	char		 ih_name[MAXCOMLEN + 1]; /* Name of handler. */
 	struct intr_event *ih_event;	/* Event we are connected to. */
 	int		 ih_need;	/* Needs service. */
 	TAILQ_ENTRY(intr_handler) ih_next; /* Next handler for this event. */
 	u_char		 ih_pri;	/* Priority of this handler. */
 	struct intr_thread *ih_thread;	/* Ithread for filtered handler. */
 };
 
 /* Interrupt handle flags kept in ih_flags */
 #define	IH_EXCLUSIVE	0x00000002	/* Exclusive interrupt. */
 #define	IH_ENTROPY	0x00000004	/* Device is a good entropy source. */
 #define	IH_DEAD		0x00000008	/* Handler should be removed. */
 #define	IH_MPSAFE	0x80000000	/* Handler does not need Giant. */
 
 /*
  * Describe an interrupt event.  An event holds a list of handlers.
  * The 'pre_ithread', 'post_ithread', 'post_filter', and 'assign_cpu'
  * hooks are used to invoke MD code for certain operations.
  *
  * The 'pre_ithread' hook is called when an interrupt thread for
  * handlers without filters is scheduled.  It is responsible for
  * ensuring that 1) the system won't be swamped with an interrupt
  * storm from the associated source while the ithread runs and 2) the
  * current CPU is able to receive interrupts from other interrupt
  * sources.  The first is usually accomplished by disabling
  * level-triggered interrupts until the ithread completes.  The second
  * is accomplished on some platforms by acknowledging the interrupt
  * via an EOI.
  *
  * The 'post_ithread' hook is invoked when an ithread finishes.  It is
  * responsible for ensuring that the associated interrupt source will
  * trigger an interrupt when it is asserted in the future.  Usually
  * this is implemented by enabling a level-triggered interrupt that
  * was previously disabled via the 'pre_ithread' hook.
  *
  * The 'post_filter' hook is invoked when a filter handles an
  * interrupt.  It is responsible for ensuring that the current CPU is
  * able to receive interrupts again.  On some platforms this is done
  * by acknowledging the interrupts via an EOI.
  *
  * The 'assign_cpu' hook is used to bind an interrupt source to a
  * specific CPU.  If the interrupt cannot be bound, this function may
  * return an error.
  *
  * Note that device drivers may also use interrupt events to manage
  * multiplexing interrupt interrupt handler into handlers for child
  * devices.  In that case, the above hooks are not used.  The device
  * can create an event for its interrupt resource and register child
  * event handlers with that event.  It can then use
  * intr_event_execute_handlers() to execute non-filter handlers.
  * Currently filter handlers are not supported by this, but that can
  * be added by splitting out the filter loop from intr_event_handle()
  * if desired.
  */
 struct intr_event {
 	TAILQ_ENTRY(intr_event) ie_list;
 	TAILQ_HEAD(, intr_handler) ie_handlers; /* Interrupt handlers. */
 	char		ie_name[MAXCOMLEN + 1]; /* Individual event name. */
 	char		ie_fullname[MAXCOMLEN + 1];
 	struct mtx	ie_lock;
 	void		*ie_source;	/* Cookie used by MD code. */
 	struct intr_thread *ie_thread;	/* Thread we are connected to. */
 	void		(*ie_pre_ithread)(void *);
 	void		(*ie_post_ithread)(void *);
 	void		(*ie_post_filter)(void *);
 	int		(*ie_assign_cpu)(void *, int);
 	int		ie_flags;
 	int		ie_count;	/* Loop counter. */
 	int		ie_warncnt;	/* Rate-check interrupt storm warns. */
 	struct timeval	ie_warntm;
 	int		ie_irq;		/* Physical irq number if !SOFT. */
 	int		ie_cpu;		/* CPU this event is bound to. */
 };
 
 /* Interrupt event flags kept in ie_flags. */
 #define	IE_SOFT		0x000001	/* Software interrupt. */
 #define	IE_ENTROPY	0x000002	/* Interrupt is an entropy source. */
 #define	IE_ADDING_THREAD 0x000004	/* Currently building an ithread. */
 
 /* Flags to pass to sched_swi. */
 #define	SWI_DELAY	0x2
 
 /*
  * Software interrupt numbers in priority order.  The priority determines
  * the priority of the corresponding interrupt thread.
  */
 #define	SWI_TTY		0
 #define	SWI_NET		1
 #define	SWI_CAMBIO	2
 #define	SWI_VM		3
 #define	SWI_CLOCK	4
 #define	SWI_TQ_FAST	5
 #define	SWI_TQ		6
 #define	SWI_TQ_GIANT	6
 
 struct proc;
 
 extern struct	intr_event *tty_intr_event;
 extern struct	intr_event *clk_intr_event;
 extern void	*vm_ih;
 
 /* Counts and names for statistics (defined in MD code). */
 extern u_long 	intrcnt[];	/* counts for for each device and stray */
 extern char 	intrnames[];	/* string table containing device names */
 extern size_t	sintrcnt;	/* size of intrcnt table */
 extern size_t	sintrnames;	/* size of intrnames table */
 
 #ifdef DDB
 void	db_dump_intr_event(struct intr_event *ie, int handlers);
 #endif
 u_char	intr_priority(enum intr_type flags);
 int	intr_event_add_handler(struct intr_event *ie, const char *name,
 	    driver_filter_t filter, driver_intr_t handler, void *arg, 
 	    u_char pri, enum intr_type flags, void **cookiep);	    
 int	intr_event_bind(struct intr_event *ie, int cpu);
+int	intr_event_bind_irqonly(struct intr_event *ie, int cpu);
+int	intr_event_bind_ithread(struct intr_event *ie, int cpu);
 int	intr_event_create(struct intr_event **event, void *source,
 	    int flags, int irq, void (*pre_ithread)(void *),
 	    void (*post_ithread)(void *), void (*post_filter)(void *),
 	    int (*assign_cpu)(void *, int), const char *fmt, ...)
 	    __printflike(9, 10);
 int	intr_event_describe_handler(struct intr_event *ie, void *cookie,
 	    const char *descr);
 int	intr_event_destroy(struct intr_event *ie);
 void	intr_event_execute_handlers(struct proc *p, struct intr_event *ie);
 int	intr_event_handle(struct intr_event *ie, struct trapframe *frame);
 int	intr_event_remove_handler(void *cookie);
-int	intr_getaffinity(int irq, void *mask);
+int	intr_getaffinity(int irq, int mode, void *mask);
 void	*intr_handler_source(void *cookie);
-int	intr_setaffinity(int irq, void *mask);
+int	intr_setaffinity(int irq, int mode, void *mask);
 void	_intr_drain(int irq);  /* Linux compat only. */
 int	swi_add(struct intr_event **eventp, const char *name,
 	    driver_intr_t handler, void *arg, int pri, enum intr_type flags,
 	    void **cookiep);
 void	swi_sched(void *cookie, int flags);
 int	swi_remove(void *cookie);
 
 #endif