Index: head/sys/amd64/include/intr_machdep.h
===================================================================
--- head/sys/amd64/include/intr_machdep.h	(revision 331697)
+++ head/sys/amd64/include/intr_machdep.h	(revision 331698)
@@ -1,198 +1,199 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef __MACHINE_INTR_MACHDEP_H__
 #define	__MACHINE_INTR_MACHDEP_H__
 
 #ifdef _KERNEL
 
 /*
  * The maximum number of I/O interrupts we allow.  This number is rather
  * arbitrary as it is just the maximum IRQ resource value.  The interrupt
  * source for a given IRQ maps that I/O interrupt to device interrupt
  * source whether it be a pin on an interrupt controller or an MSI interrupt.
  * The 16 ISA IRQs are assigned fixed IDT vectors, but all other device
  * interrupts allocate IDT vectors on demand.  Currently we have 191 IDT
  * vectors available for device interrupts.  On many systems with I/O APICs,
  * a lot of the IRQs are not used, so this number can be much larger than
  * 191 and still be safe since only interrupt sources in actual use will
  * allocate IDT vectors.
  *
  * The first 255 IRQs (0 - 254) are reserved for ISA IRQs and PCI intline IRQs.
  * IRQ values from 256 to 767 are used by MSI.  When running under the Xen
  * Hypervisor, IRQ values from 768 to 4863 are available for binding to
  * event channel events.  We leave 255 unused to avoid confusion since 255 is
  * used in PCI to indicate an invalid IRQ.
  */
 #define	NUM_MSI_INTS	512
 #define	FIRST_MSI_INT	256
 #ifdef XENHVM
 #include <xen/xen-os.h>
 #include <xen/interface/event_channel.h>
 #define	NUM_EVTCHN_INTS	NR_EVENT_CHANNELS
 #define	FIRST_EVTCHN_INT \
     (FIRST_MSI_INT + NUM_MSI_INTS)
 #define	LAST_EVTCHN_INT \
     (FIRST_EVTCHN_INT + NUM_EVTCHN_INTS - 1)
 #else
 #define	NUM_EVTCHN_INTS	0
 #endif
 #define	NUM_IO_INTS	(FIRST_MSI_INT + NUM_MSI_INTS + NUM_EVTCHN_INTS)
 
 /*
  * Default base address for MSI messages on x86 platforms.
  */
 #define	MSI_INTEL_ADDR_BASE		0xfee00000
 
 /*
  * - 1 ??? dummy counter.
  * - 2 counters for each I/O interrupt.
  * - 1 counter for each CPU for lapic timer.
  * - 8 counters for each CPU for IPI counters for SMP.
  */
 #ifdef SMP
 #define	INTRCNT_COUNT	(1 + NUM_IO_INTS * 2 + (1 + 8) * MAXCPU)
 #else
 #define	INTRCNT_COUNT	(1 + NUM_IO_INTS * 2 + 1)
 #endif
 
 #ifndef LOCORE
 
 typedef void inthand_t(void);
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
 struct intsrc;
 
 /*
  * Methods that a PIC provides to mask/unmask a given interrupt source,
  * "turn on" the interrupt on the CPU side by setting up an IDT entry, and
  * return the vector associated with this source.
  */
 struct pic {
 	void (*pic_enable_source)(struct intsrc *);
 	void (*pic_disable_source)(struct intsrc *, int);
 	void (*pic_eoi_source)(struct intsrc *);
 	void (*pic_enable_intr)(struct intsrc *);
 	void (*pic_disable_intr)(struct intsrc *);
 	int (*pic_vector)(struct intsrc *);
 	int (*pic_source_pending)(struct intsrc *);
 	void (*pic_suspend)(struct pic *);
 	void (*pic_resume)(struct pic *, bool suspend_cancelled);
 	int (*pic_config_intr)(struct intsrc *, enum intr_trigger,
 	    enum intr_polarity);
 	int (*pic_assign_cpu)(struct intsrc *, u_int apic_id);
 	void (*pic_reprogram_pin)(struct intsrc *);
 	TAILQ_ENTRY(pic) pics;
 };
 
 /* Flags for pic_disable_source() */
 enum {
 	PIC_EOI,
 	PIC_NO_EOI,
 };
 
 /*
  * An interrupt source.  The upper-layer code uses the PIC methods to
  * control a given source.  The lower-layer PIC drivers can store additional
  * private data in a given interrupt source such as an interrupt pin number
  * or an I/O APIC pointer.
  */
 struct intsrc {
 	struct pic *is_pic;
 	struct intr_event *is_event;
 	u_long *is_count;
 	u_long *is_straycount;
 	u_int is_index;
 	u_int is_handlers;
+	u_int is_domain;
 	u_int is_cpu;
 };
 
 struct trapframe;
 
 /*
  * The following data structure holds per-cpu data, and is placed just
  * above the top of the space used for the NMI and MC# stacks.
  */
 struct nmi_pcpu {
 	register_t	np_pcpu;
 	register_t	__padding;	/* pad to 16 bytes */
 };
 
 #ifdef SMP
 extern cpuset_t intr_cpus;
 #endif
 extern struct mtx icu_lock;
 extern int elcr_found;
 #ifdef SMP
 extern int msix_disable_migration;
 #endif
 
 #ifndef DEV_ATPIC
 void	atpic_reset(void);
 #endif
 /* XXX: The elcr_* prototypes probably belong somewhere else. */
 int	elcr_probe(void);
 enum intr_trigger elcr_read_trigger(u_int irq);
 void	elcr_resume(void);
 void	elcr_write_trigger(u_int irq, enum intr_trigger trigger);
 #ifdef SMP
 void	intr_add_cpu(u_int cpu);
 #endif
 int	intr_add_handler(const char *name, int vector, driver_filter_t filter, 
 			 driver_intr_t handler, void *arg, enum intr_type flags, 
-			 void **cookiep);    
+			 void **cookiep, int domain);    
 #ifdef SMP
 int	intr_bind(u_int vector, u_char cpu);
 #endif
 int	intr_config_intr(int vector, enum intr_trigger trig,
     enum intr_polarity pol);
 int	intr_describe(u_int vector, void *ih, const char *descr);
 void	intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame);
-u_int	intr_next_cpu(void);
+u_int	intr_next_cpu(int domain);
 struct intsrc *intr_lookup_source(int vector);
 int	intr_register_pic(struct pic *pic);
 int	intr_register_source(struct intsrc *isrc);
 int	intr_remove_handler(void *cookie);
 void	intr_resume(bool suspend_cancelled);
 void	intr_suspend(void);
 void	intr_reprogram(void);
 void	intrcnt_add(const char *name, u_long **countp);
 void	nexus_add_irq(u_long irq);
 int	msi_alloc(device_t dev, int count, int maxcount, int *irqs);
 void	msi_init(void);
 int	msi_map(int irq, uint64_t *addr, uint32_t *data);
 int	msi_release(int *irqs, int count);
 int	msix_alloc(device_t dev, int *irq);
 int	msix_release(int irq);
 
 #endif	/* !LOCORE */
 #endif	/* _KERNEL */
 #endif	/* !__MACHINE_INTR_MACHDEP_H__ */
Index: head/sys/i386/include/intr_machdep.h
===================================================================
--- head/sys/i386/include/intr_machdep.h	(revision 331697)
+++ head/sys/i386/include/intr_machdep.h	(revision 331698)
@@ -1,188 +1,190 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef __MACHINE_INTR_MACHDEP_H__
 #define	__MACHINE_INTR_MACHDEP_H__
 
 #ifdef _KERNEL
 
 /*
  * The maximum number of I/O interrupts we allow.  This number is rather
  * arbitrary as it is just the maximum IRQ resource value.  The interrupt
  * source for a given IRQ maps that I/O interrupt to device interrupt
  * source whether it be a pin on an interrupt controller or an MSI interrupt.
  * The 16 ISA IRQs are assigned fixed IDT vectors, but all other device
  * interrupts allocate IDT vectors on demand.  Currently we have 191 IDT
  * vectors available for device interrupts.  On many systems with I/O APICs,
  * a lot of the IRQs are not used, so this number can be much larger than
  * 191 and still be safe since only interrupt sources in actual use will
  * allocate IDT vectors.
  *
  * The first 255 IRQs (0 - 254) are reserved for ISA IRQs and PCI intline IRQs.
  * IRQ values from 256 to 767 are used by MSI.  When running under the Xen
  * Hypervisor, IRQ values from 768 to 4863 are available for binding to
  * event channel events.  We leave 255 unused to avoid confusion since 255 is
  * used in PCI to indicate an invalid IRQ.
  */
 #define	NUM_MSI_INTS	512
 #define	FIRST_MSI_INT	256
 #ifdef XENHVM
 #include <xen/xen-os.h>
 #include <xen/interface/event_channel.h>
 #define	NUM_EVTCHN_INTS	NR_EVENT_CHANNELS
 #define	FIRST_EVTCHN_INT \
     (FIRST_MSI_INT + NUM_MSI_INTS)
 #define	LAST_EVTCHN_INT \
     (FIRST_EVTCHN_INT + NUM_EVTCHN_INTS - 1)
 #else /* !XENHVM */
 #define	NUM_EVTCHN_INTS	0
 #endif
 #define	NUM_IO_INTS	(FIRST_MSI_INT + NUM_MSI_INTS + NUM_EVTCHN_INTS)
 
 /*
  * Default base address for MSI messages on x86 platforms.
  */
 #define	MSI_INTEL_ADDR_BASE		0xfee00000
 
 /*
  * - 1 ??? dummy counter.
  * - 2 counters for each I/O interrupt.
  * - 1 counter for each CPU for lapic timer.
  * - 9 counters for each CPU for IPI counters for SMP.
  */
 #ifdef SMP
 #define	INTRCNT_COUNT	(1 + NUM_IO_INTS * 2 + (1 + 9) * MAXCPU)
 #else
 #define	INTRCNT_COUNT	(1 + NUM_IO_INTS * 2 + 1)
 #endif
 
 #ifndef LOCORE
 
 typedef void inthand_t(void);
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
 struct intsrc;
 
 /*
  * Methods that a PIC provides to mask/unmask a given interrupt source,
  * "turn on" the interrupt on the CPU side by setting up an IDT entry, and
  * return the vector associated with this source.
  */
 struct pic {
 	void (*pic_enable_source)(struct intsrc *);
 	void (*pic_disable_source)(struct intsrc *, int);
 	void (*pic_eoi_source)(struct intsrc *);
 	void (*pic_enable_intr)(struct intsrc *);
 	void (*pic_disable_intr)(struct intsrc *);
 	int (*pic_vector)(struct intsrc *);
 	int (*pic_source_pending)(struct intsrc *);
 	void (*pic_suspend)(struct pic *);
 	void (*pic_resume)(struct pic *, bool suspend_cancelled);
 	int (*pic_config_intr)(struct intsrc *, enum intr_trigger,
 	    enum intr_polarity);
 	int (*pic_assign_cpu)(struct intsrc *, u_int apic_id);
 	void (*pic_reprogram_pin)(struct intsrc *);
 	TAILQ_ENTRY(pic) pics;
 };
 
 /* Flags for pic_disable_source() */
 enum {
 	PIC_EOI,
 	PIC_NO_EOI,
 };
 
 /*
  * An interrupt source.  The upper-layer code uses the PIC methods to
  * control a given source.  The lower-layer PIC drivers can store additional
  * private data in a given interrupt source such as an interrupt pin number
  * or an I/O APIC pointer.
  */
 struct intsrc {
 	struct pic *is_pic;
 	struct intr_event *is_event;
 	u_long *is_count;
 	u_long *is_straycount;
 	u_int is_index;
 	u_int is_handlers;
+	u_int is_domain;
 	u_int is_cpu;
 };
 
 struct trapframe;
 
 #ifdef SMP
 extern cpuset_t intr_cpus;
 #endif
 extern struct mtx icu_lock;
 extern int elcr_found;
 #ifdef SMP
 extern int msix_disable_migration;
 #endif
 
 #ifndef DEV_ATPIC
 void	atpic_reset(void);
 #endif
 /* XXX: The elcr_* prototypes probably belong somewhere else. */
 int	elcr_probe(void);
 enum intr_trigger elcr_read_trigger(u_int irq);
 void	elcr_resume(void);
 void	elcr_write_trigger(u_int irq, enum intr_trigger trigger);
 #ifdef SMP
 void	intr_add_cpu(u_int cpu);
 #endif
 int	intr_add_handler(const char *name, int vector, driver_filter_t filter,
-    driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep);
+    driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep,
+    int domain);
 #ifdef SMP
 int	intr_bind(u_int vector, u_char cpu);
 #endif
 int	intr_config_intr(int vector, enum intr_trigger trig,
     enum intr_polarity pol);
 int	intr_describe(u_int vector, void *ih, const char *descr);
 void	intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame);
-u_int	intr_next_cpu(void);
+u_int	intr_next_cpu(int domain);
 struct intsrc *intr_lookup_source(int vector);
 int	intr_register_pic(struct pic *pic);
 int	intr_register_source(struct intsrc *isrc);
 int	intr_remove_handler(void *cookie);
 void	intr_resume(bool suspend_cancelled);
 void	intr_suspend(void);
 void	intr_reprogram(void);
 void	intrcnt_add(const char *name, u_long **countp);
 void	nexus_add_irq(u_long irq);
 int	msi_alloc(device_t dev, int count, int maxcount, int *irqs);
 void	msi_init(void);
 int	msi_map(int irq, uint64_t *addr, uint32_t *data);
 int	msi_release(int* irqs, int count);
 int	msix_alloc(device_t dev, int *irq);
 int	msix_release(int irq);
 
 #endif	/* !LOCORE */
 #endif	/* _KERNEL */
 #endif	/* !__MACHINE_INTR_MACHDEP_H__ */
Index: head/sys/kern/kern_cpuset.c
===================================================================
--- head/sys/kern/kern_cpuset.c	(revision 331697)
+++ head/sys/kern/kern_cpuset.c	(revision 331698)
@@ -1,2192 +1,2175 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008,  Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  * 
  * Copyright (c) 2008 Nokia Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/capsicum.h>
 #include <sys/cpuset.h>
 #include <sys/domainset.h>
 #include <sys/sx.h>
 #include <sys/queue.h>
 #include <sys/libkern.h>
 #include <sys/limits.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/vmmeter.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 #include <vm/vm_phys.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif /* DDB */
 
 /*
  * cpusets provide a mechanism for creating and manipulating sets of
  * processors for the purpose of constraining the scheduling of threads to
  * specific processors.
  *
  * Each process belongs to an identified set, by default this is set 1.  Each
  * thread may further restrict the cpus it may run on to a subset of this
  * named set.  This creates an anonymous set which other threads and processes
  * may not join by number.
  *
  * The named set is referred to herein as the 'base' set to avoid ambiguity.
  * This set is usually a child of a 'root' set while the anonymous set may
  * simply be referred to as a mask.  In the syscall api these are referred to
  * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
  *
  * Threads inherit their set from their creator whether it be anonymous or
  * not.  This means that anonymous sets are immutable because they may be
  * shared.  To modify an anonymous set a new set is created with the desired
  * mask and the same parent as the existing anonymous set.  This gives the
  * illusion of each thread having a private mask.
  *
  * Via the syscall apis a user may ask to retrieve or modify the root, base,
  * or mask that is discovered via a pid, tid, or setid.  Modifying a set
  * modifies all numbered and anonymous child sets to comply with the new mask.
  * Modifying a pid or tid's mask applies only to that tid but must still
  * exist within the assigned parent set.
  *
  * A thread may not be assigned to a group separate from other threads in
  * the process.  This is to remove ambiguity when the setid is queried with
  * a pid argument.  There is no other technical limitation.
  *
  * This somewhat complex arrangement is intended to make it easy for
  * applications to query available processors and bind their threads to
  * specific processors while also allowing administrators to dynamically
  * reprovision by changing sets which apply to groups of processes.
  *
  * A simple application should not concern itself with sets at all and
  * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
  * meaning 'curthread'.  It may query available cpus for that tid with a
  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
  */
 static uma_zone_t cpuset_zone;
 static uma_zone_t domainset_zone;
 static struct mtx cpuset_lock;
 static struct setlist cpuset_ids;
 static struct domainlist cpuset_domains;
 static struct unrhdr *cpuset_unr;
 static struct cpuset *cpuset_zero, *cpuset_default;
 
 /* Return the size of cpuset_t at the kernel level */
 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
     SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)");
 
 cpuset_t *cpuset_root;
 cpuset_t cpuset_domain[MAXMEMDOM];
 
 static int domainset_valid(const struct domainset *, const struct domainset *);
 
 /*
  * Find the first non-anonymous set starting from 'set'.
  */
 static struct cpuset *
 cpuset_getbase(struct cpuset *set)
 {
 
 	if (set->cs_id == CPUSET_INVALID)
 		set = set->cs_parent;
 	return (set);
 }
 
 /*
  * Walks up the tree from 'set' to find the root.
  */
 static struct cpuset *
 cpuset_getroot(struct cpuset *set)
 {
 
 	while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL)
 		set = set->cs_parent;
 	return (set);
 }
 
 /*
  * Acquire a reference to a cpuset, all pointers must be tracked with refs.
  */
 struct cpuset *
 cpuset_ref(struct cpuset *set)
 {
 
 	refcount_acquire(&set->cs_ref);
 	return (set);
 }
 
 /*
  * Walks up the tree from 'set' to find the root.  Returns the root
  * referenced.
  */
 static struct cpuset *
 cpuset_refroot(struct cpuset *set)
 {
 
 	return (cpuset_ref(cpuset_getroot(set)));
 }
 
 /*
  * Find the first non-anonymous set starting from 'set'.  Returns this set
  * referenced.  May return the passed in set with an extra ref if it is
  * not anonymous. 
  */
 static struct cpuset *
 cpuset_refbase(struct cpuset *set)
 {
 
 	return (cpuset_ref(cpuset_getbase(set)));
 }
 
 /*
  * Release a reference in a context where it is safe to allocate.
  */
 void
 cpuset_rel(struct cpuset *set)
 {
 	cpusetid_t id;
 
 	if (refcount_release(&set->cs_ref) == 0)
 		return;
 	mtx_lock_spin(&cpuset_lock);
 	LIST_REMOVE(set, cs_siblings);
 	id = set->cs_id;
 	if (id != CPUSET_INVALID)
 		LIST_REMOVE(set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 	cpuset_rel(set->cs_parent);
 	uma_zfree(cpuset_zone, set);
 	if (id != CPUSET_INVALID)
 		free_unr(cpuset_unr, id);
 }
 
 /*
  * Deferred release must be used when in a context that is not safe to
  * allocate/free.  This places any unreferenced sets on the list 'head'.
  */
 static void
 cpuset_rel_defer(struct setlist *head, struct cpuset *set)
 {
 
 	if (refcount_release(&set->cs_ref) == 0)
 		return;
 	mtx_lock_spin(&cpuset_lock);
 	LIST_REMOVE(set, cs_siblings);
 	if (set->cs_id != CPUSET_INVALID)
 		LIST_REMOVE(set, cs_link);
 	LIST_INSERT_HEAD(head, set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 }
 
 /*
  * Complete a deferred release.  Removes the set from the list provided to
  * cpuset_rel_defer.
  */
 static void
 cpuset_rel_complete(struct cpuset *set)
 {
 	LIST_REMOVE(set, cs_link);
 	cpuset_rel(set->cs_parent);
 	uma_zfree(cpuset_zone, set);
 }
 
 /*
  * Find a set based on an id.  Returns it with a ref.
  */
 static struct cpuset *
 cpuset_lookup(cpusetid_t setid, struct thread *td)
 {
 	struct cpuset *set;
 
 	if (setid == CPUSET_INVALID)
 		return (NULL);
 	mtx_lock_spin(&cpuset_lock);
 	LIST_FOREACH(set, &cpuset_ids, cs_link)
 		if (set->cs_id == setid)
 			break;
 	if (set)
 		cpuset_ref(set);
 	mtx_unlock_spin(&cpuset_lock);
 
 	KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
 	if (set != NULL && jailed(td->td_ucred)) {
 		struct cpuset *jset, *tset;
 
 		jset = td->td_ucred->cr_prison->pr_cpuset;
 		for (tset = set; tset != NULL; tset = tset->cs_parent)
 			if (tset == jset)
 				break;
 		if (tset == NULL) {
 			cpuset_rel(set);
 			set = NULL;
 		}
 	}
 
 	return (set);
 }
 
 /*
  * Create a set in the space provided in 'set' with the provided parameters.
  * The set is returned with a single ref.  May return EDEADLK if the set
  * will have no valid cpu based on restrictions from the parent.
  */
 static int
 _cpuset_create(struct cpuset *set, struct cpuset *parent,
     const cpuset_t *mask, struct domainset *domain, cpusetid_t id)
 {
 
 	if (domain == NULL)
 		domain = parent->cs_domain;
 	if (mask == NULL)
 		mask = &parent->cs_mask;
 	if (!CPU_OVERLAP(&parent->cs_mask, mask))
 		return (EDEADLK);
 	/* The domain must be prepared ahead of time. */
 	if (!domainset_valid(parent->cs_domain, domain))
 		return (EDEADLK);
 	CPU_COPY(mask, &set->cs_mask);
 	LIST_INIT(&set->cs_children);
 	refcount_init(&set->cs_ref, 1);
 	set->cs_flags = 0;
 	mtx_lock_spin(&cpuset_lock);
 	set->cs_domain = domain;
 	CPU_AND(&set->cs_mask, &parent->cs_mask);
 	set->cs_id = id;
 	set->cs_parent = cpuset_ref(parent);
 	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
 	if (set->cs_id != CPUSET_INVALID)
 		LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
 	mtx_unlock_spin(&cpuset_lock);
 
 	return (0);
 }
 
 /*
  * Create a new non-anonymous set with the requested parent and mask.  May
  * return failures if the mask is invalid or a new number can not be
  * allocated.
  */
 static int
 cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask)
 {
 	struct cpuset *set;
 	cpusetid_t id;
 	int error;
 
 	id = alloc_unr(cpuset_unr);
 	if (id == -1)
 		return (ENFILE);
 	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	error = _cpuset_create(set, parent, mask, NULL, id);
 	if (error == 0)
 		return (0);
 	free_unr(cpuset_unr, id);
 	uma_zfree(cpuset_zone, set);
 
 	return (error);
 }
 
 static void
 cpuset_freelist_add(struct setlist *list, int count)
 {
 	struct cpuset *set;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK);
 		LIST_INSERT_HEAD(list, set, cs_link);
 	}
 }
 
 static void
 cpuset_freelist_init(struct setlist *list, int count)
 {
 
 	LIST_INIT(list);
 	cpuset_freelist_add(list, count);
 }
 
 static void
 cpuset_freelist_free(struct setlist *list)
 {
 	struct cpuset *set;
 
 	while ((set = LIST_FIRST(list)) != NULL) {
 		LIST_REMOVE(set, cs_link);
 		uma_zfree(cpuset_zone, set);
 	}
 }
 
 static void
 domainset_freelist_add(struct domainlist *list, int count)
 {
 	struct domainset *set;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK);
 		LIST_INSERT_HEAD(list, set, ds_link);
 	}
 }
 
 static void
 domainset_freelist_init(struct domainlist *list, int count)
 {
 
 	LIST_INIT(list);
 	domainset_freelist_add(list, count);
 }
 
 static void
 domainset_freelist_free(struct domainlist *list)
 {
 	struct domainset *set;
 
 	while ((set = LIST_FIRST(list)) != NULL) {
 		LIST_REMOVE(set, ds_link);
 		uma_zfree(domainset_zone, set);
 	}
 }
 
 /* Copy a domainset preserving mask and policy. */
 static void
 domainset_copy(const struct domainset *from, struct domainset *to)
 {
 
 	DOMAINSET_COPY(&from->ds_mask, &to->ds_mask);
 	to->ds_policy = from->ds_policy;
 	to->ds_prefer = from->ds_prefer;
 }
 
 /* Return 1 if mask and policy are equal, otherwise 0. */
 static int
 domainset_equal(const struct domainset *one, const struct domainset *two)
 {
 
 	return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 &&
 	    one->ds_policy == two->ds_policy &&
 	    one->ds_prefer == two->ds_prefer);
 }
 
 /* Return 1 if child is a valid subset of parent. */
 static int
 domainset_valid(const struct domainset *parent, const struct domainset *child)
 {
 	if (child->ds_policy != DOMAINSET_POLICY_PREFER)
 		return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask));
 	return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
 }
 
 static int
 domainset_restrict(const struct domainset *parent,
     const struct domainset *child)
 {
 	if (child->ds_policy != DOMAINSET_POLICY_PREFER)
 		return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask));
 	return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask));
 }
 
 /*
  * Lookup or create a domainset.  The key is provided in ds_mask and
  * ds_policy.  If the domainset does not yet exist the storage in
  * 'domain' is used to insert.  Otherwise this storage is freed to the
  * domainset_zone and the existing domainset is returned.
  */
 static struct domainset *
 _domainset_create(struct domainset *domain, struct domainlist *freelist)
 {
 	struct domainset *ndomain;
 
 	mtx_lock_spin(&cpuset_lock);
 	LIST_FOREACH(ndomain, &cpuset_domains, ds_link)
 		if (domainset_equal(ndomain, domain))
 			break;
 	/*
 	 * If the domain does not yet exist we insert it and initialize
 	 * various iteration helpers which are not part of the key.
 	 */
 	if (ndomain == NULL) {
 		LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link);
 		domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
 		domain->ds_max = DOMAINSET_FLS(&domain->ds_mask) + 1;
 	}
 	mtx_unlock_spin(&cpuset_lock);
 	if (ndomain == NULL)
 		return (domain);
 	if (freelist != NULL)
 		LIST_INSERT_HEAD(freelist, domain, ds_link);
 	else
 		uma_zfree(domainset_zone, domain);
 	return (ndomain);
 	
 }
 
 /*
  * Create or lookup a domainset based on the key held in 'domain'.
  */
 static struct domainset *
 domainset_create(const struct domainset *domain)
 {
 	struct domainset *ndomain;
 
 	ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO);
 	domainset_copy(domain, ndomain);
 	return _domainset_create(ndomain, NULL);
 }
 
 /*
  * Update thread domainset pointers.
  */
 static void
 domainset_notify(void)
 {
 	struct thread *td;
 	struct proc *p;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			td->td_domain.dr_policy = td->td_cpuset->cs_domain;
 			thread_unlock(td);
 		}
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 	kernel_object->domain.dr_policy = cpuset_default->cs_domain;
 }
 
 /*
  * Create a new set that is a subset of a parent.
  */
 static struct domainset *
 domainset_shadow(const struct domainset *pdomain,
     const struct domainset *domain, struct domainlist *freelist)
 {
 	struct domainset *ndomain;
 
 	ndomain = LIST_FIRST(freelist);
 	LIST_REMOVE(ndomain, ds_link);
 
 	/*
 	 * Initialize the key from the request.
 	 */
 	domainset_copy(domain, ndomain);
 
 	/*
 	 * Restrict the key by the parent.
 	 */
 	DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask);
 
 	return _domainset_create(ndomain, freelist);
 }
 
 /*
  * Recursively check for errors that would occur from applying mask to
  * the tree of sets starting at 'set'.  Checks for sets that would become
  * empty as well as RDONLY flags.
  */
 static int
 cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask)
 {
 	struct cpuset *nset;
 	cpuset_t newmask;
 	int error;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	if (set->cs_flags & CPU_SET_RDONLY)
 		return (EPERM);
 	if (check_mask) {
 		if (!CPU_OVERLAP(&set->cs_mask, mask))
 			return (EDEADLK);
 		CPU_COPY(&set->cs_mask, &newmask);
 		CPU_AND(&newmask, mask);
 	} else
 		CPU_COPY(mask, &newmask);
 	error = 0;
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
 			break;
 	return (error);
 }
 
 /*
  * Applies the mask 'mask' without checking for empty sets or permissions.
  */
 static void
 cpuset_update(struct cpuset *set, cpuset_t *mask)
 {
 	struct cpuset *nset;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	CPU_AND(&set->cs_mask, mask);
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		cpuset_update(nset, &set->cs_mask);
 
 	return;
 }
 
 /*
  * Modify the set 'set' to use a copy of the mask provided.  Apply this new
  * mask to restrict all children in the tree.  Checks for validity before
  * applying the changes.
  */
 static int
 cpuset_modify(struct cpuset *set, cpuset_t *mask)
 {
 	struct cpuset *root;
 	int error;
 
 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
 	if (error)
 		return (error);
 	/*
 	 * In case we are called from within the jail
 	 * we do not allow modifying the dedicated root
 	 * cpuset of the jail but may still allow to
 	 * change child sets.
 	 */
 	if (jailed(curthread->td_ucred) &&
 	    set->cs_flags & CPU_SET_ROOT)
 		return (EPERM);
 	/*
 	 * Verify that we have access to this set of
 	 * cpus.
 	 */
 	root = cpuset_getroot(set);
 	mtx_lock_spin(&cpuset_lock);
 	if (root && !CPU_SUBSET(&root->cs_mask, mask)) {
 		error = EINVAL;
 		goto out;
 	}
 	error = cpuset_testupdate(set, mask, 0);
 	if (error)
 		goto out;
 	CPU_COPY(mask, &set->cs_mask);
 	cpuset_update(set, mask);
 out:
 	mtx_unlock_spin(&cpuset_lock);
 
 	return (error);
 }
 
 /*
  * Recursively check for errors that would occur from applying mask to
  * the tree of sets starting at 'set'.  Checks for sets that would become
  * empty as well as RDONLY flags.
  */
 static int
 cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset,
     struct domainset *orig, int *count, int check_mask)
 {
 	struct cpuset *nset;
 	struct domainset *domain;
 	struct domainset newset;
 	int error;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	if (set->cs_flags & CPU_SET_RDONLY)
 		return (EPERM);
 	domain = set->cs_domain;
 	domainset_copy(domain, &newset);
 	if (!domainset_equal(domain, orig)) {
 		if (!domainset_restrict(domain, dset))
 			return (EDEADLK);
 		DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask);
 		/* Count the number of domains that are changing. */
 		(*count)++;
 	}
 	error = 0;
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		if ((error = cpuset_testupdate_domain(nset, &newset, domain,
 		    count, 1)) != 0)
 			break;
 	return (error);
 }
 
 /*
  * Applies the mask 'mask' without checking for empty sets or permissions.
  */
 static void
 cpuset_update_domain(struct cpuset *set, struct domainset *domain,
     struct domainset *orig, struct domainlist *domains)
 {
 	struct cpuset *nset;
 
 	mtx_assert(&cpuset_lock, MA_OWNED);
 	/*
 	 * If this domainset has changed from the parent we must calculate
 	 * a new set.  Otherwise it simply inherits from the parent.  When
 	 * we inherit from the parent we get a new mask and policy.  If the
 	 * set is modified from the parent we keep the policy and only
 	 * update the mask.
 	 */
 	if (set->cs_domain != orig) {
 		orig = set->cs_domain;
 		set->cs_domain = domainset_shadow(domain, orig, domains);
 	} else
 		set->cs_domain = domain;
 	LIST_FOREACH(nset, &set->cs_children, cs_siblings) 
 		cpuset_update_domain(nset, set->cs_domain, orig, domains);
 
 	return;
 }
 
 /*
  * Modify the set 'set' to use a copy the domainset provided.  Apply this new
  * mask to restrict all children in the tree.  Checks for validity before
  * applying the changes.
  */
 static int
 cpuset_modify_domain(struct cpuset *set, struct domainset *domain)
 {
 	struct domainlist domains;
 	struct domainset temp;
 	struct domainset *dset;
 	struct cpuset *root;
 	int ndomains, needed;
 	int error;
 
 	error = priv_check(curthread, PRIV_SCHED_CPUSET);
 	if (error)
 		return (error);
 	/*
 	 * In case we are called from within the jail
 	 * we do not allow modifying the dedicated root
 	 * cpuset of the jail but may still allow to
 	 * change child sets.
 	 */
 	if (jailed(curthread->td_ucred) &&
 	    set->cs_flags & CPU_SET_ROOT)
 		return (EPERM);
 	domainset_freelist_init(&domains, 0);
 	domain = domainset_create(domain);
 	ndomains = needed = 0;
 	do {
 		if (ndomains < needed) {
 			domainset_freelist_add(&domains, needed - ndomains);
 			ndomains = needed;
 		}
 		root = cpuset_getroot(set);
 		mtx_lock_spin(&cpuset_lock);
 		dset = root->cs_domain;
 		/*
 		 * Verify that we have access to this set of domains.
 		 */
 		if (root && !domainset_valid(dset, domain)) {
 			error = EINVAL;
 			goto out;
 		}
 		/*
 		 * If applying prefer we keep the current set as the fallback.
 		 */
 		if (domain->ds_policy == DOMAINSET_POLICY_PREFER)
 			DOMAINSET_COPY(&set->cs_domain->ds_mask,
 			    &domain->ds_mask);
 		/*
 		 * Determine whether we can apply this set of domains and
 		 * how many new domain structures it will require.
 		 */
 		domainset_copy(domain, &temp);
 		needed = 0;
 		error = cpuset_testupdate_domain(set, &temp, set->cs_domain,
 		    &needed, 0);
 		if (error)
 			goto out;
 	} while (ndomains < needed);
 	dset = set->cs_domain;
 	cpuset_update_domain(set, domain, dset, &domains);
 out:
 	mtx_unlock_spin(&cpuset_lock);
 	domainset_freelist_free(&domains);
 	if (error == 0)
 		domainset_notify();
 
 	return (error);
 }
 
 /*
  * Resolve the 'which' parameter of several cpuset apis.
  *
  * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid.  Also
  * checks for permission via p_cansched().
  *
  * For WHICH_SET returns a valid set with a new reference.
  *
  * -1 may be supplied for any argument to mean the current proc/thread or
  * the base set of the current thread.  May fail with ESRCH/EPERM.
  */
 int
 cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp,
     struct cpuset **setp)
 {
 	struct cpuset *set;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	*pp = p = NULL;
 	*tdp = td = NULL;
 	*setp = set = NULL;
 	switch (which) {
 	case CPU_WHICH_PID:
 		if (id == -1) {
 			PROC_LOCK(curproc);
 			p = curproc;
 			break;
 		}
 		if ((p = pfind(id)) == NULL)
 			return (ESRCH);
 		break;
 	case CPU_WHICH_TID:
 		if (id == -1) {
 			PROC_LOCK(curproc);
 			p = curproc;
 			td = curthread;
 			break;
 		}
 		td = tdfind(id, -1);
 		if (td == NULL)
 			return (ESRCH);
 		p = td->td_proc;
 		break;
 	case CPU_WHICH_CPUSET:
 		if (id == -1) {
 			thread_lock(curthread);
 			set = cpuset_refbase(curthread->td_cpuset);
 			thread_unlock(curthread);
 		} else
 			set = cpuset_lookup(id, curthread);
 		if (set) {
 			*setp = set;
 			return (0);
 		}
 		return (ESRCH);
 	case CPU_WHICH_JAIL:
 	{
 		/* Find `set' for prison with given id. */
 		struct prison *pr;
 
 		sx_slock(&allprison_lock);
 		pr = prison_find_child(curthread->td_ucred->cr_prison, id);
 		sx_sunlock(&allprison_lock);
 		if (pr == NULL)
 			return (ESRCH);
 		cpuset_ref(pr->pr_cpuset);
 		*setp = pr->pr_cpuset;
 		mtx_unlock(&pr->pr_mtx);
 		return (0);
 	}
 	case CPU_WHICH_IRQ:
 	case CPU_WHICH_DOMAIN:
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	error = p_cansched(curthread, p);
 	if (error) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (td == NULL)
 		td = FIRST_THREAD_IN_PROC(p);
 	*pp = p;
 	*tdp = td;
 	return (0);
 }
 
 static int
 cpuset_testshadow(struct cpuset *set, const cpuset_t *mask,
     const struct domainset *domain)
 {
 	struct cpuset *parent;
 	struct domainset *dset;
 
 	parent = cpuset_getbase(set);
 	/*
 	 * If we are restricting a cpu mask it must be a subset of the
 	 * parent or invalid CPUs have been specified.
 	 */
 	if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask))
 		return (EINVAL);
 
 	/*
 	 * If we are restricting a domain mask it must be a subset of the
 	 * parent or invalid domains have been specified.
 	 */
 	dset = parent->cs_domain;
 	if (domain != NULL && !domainset_valid(dset, domain))
 		return (EINVAL);
 
 	return (0);
 }
 
 /*
  * Create an anonymous set with the provided mask in the space provided by
  * 'nset'.  If the passed in set is anonymous we use its parent otherwise
  * the new set is a child of 'set'.
  */
 static int
 cpuset_shadow(struct cpuset *set, struct cpuset **nsetp,
    const cpuset_t *mask, const struct domainset *domain,
    struct setlist *cpusets, struct domainlist *domains)
 {
 	struct cpuset *parent;
 	struct cpuset *nset;
 	struct domainset *dset;
 	struct domainset *d;
 	int error;
 
 	error = cpuset_testshadow(set, mask, domain);
 	if (error)
 		return (error);
 
 	parent = cpuset_getbase(set);
 	dset = parent->cs_domain;
 	if (mask == NULL)
 		mask = &set->cs_mask;
 	if (domain != NULL)
 		d = domainset_shadow(dset, domain, domains);
 	else
 		d = set->cs_domain;
 	nset = LIST_FIRST(cpusets);
 	error = _cpuset_create(nset, parent, mask, d, CPUSET_INVALID);
 	if (error == 0) {
 		LIST_REMOVE(nset, cs_link);
 		*nsetp = nset;
 	}
 	return (error);
 }
 
 static struct cpuset *
 cpuset_update_thread(struct thread *td, struct cpuset *nset)
 {
 	struct cpuset *tdset;
 
 	tdset = td->td_cpuset;
 	td->td_cpuset = nset;
 	td->td_domain.dr_policy = nset->cs_domain;
 	sched_affinity(td);
 
 	return (tdset);
 }
 
 static int
 cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask,
     struct domainset *domain)
 {
 	struct cpuset *parent;
 
 	parent = cpuset_getbase(tdset);
 	if (mask == NULL)
 		mask = &tdset->cs_mask;
 	if (domain == NULL)
 		domain = tdset->cs_domain;
 	return cpuset_testshadow(parent, mask, domain);
 }
 
 static int
 cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask,
     struct domainset *domain, struct cpuset **nsetp,
     struct setlist *freelist, struct domainlist *domainlist)
 {
 	struct cpuset *parent;
 
 	parent = cpuset_getbase(tdset);
 	if (mask == NULL)
 		mask = &tdset->cs_mask;
 	if (domain == NULL)
 		domain = tdset->cs_domain;
 	return cpuset_shadow(parent, nsetp, mask, domain, freelist,
 	    domainlist);
 }
 
 static int
 cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set,
     cpuset_t *mask, struct domainset *domain)
 {
 	struct cpuset *parent;
 
 	parent = cpuset_getbase(tdset);
 
 	/*
 	 * If the thread restricted its mask then apply that same
 	 * restriction to the new set, otherwise take it wholesale.
 	 */
 	if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) {
 		CPU_COPY(&tdset->cs_mask, mask);
 		CPU_AND(mask, &set->cs_mask);
 	} else
 		CPU_COPY(&set->cs_mask, mask);
 
 	/*
 	 * If the thread restricted the domain then we apply the
 	 * restriction to the new set but retain the policy.
 	 */
 	if (tdset->cs_domain != parent->cs_domain) {
 		domainset_copy(tdset->cs_domain, domain);
 		DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask);
 	} else
 		domainset_copy(set->cs_domain, domain);
 
 	if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask))
 		return (EDEADLK);
 
 	return (0);
 }
 
 static int
 cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set)
 {
 	struct domainset domain;
 	cpuset_t mask;
 
 	if (tdset->cs_id != CPUSET_INVALID)
 		return (0);
 	return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
 }
 
 static int
 cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set,
     struct cpuset **nsetp, struct setlist *freelist,
     struct domainlist *domainlist)
 {
 	struct domainset domain;
 	cpuset_t mask;
 	int error;
 
 	/*
 	 * If we're replacing on a thread that has not constrained the
 	 * original set we can simply accept the new set.
 	 */
 	if (tdset->cs_id != CPUSET_INVALID) {
 		*nsetp = cpuset_ref(set);
 		return (0);
 	}
 	error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain);
 	if (error)
 		return (error);
 
 	return cpuset_shadow(tdset, nsetp, &mask, &domain, freelist,
 	    domainlist);
 }
 
 /*
  * Handle three cases for updating an entire process.
  *
  * 1) Set is non-null.  This reparents all anonymous sets to the provided
  *    set and replaces all non-anonymous td_cpusets with the provided set.
  * 2) Mask is non-null.  This replaces or creates anonymous sets for every
  *    thread with the existing base as a parent.
  * 3) domain is non-null.  This creates anonymous sets for every thread
  *    and replaces the domain set.
  *
  * This is overly complicated because we can't allocate while holding a 
  * spinlock and spinlocks must be held while changing and examining thread
  * state.
  */
 static int
 cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
     struct domainset *domain)
 {
 	struct setlist freelist;
 	struct setlist droplist;
 	struct domainlist domainlist;
 	struct cpuset *nset;
 	struct thread *td;
 	struct proc *p;
 	int threads;
 	int nfree;
 	int error;
 
 	/*
 	 * The algorithm requires two passes due to locking considerations.
 	 * 
 	 * 1) Lookup the process and acquire the locks in the required order.
 	 * 2) If enough cpusets have not been allocated release the locks and
 	 *    allocate them.  Loop.
 	 */
 	cpuset_freelist_init(&freelist, 1);
 	domainset_freelist_init(&domainlist, 1);
 	nfree = 1;
 	LIST_INIT(&droplist);
 	nfree = 0;
 	for (;;) {
 		error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
 		if (error)
 			goto out;
 		if (nfree >= p->p_numthreads)
 			break;
 		threads = p->p_numthreads;
 		PROC_UNLOCK(p);
 		if (nfree < threads) {
 			cpuset_freelist_add(&freelist, threads - nfree);
 			domainset_freelist_add(&domainlist, threads - nfree);
 			nfree = threads;
 		}
 	}
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * Now that the appropriate locks are held and we have enough cpusets,
 	 * make sure the operation will succeed before applying changes. The
 	 * proc lock prevents td_cpuset from changing between calls.
 	 */
 	error = 0;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		if (set != NULL)
 			error = cpuset_setproc_test_setthread(td->td_cpuset,
 			    set);
 		else
 			error = cpuset_setproc_test_maskthread(td->td_cpuset,
 			    mask, domain);
 		thread_unlock(td);
 		if (error)
 			goto unlock_out;
 	}
 	/*
 	 * Replace each thread's cpuset while using deferred release.  We
 	 * must do this because the thread lock must be held while operating
 	 * on the thread and this limits the type of operations allowed.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		if (set != NULL)
 			error = cpuset_setproc_setthread(td->td_cpuset, set,
 			    &nset, &freelist, &domainlist);
 		else
 			error = cpuset_setproc_maskthread(td->td_cpuset, mask,
 			    domain, &nset, &freelist, &domainlist);
 		if (error) {
 			thread_unlock(td);
 			break;
 		}
 		cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset));
 		thread_unlock(td);
 	}
 unlock_out:
 	PROC_UNLOCK(p);
 out:
 	while ((nset = LIST_FIRST(&droplist)) != NULL)
 		cpuset_rel_complete(nset);
 	cpuset_freelist_free(&freelist);
 	domainset_freelist_free(&domainlist);
 	return (error);
 }
 
 /*
  * Return a string representing a valid layout for a cpuset_t object.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
  */
 char *
 cpusetobj_strprint(char *buf, const cpuset_t *set)
 {
 	char *tbuf;
 	size_t i, bytesp, bufsiz;
 
 	tbuf = buf;
 	bytesp = 0;
 	bufsiz = CPUSETBUFSIZ;
 
 	for (i = 0; i < (_NCPUWORDS - 1); i++) {
 		bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]);
 		bufsiz -= bytesp;
 		tbuf += bytesp;
 	}
 	snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]);
 	return (buf);
 }
 
 /*
  * Build a valid cpuset_t object from a string representation.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
  */
 int
 cpusetobj_strscan(cpuset_t *set, const char *buf)
 {
 	u_int nwords;
 	int i, ret;
 
 	if (strlen(buf) > CPUSETBUFSIZ - 1)
 		return (-1);
 
 	/* Allow to pass a shorter version of the mask when necessary. */
 	nwords = 1;
 	for (i = 0; buf[i] != '\0'; i++)
 		if (buf[i] == ',')
 			nwords++;
 	if (nwords > _NCPUWORDS)
 		return (-1);
 
 	CPU_ZERO(set);
 	for (i = 0; i < (nwords - 1); i++) {
 		ret = sscanf(buf, "%lx,", &set->__bits[i]);
 		if (ret == 0 || ret == -1)
 			return (-1);
 		buf = strstr(buf, ",");
 		if (buf == NULL)
 			return (-1);
 		buf++;
 	}
 	ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]);
 	if (ret == 0 || ret == -1)
 		return (-1);
 	return (0);
 }
 
 /*
  * Apply an anonymous mask or a domain to a single thread.
  */
 static int
 _cpuset_setthread(lwpid_t id, cpuset_t *mask, struct domainset *domain)
 {
 	struct setlist cpusets;
 	struct domainlist domainlist;
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *td;
 	struct proc *p;
 	int error;
 
 	cpuset_freelist_init(&cpusets, 1);
 	domainset_freelist_init(&domainlist, domain != NULL);
 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
 	if (error)
 		goto out;
 	set = NULL;
 	thread_lock(td);
 	error = cpuset_shadow(td->td_cpuset, &nset, mask, domain,
 	    &cpusets, &domainlist);
 	if (error == 0)
 		set = cpuset_update_thread(td, nset);
 	thread_unlock(td);
 	PROC_UNLOCK(p);
 	if (set)
 		cpuset_rel(set);
 out:
 	cpuset_freelist_free(&cpusets);
 	domainset_freelist_free(&domainlist);
 	return (error);
 }
 
 /*
  * Apply an anonymous mask to a single thread.
  */
 int
 cpuset_setthread(lwpid_t id, cpuset_t *mask)
 {
 
 	return _cpuset_setthread(id, mask, NULL);
 }
 
 /*
  * Apply new cpumask to the ithread.
  */
 int
 cpuset_setithread(lwpid_t id, int cpu)
 {
 	struct setlist cpusets;
 	struct cpuset *nset, *rset;
 	struct cpuset *parent, *old_set;
 	struct thread *td;
 	struct proc *p;
 	cpusetid_t cs_id;
 	cpuset_t mask;
 	int error;
 
 	cpuset_freelist_init(&cpusets, 1);
 	rset = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	cs_id = CPUSET_INVALID;
 
 	CPU_ZERO(&mask);
 	if (cpu == NOCPU)
 		CPU_COPY(cpuset_root, &mask);
 	else
 		CPU_SET(cpu, &mask);
 
 	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &old_set);
 	if (error != 0 || ((cs_id = alloc_unr(cpuset_unr)) == CPUSET_INVALID))
 		goto out;
 
 	/* cpuset_which() returns with PROC_LOCK held. */
 	old_set = td->td_cpuset;
 
 	if (cpu == NOCPU) {
 		nset = LIST_FIRST(&cpusets);
 		LIST_REMOVE(nset, cs_link);
 
 		/*
 		 * roll back to default set. We're not using cpuset_shadow()
 		 * here because we can fail CPU_SUBSET() check. This can happen
 		 * if default set does not contain all CPUs.
 		 */
 		error = _cpuset_create(nset, cpuset_default, &mask, NULL,
 		    CPUSET_INVALID);
 
 		goto applyset;
 	}
 
 	if (old_set->cs_id == 1 || (old_set->cs_id == CPUSET_INVALID &&
 	    old_set->cs_parent->cs_id == 1)) {
 
 		/*
 		 * Current set is either default (1) or
 		 * shadowed version of default set.
 		 *
 		 * Allocate new root set to be able to shadow it
 		 * with any mask.
 		 */
 		error = _cpuset_create(rset, cpuset_zero,
 		    &cpuset_zero->cs_mask, NULL, cs_id);
 		if (error != 0) {
 			PROC_UNLOCK(p);
 			goto out;
 		}
 		rset->cs_flags |= CPU_SET_ROOT;
 		parent = rset;
 		rset = NULL;
 		cs_id = CPUSET_INVALID;
 	} else {
 		/* Assume existing set was already allocated by previous call */
 		parent = old_set;
 		old_set = NULL;
 	}
 
 	error = cpuset_shadow(parent, &nset, &mask, NULL, &cpusets, NULL);
 applyset:
 	if (error == 0) {
 		thread_lock(td);
 		old_set = cpuset_update_thread(td, nset);
 		thread_unlock(td);
 	} else
 		old_set = NULL;
 	PROC_UNLOCK(p);
 	if (old_set != NULL)
 		cpuset_rel(old_set);
 out:
 	cpuset_freelist_free(&cpusets);
 	if (rset != NULL)
 		uma_zfree(cpuset_zone, rset);
 	if (cs_id != CPUSET_INVALID)
 		free_unr(cpuset_unr, cs_id);
 	return (error);
 }
 
 static struct domainset domainset0;
 
 void
 domainset_zero(void)
 {
 	struct domainset *dset;
 	int i;
 
 	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE);
 
 	dset = &domainset0;
 	DOMAINSET_ZERO(&dset->ds_mask);
 	for (i = 0; i < vm_ndomains; i++)
 		DOMAINSET_SET(i, &dset->ds_mask);
 	dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
 	dset->ds_prefer = -1;
 	curthread->td_domain.dr_policy = _domainset_create(dset, NULL);
 	kernel_object->domain.dr_policy = curthread->td_domain.dr_policy;
 }
 
 /*
  * Creates system-wide cpusets and the cpuset for thread0 including two
  * sets:
  * 
  * 0 - The root set which should represent all valid processors in the
  *     system.  It is initially created with a mask of all processors
  *     because we don't know what processors are valid until cpuset_init()
  *     runs.  This set is immutable.
  * 1 - The default set which all processes are a member of until changed.
  *     This allows an administrator to move all threads off of given cpus to
  *     dedicate them to high priority tasks or save power etc.
  */
 struct cpuset *
 cpuset_thread0(void)
 {
 	struct cpuset *set;
 	int error;
+	int i;
 
 	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	domainset_zone = uma_zcreate("domainset", sizeof(struct domainset),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 
 	/*
 	 * Create the root system set for the whole machine.  Doesn't use
 	 * cpuset_create() due to NULL parent.
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
-	CPU_FILL(&set->cs_mask);
+	CPU_COPY(&all_cpus, &set->cs_mask);
 	LIST_INIT(&set->cs_children);
 	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
 	set->cs_ref = 1;
-	set->cs_flags = CPU_SET_ROOT;
+	set->cs_flags = CPU_SET_ROOT | CPU_SET_RDONLY;
 	set->cs_domain = &domainset0;
 	cpuset_zero = set;
 	cpuset_root = &set->cs_mask;
 
 	/*
 	 * Now derive a default, modifiable set from that to give out.
 	 */
 	set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
 	error = _cpuset_create(set, cpuset_zero, NULL, NULL, 1);
 	KASSERT(error == 0, ("Error creating default set: %d\n", error));
 	cpuset_default = set;
 
 	/*
 	 * Initialize the unit allocator. 0 and 1 are allocated above.
 	 */
 	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);
 
+	/*
+	 * If MD code has not initialized per-domain cpusets, place all
+	 * CPUs in domain 0.
+	 */
+	for (i = 0; i < MAXMEMDOM; i++)
+		if (!CPU_EMPTY(&cpuset_domain[i]))
+			goto domains_set;
+	CPU_COPY(&all_cpus, &cpuset_domain[0]);
+domains_set:
+
 	return (set);
 }
 
 /*
  * Create a cpuset, which would be cpuset_create() but
  * mark the new 'set' as root.
  *
  * We are not going to reparent the td to it.  Use cpuset_setproc_update_set()
  * for that.
  *
  * In case of no error, returns the set in *setp locked with a reference.
  */
 int
 cpuset_create_root(struct prison *pr, struct cpuset **setp)
 {
 	struct cpuset *set;
 	int error;
 
 	KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
 	KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));
 
 	error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
 	if (error)
 		return (error);
 
 	KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
 	    __func__, __LINE__));
 
 	/* Mark the set as root. */
 	set = *setp;
 	set->cs_flags |= CPU_SET_ROOT;
 
 	return (0);
 }
 
 int
 cpuset_setproc_update_set(struct proc *p, struct cpuset *set)
 {
 	int error;
 
 	KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
 	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));
 
 	cpuset_ref(set);
 	error = cpuset_setproc(p->p_pid, set, NULL, NULL);
 	if (error)
 		return (error);
 	cpuset_rel(set);
 	return (0);
 }
-
-/*
- * This is called once the final set of system cpus is known.  Modifies
- * the root set and all children and mark the root read-only.  
- */
-static void
-cpuset_init(void *arg)
-{
-	cpuset_t mask;
-	int i;
-
-	mask = all_cpus;
-	if (cpuset_modify(cpuset_zero, &mask))
-		panic("Can't set initial cpuset mask.\n");
-	cpuset_zero->cs_flags |= CPU_SET_RDONLY;
-
-	/*
-	 * If MD code has not initialized per-domain cpusets, place all
-	 * CPUs in domain 0.
-	 */
-	for (i = 0; i < MAXMEMDOM; i++)
-		if (!CPU_EMPTY(&cpuset_domain[i]))
-			goto domains_set;
-	CPU_COPY(&all_cpus, &cpuset_domain[0]);
-domains_set:
-	return;
-}
-SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_args {
 	cpusetid_t	*setid;
 };
 #endif
 int
 sys_cpuset(struct thread *td, struct cpuset_args *uap)
 {
 	struct cpuset *root;
 	struct cpuset *set;
 	int error;
 
 	thread_lock(td);
 	root = cpuset_refroot(td->td_cpuset);
 	thread_unlock(td);
 	error = cpuset_create(&set, root, &root->cs_mask);
 	cpuset_rel(root);
 	if (error)
 		return (error);
 	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
 	if (error == 0)
 		error = cpuset_setproc(-1, set, NULL, NULL);
 	cpuset_rel(set);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setid_args {
 	cpuwhich_t	which;
 	id_t		id;
 	cpusetid_t	setid;
 };
 #endif
 int
 sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap)
 {
 
 	return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid));
 }
 
 int
 kern_cpuset_setid(struct thread *td, cpuwhich_t which,
     id_t id, cpusetid_t setid)
 {
 	struct cpuset *set;
 	int error;
 
 	/*
 	 * Presently we only support per-process sets.
 	 */
 	if (which != CPU_WHICH_PID)
 		return (EINVAL);
 	set = cpuset_lookup(setid, td);
 	if (set == NULL)
 		return (ESRCH);
 	error = cpuset_setproc(id, set, NULL, NULL);
 	cpuset_rel(set);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getid_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	cpusetid_t	*setid;
 };
 #endif
 int
 sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
 {
 
 	return (kern_cpuset_getid(td, uap->level, uap->which, uap->id,
 	    uap->setid));
 }
 
 int
 kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, cpusetid_t *setid)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	cpusetid_t tmpid;
 	int error;
 
 	if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET)
 		return (EINVAL);
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error)
 		return (error);
 	switch (which) {
 	case CPU_WHICH_TID:
 	case CPU_WHICH_PID:
 		thread_lock(ttd);
 		set = cpuset_refbase(ttd->td_cpuset);
 		thread_unlock(ttd);
 		PROC_UNLOCK(p);
 		break;
 	case CPU_WHICH_CPUSET:
 	case CPU_WHICH_JAIL:
 		break;
 	case CPU_WHICH_IRQ:
 	case CPU_WHICH_DOMAIN:
 		return (EINVAL);
 	}
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 		nset = cpuset_refroot(set);
 		cpuset_rel(set);
 		set = nset;
 		break;
 	case CPU_LEVEL_CPUSET:
 		break;
 	case CPU_LEVEL_WHICH:
 		break;
 	}
 	tmpid = set->cs_id;
 	cpuset_rel(set);
 	if (error == 0)
 		error = copyout(&tmpid, setid, sizeof(tmpid));
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getaffinity_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		cpusetsize;
 	cpuset_t	*mask;
 };
 #endif
 int
 sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap)
 {
 
 	return (kern_cpuset_getaffinity(td, uap->level, uap->which,
 	    uap->id, uap->cpusetsize, uap->mask));
 }
 
 int
 kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t cpusetsize, cpuset_t *maskp)
 {
 	struct thread *ttd;
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct proc *p;
 	cpuset_t *mask;
 	int error;
 	size_t size;
 
 	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
 		return (ERANGE);
 	/* In Capability mode, you can only get your own CPU set. */
 	if (IN_CAPABILITY_MODE(td)) {
 		if (level != CPU_LEVEL_WHICH)
 			return (ECAPMODE);
 		if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 			return (ECAPMODE);
 		if (id != -1)
 			return (ECAPMODE);
 	}
 	size = cpusetsize;
 	mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error)
 		goto out;
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		CPU_COPY(&nset->cs_mask, mask);
 		cpuset_rel(nset);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			thread_lock(ttd);
 			CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_PID:
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				thread_lock(ttd);
 				CPU_OR(mask, &ttd->td_cpuset->cs_mask);
 				thread_unlock(ttd);
 			}
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			CPU_COPY(&set->cs_mask, mask);
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 			error = intr_getaffinity(id, which, mask);
 			break;
 		case CPU_WHICH_DOMAIN:
 			if (id < 0 || id >= MAXMEMDOM)
 				error = ESRCH;
 			else
 				CPU_COPY(&cpuset_domain[id], mask);
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (set)
 		cpuset_rel(set);
 	if (p)
 		PROC_UNLOCK(p);
 	if (error == 0)
 		error = copyout(mask, maskp, size);
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setaffinity_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		cpusetsize;
 	const cpuset_t	*mask;
 };
 #endif
 int
 sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap)
 {
 
 	return (kern_cpuset_setaffinity(td, uap->level, uap->which,
 	    uap->id, uap->cpusetsize, uap->mask));
 }
 
 int
 kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t cpusetsize, const cpuset_t *maskp)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	cpuset_t *mask;
 	int error;
 
 	if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY)
 		return (ERANGE);
 	/* In Capability mode, you can only set your own CPU set. */
 	if (IN_CAPABILITY_MODE(td)) {
 		if (level != CPU_LEVEL_WHICH)
 			return (ECAPMODE);
 		if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 			return (ECAPMODE);
 		if (id != -1)
 			return (ECAPMODE);
 	}
 	mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO);
 	error = copyin(maskp, mask, cpusetsize);
 	if (error)
 		goto out;
 	/*
 	 * Verify that no high bits are set.
 	 */
 	if (cpusetsize > sizeof(cpuset_t)) {
 		char *end;
 		char *cp;
 
 		end = cp = (char *)&mask->__bits;
 		end += cpusetsize;
 		cp += sizeof(cpuset_t);
 		while (cp != end)
 			if (*cp++ != 0) {
 				error = EINVAL;
 				goto out;
 			}
 
 	}
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		error = cpuset_which(which, id, &p, &ttd, &set);
 		if (error)
 			break;
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			PROC_UNLOCK(p);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		error = cpuset_modify(nset, mask);
 		cpuset_rel(nset);
 		cpuset_rel(set);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			error = cpuset_setthread(id, mask);
 			break;
 		case CPU_WHICH_PID:
 			error = cpuset_setproc(id, NULL, mask, NULL);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			error = cpuset_which(which, id, &p, &ttd, &set);
 			if (error == 0) {
 				error = cpuset_modify(set, mask);
 				cpuset_rel(set);
 			}
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 			error = intr_setaffinity(id, which, mask);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_getdomain_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		domainsetsize;
 	domainset_t	*mask;
 	int 		*policy;
 };
 #endif
 int
 sys_cpuset_getdomain(struct thread *td, struct cpuset_getdomain_args *uap)
 {
 
 	return (kern_cpuset_getdomain(td, uap->level, uap->which,
 	    uap->id, uap->domainsetsize, uap->mask, uap->policy));
 }
 
 int
 kern_cpuset_getdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t domainsetsize, domainset_t *maskp, int *policyp)
 {
 	struct domainset outset;
 	struct thread *ttd;
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct domainset *dset;
 	struct proc *p;
 	domainset_t *mask;
 	int error;
 
 	if (domainsetsize < sizeof(domainset_t) ||
 	    domainsetsize > DOMAINSET_MAXSIZE / NBBY)
 		return (ERANGE);
 	/* In Capability mode, you can only get your own domain set. */
 	if (IN_CAPABILITY_MODE(td)) {
 		if (level != CPU_LEVEL_WHICH)
 			return (ECAPMODE);
 		if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 			return (ECAPMODE);
 		if (id != -1)
 			return (ECAPMODE);
 	}
 	mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
 	bzero(&outset, sizeof(outset));
 	error = cpuset_which(which, id, &p, &ttd, &set);
 	if (error)
 		goto out;
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		domainset_copy(nset->cs_domain, &outset);
 		cpuset_rel(nset);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			thread_lock(ttd);
 			domainset_copy(ttd->td_cpuset->cs_domain, &outset);
 			thread_unlock(ttd);
 			break;
 		case CPU_WHICH_PID:
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				thread_lock(ttd);
 				dset = ttd->td_cpuset->cs_domain;
 				/* Show all domains in the proc. */
 				DOMAINSET_OR(&outset.ds_mask, &dset->ds_mask);
 				/* Last policy wins. */
 				outset.ds_policy = dset->ds_policy;
 				outset.ds_prefer = dset->ds_prefer;
 				thread_unlock(ttd);
 			}
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			domainset_copy(set->cs_domain, &outset);
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (set)
 		cpuset_rel(set);
 	if (p)
 		PROC_UNLOCK(p);
 	/*
 	 * Translate prefer into a set containing only the preferred domain,
 	 * not the entire fallback set.
 	 */
 	if (outset.ds_policy == DOMAINSET_POLICY_PREFER) {
 		DOMAINSET_ZERO(&outset.ds_mask);
 		DOMAINSET_SET(outset.ds_prefer, &outset.ds_mask);
 	}
 	DOMAINSET_COPY(&outset.ds_mask, mask);
 	if (error == 0)
 		error = copyout(mask, maskp, domainsetsize);
 	if (error == 0)
 		if (suword32(policyp, outset.ds_policy) != 0)
 			error = EFAULT;
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct cpuset_setdomain_args {
 	cpulevel_t	level;
 	cpuwhich_t	which;
 	id_t		id;
 	size_t		domainsetsize;
 	domainset_t	*mask;
 	int 		policy;
 };
 #endif
 int
 sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap)
 {
 
 	return (kern_cpuset_setdomain(td, uap->level, uap->which,
 	    uap->id, uap->domainsetsize, uap->mask, uap->policy));
 }
 
 int
 kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which,
     id_t id, size_t domainsetsize, const domainset_t *maskp, int policy)
 {
 	struct cpuset *nset;
 	struct cpuset *set;
 	struct thread *ttd;
 	struct proc *p;
 	struct domainset domain;
 	domainset_t *mask;
 	int error;
 
 	if (domainsetsize < sizeof(domainset_t) ||
 	    domainsetsize > DOMAINSET_MAXSIZE / NBBY)
 		return (ERANGE);
 	/* In Capability mode, you can only set your own CPU set. */
 	if (IN_CAPABILITY_MODE(td)) {
 		if (level != CPU_LEVEL_WHICH)
 			return (ECAPMODE);
 		if (which != CPU_WHICH_TID && which != CPU_WHICH_PID)
 			return (ECAPMODE);
 		if (id != -1)
 			return (ECAPMODE);
 	}
 	memset(&domain, 0, sizeof(domain));
 	mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO);
 	error = copyin(maskp, mask, domainsetsize);
 	if (error)
 		goto out;
 	/*
 	 * Verify that no high bits are set.
 	 */
 	if (domainsetsize > sizeof(domainset_t)) {
 		char *end;
 		char *cp;
 
 		end = cp = (char *)&mask->__bits;
 		end += domainsetsize;
 		cp += sizeof(domainset_t);
 		while (cp != end)
 			if (*cp++ != 0) {
 				error = EINVAL;
 				goto out;
 			}
 
 	}
 	DOMAINSET_COPY(mask, &domain.ds_mask);
 	domain.ds_policy = policy;
 	if (policy <= DOMAINSET_POLICY_INVALID ||
 	    policy > DOMAINSET_POLICY_MAX)
 		return (EINVAL);
 
 	/* Translate preferred policy into a mask and fallback. */
 	if (policy == DOMAINSET_POLICY_PREFER) {
 		/* Only support a single preferred domain. */
 		if (DOMAINSET_COUNT(&domain.ds_mask) != 1)
 			return (EINVAL);
 		domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1;
 		/* This will be constrained by domainset_shadow(). */
 		DOMAINSET_FILL(&domain.ds_mask);
 	}
 
 	switch (level) {
 	case CPU_LEVEL_ROOT:
 	case CPU_LEVEL_CPUSET:
 		error = cpuset_which(which, id, &p, &ttd, &set);
 		if (error)
 			break;
 		switch (which) {
 		case CPU_WHICH_TID:
 		case CPU_WHICH_PID:
 			thread_lock(ttd);
 			set = cpuset_ref(ttd->td_cpuset);
 			thread_unlock(ttd);
 			PROC_UNLOCK(p);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		case CPU_WHICH_DOMAIN:
 			error = EINVAL;
 			goto out;
 		}
 		if (level == CPU_LEVEL_ROOT)
 			nset = cpuset_refroot(set);
 		else
 			nset = cpuset_refbase(set);
 		error = cpuset_modify_domain(nset, &domain);
 		cpuset_rel(nset);
 		cpuset_rel(set);
 		break;
 	case CPU_LEVEL_WHICH:
 		switch (which) {
 		case CPU_WHICH_TID:
 			error = _cpuset_setthread(id, NULL, &domain);
 			break;
 		case CPU_WHICH_PID:
 			error = cpuset_setproc(id, NULL, NULL, &domain);
 			break;
 		case CPU_WHICH_CPUSET:
 		case CPU_WHICH_JAIL:
 			error = cpuset_which(which, id, &p, &ttd, &set);
 			if (error == 0) {
 				error = cpuset_modify_domain(set, &domain);
 				cpuset_rel(set);
 			}
 			break;
 		case CPU_WHICH_IRQ:
 		case CPU_WHICH_INTRHANDLER:
 		case CPU_WHICH_ITHREAD:
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 out:
 	free(mask, M_TEMP);
 	return (error);
 }
 
 #ifdef DDB
 BITSET_DEFINE(bitset, 1);
 static void
 ddb_display_bitset(const struct bitset *set, int size)
 {
 	int bit, once;
 
 	for (once = 0, bit = 0; bit < size; bit++) {
 		if (CPU_ISSET(bit, set)) {
 			if (once == 0) {
 				db_printf("%d", bit);
 				once = 1;
 			} else  
 				db_printf(",%d", bit);
 		}
 	}
 	if (once == 0)
 		db_printf("<none>");
 }
 
 void
 ddb_display_cpuset(const cpuset_t *set)
 {
 	ddb_display_bitset((const struct bitset *)set, CPU_SETSIZE);
 }
 
 static void
 ddb_display_domainset(const domainset_t *set)
 {
 	ddb_display_bitset((const struct bitset *)set, DOMAINSET_SETSIZE);
 }
 
 DB_SHOW_COMMAND(cpusets, db_show_cpusets)
 {
 	struct cpuset *set;
 
 	LIST_FOREACH(set, &cpuset_ids, cs_link) {
 		db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
 		    set, set->cs_id, set->cs_ref, set->cs_flags,
 		    (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
 		db_printf("  cpu mask=");
 		ddb_display_cpuset(&set->cs_mask);
 		db_printf("\n");
 		db_printf("  domain policy %d prefer %d mask=",
 		    set->cs_domain->ds_policy, set->cs_domain->ds_prefer);
 		ddb_display_domainset(&set->cs_domain->ds_mask);
 		db_printf("\n");
 		if (db_pager_quit)
 			break;
 	}
 }
 
 DB_SHOW_COMMAND(domainsets, db_show_domainsets)
 {
 	struct domainset *set;
 
 	LIST_FOREACH(set, &cpuset_domains, ds_link) {
 		db_printf("set=%p policy %d prefer %d cnt %d max %d\n",
 		    set, set->ds_policy, set->ds_prefer, set->ds_cnt,
 		    set->ds_max);
 		db_printf("  mask =");
 		ddb_display_domainset(&set->ds_mask);
 		db_printf("\n");
 	}
 }
 #endif /* DDB */
Index: head/sys/x86/x86/intr_machdep.c
===================================================================
--- head/sys/x86/x86/intr_machdep.c	(revision 331697)
+++ head/sys/x86/x86/intr_machdep.c	(revision 331698)
@@ -1,743 +1,774 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Machine dependent interrupt code for x86.  For x86, we have to
  * deal with different PICs.  Thus, we use the passed in vector to lookup
  * an interrupt source associated with that vector.  The interrupt source
  * describes which PIC the source belongs to and includes methods to handle
  * that source.
  */
 
 #include "opt_atpic.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/ktr.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 #include <sys/vmmeter.h>
 #include <machine/clock.h>
 #include <machine/intr_machdep.h>
 #include <machine/smp.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #ifndef DEV_ATPIC
 #include <machine/segments.h>
 #include <machine/frame.h>
 #include <dev/ic/i8259.h>
 #include <x86/isa/icu.h>
 #include <isa/isareg.h>
 #endif
 
+#include <vm/vm.h>
+
 #define	MAX_STRAY_LOG	5
 
 typedef void (*mask_fn)(void *);
 
 static int intrcnt_index;
 static struct intsrc *interrupt_sources[NUM_IO_INTS];
 #ifdef SMP
 static struct intsrc *interrupt_sorted[NUM_IO_INTS];
 CTASSERT(sizeof(interrupt_sources) == sizeof(interrupt_sorted));
 static int intrbalance;
 SYSCTL_INT(_hw, OID_AUTO, intrbalance, CTLFLAG_RW, &intrbalance, 0,
     "Interrupt auto-balance interval (seconds).  Zero disables.");
 static struct timeout_task intrbalance_task;
 #endif
 static struct sx intrsrc_lock;
 static struct mtx intrpic_lock;
 static struct mtx intrcnt_lock;
 static TAILQ_HEAD(pics_head, pic) pics;
 
 #if defined(SMP) && !defined(EARLY_AP_STARTUP)
 static int assign_cpu;
 #endif
 
 u_long intrcnt[INTRCNT_COUNT];
 char intrnames[INTRCNT_COUNT * (MAXCOMLEN + 1)];
 size_t sintrcnt = sizeof(intrcnt);
 size_t sintrnames = sizeof(intrnames);
 
 static int	intr_assign_cpu(void *arg, int cpu);
 static void	intr_disable_src(void *arg);
 static void	intr_init(void *__dummy);
 static int	intr_pic_registered(struct pic *pic);
 static void	intrcnt_setname(const char *name, int index);
 static void	intrcnt_updatename(struct intsrc *is);
 static void	intrcnt_register(struct intsrc *is);
 
 static int
 intr_pic_registered(struct pic *pic)
 {
 	struct pic *p;
 
 	TAILQ_FOREACH(p, &pics, pics) {
 		if (p == pic)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Register a new interrupt controller (PIC).  This is to support suspend
  * and resume where we suspend/resume controllers rather than individual
  * sources.  This also allows controllers with no active sources (such as
  * 8259As in a system using the APICs) to participate in suspend and resume.
  */
 int
 intr_register_pic(struct pic *pic)
 {
 	int error;
 
 	mtx_lock(&intrpic_lock);
 	if (intr_pic_registered(pic))
 		error = EBUSY;
 	else {
 		TAILQ_INSERT_TAIL(&pics, pic, pics);
 		error = 0;
 	}
 	mtx_unlock(&intrpic_lock);
 	return (error);
 }
 
 /*
  * Register a new interrupt source with the global interrupt system.
  * The global interrupts need to be disabled when this function is
  * called.
  */
 int
 intr_register_source(struct intsrc *isrc)
 {
 	int error, vector;
 
 	KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC"));
 	vector = isrc->is_pic->pic_vector(isrc);
 	if (interrupt_sources[vector] != NULL)
 		return (EEXIST);
 	error = intr_event_create(&isrc->is_event, isrc, 0, vector,
 	    intr_disable_src, (mask_fn)isrc->is_pic->pic_enable_source,
 	    (mask_fn)isrc->is_pic->pic_eoi_source, intr_assign_cpu, "irq%d:",
 	    vector);
 	if (error)
 		return (error);
 	sx_xlock(&intrsrc_lock);
 	if (interrupt_sources[vector] != NULL) {
 		sx_xunlock(&intrsrc_lock);
 		intr_event_destroy(isrc->is_event);
 		return (EEXIST);
 	}
 	intrcnt_register(isrc);
 	interrupt_sources[vector] = isrc;
 	isrc->is_handlers = 0;
 	sx_xunlock(&intrsrc_lock);
 	return (0);
 }
 
 struct intsrc *
 intr_lookup_source(int vector)
 {
 
 	if (vector < 0 || vector >= nitems(interrupt_sources))
 		return (NULL);
 	return (interrupt_sources[vector]);
 }
 
 int
 intr_add_handler(const char *name, int vector, driver_filter_t filter,
-    driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep)
+    driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep,
+    int domain)
 {
 	struct intsrc *isrc;
 	int error;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	error = intr_event_add_handler(isrc->is_event, name, filter, handler,
 	    arg, intr_priority(flags), flags, cookiep);
 	if (error == 0) {
 		sx_xlock(&intrsrc_lock);
 		intrcnt_updatename(isrc);
 		isrc->is_handlers++;
 		if (isrc->is_handlers == 1) {
+			isrc->is_domain = domain;
 			isrc->is_pic->pic_enable_intr(isrc);
 			isrc->is_pic->pic_enable_source(isrc);
 		}
 		sx_xunlock(&intrsrc_lock);
 	}
 	return (error);
 }
 
 int
 intr_remove_handler(void *cookie)
 {
 	struct intsrc *isrc;
 	int error;
 
 	isrc = intr_handler_source(cookie);
 	error = intr_event_remove_handler(cookie);
 	if (error == 0) {
 		sx_xlock(&intrsrc_lock);
 		isrc->is_handlers--;
 		if (isrc->is_handlers == 0) {
 			isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI);
 			isrc->is_pic->pic_disable_intr(isrc);
 		}
 		intrcnt_updatename(isrc);
 		sx_xunlock(&intrsrc_lock);
 	}
 	return (error);
 }
 
 int
 intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol)
 {
 	struct intsrc *isrc;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	return (isrc->is_pic->pic_config_intr(isrc, trig, pol));
 }
 
 static void
 intr_disable_src(void *arg)
 {
 	struct intsrc *isrc;
 
 	isrc = arg;
 	isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
 }
 
 void
 intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame)
 {
 	struct intr_event *ie;
 	int vector;
 
 	/*
 	 * We count software interrupts when we process them.  The
 	 * code here follows previous practice, but there's an
 	 * argument for counting hardware interrupts when they're
 	 * processed too.
 	 */
 	(*isrc->is_count)++;
 	VM_CNT_INC(v_intr);
 
 	ie = isrc->is_event;
 
 	/*
 	 * XXX: We assume that IRQ 0 is only used for the ISA timer
 	 * device (clk).
 	 */
 	vector = isrc->is_pic->pic_vector(isrc);
 	if (vector == 0)
 		clkintr_pending = 1;
 
 	/*
 	 * For stray interrupts, mask and EOI the source, bump the
 	 * stray count, and log the condition.
 	 */
 	if (intr_event_handle(ie, frame) != 0) {
 		isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
 		(*isrc->is_straycount)++;
 		if (*isrc->is_straycount < MAX_STRAY_LOG)
 			log(LOG_ERR, "stray irq%d\n", vector);
 		else if (*isrc->is_straycount == MAX_STRAY_LOG)
 			log(LOG_CRIT,
 			    "too many stray irq %d's: not logging anymore\n",
 			    vector);
 	}
 }
 
 void
 intr_resume(bool suspend_cancelled)
 {
 	struct pic *pic;
 
 #ifndef DEV_ATPIC
 	atpic_reset();
 #endif
 	mtx_lock(&intrpic_lock);
 	TAILQ_FOREACH(pic, &pics, pics) {
 		if (pic->pic_resume != NULL)
 			pic->pic_resume(pic, suspend_cancelled);
 	}
 	mtx_unlock(&intrpic_lock);
 }
 
 void
 intr_suspend(void)
 {
 	struct pic *pic;
 
 	mtx_lock(&intrpic_lock);
 	TAILQ_FOREACH_REVERSE(pic, &pics, pics_head, pics) {
 		if (pic->pic_suspend != NULL)
 			pic->pic_suspend(pic);
 	}
 	mtx_unlock(&intrpic_lock);
 }
 
 static int
 intr_assign_cpu(void *arg, int cpu)
 {
 #ifdef SMP
 	struct intsrc *isrc;
 	int error;
 
 #ifdef EARLY_AP_STARTUP
 	MPASS(mp_ncpus == 1 || smp_started);
 
 	/* Nothing to do if there is only a single CPU. */
 	if (mp_ncpus > 1 && cpu != NOCPU) {
 #else
 	/*
 	 * Don't do anything during early boot.  We will pick up the
 	 * assignment once the APs are started.
 	 */
 	if (assign_cpu && cpu != NOCPU) {
 #endif
 		isrc = arg;
 		sx_xlock(&intrsrc_lock);
 		error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]);
 		if (error == 0)
 			isrc->is_cpu = cpu;
 		sx_xunlock(&intrsrc_lock);
 	} else
 		error = 0;
 	return (error);
 #else
 	return (EOPNOTSUPP);
 #endif
 }
 
 static void
 intrcnt_setname(const char *name, int index)
 {
 
 	snprintf(intrnames + (MAXCOMLEN + 1) * index, MAXCOMLEN + 1, "%-*s",
 	    MAXCOMLEN, name);
 }
 
 static void
 intrcnt_updatename(struct intsrc *is)
 {
 
 	intrcnt_setname(is->is_event->ie_fullname, is->is_index);
 }
 
 static void
 intrcnt_register(struct intsrc *is)
 {
 	char straystr[MAXCOMLEN + 1];
 
 	KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__));
 	mtx_lock_spin(&intrcnt_lock);
 	is->is_index = intrcnt_index;
 	intrcnt_index += 2;
 	snprintf(straystr, MAXCOMLEN + 1, "stray irq%d",
 	    is->is_pic->pic_vector(is));
 	intrcnt_updatename(is);
 	is->is_count = &intrcnt[is->is_index];
 	intrcnt_setname(straystr, is->is_index + 1);
 	is->is_straycount = &intrcnt[is->is_index + 1];
 	mtx_unlock_spin(&intrcnt_lock);
 }
 
 void
 intrcnt_add(const char *name, u_long **countp)
 {
 
 	mtx_lock_spin(&intrcnt_lock);
 	*countp = &intrcnt[intrcnt_index];
 	intrcnt_setname(name, intrcnt_index);
 	intrcnt_index++;
 	mtx_unlock_spin(&intrcnt_lock);
 }
 
 static void
 intr_init(void *dummy __unused)
 {
 
 	intrcnt_setname("???", 0);
 	intrcnt_index = 1;
 	TAILQ_INIT(&pics);
 	mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF);
 	sx_init(&intrsrc_lock, "intrsrc");
 	mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN);
 }
 SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL);
 
 static void
 intr_init_final(void *dummy __unused)
 {
 
 	/*
 	 * Enable interrupts on the BSP after all of the interrupt
 	 * controllers are initialized.  Device interrupts are still
 	 * disabled in the interrupt controllers until interrupt
 	 * handlers are registered.  Interrupts are enabled on each AP
 	 * after their first context switch.
 	 */
 	enable_intr();
 }
 SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL);
 
 #ifndef DEV_ATPIC
 /* Initialize the two 8259A's to a known-good shutdown state. */
 void
 atpic_reset(void)
 {
 
 	outb(IO_ICU1, ICW1_RESET | ICW1_IC4);
 	outb(IO_ICU1 + ICU_IMR_OFFSET, IDT_IO_INTS);
 	outb(IO_ICU1 + ICU_IMR_OFFSET, IRQ_MASK(ICU_SLAVEID));
 	outb(IO_ICU1 + ICU_IMR_OFFSET, MASTER_MODE);
 	outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff);
 	outb(IO_ICU1, OCW3_SEL | OCW3_RR);
 
 	outb(IO_ICU2, ICW1_RESET | ICW1_IC4);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, IDT_IO_INTS + 8);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, ICU_SLAVEID);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, SLAVE_MODE);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff);
 	outb(IO_ICU2, OCW3_SEL | OCW3_RR);
 }
 #endif
 
 /* Add a description to an active interrupt handler. */
 int
 intr_describe(u_int vector, void *ih, const char *descr)
 {
 	struct intsrc *isrc;
 	int error;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	error = intr_event_describe_handler(isrc->is_event, ih, descr);
 	if (error)
 		return (error);
 	intrcnt_updatename(isrc);
 	return (0);
 }
 
 void
 intr_reprogram(void)
 {
 	struct intsrc *is;
 	int v;
 
 	sx_xlock(&intrsrc_lock);
 	for (v = 0; v < NUM_IO_INTS; v++) {
 		is = interrupt_sources[v];
 		if (is == NULL)
 			continue;
 		if (is->is_pic->pic_reprogram_pin != NULL)
 			is->is_pic->pic_reprogram_pin(is);
 	}
 	sx_xunlock(&intrsrc_lock);
 }
 
 #ifdef DDB
 /*
  * Dump data about interrupt handlers
  */
 DB_SHOW_COMMAND(irqs, db_show_irqs)
 {
 	struct intsrc **isrc;
 	int i, verbose;
 
 	if (strcmp(modif, "v") == 0)
 		verbose = 1;
 	else
 		verbose = 0;
 	isrc = interrupt_sources;
 	for (i = 0; i < NUM_IO_INTS && !db_pager_quit; i++, isrc++)
 		if (*isrc != NULL)
 			db_dump_intr_event((*isrc)->is_event, verbose);
 }
 #endif
 
 #ifdef SMP
 /*
  * Support for balancing interrupt sources across CPUs.  For now we just
  * allocate CPUs round-robin.
  */
 
 cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
-static int current_cpu;
+static int current_cpu[MAXMEMDOM];
 
+static void
+intr_init_cpus(void)
+{
+	int i;
+
+	for (i = 0; i < vm_ndomains; i++) {
+		current_cpu[i] = 0;
+		if (!CPU_ISSET(current_cpu[i], &intr_cpus) ||
+		    !CPU_ISSET(current_cpu[i], &cpuset_domain[i]))
+			intr_next_cpu(i);
+	}
+}
+
 /*
  * Return the CPU that the next interrupt source should use.  For now
  * this just returns the next local APIC according to round-robin.
  */
 u_int
-intr_next_cpu(void)
+intr_next_cpu(int domain)
 {
 	u_int apic_id;
 
 #ifdef EARLY_AP_STARTUP
 	MPASS(mp_ncpus == 1 || smp_started);
 	if (mp_ncpus == 1)
 		return (PCPU_GET(apic_id));
 #else
 	/* Leave all interrupts on the BSP during boot. */
 	if (!assign_cpu)
 		return (PCPU_GET(apic_id));
 #endif
 
 	mtx_lock_spin(&icu_lock);
-	apic_id = cpu_apic_ids[current_cpu];
+	apic_id = cpu_apic_ids[current_cpu[domain]];
 	do {
-		current_cpu++;
-		if (current_cpu > mp_maxid)
-			current_cpu = 0;
-	} while (!CPU_ISSET(current_cpu, &intr_cpus));
+		current_cpu[domain]++;
+		if (current_cpu[domain] > mp_maxid)
+			current_cpu[domain] = 0;
+	} while (!CPU_ISSET(current_cpu[domain], &intr_cpus) ||
+	    !CPU_ISSET(current_cpu[domain], &cpuset_domain[domain]));
 	mtx_unlock_spin(&icu_lock);
 	return (apic_id);
 }
 
 /* Attempt to bind the specified IRQ to the specified CPU. */
 int
 intr_bind(u_int vector, u_char cpu)
 {
 	struct intsrc *isrc;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	return (intr_event_bind(isrc->is_event, cpu));
 }
 
 /*
  * Add a CPU to our mask of valid CPUs that can be destinations of
  * interrupts.
  */
 void
 intr_add_cpu(u_int cpu)
 {
 
 	if (cpu >= MAXCPU)
 		panic("%s: Invalid CPU ID", __func__);
 	if (bootverbose)
 		printf("INTR: Adding local APIC %d as a target\n",
 		    cpu_apic_ids[cpu]);
 
 	CPU_SET(cpu, &intr_cpus);
 }
 
-#ifndef EARLY_AP_STARTUP
+#ifdef EARLY_AP_STARTUP
+static void
+intr_smp_startup(void *arg __unused)
+{
+
+	intr_init_cpus();
+	return;
+}
+SYSINIT(intr_smp_startup, SI_SUB_SMP, SI_ORDER_SECOND, intr_smp_startup,
+    NULL);
+
+#else
 /*
  * Distribute all the interrupt sources among the available CPUs once the
  * AP's have been launched.
  */
 static void
 intr_shuffle_irqs(void *arg __unused)
 {
 	struct intsrc *isrc;
 	u_int cpu;
 	int i;
 
+	intr_init_cpus();
 	/* Don't bother on UP. */
 	if (mp_ncpus == 1)
 		return;
 
 	/* Round-robin assign a CPU to each enabled source. */
 	sx_xlock(&intrsrc_lock);
 	assign_cpu = 1;
 	for (i = 0; i < NUM_IO_INTS; i++) {
 		isrc = interrupt_sources[i];
 		if (isrc != NULL && isrc->is_handlers > 0) {
 			/*
 			 * If this event is already bound to a CPU,
 			 * then assign the source to that CPU instead
 			 * of picking one via round-robin.  Note that
 			 * this is careful to only advance the
 			 * round-robin if the CPU assignment succeeds.
 			 */
 			cpu = isrc->is_event->ie_cpu;
 			if (cpu == NOCPU)
-				cpu = current_cpu;
+				cpu = current_cpu[isrc->is_domain];
 			if (isrc->is_pic->pic_assign_cpu(isrc,
 			    cpu_apic_ids[cpu]) == 0) {
 				isrc->is_cpu = cpu;
 				if (isrc->is_event->ie_cpu == NOCPU)
-					intr_next_cpu();
+					intr_next_cpu(isrc->is_domain);
 			}
 		}
 	}
 	sx_xunlock(&intrsrc_lock);
 }
 SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs,
     NULL);
 #endif
 
 /*
  * TODO: Export this information in a non-MD fashion, integrate with vmstat -i.
  */
 static int
 sysctl_hw_intrs(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	struct intsrc *isrc;
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	sx_slock(&intrsrc_lock);
 	for (i = 0; i < NUM_IO_INTS; i++) {
 		isrc = interrupt_sources[i];
 		if (isrc == NULL)
 			continue;
-		sbuf_printf(&sbuf, "%s:%d @%d: %ld\n",
+		sbuf_printf(&sbuf, "%s:%d @cpu%d(domain%d): %ld\n",
 		    isrc->is_event->ie_fullname,
 		    isrc->is_index,
 		    isrc->is_cpu,
+		    isrc->is_domain,
 		    *isrc->is_count);
 	}
 
 	sx_sunlock(&intrsrc_lock);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 SYSCTL_PROC(_hw, OID_AUTO, intrs, CTLTYPE_STRING | CTLFLAG_RW,
     0, 0, sysctl_hw_intrs, "A", "interrupt:number @cpu: count");
 
 /*
  * Compare two, possibly NULL, entries in the interrupt source array
  * by load.
  */
 static int
 intrcmp(const void *one, const void *two)
 {
 	const struct intsrc *i1, *i2;
 
 	i1 = *(const struct intsrc * const *)one;
 	i2 = *(const struct intsrc * const *)two;
 	if (i1 != NULL && i2 != NULL)
 		return (*i1->is_count - *i2->is_count);
 	if (i1 != NULL)
 		return (1);
 	if (i2 != NULL)
 		return (-1);
 	return (0);
 }
 
 /*
  * Balance IRQs across available CPUs according to load.
  */
 static void
 intr_balance(void *dummy __unused, int pending __unused)
 {
 	struct intsrc *isrc;
 	int interval;
 	u_int cpu;
 	int i;
 
 	interval = intrbalance;
 	if (interval == 0)
 		goto out;
 
 	/*
 	 * Sort interrupts according to count.
 	 */
 	sx_xlock(&intrsrc_lock);
 	memcpy(interrupt_sorted, interrupt_sources, sizeof(interrupt_sorted));
 	qsort(interrupt_sorted, NUM_IO_INTS, sizeof(interrupt_sorted[0]),
 	    intrcmp);
 
 	/*
 	 * Restart the scan from the same location to avoid moving in the
 	 * common case.
 	 */
-	current_cpu = 0;
+	intr_init_cpus();
 
 	/*
 	 * Assign round-robin from most loaded to least.
 	 */
 	for (i = NUM_IO_INTS - 1; i >= 0; i--) {
 		isrc = interrupt_sorted[i];
 		if (isrc == NULL  || isrc->is_event->ie_cpu != NOCPU)
 			continue;
-		cpu = current_cpu;
-		intr_next_cpu();
+		cpu = current_cpu[isrc->is_domain];
+		intr_next_cpu(isrc->is_domain);
 		if (isrc->is_cpu != cpu &&
 		    isrc->is_pic->pic_assign_cpu(isrc,
 		    cpu_apic_ids[cpu]) == 0)
 			isrc->is_cpu = cpu;
 	}
 	sx_xunlock(&intrsrc_lock);
 out:
 	taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task,
 	    interval ? hz * interval : hz * 60);
 
 }
 
 static void
 intr_balance_init(void *dummy __unused)
 {
 
 	TIMEOUT_TASK_INIT(taskqueue_thread, &intrbalance_task, 0, intr_balance,
 	    NULL);
 	taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task, hz);
 }
 SYSINIT(intr_balance_init, SI_SUB_SMP, SI_ORDER_ANY, intr_balance_init, NULL);
 
 #else
 /*
  * Always route interrupts to the current processor in the UP case.
  */
 u_int
-intr_next_cpu(void)
+intr_next_cpu(int domain)
 {
 
 	return (PCPU_GET(apic_id));
 }
 #endif
Index: head/sys/x86/x86/io_apic.c
===================================================================
--- head/sys/x86/x86/io_apic.c	(revision 331697)
+++ head/sys/x86/x86/io_apic.c	(revision 331698)
@@ -1,1234 +1,1234 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include "opt_isa.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/rman.h>
 #include <sys/sysctl.h>
 
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <x86/apicreg.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <machine/resource.h>
 #include <machine/segments.h>
 #include <x86/iommu/iommu_intrmap.h>
 
 #define IOAPIC_ISA_INTS		16
 #define	IOAPIC_MEM_REGION	32
 #define	IOAPIC_REDTBL_LO(i)	(IOAPIC_REDTBL + (i) * 2)
 #define	IOAPIC_REDTBL_HI(i)	(IOAPIC_REDTBL_LO(i) + 1)
 
 static MALLOC_DEFINE(M_IOAPIC, "io_apic", "I/O APIC structures");
 
 /*
  * I/O APIC interrupt source driver.  Each pin is assigned an IRQ cookie
  * as laid out in the ACPI System Interrupt number model where each I/O
  * APIC has a contiguous chunk of the System Interrupt address space.
  * We assume that IRQs 1 - 15 behave like ISA IRQs and that all other
  * IRQs behave as PCI IRQs by default.  We also assume that the pin for
  * IRQ 0 is actually an ExtINT pin.  The apic enumerators override the
  * configuration of individual pins as indicated by their tables.
  *
  * Documentation for the I/O APIC: "82093AA I/O Advanced Programmable
  * Interrupt Controller (IOAPIC)", May 1996, Intel Corp.
  * ftp://download.intel.com/design/chipsets/datashts/29056601.pdf
  */
 
 struct ioapic_intsrc {
 	struct intsrc io_intsrc;
 	u_int io_irq;
 	u_int io_intpin:8;
 	u_int io_vector:8;
 	u_int io_cpu;
 	u_int io_activehi:1;
 	u_int io_edgetrigger:1;
 	u_int io_masked:1;
 	int io_bus:4;
 	uint32_t io_lowreg;
 	u_int io_remap_cookie;
 };
 
 struct ioapic {
 	struct pic io_pic;
 	u_int io_id:8;			/* logical ID */
 	u_int io_apic_id:4;
 	u_int io_intbase:8;		/* System Interrupt base */
 	u_int io_numintr:8;
 	u_int io_haseoi:1;
 	volatile ioapic_t *io_addr;	/* XXX: should use bus_space */
 	vm_paddr_t io_paddr;
 	STAILQ_ENTRY(ioapic) io_next;
 	device_t pci_dev;		/* matched pci device, if found */
 	struct resource *pci_wnd;	/* BAR 0, should be same or alias to
 					   io_paddr */
 	struct ioapic_intsrc io_pins[0];
 };
 
 static u_int	ioapic_read(volatile ioapic_t *apic, int reg);
 static void	ioapic_write(volatile ioapic_t *apic, int reg, u_int val);
 static const char *ioapic_bus_string(int bus_type);
 static void	ioapic_print_irq(struct ioapic_intsrc *intpin);
 static void	ioapic_enable_source(struct intsrc *isrc);
 static void	ioapic_disable_source(struct intsrc *isrc, int eoi);
 static void	ioapic_eoi_source(struct intsrc *isrc);
 static void	ioapic_enable_intr(struct intsrc *isrc);
 static void	ioapic_disable_intr(struct intsrc *isrc);
 static int	ioapic_vector(struct intsrc *isrc);
 static int	ioapic_source_pending(struct intsrc *isrc);
 static int	ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig,
 		    enum intr_polarity pol);
 static void	ioapic_resume(struct pic *pic, bool suspend_cancelled);
 static int	ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id);
 static void	ioapic_program_intpin(struct ioapic_intsrc *intpin);
 static void	ioapic_reprogram_intpin(struct intsrc *isrc);
 
 static STAILQ_HEAD(,ioapic) ioapic_list = STAILQ_HEAD_INITIALIZER(ioapic_list);
 struct pic ioapic_template = {
 	.pic_enable_source = ioapic_enable_source,
 	.pic_disable_source = ioapic_disable_source,
 	.pic_eoi_source = ioapic_eoi_source,
 	.pic_enable_intr = ioapic_enable_intr,
 	.pic_disable_intr = ioapic_disable_intr,
 	.pic_vector = ioapic_vector,
 	.pic_source_pending = ioapic_source_pending,
 	.pic_suspend = NULL,
 	.pic_resume = ioapic_resume,
 	.pic_config_intr = ioapic_config_intr,
 	.pic_assign_cpu = ioapic_assign_cpu,
 	.pic_reprogram_pin = ioapic_reprogram_intpin,
 };
 
 static int next_ioapic_base;
 static u_int next_id;
 
 static int enable_extint;
 SYSCTL_INT(_hw_apic, OID_AUTO, enable_extint, CTLFLAG_RDTUN, &enable_extint, 0,
     "Enable the ExtINT pin in the first I/O APIC");
 
 static void
 _ioapic_eoi_source(struct intsrc *isrc, int locked)
 {
 	struct ioapic_intsrc *src;
 	struct ioapic *io;
 	volatile uint32_t *apic_eoi;
 	uint32_t low1;
 
 	lapic_eoi();
 	if (!lapic_eoi_suppression)
 		return;
 	src = (struct ioapic_intsrc *)isrc;
 	if (src->io_edgetrigger)
 		return;
 	io = (struct ioapic *)isrc->is_pic;
 
 	/*
 	 * Handle targeted EOI for level-triggered pins, if broadcast
 	 * EOI suppression is supported by LAPICs.
 	 */
 	if (io->io_haseoi) {
 		/*
 		 * If IOAPIC has EOI Register, simply write vector
 		 * number into the reg.
 		 */
 		apic_eoi = (volatile uint32_t *)((volatile char *)
 		    io->io_addr + IOAPIC_EOIR);
 		*apic_eoi = src->io_vector;
 	} else {
 		/*
 		 * Otherwise, if IO-APIC is too old to provide EOIR,
 		 * do what Intel did for the Linux kernel. Temporary
 		 * switch the pin to edge-trigger and back, masking
 		 * the pin during the trick.
 		 */
 		if (!locked)
 			mtx_lock_spin(&icu_lock);
 		low1 = src->io_lowreg;
 		low1 &= ~IOART_TRGRLVL;
 		low1 |= IOART_TRGREDG | IOART_INTMSET;
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin),
 		    low1);
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin),
 		    src->io_lowreg);
 		if (!locked)
 			mtx_unlock_spin(&icu_lock);
 	}
 }
 
 static u_int
 ioapic_read(volatile ioapic_t *apic, int reg)
 {
 
 	mtx_assert(&icu_lock, MA_OWNED);
 	apic->ioregsel = reg;
 	return (apic->iowin);
 }
 
 static void
 ioapic_write(volatile ioapic_t *apic, int reg, u_int val)
 {
 
 	mtx_assert(&icu_lock, MA_OWNED);
 	apic->ioregsel = reg;
 	apic->iowin = val;
 }
 
 static const char *
 ioapic_bus_string(int bus_type)
 {
 
 	switch (bus_type) {
 	case APIC_BUS_ISA:
 		return ("ISA");
 	case APIC_BUS_EISA:
 		return ("EISA");
 	case APIC_BUS_PCI:
 		return ("PCI");
 	default:
 		return ("unknown");
 	}
 }
 
 static void
 ioapic_print_irq(struct ioapic_intsrc *intpin)
 {
 
 	switch (intpin->io_irq) {
 	case IRQ_DISABLED:
 		printf("disabled");
 		break;
 	case IRQ_EXTINT:
 		printf("ExtINT");
 		break;
 	case IRQ_NMI:
 		printf("NMI");
 		break;
 	case IRQ_SMI:
 		printf("SMI");
 		break;
 	default:
 		printf("%s IRQ %u", ioapic_bus_string(intpin->io_bus),
 		    intpin->io_irq);
 	}
 }
 
 static void
 ioapic_enable_source(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	struct ioapic *io = (struct ioapic *)isrc->is_pic;
 	uint32_t flags;
 
 	mtx_lock_spin(&icu_lock);
 	if (intpin->io_masked) {
 		flags = intpin->io_lowreg & ~IOART_INTMASK;
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
 		    flags);
 		intpin->io_masked = 0;
 	}
 	mtx_unlock_spin(&icu_lock);
 }
 
 static void
 ioapic_disable_source(struct intsrc *isrc, int eoi)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	struct ioapic *io = (struct ioapic *)isrc->is_pic;
 	uint32_t flags;
 
 	mtx_lock_spin(&icu_lock);
 	if (!intpin->io_masked && !intpin->io_edgetrigger) {
 		flags = intpin->io_lowreg | IOART_INTMSET;
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
 		    flags);
 		intpin->io_masked = 1;
 	}
 
 	if (eoi == PIC_EOI)
 		_ioapic_eoi_source(isrc, 1);
 
 	mtx_unlock_spin(&icu_lock);
 }
 
 static void
 ioapic_eoi_source(struct intsrc *isrc)
 {
 
 	_ioapic_eoi_source(isrc, 0);
 }
 
 /*
  * Completely program an intpin based on the data in its interrupt source
  * structure.
  */
 static void
 ioapic_program_intpin(struct ioapic_intsrc *intpin)
 {
 	struct ioapic *io = (struct ioapic *)intpin->io_intsrc.is_pic;
 	uint32_t low, high;
 #ifdef ACPI_DMAR
 	int error;
 #endif
 
 	/*
 	 * If a pin is completely invalid or if it is valid but hasn't
 	 * been enabled yet, just ensure that the pin is masked.
 	 */
 	mtx_assert(&icu_lock, MA_OWNED);
 	if (intpin->io_irq == IRQ_DISABLED || (intpin->io_irq < NUM_IO_INTS &&
 	    intpin->io_vector == 0)) {
 		low = ioapic_read(io->io_addr,
 		    IOAPIC_REDTBL_LO(intpin->io_intpin));
 		if ((low & IOART_INTMASK) == IOART_INTMCLR)
 			ioapic_write(io->io_addr,
 			    IOAPIC_REDTBL_LO(intpin->io_intpin),
 			    low | IOART_INTMSET);
 #ifdef ACPI_DMAR
 		mtx_unlock_spin(&icu_lock);
 		iommu_unmap_ioapic_intr(io->io_apic_id,
 		    &intpin->io_remap_cookie);
 		mtx_lock_spin(&icu_lock);
 #endif
 		return;
 	}
 
 #ifdef ACPI_DMAR
 	mtx_unlock_spin(&icu_lock);
 	error = iommu_map_ioapic_intr(io->io_apic_id,
 	    intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger,
 	    intpin->io_activehi, intpin->io_irq, &intpin->io_remap_cookie,
 	    &high, &low);
 	mtx_lock_spin(&icu_lock);
 	if (error == 0) {
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin),
 		    high);
 		intpin->io_lowreg = low;
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
 		    low);
 		return;
 	} else if (error != EOPNOTSUPP) {
 		return;
 	}
 #endif
 
 	/*
 	 * Set the destination.  Note that with Intel interrupt remapping,
 	 * the previously reserved bits 55:48 now have a purpose so ensure
 	 * these are zero.
 	 */
 	low = IOART_DESTPHY;
 	high = intpin->io_cpu << APIC_ID_SHIFT;
 
 	/* Program the rest of the low word. */
 	if (intpin->io_edgetrigger)
 		low |= IOART_TRGREDG;
 	else
 		low |= IOART_TRGRLVL;
 	if (intpin->io_activehi)
 		low |= IOART_INTAHI;
 	else
 		low |= IOART_INTALO;
 	if (intpin->io_masked)
 		low |= IOART_INTMSET;
 	switch (intpin->io_irq) {
 	case IRQ_EXTINT:
 		KASSERT(intpin->io_edgetrigger,
 		    ("ExtINT not edge triggered"));
 		low |= IOART_DELEXINT;
 		break;
 	case IRQ_NMI:
 		KASSERT(intpin->io_edgetrigger,
 		    ("NMI not edge triggered"));
 		low |= IOART_DELNMI;
 		break;
 	case IRQ_SMI:
 		KASSERT(intpin->io_edgetrigger,
 		    ("SMI not edge triggered"));
 		low |= IOART_DELSMI;
 		break;
 	default:
 		KASSERT(intpin->io_vector != 0, ("No vector for IRQ %u",
 		    intpin->io_irq));
 		low |= IOART_DELFIXED | intpin->io_vector;
 	}
 
 	/* Write the values to the APIC. */
 	ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin), high);
 	intpin->io_lowreg = low;
 	ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low);
 }
 
 static void
 ioapic_reprogram_intpin(struct intsrc *isrc)
 {
 
 	mtx_lock_spin(&icu_lock);
 	ioapic_program_intpin((struct ioapic_intsrc *)isrc);
 	mtx_unlock_spin(&icu_lock);
 }
 
 static int
 ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	struct ioapic *io = (struct ioapic *)isrc->is_pic;
 	u_int old_vector, new_vector;
 	u_int old_id;
 
 	/*
 	 * On Hyper-V:
 	 * - Stick to the first cpu for all I/O APIC pins.
 	 * - And don't allow destination cpu changes.
 	 */
 	if (vm_guest == VM_GUEST_HV) {
 		if (intpin->io_vector)
 			return (EINVAL);
 		else
 			apic_id = 0;
 	}
 
 	/*
 	 * keep 1st core as the destination for NMI
 	 */
 	if (intpin->io_irq == IRQ_NMI)
 		apic_id = 0;
 
 	/*
 	 * Set us up to free the old irq.
 	 */
 	old_vector = intpin->io_vector;
 	old_id = intpin->io_cpu;
 	if (old_vector && apic_id == old_id)
 		return (0);
 
 	/*
 	 * Allocate an APIC vector for this interrupt pin.  Once
 	 * we have a vector we program the interrupt pin.
 	 */
 	new_vector = apic_alloc_vector(apic_id, intpin->io_irq);
 	if (new_vector == 0)
 		return (ENOSPC);
 
 	/*
 	 * Mask the old intpin if it is enabled while it is migrated.
 	 *
 	 * At least some level-triggered interrupts seem to need the
 	 * extra DELAY() to avoid being stuck in a non-EOI'd state.
 	 */
 	mtx_lock_spin(&icu_lock);
 	if (!intpin->io_masked && !intpin->io_edgetrigger) {
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
 		    intpin->io_lowreg | IOART_INTMSET);
 		mtx_unlock_spin(&icu_lock);
 		DELAY(100);
 		mtx_lock_spin(&icu_lock);
 	}
 
 	intpin->io_cpu = apic_id;
 	intpin->io_vector = new_vector;
 	if (isrc->is_handlers > 0)
 		apic_enable_vector(intpin->io_cpu, intpin->io_vector);
 	if (bootverbose) {
 		printf("ioapic%u: routing intpin %u (", io->io_id,
 		    intpin->io_intpin);
 		ioapic_print_irq(intpin);
 		printf(") to lapic %u vector %u\n", intpin->io_cpu,
 		    intpin->io_vector);
 	}
 	ioapic_program_intpin(intpin);
 	mtx_unlock_spin(&icu_lock);
 
 	/*
 	 * Free the old vector after the new one is established.  This is done
 	 * to prevent races where we could miss an interrupt.
 	 */
 	if (old_vector) {
 		if (isrc->is_handlers > 0)
 			apic_disable_vector(old_id, old_vector);
 		apic_free_vector(old_id, old_vector, intpin->io_irq);
 	}
 	return (0);
 }
 
 static void
 ioapic_enable_intr(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 
 	if (intpin->io_vector == 0)
-		if (ioapic_assign_cpu(isrc, intr_next_cpu()) != 0)
+		if (ioapic_assign_cpu(isrc, intr_next_cpu(isrc->is_domain)) != 0)
 			panic("Couldn't find an APIC vector for IRQ %d",
 			    intpin->io_irq);
 	apic_enable_vector(intpin->io_cpu, intpin->io_vector);
 }
 
 
 static void
 ioapic_disable_intr(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	u_int vector;
 
 	if (intpin->io_vector != 0) {
 		/* Mask this interrupt pin and free its APIC vector. */
 		vector = intpin->io_vector;
 		apic_disable_vector(intpin->io_cpu, vector);
 		mtx_lock_spin(&icu_lock);
 		intpin->io_masked = 1;
 		intpin->io_vector = 0;
 		ioapic_program_intpin(intpin);
 		mtx_unlock_spin(&icu_lock);
 		apic_free_vector(intpin->io_cpu, vector, intpin->io_irq);
 	}
 }
 
 static int
 ioapic_vector(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *pin;
 
 	pin = (struct ioapic_intsrc *)isrc;
 	return (pin->io_irq);
 }
 
 static int
 ioapic_source_pending(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 
 	if (intpin->io_vector == 0)
 		return 0;
 	return (lapic_intr_pending(intpin->io_vector));
 }
 
 static int
 ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	struct ioapic *io = (struct ioapic *)isrc->is_pic;
 	int changed;
 
 	KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM),
 	    ("%s: Conforming trigger or polarity\n", __func__));
 
 	/*
 	 * EISA interrupts always use active high polarity, so don't allow
 	 * them to be set to active low.
 	 *
 	 * XXX: Should we write to the ELCR if the trigger mode changes for
 	 * an EISA IRQ or an ISA IRQ with the ELCR present?
 	 */
 	mtx_lock_spin(&icu_lock);
 	if (intpin->io_bus == APIC_BUS_EISA)
 		pol = INTR_POLARITY_HIGH;
 	changed = 0;
 	if (intpin->io_edgetrigger != (trig == INTR_TRIGGER_EDGE)) {
 		if (bootverbose)
 			printf("ioapic%u: Changing trigger for pin %u to %s\n",
 			    io->io_id, intpin->io_intpin,
 			    trig == INTR_TRIGGER_EDGE ? "edge" : "level");
 		intpin->io_edgetrigger = (trig == INTR_TRIGGER_EDGE);
 		changed++;
 	}
 	if (intpin->io_activehi != (pol == INTR_POLARITY_HIGH)) {
 		if (bootverbose)
 			printf("ioapic%u: Changing polarity for pin %u to %s\n",
 			    io->io_id, intpin->io_intpin,
 			    pol == INTR_POLARITY_HIGH ? "high" : "low");
 		intpin->io_activehi = (pol == INTR_POLARITY_HIGH);
 		changed++;
 	}
 	if (changed)
 		ioapic_program_intpin(intpin);
 	mtx_unlock_spin(&icu_lock);
 	return (0);
 }
 
 static void
 ioapic_resume(struct pic *pic, bool suspend_cancelled)
 {
 	struct ioapic *io = (struct ioapic *)pic;
 	int i;
 
 	mtx_lock_spin(&icu_lock);
 	for (i = 0; i < io->io_numintr; i++)
 		ioapic_program_intpin(&io->io_pins[i]);
 	mtx_unlock_spin(&icu_lock);
 }
 
 /*
  * Create a plain I/O APIC object.
  */
 void *
 ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase)
 {
 	struct ioapic *io;
 	struct ioapic_intsrc *intpin;
 	volatile ioapic_t *apic;
 	u_int numintr, i;
 	uint32_t value;
 
 	/* Map the register window so we can access the device. */
 	apic = pmap_mapdev(addr, IOAPIC_MEM_REGION);
 	mtx_lock_spin(&icu_lock);
 	value = ioapic_read(apic, IOAPIC_VER);
 	mtx_unlock_spin(&icu_lock);
 
 	/* If it's version register doesn't seem to work, punt. */
 	if (value == 0xffffffff) {
 		pmap_unmapdev((vm_offset_t)apic, IOAPIC_MEM_REGION);
 		return (NULL);
 	}
 
 	/* Determine the number of vectors and set the APIC ID. */
 	numintr = ((value & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) + 1;
 	io = malloc(sizeof(struct ioapic) +
 	    numintr * sizeof(struct ioapic_intsrc), M_IOAPIC, M_WAITOK);
 	io->io_pic = ioapic_template;
 	io->pci_dev = NULL;
 	io->pci_wnd = NULL;
 	mtx_lock_spin(&icu_lock);
 	io->io_id = next_id++;
 	io->io_apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT;
 	if (apic_id != -1 && io->io_apic_id != apic_id) {
 		ioapic_write(apic, IOAPIC_ID, apic_id << APIC_ID_SHIFT);
 		mtx_unlock_spin(&icu_lock);
 		io->io_apic_id = apic_id;
 		printf("ioapic%u: Changing APIC ID to %d\n", io->io_id,
 		    apic_id);
 	} else
 		mtx_unlock_spin(&icu_lock);
 	if (intbase == -1) {
 		intbase = next_ioapic_base;
 		printf("ioapic%u: Assuming intbase of %d\n", io->io_id,
 		    intbase);
 	} else if (intbase != next_ioapic_base && bootverbose)
 		printf("ioapic%u: WARNING: intbase %d != expected base %d\n",
 		    io->io_id, intbase, next_ioapic_base);
 	io->io_intbase = intbase;
 	next_ioapic_base = intbase + numintr;
 	io->io_numintr = numintr;
 	io->io_addr = apic;
 	io->io_paddr = addr;
 
 	if (bootverbose) {
 		printf("ioapic%u: ver 0x%02x maxredir 0x%02x\n", io->io_id,
 		    (value & IOART_VER_VERSION), (value & IOART_VER_MAXREDIR)
 		    >> MAXREDIRSHIFT);
 	}
 	/*
 	 * The  summary information about IO-APIC versions is taken from
 	 * the Linux kernel source:
 	 *     0Xh     82489DX
 	 *     1Xh     I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
 	 *     2Xh     I/O(x)APIC which is PCI 2.2 Compliant
 	 *     30h-FFh Reserved
 	 * IO-APICs with version >= 0x20 have working EOIR register.
 	 */
 	io->io_haseoi = (value & IOART_VER_VERSION) >= 0x20;
 
 	/*
 	 * Initialize pins.  Start off with interrupts disabled.  Default
 	 * to active-hi and edge-triggered for ISA interrupts and active-lo
 	 * and level-triggered for all others.
 	 */
 	bzero(io->io_pins, sizeof(struct ioapic_intsrc) * numintr);
 	mtx_lock_spin(&icu_lock);
 	for (i = 0, intpin = io->io_pins; i < numintr; i++, intpin++) {
 		intpin->io_intsrc.is_pic = (struct pic *)io;
 		intpin->io_intpin = i;
 		intpin->io_irq = intbase + i;
 
 		/*
 		 * Assume that pin 0 on the first I/O APIC is an ExtINT pin.
 		 * Assume that pins 1-15 are ISA interrupts and that all
 		 * other pins are PCI interrupts.
 		 */
 		if (intpin->io_irq == 0)
 			ioapic_set_extint(io, i);
 		else if (intpin->io_irq < IOAPIC_ISA_INTS) {
 			intpin->io_bus = APIC_BUS_ISA;
 			intpin->io_activehi = 1;
 			intpin->io_edgetrigger = 1;
 			intpin->io_masked = 1;
 		} else {
 			intpin->io_bus = APIC_BUS_PCI;
 			intpin->io_activehi = 0;
 			intpin->io_edgetrigger = 0;
 			intpin->io_masked = 1;
 		}
 
 		/*
 		 * Route interrupts to the BSP by default.  Interrupts may
 		 * be routed to other CPUs later after they are enabled.
 		 */
 		intpin->io_cpu = PCPU_GET(apic_id);
 		value = ioapic_read(apic, IOAPIC_REDTBL_LO(i));
 		ioapic_write(apic, IOAPIC_REDTBL_LO(i), value | IOART_INTMSET);
 #ifdef ACPI_DMAR
 		/* dummy, but sets cookie */
 		mtx_unlock_spin(&icu_lock);
 		iommu_map_ioapic_intr(io->io_apic_id,
 		    intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger,
 		    intpin->io_activehi, intpin->io_irq,
 		    &intpin->io_remap_cookie, NULL, NULL);
 		mtx_lock_spin(&icu_lock);
 #endif
 	}
 	mtx_unlock_spin(&icu_lock);
 
 	return (io);
 }
 
 int
 ioapic_get_vector(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (-1);
 	return (io->io_pins[pin].io_irq);
 }
 
 int
 ioapic_disable_pin(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_DISABLED)
 		return (EINVAL);
 	io->io_pins[pin].io_irq = IRQ_DISABLED;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d disabled\n", io->io_id, pin);
 	return (0);
 }
 
 int
 ioapic_remap_vector(void *cookie, u_int pin, int vector)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || vector < 0)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	io->io_pins[pin].io_irq = vector;
 	if (bootverbose)
 		printf("ioapic%u: Routing IRQ %d -> intpin %d\n", io->io_id,
 		    vector, pin);
 	return (0);
 }
 
 int
 ioapic_set_bus(void *cookie, u_int pin, int bus_type)
 {
 	struct ioapic *io;
 
 	if (bus_type < 0 || bus_type > APIC_BUS_MAX)
 		return (EINVAL);
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	if (io->io_pins[pin].io_bus == bus_type)
 		return (0);
 	io->io_pins[pin].io_bus = bus_type;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d bus %s\n", io->io_id, pin,
 		    ioapic_bus_string(bus_type));
 	return (0);
 }
 
 int
 ioapic_set_nmi(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_NMI)
 		return (0);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_NMI;
 	io->io_pins[pin].io_masked = 0;
 	io->io_pins[pin].io_edgetrigger = 1;
 	io->io_pins[pin].io_activehi = 1;
 	if (bootverbose)
 		printf("ioapic%u: Routing NMI -> intpin %d\n",
 		    io->io_id, pin);
 	return (0);
 }
 
 int
 ioapic_set_smi(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_SMI)
 		return (0);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_SMI;
 	io->io_pins[pin].io_masked = 0;
 	io->io_pins[pin].io_edgetrigger = 1;
 	io->io_pins[pin].io_activehi = 1;
 	if (bootverbose)
 		printf("ioapic%u: Routing SMI -> intpin %d\n",
 		    io->io_id, pin);
 	return (0);
 }
 
 int
 ioapic_set_extint(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_EXTINT)
 		return (0);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_EXTINT;
 	if (enable_extint)
 		io->io_pins[pin].io_masked = 0;
 	else
 		io->io_pins[pin].io_masked = 1;
 	io->io_pins[pin].io_edgetrigger = 1;
 	io->io_pins[pin].io_activehi = 1;
 	if (bootverbose)
 		printf("ioapic%u: Routing external 8259A's -> intpin %d\n",
 		    io->io_id, pin);
 	return (0);
 }
 
 int
 ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol)
 {
 	struct ioapic *io;
 	int activehi;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || pol == INTR_POLARITY_CONFORM)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	activehi = (pol == INTR_POLARITY_HIGH);
 	if (io->io_pins[pin].io_activehi == activehi)
 		return (0);
 	io->io_pins[pin].io_activehi = activehi;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d polarity: %s\n", io->io_id, pin,
 		    pol == INTR_POLARITY_HIGH ? "high" : "low");
 	return (0);
 }
 
 int
 ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger)
 {
 	struct ioapic *io;
 	int edgetrigger;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || trigger == INTR_TRIGGER_CONFORM)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	edgetrigger = (trigger == INTR_TRIGGER_EDGE);
 	if (io->io_pins[pin].io_edgetrigger == edgetrigger)
 		return (0);
 	io->io_pins[pin].io_edgetrigger = edgetrigger;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d trigger: %s\n", io->io_id, pin,
 		    trigger == INTR_TRIGGER_EDGE ? "edge" : "level");
 	return (0);
 }
 
 /*
  * Register a complete I/O APIC object with the interrupt subsystem.
  */
 void
 ioapic_register(void *cookie)
 {
 	struct ioapic_intsrc *pin;
 	struct ioapic *io;
 	volatile ioapic_t *apic;
 	uint32_t flags;
 	int i;
 
 	io = (struct ioapic *)cookie;
 	apic = io->io_addr;
 	mtx_lock_spin(&icu_lock);
 	flags = ioapic_read(apic, IOAPIC_VER) & IOART_VER_VERSION;
 	STAILQ_INSERT_TAIL(&ioapic_list, io, io_next);
 	mtx_unlock_spin(&icu_lock);
 	printf("ioapic%u <Version %u.%u> irqs %u-%u on motherboard\n",
 	    io->io_id, flags >> 4, flags & 0xf, io->io_intbase,
 	    io->io_intbase + io->io_numintr - 1);
 
 	/*
 	 * Reprogram pins to handle special case pins (such as NMI and
 	 * SMI) and register valid pins as interrupt sources.
 	 */
 	intr_register_pic(&io->io_pic);
 	for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++) {
 		ioapic_reprogram_intpin(&pin->io_intsrc);
 		if (pin->io_irq < NUM_IO_INTS)
 			intr_register_source(&pin->io_intsrc);
 	}
 }
 
 /* A simple new-bus driver to consume PCI I/O APIC devices. */
 static int
 ioapic_pci_probe(device_t dev)
 {
 
 	if (pci_get_class(dev) == PCIC_BASEPERIPH &&
 	    pci_get_subclass(dev) == PCIS_BASEPERIPH_PIC) {
 		switch (pci_get_progif(dev)) {
 		case PCIP_BASEPERIPH_PIC_IO_APIC:
 			device_set_desc(dev, "IO APIC");
 			break;
 		case PCIP_BASEPERIPH_PIC_IOX_APIC:
 			device_set_desc(dev, "IO(x) APIC");
 			break;
 		default:
 			return (ENXIO);
 		}
 		device_quiet(dev);
 		return (-10000);
 	}
 	return (ENXIO);
 }
 
 static int
 ioapic_pci_attach(device_t dev)
 {
 	struct resource *res;
 	volatile ioapic_t *apic;
 	struct ioapic *io;
 	int rid;
 	u_int apic_id;
 
 	/*
 	 * Try to match the enumerated ioapic.  Match BAR start
 	 * against io_paddr.  Due to a fear that PCI window is not the
 	 * same as the MADT reported io window, but an alias, read the
 	 * APIC ID from the mapped BAR and match against it.
 	 */
 	rid = PCIR_BAR(0);
 	res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
 	    RF_ACTIVE | RF_SHAREABLE);
 	if (res == NULL) {
 		if (bootverbose)
 			device_printf(dev, "cannot activate BAR0\n");
 		return (ENXIO);
 	}
 	apic = (volatile ioapic_t *)rman_get_virtual(res);
 	if (rman_get_size(res) < IOAPIC_WND_SIZE) {
 		if (bootverbose)
 			device_printf(dev,
 			    "BAR0 too small (%jd) for IOAPIC window\n",
 			    (uintmax_t)rman_get_size(res));
 		goto fail;
 	}
 	mtx_lock_spin(&icu_lock);
 	apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT;
 	/* First match by io window address */
 	STAILQ_FOREACH(io, &ioapic_list, io_next) {
 		if (io->io_paddr == (vm_paddr_t)rman_get_start(res))
 			goto found;
 	}
 	/* Then by apic id */
 	STAILQ_FOREACH(io, &ioapic_list, io_next) {
 		if (io->io_apic_id == apic_id)
 			goto found;
 	}
 	mtx_unlock_spin(&icu_lock);
 	if (bootverbose)
 		device_printf(dev,
 		    "cannot match pci bar apic id %d against MADT\n",
 		    apic_id);
 fail:
 	bus_release_resource(dev, SYS_RES_MEMORY, rid, res);
 	return (ENXIO);
 found:
 	KASSERT(io->pci_dev == NULL,
 	    ("ioapic %d pci_dev not NULL", io->io_id));
 	KASSERT(io->pci_wnd == NULL,
 	    ("ioapic %d pci_wnd not NULL", io->io_id));
 
 	io->pci_dev = dev;
 	io->pci_wnd = res;
 	if (bootverbose && (io->io_paddr != (vm_paddr_t)rman_get_start(res) ||
 	    io->io_apic_id != apic_id)) {
 		device_printf(dev, "pci%d:%d:%d:%d pci BAR0@%jx id %d "
 		    "MADT id %d paddr@%jx\n",
 		    pci_get_domain(dev), pci_get_bus(dev),
 		    pci_get_slot(dev), pci_get_function(dev),
 		    (uintmax_t)rman_get_start(res), apic_id,
 		    io->io_apic_id, (uintmax_t)io->io_paddr);
 	}
 	mtx_unlock_spin(&icu_lock);
 	return (0);
 }
 
 static device_method_t ioapic_pci_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		ioapic_pci_probe),
 	DEVMETHOD(device_attach,	ioapic_pci_attach),
 
 	{ 0, 0 }
 };
 
 DEFINE_CLASS_0(ioapic, ioapic_pci_driver, ioapic_pci_methods, 0);
 
 static devclass_t ioapic_devclass;
 DRIVER_MODULE(ioapic, pci, ioapic_pci_driver, ioapic_devclass, 0, 0);
 
 int
 ioapic_get_rid(u_int apic_id, uint16_t *ridp)
 {
 	struct ioapic *io;
 	uintptr_t rid;
 	int error;
 
 	mtx_lock_spin(&icu_lock);
 	STAILQ_FOREACH(io, &ioapic_list, io_next) {
 		if (io->io_apic_id == apic_id)
 			break;
 	}
 	mtx_unlock_spin(&icu_lock);
 	if (io == NULL || io->pci_dev == NULL)
 		return (EINVAL);
 	error = pci_get_id(io->pci_dev, PCI_ID_RID, &rid);
 	if (error != 0)
 		return (error);
 	*ridp = rid;
 	return (0);
 }
 
 /*
  * A new-bus driver to consume the memory resources associated with
  * the APICs in the system.  On some systems ACPI or PnPBIOS system
  * resource devices may already claim these resources.  To keep from
  * breaking those devices, we attach ourself to the nexus device after
  * legacy0 and acpi0 and ignore any allocation failures.
  */
 static void
 apic_identify(driver_t *driver, device_t parent)
 {
 
 	/*
 	 * Add at order 12.  acpi0 is probed at order 10 and legacy0
 	 * is probed at order 11.
 	 */
 	if (lapic_paddr != 0)
 		BUS_ADD_CHILD(parent, 12, "apic", 0);
 }
 
 static int
 apic_probe(device_t dev)
 {
 
 	device_set_desc(dev, "APIC resources");
 	device_quiet(dev);
 	return (0);
 }
 
 static void
 apic_add_resource(device_t dev, int rid, vm_paddr_t base, size_t length)
 {
 	int error;
 
 	error = bus_set_resource(dev, SYS_RES_MEMORY, rid, base, length);
 	if (error)
 		panic("apic_add_resource: resource %d failed set with %d", rid,
 		    error);
 	bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_SHAREABLE);
 }
 
 static int
 apic_attach(device_t dev)
 {
 	struct ioapic *io;
 	int i;
 
 	/* Reserve the local APIC. */
 	apic_add_resource(dev, 0, lapic_paddr, LAPIC_MEM_REGION);
 	i = 1;
 	STAILQ_FOREACH(io, &ioapic_list, io_next) {
 		apic_add_resource(dev, i, io->io_paddr, IOAPIC_MEM_REGION);
 		i++;
 	}
 	return (0);
 }
 
 static device_method_t apic_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_identify,	apic_identify),
 	DEVMETHOD(device_probe,		apic_probe),
 	DEVMETHOD(device_attach,	apic_attach),
 
 	{ 0, 0 }
 };
 
 DEFINE_CLASS_0(apic, apic_driver, apic_methods, 0);
 
 static devclass_t apic_devclass;
 DRIVER_MODULE(apic, nexus, apic_driver, apic_devclass, 0, 0);
 
 #include "opt_ddb.h"
 
 #ifdef DDB
 #include <ddb/ddb.h>
 
 static const char *
 ioapic_delivery_mode(uint32_t mode)
 {
 
 	switch (mode) {
 	case IOART_DELFIXED:
 		return ("fixed");
 	case IOART_DELLOPRI:
 		return ("lowestpri");
 	case IOART_DELSMI:
 		return ("SMI");
 	case IOART_DELRSV1:
 		return ("rsrvd1");
 	case IOART_DELNMI:
 		return ("NMI");
 	case IOART_DELINIT:
 		return ("INIT");
 	case IOART_DELRSV2:
 		return ("rsrvd2");
 	case IOART_DELEXINT:
 		return ("ExtINT");
 	default:
 		return ("");
 	}
 }
 
 static u_int
 db_ioapic_read(volatile ioapic_t *apic, int reg)
 {
 
 	apic->ioregsel = reg;
 	return (apic->iowin);
 }
 
 static void
 db_show_ioapic_one(volatile ioapic_t *io_addr)
 {
 	uint32_t r, lo, hi;
 	int mre, i;
 
 	r = db_ioapic_read(io_addr, IOAPIC_VER);
 	mre = (r & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT;
 	db_printf("Id 0x%08x Ver 0x%02x MRE %d\n",
 	    db_ioapic_read(io_addr, IOAPIC_ID), r & IOART_VER_VERSION, mre);
 	for (i = 0; i < mre; i++) {
 		lo = db_ioapic_read(io_addr, IOAPIC_REDTBL_LO(i));
 		hi = db_ioapic_read(io_addr, IOAPIC_REDTBL_HI(i));
 		db_printf("  pin %d Dest %s/%x %smasked Trig %s RemoteIRR %d "
 		    "Polarity %s Status %s DeliveryMode %s Vec %d\n", i,
 		    (lo & IOART_DESTMOD) == IOART_DESTLOG ? "log" : "phy",
 		    (hi & IOART_DEST) >> 24,
 		    (lo & IOART_INTMASK) == IOART_INTMSET ? "" : "not",
 		    (lo & IOART_TRGRMOD) == IOART_TRGRLVL ? "lvl" : "edge",
 		    (lo & IOART_REM_IRR) == IOART_REM_IRR ? 1 : 0,
 		    (lo & IOART_INTPOL) == IOART_INTALO ? "low" : "high",
 		    (lo & IOART_DELIVS) == IOART_DELIVS ? "pend" : "idle",
 		    ioapic_delivery_mode(lo & IOART_DELMOD),
 		    (lo & IOART_INTVEC));
 	  }
 }
 
 DB_SHOW_COMMAND(ioapic, db_show_ioapic)
 {
 	struct ioapic *ioapic;
 	int idx, i;
 
 	if (!have_addr) {
 		db_printf("usage: show ioapic index\n");
 		return;
 	}
 
 	idx = (int)addr;
 	i = 0;
 	STAILQ_FOREACH(ioapic, &ioapic_list, io_next) {
 		if (idx == i) {
 			db_show_ioapic_one(ioapic->io_addr);
 			break;
 		}
 		i++;
 	}
 }
 
 DB_SHOW_ALL_COMMAND(ioapics, db_show_all_ioapics)
 {
 	struct ioapic *ioapic;
 
 	STAILQ_FOREACH(ioapic, &ioapic_list, io_next)
 		db_show_ioapic_one(ioapic->io_addr);
 }
 #endif
Index: head/sys/x86/x86/msi.c
===================================================================
--- head/sys/x86/x86/msi.c	(revision 331697)
+++ head/sys/x86/x86/msi.c	(revision 331698)
@@ -1,732 +1,738 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2006 Yahoo!, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Support for PCI Message Signalled Interrupts (MSI).  MSI interrupts on
  * x86 are basically APIC messages that the northbridge delivers directly
  * to the local APICs as if they had come from an I/O APIC.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <x86/apicreg.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <x86/iommu/iommu_intrmap.h>
 #include <machine/specialreg.h>
 #include <dev/pci/pcivar.h>
 
 /* Fields in address for Intel MSI messages. */
 #define	MSI_INTEL_ADDR_DEST		0x000ff000
 #define	MSI_INTEL_ADDR_RH		0x00000008
 # define MSI_INTEL_ADDR_RH_ON		0x00000008
 # define MSI_INTEL_ADDR_RH_OFF		0x00000000
 #define	MSI_INTEL_ADDR_DM		0x00000004
 # define MSI_INTEL_ADDR_DM_PHYSICAL	0x00000000
 # define MSI_INTEL_ADDR_DM_LOGICAL	0x00000004
 
 /* Fields in data for Intel MSI messages. */
 #define	MSI_INTEL_DATA_TRGRMOD		IOART_TRGRMOD	/* Trigger mode. */
 # define MSI_INTEL_DATA_TRGREDG		IOART_TRGREDG
 # define MSI_INTEL_DATA_TRGRLVL		IOART_TRGRLVL
 #define	MSI_INTEL_DATA_LEVEL		0x00004000	/* Polarity. */
 # define MSI_INTEL_DATA_DEASSERT	0x00000000
 # define MSI_INTEL_DATA_ASSERT		0x00004000
 #define	MSI_INTEL_DATA_DELMOD		IOART_DELMOD	/* Delivery mode. */
 # define MSI_INTEL_DATA_DELFIXED	IOART_DELFIXED
 # define MSI_INTEL_DATA_DELLOPRI	IOART_DELLOPRI
 # define MSI_INTEL_DATA_DELSMI		IOART_DELSMI
 # define MSI_INTEL_DATA_DELNMI		IOART_DELNMI
 # define MSI_INTEL_DATA_DELINIT		IOART_DELINIT
 # define MSI_INTEL_DATA_DELEXINT	IOART_DELEXINT
 #define	MSI_INTEL_DATA_INTVEC		IOART_INTVEC	/* Interrupt vector. */
 
 /*
  * Build Intel MSI message and data values from a source.  AMD64 systems
  * seem to be compatible, so we use the same function for both.
  */
 #define	INTEL_ADDR(msi)							\
 	(MSI_INTEL_ADDR_BASE | (msi)->msi_cpu << 12 |			\
 	    MSI_INTEL_ADDR_RH_OFF | MSI_INTEL_ADDR_DM_PHYSICAL)
 #define	INTEL_DATA(msi)							\
 	(MSI_INTEL_DATA_TRGREDG | MSI_INTEL_DATA_DELFIXED | (msi)->msi_vector)
 
 static MALLOC_DEFINE(M_MSI, "msi", "PCI MSI");
 
 /*
  * MSI sources are bunched into groups.  This is because MSI forces
  * all of the messages to share the address and data registers and
  * thus certain properties (such as the local APIC ID target on x86).
  * Each group has a 'first' source that contains information global to
  * the group.  These fields are marked with (g) below.
  *
  * Note that local APIC ID is kind of special.  Each message will be
  * assigned an ID by the system; however, a group will use the ID from
  * the first message.
  *
  * For MSI-X, each message is isolated.
  */
 struct msi_intsrc {
 	struct intsrc msi_intsrc;
 	device_t msi_dev;		/* Owning device. (g) */
 	struct msi_intsrc *msi_first;	/* First source in group. */
 	u_int msi_irq;			/* IRQ cookie. */
 	u_int msi_msix;			/* MSI-X message. */
 	u_int msi_vector:8;		/* IDT vector. */
 	u_int msi_cpu;			/* Local APIC ID. (g) */
 	u_int msi_count:8;		/* Messages in this group. (g) */
 	u_int msi_maxcount:8;		/* Alignment for this group. (g) */
 	int *msi_irqs;			/* Group's IRQ list. (g) */
 	u_int msi_remap_cookie;
 };
 
 static void	msi_create_source(void);
 static void	msi_enable_source(struct intsrc *isrc);
 static void	msi_disable_source(struct intsrc *isrc, int eoi);
 static void	msi_eoi_source(struct intsrc *isrc);
 static void	msi_enable_intr(struct intsrc *isrc);
 static void	msi_disable_intr(struct intsrc *isrc);
 static int	msi_vector(struct intsrc *isrc);
 static int	msi_source_pending(struct intsrc *isrc);
 static int	msi_config_intr(struct intsrc *isrc, enum intr_trigger trig,
 		    enum intr_polarity pol);
 static int	msi_assign_cpu(struct intsrc *isrc, u_int apic_id);
 
 struct pic msi_pic = {
 	.pic_enable_source = msi_enable_source,
 	.pic_disable_source = msi_disable_source,
 	.pic_eoi_source = msi_eoi_source,
 	.pic_enable_intr = msi_enable_intr,
 	.pic_disable_intr = msi_disable_intr,
 	.pic_vector = msi_vector,
 	.pic_source_pending = msi_source_pending,
 	.pic_suspend = NULL,
 	.pic_resume = NULL,
 	.pic_config_intr = msi_config_intr,
 	.pic_assign_cpu = msi_assign_cpu,
 	.pic_reprogram_pin = NULL,
 };
 
 #ifdef SMP
 /**
  * Xen hypervisors prior to 4.6.0 do not properly handle updates to
  * enabled MSI-X table entries.  Allow migration of MSI-X interrupts
  * to be disabled via a tunable. Values have the following meaning:
  *
  * -1: automatic detection by FreeBSD
  *  0: enable migration
  *  1: disable migration
  */
 int msix_disable_migration = -1;
 SYSCTL_INT(_machdep, OID_AUTO, disable_msix_migration, CTLFLAG_RDTUN,
     &msix_disable_migration, 0,
     "Disable migration of MSI-X interrupts between CPUs");
 #endif
 
 static int msi_enabled;
 static int msi_last_irq;
 static struct mtx msi_lock;
 
 static void
 msi_enable_source(struct intsrc *isrc)
 {
 }
 
 static void
 msi_disable_source(struct intsrc *isrc, int eoi)
 {
 
 	if (eoi == PIC_EOI)
 		lapic_eoi();
 }
 
 static void
 msi_eoi_source(struct intsrc *isrc)
 {
 
 	lapic_eoi();
 }
 
 static void
 msi_enable_intr(struct intsrc *isrc)
 {
 	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
 
 	apic_enable_vector(msi->msi_cpu, msi->msi_vector);
 }
 
 static void
 msi_disable_intr(struct intsrc *isrc)
 {
 	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
 
 	apic_disable_vector(msi->msi_cpu, msi->msi_vector);
 }
 
 static int
 msi_vector(struct intsrc *isrc)
 {
 	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
 
 	return (msi->msi_irq);
 }
 
 static int
 msi_source_pending(struct intsrc *isrc)
 {
 
 	return (0);
 }
 
 static int
 msi_config_intr(struct intsrc *isrc, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 
 	return (ENODEV);
 }
 
 static int
 msi_assign_cpu(struct intsrc *isrc, u_int apic_id)
 {
 	struct msi_intsrc *sib, *msi = (struct msi_intsrc *)isrc;
 	int old_vector;
 	u_int old_id;
 	int i, vector;
 
 	/*
 	 * Only allow CPUs to be assigned to the first message for an
 	 * MSI group.
 	 */
 	if (msi->msi_first != msi)
 		return (EINVAL);
 
 #ifdef SMP
 	if (msix_disable_migration && msi->msi_msix)
 		return (EINVAL);
 #endif
 
 	/* Store information to free existing irq. */
 	old_vector = msi->msi_vector;
 	old_id = msi->msi_cpu;
 	if (old_id == apic_id)
 		return (0);
 
 	/* Allocate IDT vectors on this cpu. */
 	if (msi->msi_count > 1) {
 		KASSERT(msi->msi_msix == 0, ("MSI-X message group"));
 		vector = apic_alloc_vectors(apic_id, msi->msi_irqs,
 		    msi->msi_count, msi->msi_maxcount);
 	} else
 		vector = apic_alloc_vector(apic_id, msi->msi_irq);
 	if (vector == 0)
 		return (ENOSPC);
 
 	msi->msi_cpu = apic_id;
 	msi->msi_vector = vector;
 	if (msi->msi_intsrc.is_handlers > 0)
 		apic_enable_vector(msi->msi_cpu, msi->msi_vector);
 	if (bootverbose)
 		printf("msi: Assigning %s IRQ %d to local APIC %u vector %u\n",
 		    msi->msi_msix ? "MSI-X" : "MSI", msi->msi_irq,
 		    msi->msi_cpu, msi->msi_vector);
 	for (i = 1; i < msi->msi_count; i++) {
 		sib = (struct msi_intsrc *)intr_lookup_source(msi->msi_irqs[i]);
 		sib->msi_cpu = apic_id;
 		sib->msi_vector = vector + i;
 		if (sib->msi_intsrc.is_handlers > 0)
 			apic_enable_vector(sib->msi_cpu, sib->msi_vector);
 		if (bootverbose)
 			printf(
 		    "msi: Assigning MSI IRQ %d to local APIC %u vector %u\n",
 			    sib->msi_irq, sib->msi_cpu, sib->msi_vector);
 	}
 	BUS_REMAP_INTR(device_get_parent(msi->msi_dev), msi->msi_dev,
 	    msi->msi_irq);
 
 	/*
 	 * Free the old vector after the new one is established.  This is done
 	 * to prevent races where we could miss an interrupt.
 	 */
 	if (msi->msi_intsrc.is_handlers > 0)
 		apic_disable_vector(old_id, old_vector);
 	apic_free_vector(old_id, old_vector, msi->msi_irq);
 	for (i = 1; i < msi->msi_count; i++) {
 		sib = (struct msi_intsrc *)intr_lookup_source(msi->msi_irqs[i]);
 		if (sib->msi_intsrc.is_handlers > 0)
 			apic_disable_vector(old_id, old_vector + i);
 		apic_free_vector(old_id, old_vector + i, msi->msi_irqs[i]);
 	}
 	return (0);
 }
 
 void
 msi_init(void)
 {
 
 	/* Check if we have a supported CPU. */
 	switch (cpu_vendor_id) {
 	case CPU_VENDOR_INTEL:
 	case CPU_VENDOR_AMD:
 		break;
 	case CPU_VENDOR_CENTAUR:
 		if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 		    CPUID_TO_MODEL(cpu_id) >= 0xf)
 			break;
 		/* FALLTHROUGH */
 	default:
 		return;
 	}
 
 #ifdef SMP
 	if (msix_disable_migration == -1) {
 		/* The default is to allow migration of MSI-X interrupts. */
 		msix_disable_migration = 0;
 	}
 #endif
 
 	msi_enabled = 1;
 	intr_register_pic(&msi_pic);
 	mtx_init(&msi_lock, "msi", NULL, MTX_DEF);
 }
 
 static void
 msi_create_source(void)
 {
 	struct msi_intsrc *msi;
 	u_int irq;
 
 	mtx_lock(&msi_lock);
 	if (msi_last_irq >= NUM_MSI_INTS) {
 		mtx_unlock(&msi_lock);
 		return;
 	}
 	irq = msi_last_irq + FIRST_MSI_INT;
 	msi_last_irq++;
 	mtx_unlock(&msi_lock);
 
 	msi = malloc(sizeof(struct msi_intsrc), M_MSI, M_WAITOK | M_ZERO);
 	msi->msi_intsrc.is_pic = &msi_pic;
 	msi->msi_irq = irq;
 	intr_register_source(&msi->msi_intsrc);
 	nexus_add_irq(irq);
 }
 
 /*
  * Try to allocate 'count' interrupt sources with contiguous IDT values.
  */
 int
 msi_alloc(device_t dev, int count, int maxcount, int *irqs)
 {
 	struct msi_intsrc *msi, *fsrc;
-	u_int cpu;
+	u_int cpu, domain;
 	int cnt, i, *mirqs, vector;
 #ifdef ACPI_DMAR
 	u_int cookies[count];
 	int error;
 #endif
 
 	if (!msi_enabled)
 		return (ENXIO);
 
+	if (bus_get_domain(dev, &domain) != 0)
+		domain = 0;
+
 	if (count > 1)
 		mirqs = malloc(count * sizeof(*mirqs), M_MSI, M_WAITOK);
 	else
 		mirqs = NULL;
 again:
 	mtx_lock(&msi_lock);
 
 	/* Try to find 'count' free IRQs. */
 	cnt = 0;
 	for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(i);
 
 		/* End of allocated sources, so break. */
 		if (msi == NULL)
 			break;
 
 		/* If this is a free one, save its IRQ in the array. */
 		if (msi->msi_dev == NULL) {
 			irqs[cnt] = i;
 			cnt++;
 			if (cnt == count)
 				break;
 		}
 	}
 
 	/* Do we need to create some new sources? */
 	if (cnt < count) {
 		/* If we would exceed the max, give up. */
 		if (i + (count - cnt) > FIRST_MSI_INT + NUM_MSI_INTS) {
 			mtx_unlock(&msi_lock);
 			free(mirqs, M_MSI);
 			return (ENXIO);
 		}
 		mtx_unlock(&msi_lock);
 
 		/* We need count - cnt more sources. */
 		while (cnt < count) {
 			msi_create_source();
 			cnt++;
 		}
 		goto again;
 	}
 
 	/* Ok, we now have the IRQs allocated. */
 	KASSERT(cnt == count, ("count mismatch"));
 
 	/* Allocate 'count' IDT vectors. */
-	cpu = intr_next_cpu();
+	cpu = intr_next_cpu(domain);
 	vector = apic_alloc_vectors(cpu, irqs, count, maxcount);
 	if (vector == 0) {
 		mtx_unlock(&msi_lock);
 		free(mirqs, M_MSI);
 		return (ENOSPC);
 	}
 
 #ifdef ACPI_DMAR
 	mtx_unlock(&msi_lock);
 	error = iommu_alloc_msi_intr(dev, cookies, count);
 	mtx_lock(&msi_lock);
 	if (error == EOPNOTSUPP)
 		error = 0;
 	if (error != 0) {
 		for (i = 0; i < count; i++)
 			apic_free_vector(cpu, vector + i, irqs[i]);
 		free(mirqs, M_MSI);
 		return (error);
 	}
 	for (i = 0; i < count; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
 		msi->msi_remap_cookie = cookies[i];
 	}
 #endif
 
 	/* Assign IDT vectors and make these messages owned by 'dev'. */
 	fsrc = (struct msi_intsrc *)intr_lookup_source(irqs[0]);
 	for (i = 0; i < count; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
 		msi->msi_cpu = cpu;
 		msi->msi_dev = dev;
 		msi->msi_vector = vector + i;
 		if (bootverbose)
 			printf(
 		    "msi: routing MSI IRQ %d to local APIC %u vector %u\n",
 			    msi->msi_irq, msi->msi_cpu, msi->msi_vector);
 		msi->msi_first = fsrc;
 		KASSERT(msi->msi_intsrc.is_handlers == 0,
 		    ("dead MSI has handlers"));
 	}
 	fsrc->msi_count = count;
 	fsrc->msi_maxcount = maxcount;
 	if (count > 1)
 		bcopy(irqs, mirqs, count * sizeof(*mirqs));
 	fsrc->msi_irqs = mirqs;
 	mtx_unlock(&msi_lock);
 	return (0);
 }
 
 int
 msi_release(int *irqs, int count)
 {
 	struct msi_intsrc *msi, *first;
 	int i;
 
 	mtx_lock(&msi_lock);
 	first = (struct msi_intsrc *)intr_lookup_source(irqs[0]);
 	if (first == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENOENT);
 	}
 
 	/* Make sure this isn't an MSI-X message. */
 	if (first->msi_msix) {
 		mtx_unlock(&msi_lock);
 		return (EINVAL);
 	}
 
 	/* Make sure this message is allocated to a group. */
 	if (first->msi_first == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENXIO);
 	}
 
 	/*
 	 * Make sure this is the start of a group and that we are releasing
 	 * the entire group.
 	 */
 	if (first->msi_first != first || first->msi_count != count) {
 		mtx_unlock(&msi_lock);
 		return (EINVAL);
 	}
 	KASSERT(first->msi_dev != NULL, ("unowned group"));
 
 	/* Clear all the extra messages in the group. */
 	for (i = 1; i < count; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
 		KASSERT(msi->msi_first == first, ("message not in group"));
 		KASSERT(msi->msi_dev == first->msi_dev, ("owner mismatch"));
 #ifdef ACPI_DMAR
 		iommu_unmap_msi_intr(first->msi_dev, msi->msi_remap_cookie);
 #endif
 		msi->msi_first = NULL;
 		msi->msi_dev = NULL;
 		apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq);
 		msi->msi_vector = 0;
 	}
 
 	/* Clear out the first message. */
 #ifdef ACPI_DMAR
 	mtx_unlock(&msi_lock);
 	iommu_unmap_msi_intr(first->msi_dev, first->msi_remap_cookie);
 	mtx_lock(&msi_lock);
 #endif
 	first->msi_first = NULL;
 	first->msi_dev = NULL;
 	apic_free_vector(first->msi_cpu, first->msi_vector, first->msi_irq);
 	first->msi_vector = 0;
 	first->msi_count = 0;
 	first->msi_maxcount = 0;
 	free(first->msi_irqs, M_MSI);
 	first->msi_irqs = NULL;
 
 	mtx_unlock(&msi_lock);
 	return (0);
 }
 
 int
 msi_map(int irq, uint64_t *addr, uint32_t *data)
 {
 	struct msi_intsrc *msi;
 	int error;
 #ifdef ACPI_DMAR
 	struct msi_intsrc *msi1;
 	int i, k;
 #endif
 
 	mtx_lock(&msi_lock);
 	msi = (struct msi_intsrc *)intr_lookup_source(irq);
 	if (msi == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENOENT);
 	}
 
 	/* Make sure this message is allocated to a device. */
 	if (msi->msi_dev == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENXIO);
 	}
 
 	/*
 	 * If this message isn't an MSI-X message, make sure it's part
 	 * of a group, and switch to the first message in the
 	 * group.
 	 */
 	if (!msi->msi_msix) {
 		if (msi->msi_first == NULL) {
 			mtx_unlock(&msi_lock);
 			return (ENXIO);
 		}
 		msi = msi->msi_first;
 	}
 
 #ifdef ACPI_DMAR
 	if (!msi->msi_msix) {
 		for (k = msi->msi_count - 1, i = FIRST_MSI_INT; k > 0 &&
 		    i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
 			if (i == msi->msi_irq)
 				continue;
 			msi1 = (struct msi_intsrc *)intr_lookup_source(i);
 			if (!msi1->msi_msix && msi1->msi_first == msi) {
 				mtx_unlock(&msi_lock);
 				iommu_map_msi_intr(msi1->msi_dev,
 				    msi1->msi_cpu, msi1->msi_vector,
 				    msi1->msi_remap_cookie, NULL, NULL);
 				k--;
 				mtx_lock(&msi_lock);
 			}
 		}
 	}
 	mtx_unlock(&msi_lock);
 	error = iommu_map_msi_intr(msi->msi_dev, msi->msi_cpu,
 	    msi->msi_vector, msi->msi_remap_cookie, addr, data);
 #else
 	mtx_unlock(&msi_lock);
 	error = EOPNOTSUPP;
 #endif
 	if (error == EOPNOTSUPP) {
 		*addr = INTEL_ADDR(msi);
 		*data = INTEL_DATA(msi);
 		error = 0;
 	}
 	return (error);
 }
 
 int
 msix_alloc(device_t dev, int *irq)
 {
 	struct msi_intsrc *msi;
-	u_int cpu;
+	u_int cpu, domain;
 	int i, vector;
 #ifdef ACPI_DMAR
 	u_int cookie;
 	int error;
 #endif
 
 	if (!msi_enabled)
 		return (ENXIO);
 
+	if (bus_get_domain(dev, &domain) != 0)
+		domain = 0;
+
 again:
 	mtx_lock(&msi_lock);
 
 	/* Find a free IRQ. */
 	for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(i);
 
 		/* End of allocated sources, so break. */
 		if (msi == NULL)
 			break;
 
 		/* Stop at the first free source. */
 		if (msi->msi_dev == NULL)
 			break;
 	}
 
 	/* Do we need to create a new source? */
 	if (msi == NULL) {
 		/* If we would exceed the max, give up. */
 		if (i + 1 > FIRST_MSI_INT + NUM_MSI_INTS) {
 			mtx_unlock(&msi_lock);
 			return (ENXIO);
 		}
 		mtx_unlock(&msi_lock);
 
 		/* Create a new source. */
 		msi_create_source();
 		goto again;
 	}
 
 	/* Allocate an IDT vector. */
-	cpu = intr_next_cpu();
+	cpu = intr_next_cpu(domain);
 	vector = apic_alloc_vector(cpu, i);
 	if (vector == 0) {
 		mtx_unlock(&msi_lock);
 		return (ENOSPC);
 	}
 
 	msi->msi_dev = dev;
 #ifdef ACPI_DMAR
 	mtx_unlock(&msi_lock);
 	error = iommu_alloc_msi_intr(dev, &cookie, 1);
 	mtx_lock(&msi_lock);
 	if (error == EOPNOTSUPP)
 		error = 0;
 	if (error != 0) {
 		msi->msi_dev = NULL;
 		apic_free_vector(cpu, vector, i);
 		return (error);
 	}
 	msi->msi_remap_cookie = cookie;
 #endif
 
 	if (bootverbose)
 		printf("msi: routing MSI-X IRQ %d to local APIC %u vector %u\n",
 		    msi->msi_irq, cpu, vector);
 
 	/* Setup source. */
 	msi->msi_cpu = cpu;
 	msi->msi_first = msi;
 	msi->msi_vector = vector;
 	msi->msi_msix = 1;
 	msi->msi_count = 1;
 	msi->msi_maxcount = 1;
 	msi->msi_irqs = NULL;
 
 	KASSERT(msi->msi_intsrc.is_handlers == 0, ("dead MSI-X has handlers"));
 	mtx_unlock(&msi_lock);
 
 	*irq = i;
 	return (0);
 }
 
 int
 msix_release(int irq)
 {
 	struct msi_intsrc *msi;
 
 	mtx_lock(&msi_lock);
 	msi = (struct msi_intsrc *)intr_lookup_source(irq);
 	if (msi == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENOENT);
 	}
 
 	/* Make sure this is an MSI-X message. */
 	if (!msi->msi_msix) {
 		mtx_unlock(&msi_lock);
 		return (EINVAL);
 	}
 
 	KASSERT(msi->msi_dev != NULL, ("unowned message"));
 
 	/* Clear out the message. */
 #ifdef ACPI_DMAR
 	mtx_unlock(&msi_lock);
 	iommu_unmap_msi_intr(msi->msi_dev, msi->msi_remap_cookie);
 	mtx_lock(&msi_lock);
 #endif
 	msi->msi_first = NULL;
 	msi->msi_dev = NULL;
 	apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq);
 	msi->msi_vector = 0;
 	msi->msi_msix = 0;
 	msi->msi_count = 0;
 	msi->msi_maxcount = 0;
 
 	mtx_unlock(&msi_lock);
 	return (0);
 }
Index: head/sys/x86/x86/nexus.c
===================================================================
--- head/sys/x86/x86/nexus.c	(revision 331697)
+++ head/sys/x86/x86/nexus.c	(revision 331698)
@@ -1,905 +1,907 @@
 /*-
  * Copyright 1998 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * This code implements a `root nexus' for Intel Architecture
  * machines.  The function of the root nexus is to serve as an
  * attachment point for both processors and buses, and to manage
  * resources which are common to all of them.  In particular,
  * this code implements the core resource managers for interrupt
  * requests, DMA requests (which rightfully should be a part of the
  * ISA code but it's easier to do it here for now), I/O port addresses,
  * and I/O memory address space.
  */
 
 #ifdef __amd64__
 #define	DEV_APIC
 #else
 #include "opt_apic.h"
 #endif
 #include "opt_isa.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <machine/bus.h>
 #include <machine/intr_machdep.h>
 #include <sys/rman.h>
 #include <sys/interrupt.h>
 
 #include <machine/vmparam.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/metadata.h>
 #include <machine/nexusvar.h>
 #include <machine/resource.h>
 #include <machine/pc/bios.h>
 
 #ifdef DEV_APIC
 #include "pcib_if.h"
 #endif
 
 #ifdef DEV_ISA
 #include <isa/isavar.h>
 #include <isa/isareg.h>
 #endif
 #include <sys/rtprio.h>
 
 #define	ELF_KERN_STR	("elf"__XSTRING(__ELF_WORD_SIZE)" kernel")
 
 static MALLOC_DEFINE(M_NEXUSDEV, "nexusdev", "Nexus device");
 
 #define DEVTONX(dev)	((struct nexus_device *)device_get_ivars(dev))
 
 struct rman irq_rman, drq_rman, port_rman, mem_rman;
 
 static	int nexus_probe(device_t);
 static	int nexus_attach(device_t);
 static	int nexus_print_all_resources(device_t dev);
 static	int nexus_print_child(device_t, device_t);
 static device_t nexus_add_child(device_t bus, u_int order, const char *name,
 				int unit);
 static	struct resource *nexus_alloc_resource(device_t, device_t, int, int *,
 					      rman_res_t, rman_res_t, rman_res_t,
 					      u_int);
 static	int nexus_adjust_resource(device_t, device_t, int, struct resource *,
 				  rman_res_t, rman_res_t);
 #ifdef SMP
 static	int nexus_bind_intr(device_t, device_t, struct resource *, int);
 #endif
 static	int nexus_config_intr(device_t, int, enum intr_trigger,
 			      enum intr_polarity);
 static	int nexus_describe_intr(device_t dev, device_t child,
 				struct resource *irq, void *cookie,
 				const char *descr);
 static	int nexus_activate_resource(device_t, device_t, int, int,
 				    struct resource *);
 static	int nexus_deactivate_resource(device_t, device_t, int, int,
 				      struct resource *);
 static	int nexus_map_resource(device_t bus, device_t child, int type,
     			       struct resource *r,
 			       struct resource_map_request *argsp,
 			       struct resource_map *map);
 static	int nexus_unmap_resource(device_t bus, device_t child, int type,
 				 struct resource *r, struct resource_map *map);
 static	int nexus_release_resource(device_t, device_t, int, int,
 				   struct resource *);
 static	int nexus_setup_intr(device_t, device_t, struct resource *, int flags,
 			     driver_filter_t filter, void (*)(void *), void *,
 			      void **);
 static	int nexus_teardown_intr(device_t, device_t, struct resource *,
 				void *);
 static struct resource_list *nexus_get_reslist(device_t dev, device_t child);
 static	int nexus_set_resource(device_t, device_t, int, int,
 			       rman_res_t, rman_res_t);
 static	int nexus_get_resource(device_t, device_t, int, int,
 			       rman_res_t *, rman_res_t *);
 static void nexus_delete_resource(device_t, device_t, int, int);
 static	int nexus_get_cpus(device_t, device_t, enum cpu_sets, size_t,
 			   cpuset_t *);
 #ifdef DEV_APIC
 static	int nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs);
 static	int nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs);
 static	int nexus_alloc_msix(device_t pcib, device_t dev, int *irq);
 static	int nexus_release_msix(device_t pcib, device_t dev, int irq);
 static	int nexus_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data);
 #endif
 
 static device_method_t nexus_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		nexus_probe),
 	DEVMETHOD(device_attach,	nexus_attach),
 	DEVMETHOD(device_detach,	bus_generic_detach),
 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
 	DEVMETHOD(device_suspend,	bus_generic_suspend),
 	DEVMETHOD(device_resume,	bus_generic_resume),
 
 	/* Bus interface */
 	DEVMETHOD(bus_print_child,	nexus_print_child),
 	DEVMETHOD(bus_add_child,	nexus_add_child),
 	DEVMETHOD(bus_alloc_resource,	nexus_alloc_resource),
 	DEVMETHOD(bus_adjust_resource,	nexus_adjust_resource),
 	DEVMETHOD(bus_release_resource,	nexus_release_resource),
 	DEVMETHOD(bus_activate_resource, nexus_activate_resource),
 	DEVMETHOD(bus_deactivate_resource, nexus_deactivate_resource),
 	DEVMETHOD(bus_map_resource,	nexus_map_resource),
 	DEVMETHOD(bus_unmap_resource,	nexus_unmap_resource),
 	DEVMETHOD(bus_setup_intr,	nexus_setup_intr),
 	DEVMETHOD(bus_teardown_intr,	nexus_teardown_intr),
 #ifdef SMP
 	DEVMETHOD(bus_bind_intr,	nexus_bind_intr),
 #endif
 	DEVMETHOD(bus_config_intr,	nexus_config_intr),
 	DEVMETHOD(bus_describe_intr,	nexus_describe_intr),
 	DEVMETHOD(bus_get_resource_list, nexus_get_reslist),
 	DEVMETHOD(bus_set_resource,	nexus_set_resource),
 	DEVMETHOD(bus_get_resource,	nexus_get_resource),
 	DEVMETHOD(bus_delete_resource,	nexus_delete_resource),
 	DEVMETHOD(bus_get_cpus,		nexus_get_cpus),
 
 	/* pcib interface */
 #ifdef DEV_APIC
 	DEVMETHOD(pcib_alloc_msi,	nexus_alloc_msi),
 	DEVMETHOD(pcib_release_msi,	nexus_release_msi),
 	DEVMETHOD(pcib_alloc_msix,	nexus_alloc_msix),
 	DEVMETHOD(pcib_release_msix,	nexus_release_msix),
 	DEVMETHOD(pcib_map_msi,		nexus_map_msi),
 #endif
 
 	{ 0, 0 }
 };
 
 DEFINE_CLASS_0(nexus, nexus_driver, nexus_methods, 1);
 static devclass_t nexus_devclass;
 
 DRIVER_MODULE(nexus, root, nexus_driver, nexus_devclass, 0, 0);
 
 static int
 nexus_probe(device_t dev)
 {
 
 	device_quiet(dev);	/* suppress attach message for neatness */
 	return (BUS_PROBE_GENERIC);
 }
 
 void
 nexus_init_resources(void)
 {
 	int irq;
 
 	/*
 	 * XXX working notes:
 	 *
 	 * - IRQ resource creation should be moved to the PIC/APIC driver.
 	 * - DRQ resource creation should be moved to the DMAC driver.
 	 * - The above should be sorted to probe earlier than any child buses.
 	 *
 	 * - Leave I/O and memory creation here, as child probes may need them.
 	 *   (especially eg. ACPI)
 	 */
 
 	/*
 	 * IRQ's are on the mainboard on old systems, but on the ISA part
 	 * of PCI->ISA bridges.  There would be multiple sets of IRQs on
 	 * multi-ISA-bus systems.  PCI interrupts are routed to the ISA
 	 * component, so in a way, PCI can be a partial child of an ISA bus(!).
 	 * APIC interrupts are global though.
 	 */
 	irq_rman.rm_start = 0;
 	irq_rman.rm_type = RMAN_ARRAY;
 	irq_rman.rm_descr = "Interrupt request lines";
 	irq_rman.rm_end = NUM_IO_INTS - 1;
 	if (rman_init(&irq_rman))
 		panic("nexus_init_resources irq_rman");
 
 	/*
 	 * We search for regions of existing IRQs and add those to the IRQ
 	 * resource manager.
 	 */
 	for (irq = 0; irq < NUM_IO_INTS; irq++)
 		if (intr_lookup_source(irq) != NULL)
 			if (rman_manage_region(&irq_rman, irq, irq) != 0)
 				panic("nexus_init_resources irq_rman add");
 
 	/*
 	 * ISA DMA on PCI systems is implemented in the ISA part of each
 	 * PCI->ISA bridge and the channels can be duplicated if there are
 	 * multiple bridges.  (eg: laptops with docking stations)
 	 */
 	drq_rman.rm_start = 0;
 	drq_rman.rm_end = 7;
 	drq_rman.rm_type = RMAN_ARRAY;
 	drq_rman.rm_descr = "DMA request lines";
 	/* XXX drq 0 not available on some machines */
 	if (rman_init(&drq_rman)
 	    || rman_manage_region(&drq_rman,
 				  drq_rman.rm_start, drq_rman.rm_end))
 		panic("nexus_init_resources drq_rman");
 
 	/*
 	 * However, IO ports and Memory truely are global at this level,
 	 * as are APIC interrupts (however many IO APICS there turn out
 	 * to be on large systems..)
 	 */
 	port_rman.rm_start = 0;
 	port_rman.rm_end = 0xffff;
 	port_rman.rm_type = RMAN_ARRAY;
 	port_rman.rm_descr = "I/O ports";
 	if (rman_init(&port_rman)
 	    || rman_manage_region(&port_rman, 0, 0xffff))
 		panic("nexus_init_resources port_rman");
 
 	mem_rman.rm_start = 0;
 #ifndef PAE
 	mem_rman.rm_end = BUS_SPACE_MAXADDR;
 #else
 	mem_rman.rm_end = ((1ULL << cpu_maxphyaddr) - 1);
 #endif
 	mem_rman.rm_type = RMAN_ARRAY;
 	mem_rman.rm_descr = "I/O memory addresses";
 	if (rman_init(&mem_rman)
 	    || rman_manage_region(&mem_rman, 0, mem_rman.rm_end))
 		panic("nexus_init_resources mem_rman");
 }
 
 static int
 nexus_attach(device_t dev)
 {
 
 	nexus_init_resources();
 	bus_generic_probe(dev);
 
 	/*
 	 * Explicitly add the legacy0 device here.  Other platform
 	 * types (such as ACPI), use their own nexus(4) subclass
 	 * driver to override this routine and add their own root bus.
 	 */
 	if (BUS_ADD_CHILD(dev, 10, "legacy", 0) == NULL)
 		panic("legacy: could not attach");
 	bus_generic_attach(dev);
 	return 0;
 }
 
 static int
 nexus_print_all_resources(device_t dev)
 {
 	struct	nexus_device *ndev = DEVTONX(dev);
 	struct resource_list *rl = &ndev->nx_resources;
 	int retval = 0;
 
 	if (STAILQ_FIRST(rl))
 		retval += printf(" at");
 
 	retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#jx");
 	retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#jx");
 	retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd");
 
 	return retval;
 }
 
 static int
 nexus_print_child(device_t bus, device_t child)
 {
 	int retval = 0;
 
 	retval += bus_print_child_header(bus, child);
 	retval += nexus_print_all_resources(child);
 	if (device_get_flags(child))
 		retval += printf(" flags %#x", device_get_flags(child));
 	retval += printf(" on motherboard\n");	/* XXX "motherboard", ick */
 
 	return (retval);
 }
 
 static device_t
 nexus_add_child(device_t bus, u_int order, const char *name, int unit)
 {
 	device_t		child;
 	struct nexus_device	*ndev;
 
 	ndev = malloc(sizeof(struct nexus_device), M_NEXUSDEV, M_NOWAIT|M_ZERO);
 	if (!ndev)
 		return(0);
 	resource_list_init(&ndev->nx_resources);
 
 	child = device_add_child_ordered(bus, order, name, unit);
 
 	/* should we free this in nexus_child_detached? */
 	device_set_ivars(child, ndev);
 
 	return(child);
 }
 
 static struct rman *
 nexus_rman(int type)
 {
 	switch (type) {
 	case SYS_RES_IRQ:
 		return (&irq_rman);
 	case SYS_RES_DRQ:
 		return (&drq_rman);
 	case SYS_RES_IOPORT:
 		return (&port_rman);
 	case SYS_RES_MEMORY:
 		return (&mem_rman);
 	default:
 		return (NULL);
 	}
 }
 
 /*
  * Allocate a resource on behalf of child.  NB: child is usually going to be a
  * child of one of our descendants, not a direct child of nexus0.
  * (Exceptions include npx.)
  */
 static struct resource *
 nexus_alloc_resource(device_t bus, device_t child, int type, int *rid,
 		     rman_res_t start, rman_res_t end, rman_res_t count,
 		     u_int flags)
 {
 	struct nexus_device *ndev = DEVTONX(child);
 	struct	resource *rv;
 	struct resource_list_entry *rle;
 	struct	rman *rm;
 	int needactivate = flags & RF_ACTIVE;
 
 	/*
 	 * If this is an allocation of the "default" range for a given
 	 * RID, and we know what the resources for this device are
 	 * (ie. they aren't maintained by a child bus), then work out
 	 * the start/end values.
 	 */
 	if (RMAN_IS_DEFAULT_RANGE(start, end) && (count == 1)) {
 		if (device_get_parent(child) != bus || ndev == NULL)
 			return(NULL);
 		rle = resource_list_find(&ndev->nx_resources, type, *rid);
 		if (rle == NULL)
 			return(NULL);
 		start = rle->start;
 		end = rle->end;
 		count = rle->count;
 	}
 
 	flags &= ~RF_ACTIVE;
 	rm = nexus_rman(type);
 	if (rm == NULL)
 		return (NULL);
 
 	rv = rman_reserve_resource(rm, start, end, count, flags, child);
 	if (rv == NULL)
 		return 0;
 	rman_set_rid(rv, *rid);
 
 	if (needactivate) {
 		if (bus_activate_resource(child, type, *rid, rv)) {
 			rman_release_resource(rv);
 			return 0;
 		}
 	}
 
 	return rv;
 }
 
 static int
 nexus_adjust_resource(device_t bus, device_t child, int type,
     struct resource *r, rman_res_t start, rman_res_t end)
 {
 	struct rman *rm;
 
 	rm = nexus_rman(type);
 	if (rm == NULL)
 		return (ENXIO);
 	if (!rman_is_region_manager(r, rm))
 		return (EINVAL);
 	return (rman_adjust_resource(r, start, end));
 }
 
 static int
 nexus_activate_resource(device_t bus, device_t child, int type, int rid,
 			struct resource *r)
 {
 	struct resource_map map;
 	int error;
 
 	error = rman_activate_resource(r);
 	if (error != 0)
 		return (error);
 
 	if (!(rman_get_flags(r) & RF_UNMAPPED) &&
 	    (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) {
 		error = nexus_map_resource(bus, child, type, r, NULL, &map);
 		if (error) {
 			rman_deactivate_resource(r);
 			return (error);
 		}
 
 		rman_set_mapping(r,&map);
 	}
 	return (0);
 }
 
 static int
 nexus_deactivate_resource(device_t bus, device_t child, int type, int rid,
 			  struct resource *r)
 {
 	struct resource_map map;
 	int error;
 
 	error = rman_deactivate_resource(r);
 	if (error)
 		return (error);
 
 	if (!(rman_get_flags(r) & RF_UNMAPPED) &&
 	    (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) {
 		rman_get_mapping(r, &map);
 		nexus_unmap_resource(bus, child, type, r, &map);
 	}
 	return (0);
 }
 
 static int
 nexus_map_resource(device_t bus, device_t child, int type, struct resource *r,
     struct resource_map_request *argsp, struct resource_map *map)
 {
 	struct resource_map_request args;
 	rman_res_t end, length, start;
 
 	/* Resources must be active to be mapped. */
 	if (!(rman_get_flags(r) & RF_ACTIVE))
 		return (ENXIO);
 
 	/* Mappings are only supported on I/O and memory resources. */
 	switch (type) {
 	case SYS_RES_IOPORT:
 	case SYS_RES_MEMORY:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	resource_init_map_request(&args);
 	if (argsp != NULL)
 		bcopy(argsp, &args, imin(argsp->size, args.size));
 	start = rman_get_start(r) + args.offset;
 	if (args.length == 0)
 		length = rman_get_size(r);
 	else
 		length = args.length;
 	end = start + length - 1;
 	if (start > rman_get_end(r) || start < rman_get_start(r))
 		return (EINVAL);
 	if (end > rman_get_end(r) || end < start)
 		return (EINVAL);
 
 	/*
 	 * If this is a memory resource, map it into the kernel.
 	 */
 	switch (type) {
 	case SYS_RES_IOPORT:
 		map->r_bushandle = start;
 		map->r_bustag = X86_BUS_SPACE_IO;
 		map->r_size = length;
 		map->r_vaddr = NULL;
 		break;
 	case SYS_RES_MEMORY:
 		map->r_vaddr = pmap_mapdev_attr(start, length, args.memattr);
 		map->r_bustag = X86_BUS_SPACE_MEM;
 		map->r_size = length;
 
 		/*
 		 * The handle is the virtual address.
 		 */
 		map->r_bushandle = (bus_space_handle_t)map->r_vaddr;
 		break;
 	}
 	return (0);
 }
 
 static int
 nexus_unmap_resource(device_t bus, device_t child, int type, struct resource *r,
     struct resource_map *map)
 {
 	
 	/*
 	 * If this is a memory resource, unmap it.
 	 */
 	switch (type) {
 	case SYS_RES_MEMORY:
 		pmap_unmapdev((vm_offset_t)map->r_vaddr, map->r_size);
 		/* FALLTHROUGH */
 	case SYS_RES_IOPORT:
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 nexus_release_resource(device_t bus, device_t child, int type, int rid,
 		       struct resource *r)
 {
 
 	if (rman_get_flags(r) & RF_ACTIVE) {
 		int error = bus_deactivate_resource(child, type, rid, r);
 		if (error)
 			return error;
 	}
 	return (rman_release_resource(r));
 }
 
 /*
  * Currently this uses the really grody interface from kern/kern_intr.c
  * (which really doesn't belong in kern/anything.c).  Eventually, all of
  * the code in kern_intr.c and machdep_intr.c should get moved here, since
  * this is going to be the official interface.
  */
 static int
 nexus_setup_intr(device_t bus, device_t child, struct resource *irq,
 		 int flags, driver_filter_t filter, void (*ihand)(void *),
 		 void *arg, void **cookiep)
 {
-	int		error;
+	int		error, domain;
 
 	/* somebody tried to setup an irq that failed to allocate! */
 	if (irq == NULL)
 		panic("nexus_setup_intr: NULL irq resource!");
 
 	*cookiep = NULL;
 	if ((rman_get_flags(irq) & RF_SHAREABLE) == 0)
 		flags |= INTR_EXCL;
 
 	/*
 	 * We depend here on rman_activate_resource() being idempotent.
 	 */
 	error = rman_activate_resource(irq);
 	if (error)
 		return (error);
+	if (bus_get_domain(child, &domain) != 0)
+		domain = 0;
 
 	error = intr_add_handler(device_get_nameunit(child),
-	    rman_get_start(irq), filter, ihand, arg, flags, cookiep);
+	    rman_get_start(irq), filter, ihand, arg, flags, cookiep, domain);
 
 	return (error);
 }
 
 static int
 nexus_teardown_intr(device_t dev, device_t child, struct resource *r, void *ih)
 {
 	return (intr_remove_handler(ih));
 }
 
 #ifdef SMP
 static int
 nexus_bind_intr(device_t dev, device_t child, struct resource *irq, int cpu)
 {
 	return (intr_bind(rman_get_start(irq), cpu));
 }
 #endif
 
 static int
 nexus_config_intr(device_t dev, int irq, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 	return (intr_config_intr(irq, trig, pol));
 }
 
 static int
 nexus_describe_intr(device_t dev, device_t child, struct resource *irq,
     void *cookie, const char *descr)
 {
 
 	return (intr_describe(rman_get_start(irq), cookie, descr));
 }
 
 static struct resource_list *
 nexus_get_reslist(device_t dev, device_t child)
 {
 	struct nexus_device *ndev = DEVTONX(child);
 
 	return (&ndev->nx_resources);
 }
 
 static int
 nexus_set_resource(device_t dev, device_t child, int type, int rid,
     rman_res_t start, rman_res_t count)
 {
 	struct nexus_device	*ndev = DEVTONX(child);
 	struct resource_list	*rl = &ndev->nx_resources;
 
 	/* XXX this should return a success/failure indicator */
 	resource_list_add(rl, type, rid, start, start + count - 1, count);
 	return(0);
 }
 
 static int
 nexus_get_resource(device_t dev, device_t child, int type, int rid,
     rman_res_t *startp, rman_res_t *countp)
 {
 	struct nexus_device	*ndev = DEVTONX(child);
 	struct resource_list	*rl = &ndev->nx_resources;
 	struct resource_list_entry *rle;
 
 	rle = resource_list_find(rl, type, rid);
 	if (!rle)
 		return(ENOENT);
 	if (startp)
 		*startp = rle->start;
 	if (countp)
 		*countp = rle->count;
 	return(0);
 }
 
 static void
 nexus_delete_resource(device_t dev, device_t child, int type, int rid)
 {
 	struct nexus_device	*ndev = DEVTONX(child);
 	struct resource_list	*rl = &ndev->nx_resources;
 
 	resource_list_delete(rl, type, rid);
 }
 
 static int
 nexus_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize,
     cpuset_t *cpuset)
 {
 
 	switch (op) {
 #ifdef SMP
 	case INTR_CPUS:
 		if (setsize != sizeof(cpuset_t))
 			return (EINVAL);
 		*cpuset = intr_cpus;
 		return (0);
 #endif
 	default:
 		return (bus_generic_get_cpus(dev, child, op, setsize, cpuset));
 	}
 }
 
 /* Called from the MSI code to add new IRQs to the IRQ rman. */
 void
 nexus_add_irq(u_long irq)
 {
 
 	if (rman_manage_region(&irq_rman, irq, irq) != 0)
 		panic("%s: failed", __func__);
 }
 
 #ifdef DEV_APIC
 static int
 nexus_alloc_msix(device_t pcib, device_t dev, int *irq)
 {
 
 	return (msix_alloc(dev, irq));
 }
 
 static int
 nexus_release_msix(device_t pcib, device_t dev, int irq)
 {
 
 	return (msix_release(irq));
 }
 
 static int
 nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs)
 {
 
 	return (msi_alloc(dev, count, maxcount, irqs));
 }
 
 static int
 nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs)
 {
 
 	return (msi_release(irqs, count));
 }
 
 static int
 nexus_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data)
 {
 
 	return (msi_map(irq, addr, data));
 }
 #endif
 
 /* Placeholder for system RAM. */
 static void
 ram_identify(driver_t *driver, device_t parent)
 {
 
 	if (resource_disabled("ram", 0))
 		return;	
 	if (BUS_ADD_CHILD(parent, 0, "ram", 0) == NULL)
 		panic("ram_identify");
 }
 
 static int
 ram_probe(device_t dev)
 {
 
 	device_quiet(dev);
 	device_set_desc(dev, "System RAM");
 	return (0);
 }
 
 static int
 ram_attach(device_t dev)
 {
 	struct bios_smap *smapbase, *smap, *smapend;
 	struct resource *res;
 	vm_paddr_t *p;
 	caddr_t kmdp;
 	uint32_t smapsize;
 	int error, rid;
 
 	/* Retrieve the system memory map from the loader. */
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type(ELF_KERN_STR);  
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase != NULL) {
 		smapsize = *((u_int32_t *)smapbase - 1);
 		smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
 		rid = 0;
 		for (smap = smapbase; smap < smapend; smap++) {
 			if (smap->type != SMAP_TYPE_MEMORY ||
 			    smap->length == 0)
 				continue;
 #ifdef __i386__
 			/*
 			 * Resources use long's to track resources, so
 			 * we can't include memory regions above 4GB.
 			 */
 			if (smap->base > ~0ul)
 				continue;
 #endif
 			error = bus_set_resource(dev, SYS_RES_MEMORY, rid,
 			    smap->base, smap->length);
 			if (error)
 				panic(
 				    "ram_attach: resource %d failed set with %d",
 				    rid, error);
 			res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
 			    0);
 			if (res == NULL)
 				panic("ram_attach: resource %d failed to attach",
 				    rid);
 			rid++;
 		}
 		return (0);
 	}
 
 	/*
 	 * If the system map is not available, fall back to using
 	 * dump_avail[].  We use the dump_avail[] array rather than
 	 * phys_avail[] for the memory map as phys_avail[] contains
 	 * holes for kernel memory, page 0, the message buffer, and
 	 * the dcons buffer.  We test the end address in the loop
 	 * instead of the start since the start address for the first
 	 * segment is 0.
 	 */
 	for (rid = 0, p = dump_avail; p[1] != 0; rid++, p += 2) {
 #ifdef PAE
 		/*
 		 * Resources use long's to track resources, so we can't
 		 * include memory regions above 4GB.
 		 */
 		if (p[0] > ~0ul)
 			break;
 #endif
 		error = bus_set_resource(dev, SYS_RES_MEMORY, rid, p[0],
 		    p[1] - p[0]);
 		if (error)
 			panic("ram_attach: resource %d failed set with %d", rid,
 			    error);
 		res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0);
 		if (res == NULL)
 			panic("ram_attach: resource %d failed to attach", rid);
 	}
 	return (0);
 }
 
 static device_method_t ram_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_identify,	ram_identify),
 	DEVMETHOD(device_probe,		ram_probe),
 	DEVMETHOD(device_attach,	ram_attach),
 	{ 0, 0 }
 };
 
 static driver_t ram_driver = {
 	"ram",
 	ram_methods,
 	1,		/* no softc */
 };
 
 static devclass_t ram_devclass;
 
 DRIVER_MODULE(ram, nexus, ram_driver, ram_devclass, 0, 0);
 
 #ifdef DEV_ISA
 /*
  * Placeholder which claims PnP 'devices' which describe system
  * resources.
  */
 static struct isa_pnp_id sysresource_ids[] = {
 	{ 0x010cd041 /* PNP0c01 */, "System Memory" },
 	{ 0x020cd041 /* PNP0c02 */, "System Resource" },
 	{ 0 }
 };
 
 static int
 sysresource_probe(device_t dev)
 {
 	int	result;
 
 	if ((result = ISA_PNP_PROBE(device_get_parent(dev), dev, sysresource_ids)) <= 0) {
 		device_quiet(dev);
 	}
 	return(result);
 }
 
 static int
 sysresource_attach(device_t dev)
 {
 	return(0);
 }
 
 static device_method_t sysresource_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		sysresource_probe),
 	DEVMETHOD(device_attach,	sysresource_attach),
 	DEVMETHOD(device_detach,	bus_generic_detach),
 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
 	DEVMETHOD(device_suspend,	bus_generic_suspend),
 	DEVMETHOD(device_resume,	bus_generic_resume),
 	{ 0, 0 }
 };
 
 static driver_t sysresource_driver = {
 	"sysresource",
 	sysresource_methods,
 	1,		/* no softc */
 };
 
 static devclass_t sysresource_devclass;
 
 DRIVER_MODULE(sysresource, isa, sysresource_driver, sysresource_devclass, 0, 0);
 ISA_PNP_INFO(sysresource_ids);
 #endif /* DEV_ISA */
Index: head/sys/x86/xen/xen_intr.c
===================================================================
--- head/sys/x86/xen/xen_intr.c	(revision 331697)
+++ head/sys/x86/xen/xen_intr.c	(revision 331698)
@@ -1,1668 +1,1668 @@
 /******************************************************************************
  * xen_intr.c
  *
  * Xen event and interrupt services for x86 HVM guests.
  *
  * Copyright (c) 2002-2005, K A Fraser
  * Copyright (c) 2005, Intel Corporation <xiaofeng.ling@intel.com>
  * Copyright (c) 2012, Spectra Logic Corporation
  *
  * This file may be distributed separately from the Linux kernel, or
  * incorporated into other software packages, subject to the following license:
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this source file (the "Software"), to deal in the Software without
  * restriction, including without limitation the rights to use, copy, modify,
  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  * and to permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/interrupt.h>
 #include <sys/pcpu.h>
 #include <sys/smp.h>
 #include <sys/refcount.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <x86/apicreg.h>
 #include <machine/smp.h>
 #include <machine/stdarg.h>
 
 #include <machine/xen/synch_bitops.h>
 #include <machine/xen/xen-os.h>
 
 #include <xen/hypervisor.h>
 #include <xen/xen_intr.h>
 #include <xen/evtchn/evtchnvar.h>
 
 #include <dev/xen/xenpci/xenpcivar.h>
 #include <dev/pci/pcivar.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 static MALLOC_DEFINE(M_XENINTR, "xen_intr", "Xen Interrupt Services");
 
 /**
  * Per-cpu event channel processing state.
  */
 struct xen_intr_pcpu_data {
 	/**
 	 * The last event channel bitmap section (level one bit) processed.
 	 * This is used to ensure we scan all ports before
 	 * servicing an already servied port again.
 	 */
 	u_int	last_processed_l1i;
 
 	/**
 	 * The last event channel processed within the event channel
 	 * bitmap being scanned.
 	 */
 	u_int	last_processed_l2i;
 
 	/** Pointer to this CPU's interrupt statistic counter. */
 	u_long *evtchn_intrcnt;
 
 	/**
 	 * A bitmap of ports that can be serviced from this CPU.
 	 * A set bit means interrupt handling is enabled.
 	 */
 	u_long	evtchn_enabled[sizeof(u_long) * 8];
 };
 
 /*
  * Start the scan at port 0 by initializing the last scanned
  * location as the highest numbered event channel port.
  */
 static DPCPU_DEFINE(struct xen_intr_pcpu_data, xen_intr_pcpu) = {
 	.last_processed_l1i = LONG_BIT - 1,
 	.last_processed_l2i = LONG_BIT - 1
 };
 
 DPCPU_DECLARE(struct vcpu_info *, vcpu_info);
 
 #define	XEN_EEXIST		17 /* Xen "already exists" error */
 #define	XEN_ALLOCATE_VECTOR	0 /* Allocate a vector for this event channel */
 #define	XEN_INVALID_EVTCHN	0 /* Invalid event channel */
 
 #define	is_valid_evtchn(x)	((x) != XEN_INVALID_EVTCHN)
 
 struct xenisrc {
 	struct intsrc	xi_intsrc;
 	enum evtchn_type xi_type;
 	int		xi_cpu;		/* VCPU for delivery. */
 	int		xi_vector;	/* Global isrc vector number. */
 	evtchn_port_t	xi_port;
 	int		xi_pirq;
 	int		xi_virq;
 	void		*xi_cookie;
 	u_int		xi_close:1;	/* close on unbind? */
 	u_int		xi_activehi:1;
 	u_int		xi_edgetrigger:1;
 	u_int		xi_masked:1;
 	volatile u_int	xi_refcount;
 };
 
 static void	xen_intr_suspend(struct pic *);
 static void	xen_intr_resume(struct pic *, bool suspend_cancelled);
 static void	xen_intr_enable_source(struct intsrc *isrc);
 static void	xen_intr_disable_source(struct intsrc *isrc, int eoi);
 static void	xen_intr_eoi_source(struct intsrc *isrc);
 static void	xen_intr_enable_intr(struct intsrc *isrc);
 static void	xen_intr_disable_intr(struct intsrc *isrc);
 static int	xen_intr_vector(struct intsrc *isrc);
 static int	xen_intr_source_pending(struct intsrc *isrc);
 static int	xen_intr_config_intr(struct intsrc *isrc,
 		     enum intr_trigger trig, enum intr_polarity pol);
 static int	xen_intr_assign_cpu(struct intsrc *isrc, u_int apic_id);
 
 static void	xen_intr_pirq_enable_source(struct intsrc *isrc);
 static void	xen_intr_pirq_disable_source(struct intsrc *isrc, int eoi);
 static void	xen_intr_pirq_eoi_source(struct intsrc *isrc);
 static void	xen_intr_pirq_enable_intr(struct intsrc *isrc);
 static void	xen_intr_pirq_disable_intr(struct intsrc *isrc);
 static int	xen_intr_pirq_config_intr(struct intsrc *isrc,
 		     enum intr_trigger trig, enum intr_polarity pol);
 
 /**
  * PIC interface for all event channel port types except physical IRQs.
  */
 struct pic xen_intr_pic = {
 	.pic_enable_source  = xen_intr_enable_source,
 	.pic_disable_source = xen_intr_disable_source,
 	.pic_eoi_source     = xen_intr_eoi_source,
 	.pic_enable_intr    = xen_intr_enable_intr,
 	.pic_disable_intr   = xen_intr_disable_intr,
 	.pic_vector         = xen_intr_vector,
 	.pic_source_pending = xen_intr_source_pending,
 	.pic_suspend        = xen_intr_suspend,
 	.pic_resume         = xen_intr_resume,
 	.pic_config_intr    = xen_intr_config_intr,
 	.pic_assign_cpu     = xen_intr_assign_cpu
 };
 
 /**
  * PIC interface for all event channel representing
  * physical interrupt sources.
  */
 struct pic xen_intr_pirq_pic = {
 	.pic_enable_source  = xen_intr_pirq_enable_source,
 	.pic_disable_source = xen_intr_pirq_disable_source,
 	.pic_eoi_source     = xen_intr_pirq_eoi_source,
 	.pic_enable_intr    = xen_intr_pirq_enable_intr,
 	.pic_disable_intr   = xen_intr_pirq_disable_intr,
 	.pic_vector         = xen_intr_vector,
 	.pic_source_pending = xen_intr_source_pending,
 	.pic_config_intr    = xen_intr_pirq_config_intr,
 	.pic_assign_cpu     = xen_intr_assign_cpu
 };
 
 static struct mtx	 xen_intr_isrc_lock;
 static int		 xen_intr_auto_vector_count;
 static struct xenisrc	*xen_intr_port_to_isrc[NR_EVENT_CHANNELS];
 static u_long		*xen_intr_pirq_eoi_map;
 static boolean_t	 xen_intr_pirq_eoi_map_enabled;
 
 /*------------------------- Private Functions --------------------------------*/
 /**
  * Disable signal delivery for an event channel port on the
  * specified CPU.
  *
  * \param port  The event channel port to mask.
  *
  * This API is used to manage the port<=>CPU binding of event
  * channel handlers.
  *
  * \note  This operation does not preclude reception of an event
  *        for this event channel on another CPU.  To mask the
  *        event channel globally, use evtchn_mask().
  */
 static inline void
 evtchn_cpu_mask_port(u_int cpu, evtchn_port_t port)
 {
 	struct xen_intr_pcpu_data *pcpu;
 
 	pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu);
 	xen_clear_bit(port, pcpu->evtchn_enabled);
 }
 
 /**
  * Enable signal delivery for an event channel port on the
  * specified CPU.
  *
  * \param port  The event channel port to unmask.
  *
  * This API is used to manage the port<=>CPU binding of event
  * channel handlers.
  *
  * \note  This operation does not guarantee that event delivery
  *        is enabled for this event channel port.  The port must
  *        also be globally enabled.  See evtchn_unmask().
  */
 static inline void
 evtchn_cpu_unmask_port(u_int cpu, evtchn_port_t port)
 {
 	struct xen_intr_pcpu_data *pcpu;
 
 	pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu);
 	xen_set_bit(port, pcpu->evtchn_enabled);
 }
 
 /**
  * Allocate and register a per-cpu Xen upcall interrupt counter.
  *
  * \param cpu  The cpu for which to register this interrupt count.
  */
 static void
 xen_intr_intrcnt_add(u_int cpu)
 {
 	char buf[MAXCOMLEN + 1];
 	struct xen_intr_pcpu_data *pcpu;
 
 	pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu);
 	if (pcpu->evtchn_intrcnt != NULL)
 		return;
 
 	snprintf(buf, sizeof(buf), "cpu%d:xen", cpu);
 	intrcnt_add(buf, &pcpu->evtchn_intrcnt);
 }
 
 /**
  * Search for an already allocated but currently unused Xen interrupt
  * source object.
  *
  * \param type  Restrict the search to interrupt sources of the given
  *              type.
  *
  * \return  A pointer to a free Xen interrupt source object or NULL.
  */
 static struct xenisrc *
 xen_intr_find_unused_isrc(enum evtchn_type type)
 {
 	int isrc_idx;
 
 	KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn isrc lock not held"));
 
 	for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx ++) {
 		struct xenisrc *isrc;
 		u_int vector;
 
 		vector = FIRST_EVTCHN_INT + isrc_idx;
 		isrc = (struct xenisrc *)intr_lookup_source(vector);
 		if (isrc != NULL
 		 && isrc->xi_type == EVTCHN_TYPE_UNBOUND) {
 			KASSERT(isrc->xi_intsrc.is_handlers == 0,
 			    ("Free evtchn still has handlers"));
 			isrc->xi_type = type;
 			return (isrc);
 		}
 	}
 	return (NULL);
 }
 
 /**
  * Allocate a Xen interrupt source object.
  *
  * \param type  The type of interrupt source to create.
  *
  * \return  A pointer to a newly allocated Xen interrupt source
  *          object or NULL.
  */
 static struct xenisrc *
 xen_intr_alloc_isrc(enum evtchn_type type, int vector)
 {
 	static int warned;
 	struct xenisrc *isrc;
 
 	KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn alloc lock not held"));
 
 	if (xen_intr_auto_vector_count > NR_EVENT_CHANNELS) {
 		if (!warned) {
 			warned = 1;
 			printf("xen_intr_alloc: Event channels exhausted.\n");
 		}
 		return (NULL);
 	}
 
 	if (type != EVTCHN_TYPE_PIRQ) {
 		vector = FIRST_EVTCHN_INT + xen_intr_auto_vector_count;
 		xen_intr_auto_vector_count++;
 	}
 
 	KASSERT((intr_lookup_source(vector) == NULL),
 	    ("Trying to use an already allocated vector"));
 
 	mtx_unlock(&xen_intr_isrc_lock);
 	isrc = malloc(sizeof(*isrc), M_XENINTR, M_WAITOK | M_ZERO);
 	isrc->xi_intsrc.is_pic =
 	    (type == EVTCHN_TYPE_PIRQ) ? &xen_intr_pirq_pic : &xen_intr_pic;
 	isrc->xi_vector = vector;
 	isrc->xi_type = type;
 	intr_register_source(&isrc->xi_intsrc);
 	mtx_lock(&xen_intr_isrc_lock);
 
 	return (isrc);
 }
 
 /**
  * Attempt to free an active Xen interrupt source object.
  *
  * \param isrc  The interrupt source object to release.
  *
  * \returns  EBUSY if the source is still in use, otherwise 0.
  */
 static int
 xen_intr_release_isrc(struct xenisrc *isrc)
 {
 
 	mtx_lock(&xen_intr_isrc_lock);
 	KASSERT(isrc->xi_intsrc.is_handlers == 0,
 	    ("Release called, but xenisrc still in use"));
 	evtchn_mask_port(isrc->xi_port);
 	evtchn_clear_port(isrc->xi_port);
 
 	/* Rebind port to CPU 0. */
 	evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port);
 	evtchn_cpu_unmask_port(0, isrc->xi_port);
 
 	if (isrc->xi_close != 0 && is_valid_evtchn(isrc->xi_port)) {
 		struct evtchn_close close = { .port = isrc->xi_port };
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
 			panic("EVTCHNOP_close failed");
 	}
 
 	xen_intr_port_to_isrc[isrc->xi_port] = NULL;
 	isrc->xi_cpu = 0;
 	isrc->xi_type = EVTCHN_TYPE_UNBOUND;
 	isrc->xi_port = 0;
 	isrc->xi_cookie = NULL;
 	mtx_unlock(&xen_intr_isrc_lock);
 	return (0);
 }
 
 /**
  * Associate an interrupt handler with an already allocated local Xen
  * event channel port.
  *
  * \param isrcp       The returned Xen interrupt object associated with
  *                    the specified local port.
  * \param local_port  The event channel to bind.
  * \param type        The event channel type of local_port.
  * \param intr_owner  The device making this bind request.
  * \param filter      An interrupt filter handler.  Specify NULL
  *                    to always dispatch to the ithread handler.
  * \param handler     An interrupt ithread handler.  Optional (can
  *                    specify NULL) if all necessary event actions
  *                    are performed by filter.
  * \param arg         Argument to present to both filter and handler.
  * \param irqflags    Interrupt handler flags.  See sys/bus.h.
  * \param handlep     Pointer to an opaque handle used to manage this
  *                    registration.
  *
  * \returns  0 on success, otherwise an errno.
  */
 static int
 xen_intr_bind_isrc(struct xenisrc **isrcp, evtchn_port_t local_port,
     enum evtchn_type type, const char *intr_owner, driver_filter_t filter,
     driver_intr_t handler, void *arg, enum intr_type flags,
     xen_intr_handle_t *port_handlep)
 {
 	struct xenisrc *isrc;
 	int error;
 
 	*isrcp = NULL;
 	if (port_handlep == NULL) {
 		printf("%s: xen_intr_bind_isrc: Bad event handle\n",
 		    intr_owner);
 		return (EINVAL);
 	}
 
 	mtx_lock(&xen_intr_isrc_lock);
 	isrc = xen_intr_find_unused_isrc(type);
 	if (isrc == NULL) {
 		isrc = xen_intr_alloc_isrc(type, XEN_ALLOCATE_VECTOR);
 		if (isrc == NULL) {
 			mtx_unlock(&xen_intr_isrc_lock);
 			return (ENOSPC);
 		}
 	}
 	isrc->xi_port = local_port;
 	xen_intr_port_to_isrc[local_port] = isrc;
 	refcount_init(&isrc->xi_refcount, 1);
 	mtx_unlock(&xen_intr_isrc_lock);
 
 	/* Assign the opaque handler (the event channel port) */
 	*port_handlep = &isrc->xi_vector;
 
 #ifdef SMP
 	if (type == EVTCHN_TYPE_PORT) {
 		/*
 		 * By default all interrupts are assigned to vCPU#0
 		 * unless specified otherwise, so shuffle them to balance
 		 * the interrupt load.
 		 */
-		xen_intr_assign_cpu(&isrc->xi_intsrc, intr_next_cpu());
+		xen_intr_assign_cpu(&isrc->xi_intsrc, intr_next_cpu(0));
 	}
 #endif
 
 	if (filter == NULL && handler == NULL) {
 		/*
 		 * No filter/handler provided, leave the event channel
 		 * masked and without a valid handler, the caller is
 		 * in charge of setting that up.
 		 */
 		*isrcp = isrc;
 		return (0);
 	}
 
 	error = xen_intr_add_handler(intr_owner, filter, handler, arg, flags,
 	    *port_handlep);
 	if (error != 0) {
 		xen_intr_release_isrc(isrc);
 		return (error);
 	}
 	*isrcp = isrc;
 	return (0);
 }
 
 /**
  * Lookup a Xen interrupt source object given an interrupt binding handle.
  * 
  * \param handle  A handle initialized by a previous call to
  *                xen_intr_bind_isrc().
  *
  * \returns  A pointer to the Xen interrupt source object associated
  *           with the given interrupt handle.  NULL if no association
  *           currently exists.
  */
 static struct xenisrc *
 xen_intr_isrc(xen_intr_handle_t handle)
 {
 	int vector;
 
 	if (handle == NULL)
 		return (NULL);
 
 	vector = *(int *)handle;
 	KASSERT(vector >= FIRST_EVTCHN_INT &&
 	    vector < (FIRST_EVTCHN_INT + xen_intr_auto_vector_count),
 	    ("Xen interrupt vector is out of range"));
 
 	return ((struct xenisrc *)intr_lookup_source(vector));
 }
 
 /**
  * Determine the event channel ports at the given section of the
  * event port bitmap which have pending events for the given cpu.
  * 
  * \param pcpu  The Xen interrupt pcpu data for the cpu being querried.
  * \param sh    The Xen shared info area.
  * \param idx   The index of the section of the event channel bitmap to
  *              inspect.
  *
  * \returns  A u_long with bits set for every event channel with pending
  *           events.
  */
 static inline u_long
 xen_intr_active_ports(struct xen_intr_pcpu_data *pcpu, shared_info_t *sh,
     u_int idx)
 {
 
 	CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(sh->evtchn_pending[0]));
 	CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(pcpu->evtchn_enabled[0]));
 	CTASSERT(sizeof(sh->evtchn_mask) == sizeof(sh->evtchn_pending));
 	CTASSERT(sizeof(sh->evtchn_mask) == sizeof(pcpu->evtchn_enabled));
 	return (sh->evtchn_pending[idx]
 	      & ~sh->evtchn_mask[idx]
 	      & pcpu->evtchn_enabled[idx]);
 }
 
 /**
  * Interrupt handler for processing all Xen event channel events.
  * 
  * \param trap_frame  The trap frame context for the current interrupt.
  */
 void
 xen_intr_handle_upcall(struct trapframe *trap_frame)
 {
 	u_int l1i, l2i, port, cpu;
 	u_long masked_l1, masked_l2;
 	struct xenisrc *isrc;
 	shared_info_t *s;
 	vcpu_info_t *v;
 	struct xen_intr_pcpu_data *pc;
 	u_long l1, l2;
 
 	/*
 	 * Disable preemption in order to always check and fire events
 	 * on the right vCPU
 	 */
 	critical_enter();
 
 	cpu = PCPU_GET(cpuid);
 	pc  = DPCPU_PTR(xen_intr_pcpu);
 	s   = HYPERVISOR_shared_info;
 	v   = DPCPU_GET(vcpu_info);
 
 	if (xen_hvm_domain() && !xen_vector_callback_enabled) {
 		KASSERT((cpu == 0), ("Fired PCI event callback on wrong CPU"));
 	}
 
 	v->evtchn_upcall_pending = 0;
 
 #if 0
 #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
 	/* Clear master flag /before/ clearing selector flag. */
 	wmb();
 #endif
 #endif
 
 	l1 = atomic_readandclear_long(&v->evtchn_pending_sel);
 
 	l1i = pc->last_processed_l1i;
 	l2i = pc->last_processed_l2i;
 	(*pc->evtchn_intrcnt)++;
 
 	while (l1 != 0) {
 
 		l1i = (l1i + 1) % LONG_BIT;
 		masked_l1 = l1 & ((~0UL) << l1i);
 
 		if (masked_l1 == 0) {
 			/*
 			 * if we masked out all events, wrap around
 			 * to the beginning.
 			 */
 			l1i = LONG_BIT - 1;
 			l2i = LONG_BIT - 1;
 			continue;
 		}
 		l1i = ffsl(masked_l1) - 1;
 
 		do {
 			l2 = xen_intr_active_ports(pc, s, l1i);
 
 			l2i = (l2i + 1) % LONG_BIT;
 			masked_l2 = l2 & ((~0UL) << l2i);
 
 			if (masked_l2 == 0) {
 				/* if we masked out all events, move on */
 				l2i = LONG_BIT - 1;
 				break;
 			}
 			l2i = ffsl(masked_l2) - 1;
 
 			/* process port */
 			port = (l1i * LONG_BIT) + l2i;
 			synch_clear_bit(port, &s->evtchn_pending[0]);
 
 			isrc = xen_intr_port_to_isrc[port];
 			if (__predict_false(isrc == NULL))
 				continue;
 
 			/* Make sure we are firing on the right vCPU */
 			KASSERT((isrc->xi_cpu == PCPU_GET(cpuid)),
 				("Received unexpected event on vCPU#%d, event bound to vCPU#%d",
 				PCPU_GET(cpuid), isrc->xi_cpu));
 
 			intr_execute_handlers(&isrc->xi_intsrc, trap_frame);
 
 			/*
 			 * If this is the final port processed,
 			 * we'll pick up here+1 next time.
 			 */
 			pc->last_processed_l1i = l1i;
 			pc->last_processed_l2i = l2i;
 
 		} while (l2i != LONG_BIT - 1);
 
 		l2 = xen_intr_active_ports(pc, s, l1i);
 		if (l2 == 0) {
 			/*
 			 * We handled all ports, so we can clear the
 			 * selector bit.
 			 */
 			l1 &= ~(1UL << l1i);
 		}
 	}
 	critical_exit();
 }
 
 static int
 xen_intr_init(void *dummy __unused)
 {
 	shared_info_t *s = HYPERVISOR_shared_info;
 	struct xen_intr_pcpu_data *pcpu;
 	struct physdev_pirq_eoi_gmfn eoi_gmfn;
 	int i, rc;
 
 	if (!xen_domain())
 		return (0);
 
 	mtx_init(&xen_intr_isrc_lock, "xen-irq-lock", NULL, MTX_DEF);
 
 	/*
 	 * Register interrupt count manually as we aren't
 	 * guaranteed to see a call to xen_intr_assign_cpu()
 	 * before our first interrupt. Also set the per-cpu
 	 * mask of CPU#0 to enable all, since by default
 	 * all event channels are bound to CPU#0.
 	 */
 	CPU_FOREACH(i) {
 		pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
 		memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0,
 		    sizeof(pcpu->evtchn_enabled));
 		xen_intr_intrcnt_add(i);
 	}
 
 	for (i = 0; i < nitems(s->evtchn_mask); i++)
 		atomic_store_rel_long(&s->evtchn_mask[i], ~0);
 
 	/* Try to register PIRQ EOI map */
 	xen_intr_pirq_eoi_map = malloc(PAGE_SIZE, M_XENINTR, M_WAITOK | M_ZERO);
 	eoi_gmfn.gmfn = atop(vtophys(xen_intr_pirq_eoi_map));
 	rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn);
 	if (rc != 0 && bootverbose)
 		printf("Xen interrupts: unable to register PIRQ EOI map\n");
 	else
 		xen_intr_pirq_eoi_map_enabled = true;
 
 	intr_register_pic(&xen_intr_pic);
 	intr_register_pic(&xen_intr_pirq_pic);
 
 	if (bootverbose)
 		printf("Xen interrupt system initialized\n");
 
 	return (0);
 }
 SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_SECOND, xen_intr_init, NULL);
 
 /*--------------------------- Common PIC Functions ---------------------------*/
 /**
  * Prepare this PIC for system suspension.
  */
 static void
 xen_intr_suspend(struct pic *unused)
 {
 }
 
 static void
 xen_rebind_ipi(struct xenisrc *isrc)
 {
 #ifdef SMP
 	int cpu = isrc->xi_cpu;
 	int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
 	int error;
 	struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id };
 
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
 	                                    &bind_ipi);
 	if (error != 0)
 		panic("unable to rebind xen IPI: %d", error);
 
 	isrc->xi_port = bind_ipi.port;
 	isrc->xi_cpu = 0;
 	xen_intr_port_to_isrc[bind_ipi.port] = isrc;
 
 	error = xen_intr_assign_cpu(&isrc->xi_intsrc,
 	                            cpu_apic_ids[cpu]);
 	if (error)
 		panic("unable to bind xen IPI to CPU#%d: %d",
 		      cpu, error);
 
 	evtchn_unmask_port(bind_ipi.port);
 #else
 	panic("Resume IPI event channel on UP");
 #endif
 }
 
 static void
 xen_rebind_virq(struct xenisrc *isrc)
 {
 	int cpu = isrc->xi_cpu;
 	int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
 	int error;
 	struct evtchn_bind_virq bind_virq = { .virq = isrc->xi_virq,
 	                                      .vcpu = vcpu_id };
 
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
 	                                    &bind_virq);
 	if (error != 0)
 		panic("unable to rebind xen VIRQ#%d: %d", isrc->xi_virq, error);
 
 	isrc->xi_port = bind_virq.port;
 	isrc->xi_cpu = 0;
 	xen_intr_port_to_isrc[bind_virq.port] = isrc;
 
 #ifdef SMP
 	error = xen_intr_assign_cpu(&isrc->xi_intsrc,
 	                            cpu_apic_ids[cpu]);
 	if (error)
 		panic("unable to bind xen VIRQ#%d to CPU#%d: %d",
 		      isrc->xi_virq, cpu, error);
 #endif
 
 	evtchn_unmask_port(bind_virq.port);
 }
 
 /**
  * Return this PIC to service after being suspended.
  */
 static void
 xen_intr_resume(struct pic *unused, bool suspend_cancelled)
 {
 	shared_info_t *s = HYPERVISOR_shared_info;
 	struct xenisrc *isrc;
 	u_int isrc_idx;
 	int i;
 
 	if (suspend_cancelled)
 		return;
 
 	/* Reset the per-CPU masks */
 	CPU_FOREACH(i) {
 		struct xen_intr_pcpu_data *pcpu;
 
 		pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
 		memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0,
 		    sizeof(pcpu->evtchn_enabled));
 	}
 
 	/* Mask all event channels. */
 	for (i = 0; i < nitems(s->evtchn_mask); i++)
 		atomic_store_rel_long(&s->evtchn_mask[i], ~0);
 
 	/* Remove port -> isrc mappings */
 	memset(xen_intr_port_to_isrc, 0, sizeof(xen_intr_port_to_isrc));
 
 	/* Free unused isrcs and rebind VIRQs and IPIs */
 	for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx++) {
 		u_int vector;
 
 		vector = FIRST_EVTCHN_INT + isrc_idx;
 		isrc = (struct xenisrc *)intr_lookup_source(vector);
 		if (isrc != NULL) {
 			isrc->xi_port = 0;
 			switch (isrc->xi_type) {
 			case EVTCHN_TYPE_IPI:
 				xen_rebind_ipi(isrc);
 				break;
 			case EVTCHN_TYPE_VIRQ:
 				xen_rebind_virq(isrc);
 				break;
 			default:
 				break;
 			}
 		}
 	}
 }
 
 /**
  * Disable a Xen interrupt source.
  *
  * \param isrc  The interrupt source to disable.
  */
 static void
 xen_intr_disable_intr(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc = (struct xenisrc *)base_isrc;
 
 	evtchn_mask_port(isrc->xi_port);
 }
 
 /**
  * Determine the global interrupt vector number for
  * a Xen interrupt source.
  *
  * \param isrc  The interrupt source to query.
  *
  * \return  The vector number corresponding to the given interrupt source.
  */
 static int
 xen_intr_vector(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc = (struct xenisrc *)base_isrc;
 
 	return (isrc->xi_vector);
 }
 
 /**
  * Determine whether or not interrupt events are pending on the
  * the given interrupt source.
  *
  * \param isrc  The interrupt source to query.
  *
  * \returns  0 if no events are pending, otherwise non-zero.
  */
 static int
 xen_intr_source_pending(struct intsrc *isrc)
 {
 	/*
 	 * EventChannels are edge triggered and never masked.
 	 * There can be no pending events.
 	 */
 	return (0);
 }
 
 /**
  * Perform configuration of an interrupt source.
  *
  * \param isrc  The interrupt source to configure.
  * \param trig  Edge or level.
  * \param pol   Active high or low.
  *
  * \returns  0 if no events are pending, otherwise non-zero.
  */
 static int
 xen_intr_config_intr(struct intsrc *isrc, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 	/* Configuration is only possible via the evtchn apis. */
 	return (ENODEV);
 }
 
 /**
  * Configure CPU affinity for interrupt source event delivery.
  *
  * \param isrc     The interrupt source to configure.
  * \param apic_id  The apic id of the CPU for handling future events.
  *
  * \returns  0 if successful, otherwise an errno.
  */
 static int
 xen_intr_assign_cpu(struct intsrc *base_isrc, u_int apic_id)
 {
 #ifdef SMP
 	struct evtchn_bind_vcpu bind_vcpu;
 	struct xenisrc *isrc;
 	u_int to_cpu, vcpu_id;
 	int error, masked;
 
 	if (xen_vector_callback_enabled == 0)
 		return (EOPNOTSUPP);
 
 	to_cpu = apic_cpuid(apic_id);
 	vcpu_id = pcpu_find(to_cpu)->pc_vcpu_id;
 	xen_intr_intrcnt_add(to_cpu);
 
 	mtx_lock(&xen_intr_isrc_lock);
 	isrc = (struct xenisrc *)base_isrc;
 	if (!is_valid_evtchn(isrc->xi_port)) {
 		mtx_unlock(&xen_intr_isrc_lock);
 		return (EINVAL);
 	}
 
 	/*
 	 * Mask the event channel while binding it to prevent interrupt
 	 * delivery with an inconsistent state in isrc->xi_cpu.
 	 */
 	masked = evtchn_test_and_set_mask(isrc->xi_port);
 	if ((isrc->xi_type == EVTCHN_TYPE_VIRQ) ||
 		(isrc->xi_type == EVTCHN_TYPE_IPI)) {
 		/*
 		 * Virtual IRQs are associated with a cpu by
 		 * the Hypervisor at evtchn_bind_virq time, so
 		 * all we need to do is update the per-CPU masks.
 		 */
 		evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port);
 		isrc->xi_cpu = to_cpu;
 		evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port);
 		goto out;
 	}
 
 	bind_vcpu.port = isrc->xi_port;
 	bind_vcpu.vcpu = vcpu_id;
 
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu);
 	if (isrc->xi_cpu != to_cpu) {
 		if (error == 0) {
 			/* Commit to new binding by removing the old one. */
 			evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port);
 			isrc->xi_cpu = to_cpu;
 			evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port);
 		}
 	}
 
 out:
 	if (masked == 0)
 		evtchn_unmask_port(isrc->xi_port);
 	mtx_unlock(&xen_intr_isrc_lock);
 	return (0);
 #else
 	return (EOPNOTSUPP);
 #endif
 }
 
 /*------------------- Virtual Interrupt Source PIC Functions -----------------*/
 /*
  * Mask a level triggered interrupt source.
  *
  * \param isrc  The interrupt source to mask (if necessary).
  * \param eoi   If non-zero, perform any necessary end-of-interrupt
  *              acknowledgements.
  */
 static void
 xen_intr_disable_source(struct intsrc *base_isrc, int eoi)
 {
 	struct xenisrc *isrc;
 
 	isrc = (struct xenisrc *)base_isrc;
 
 	/*
 	 * NB: checking if the event channel is already masked is
 	 * needed because the event channel user-space device
 	 * masks event channels on its filter as part of its
 	 * normal operation, and those shouldn't be automatically
 	 * unmasked by the generic interrupt code. The event channel
 	 * device will unmask them when needed.
 	 */
 	isrc->xi_masked = !!evtchn_test_and_set_mask(isrc->xi_port);
 }
 
 /*
  * Unmask a level triggered interrupt source.
  *
  * \param isrc  The interrupt source to unmask (if necessary).
  */
 static void
 xen_intr_enable_source(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc;
 
 	isrc = (struct xenisrc *)base_isrc;
 
 	if (isrc->xi_masked == 0)
 		evtchn_unmask_port(isrc->xi_port);
 }
 
 /*
  * Perform any necessary end-of-interrupt acknowledgements.
  *
  * \param isrc  The interrupt source to EOI.
  */
 static void
 xen_intr_eoi_source(struct intsrc *base_isrc)
 {
 }
 
 /*
  * Enable and unmask the interrupt source.
  *
  * \param isrc  The interrupt source to enable.
  */
 static void
 xen_intr_enable_intr(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc = (struct xenisrc *)base_isrc;
 
 	evtchn_unmask_port(isrc->xi_port);
 }
 
 /*------------------ Physical Interrupt Source PIC Functions -----------------*/
 /*
  * Mask a level triggered interrupt source.
  *
  * \param isrc  The interrupt source to mask (if necessary).
  * \param eoi   If non-zero, perform any necessary end-of-interrupt
  *              acknowledgements.
  */
 static void
 xen_intr_pirq_disable_source(struct intsrc *base_isrc, int eoi)
 {
 	struct xenisrc *isrc;
 
 	isrc = (struct xenisrc *)base_isrc;
 
 	if (isrc->xi_edgetrigger == 0)
 		evtchn_mask_port(isrc->xi_port);
 	if (eoi == PIC_EOI)
 		xen_intr_pirq_eoi_source(base_isrc);
 }
 
 /*
  * Unmask a level triggered interrupt source.
  *
  * \param isrc  The interrupt source to unmask (if necessary).
  */
 static void
 xen_intr_pirq_enable_source(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc;
 
 	isrc = (struct xenisrc *)base_isrc;
 
 	if (isrc->xi_edgetrigger == 0)
 		evtchn_unmask_port(isrc->xi_port);
 }
 
 /*
  * Perform any necessary end-of-interrupt acknowledgements.
  *
  * \param isrc  The interrupt source to EOI.
  */
 static void
 xen_intr_pirq_eoi_source(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc;
 	int error;
 
 	isrc = (struct xenisrc *)base_isrc;
 
 	if (xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map)) {
 		struct physdev_eoi eoi = { .irq = isrc->xi_pirq };
 
 		error = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
 		if (error != 0)
 			panic("Unable to EOI PIRQ#%d: %d\n",
 			    isrc->xi_pirq, error);
 	}
 }
 
 /*
  * Enable and unmask the interrupt source.
  *
  * \param isrc  The interrupt source to enable.
  */
 static void
 xen_intr_pirq_enable_intr(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc;
 	struct evtchn_bind_pirq bind_pirq;
 	struct physdev_irq_status_query irq_status;
 	int error;
 
 	isrc = (struct xenisrc *)base_isrc;
 
 	if (!xen_intr_pirq_eoi_map_enabled) {
 		irq_status.irq = isrc->xi_pirq;
 		error = HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query,
 		    &irq_status);
 		if (error)
 			panic("unable to get status of IRQ#%d", isrc->xi_pirq);
 
 		if (irq_status.flags & XENIRQSTAT_needs_eoi) {
 			/*
 			 * Since the dynamic PIRQ EOI map is not available
 			 * mark the PIRQ as needing EOI unconditionally.
 			 */
 			xen_set_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map);
 		}
 	}
 
 	bind_pirq.pirq = isrc->xi_pirq;
 	bind_pirq.flags = isrc->xi_edgetrigger ? 0 : BIND_PIRQ__WILL_SHARE;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
 	if (error)
 		panic("unable to bind IRQ#%d", isrc->xi_pirq);
 
 	isrc->xi_port = bind_pirq.port;
 
 	mtx_lock(&xen_intr_isrc_lock);
 	KASSERT((xen_intr_port_to_isrc[bind_pirq.port] == NULL),
 	    ("trying to override an already setup event channel port"));
 	xen_intr_port_to_isrc[bind_pirq.port] = isrc;
 	mtx_unlock(&xen_intr_isrc_lock);
 
 	evtchn_unmask_port(isrc->xi_port);
 }
 
 /*
  * Disable an interrupt source.
  *
  * \param isrc  The interrupt source to disable.
  */
 static void
 xen_intr_pirq_disable_intr(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc;
 	struct evtchn_close close;
 	int error;
 
 	isrc = (struct xenisrc *)base_isrc;
 
 	evtchn_mask_port(isrc->xi_port);
 
 	close.port = isrc->xi_port;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
 	if (error)
 		panic("unable to close event channel %d IRQ#%d",
 		    isrc->xi_port, isrc->xi_pirq);
 
 	mtx_lock(&xen_intr_isrc_lock);
 	xen_intr_port_to_isrc[isrc->xi_port] = NULL;
 	mtx_unlock(&xen_intr_isrc_lock);
 
 	isrc->xi_port = 0;
 }
 
 /**
  * Perform configuration of an interrupt source.
  *
  * \param isrc  The interrupt source to configure.
  * \param trig  Edge or level.
  * \param pol   Active high or low.
  *
  * \returns  0 if no events are pending, otherwise non-zero.
  */
 static int
 xen_intr_pirq_config_intr(struct intsrc *base_isrc, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 	struct xenisrc *isrc = (struct xenisrc *)base_isrc;
 	struct physdev_setup_gsi setup_gsi;
 	int error;
 
 	KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM),
 	    ("%s: Conforming trigger or polarity\n", __func__));
 
 	setup_gsi.gsi = isrc->xi_pirq;
 	setup_gsi.triggering = trig == INTR_TRIGGER_EDGE ? 0 : 1;
 	setup_gsi.polarity = pol == INTR_POLARITY_HIGH ? 0 : 1;
 
 	error = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
 	if (error == -XEN_EEXIST) {
 		if ((isrc->xi_edgetrigger && (trig != INTR_TRIGGER_EDGE)) ||
 		    (isrc->xi_activehi && (pol != INTR_POLARITY_HIGH)))
 			panic("unable to reconfigure interrupt IRQ#%d",
 			    isrc->xi_pirq);
 		error = 0;
 	}
 	if (error)
 		panic("unable to configure IRQ#%d\n", isrc->xi_pirq);
 
 	isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0;
 	isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0;
 
 	return (0);
 }
 
 /*--------------------------- Public Functions -------------------------------*/
 /*------- API comments for these methods can be found in xen/xenintr.h -------*/
 int
 xen_intr_bind_local_port(device_t dev, evtchn_port_t local_port,
     driver_filter_t filter, driver_intr_t handler, void *arg,
     enum intr_type flags, xen_intr_handle_t *port_handlep)
 {
 	struct xenisrc *isrc;
 	int error;
 
 	error = xen_intr_bind_isrc(&isrc, local_port, EVTCHN_TYPE_PORT,
 	    device_get_nameunit(dev), filter, handler, arg, flags,
 	    port_handlep);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * The Event Channel API didn't open this port, so it is not
 	 * responsible for closing it automatically on unbind.
 	 */
 	isrc->xi_close = 0;
 	return (0);
 }
 
 int
 xen_intr_alloc_and_bind_local_port(device_t dev, u_int remote_domain,
     driver_filter_t filter, driver_intr_t handler, void *arg,
     enum intr_type flags, xen_intr_handle_t *port_handlep)
 {
 	struct xenisrc *isrc;
 	struct evtchn_alloc_unbound alloc_unbound;
 	int error;
 
 	alloc_unbound.dom        = DOMID_SELF;
 	alloc_unbound.remote_dom = remote_domain;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
 		    &alloc_unbound);
 	if (error != 0) {
 		/*
 		 * XXX Trap Hypercall error code Linuxisms in
 		 *     the HYPERCALL layer.
 		 */
 		return (-error);
 	}
 
 	error = xen_intr_bind_isrc(&isrc, alloc_unbound.port, EVTCHN_TYPE_PORT,
 	    device_get_nameunit(dev), filter, handler, arg, flags,
 	    port_handlep);
 	if (error != 0) {
 		evtchn_close_t close = { .port = alloc_unbound.port };
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
 			panic("EVTCHNOP_close failed");
 		return (error);
 	}
 
 	isrc->xi_close = 1;
 	return (0);
 }
 
 int 
 xen_intr_bind_remote_port(device_t dev, u_int remote_domain,
     u_int remote_port, driver_filter_t filter, driver_intr_t handler,
     void *arg, enum intr_type flags, xen_intr_handle_t *port_handlep)
 {
 	struct xenisrc *isrc;
 	struct evtchn_bind_interdomain bind_interdomain;
 	int error;
 
 	bind_interdomain.remote_dom  = remote_domain;
 	bind_interdomain.remote_port = remote_port;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
 					    &bind_interdomain);
 	if (error != 0) {
 		/*
 		 * XXX Trap Hypercall error code Linuxisms in
 		 *     the HYPERCALL layer.
 		 */
 		return (-error);
 	}
 
 	error = xen_intr_bind_isrc(&isrc, bind_interdomain.local_port,
 	    EVTCHN_TYPE_PORT, device_get_nameunit(dev), filter, handler, arg,
 	    flags, port_handlep);
 	if (error) {
 		evtchn_close_t close = { .port = bind_interdomain.local_port };
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
 			panic("EVTCHNOP_close failed");
 		return (error);
 	}
 
 	/*
 	 * The Event Channel API opened this port, so it is
 	 * responsible for closing it automatically on unbind.
 	 */
 	isrc->xi_close = 1;
 	return (0);
 }
 
 int 
 xen_intr_bind_virq(device_t dev, u_int virq, u_int cpu,
     driver_filter_t filter, driver_intr_t handler, void *arg,
     enum intr_type flags, xen_intr_handle_t *port_handlep)
 {
 	int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
 	struct xenisrc *isrc;
 	struct evtchn_bind_virq bind_virq = { .virq = virq, .vcpu = vcpu_id };
 	int error;
 
 	/* Ensure the target CPU is ready to handle evtchn interrupts. */
 	xen_intr_intrcnt_add(cpu);
 
 	isrc = NULL;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq);
 	if (error != 0) {
 		/*
 		 * XXX Trap Hypercall error code Linuxisms in
 		 *     the HYPERCALL layer.
 		 */
 		return (-error);
 	}
 
 	error = xen_intr_bind_isrc(&isrc, bind_virq.port, EVTCHN_TYPE_VIRQ,
 	    device_get_nameunit(dev), filter, handler, arg, flags,
 	    port_handlep);
 
 #ifdef SMP
 	if (error == 0)
 		error = intr_event_bind(isrc->xi_intsrc.is_event, cpu);
 #endif
 
 	if (error != 0) {
 		evtchn_close_t close = { .port = bind_virq.port };
 
 		xen_intr_unbind(*port_handlep);
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
 			panic("EVTCHNOP_close failed");
 		return (error);
 	}
 
 #ifdef SMP
 	if (isrc->xi_cpu != cpu) {
 		/*
 		 * Too early in the boot process for the generic interrupt
 		 * code to perform the binding.  Update our event channel
 		 * masks manually so events can't fire on the wrong cpu
 		 * during AP startup.
 		 */
 		xen_intr_assign_cpu(&isrc->xi_intsrc, cpu_apic_ids[cpu]);
 	}
 #endif
 
 	/*
 	 * The Event Channel API opened this port, so it is
 	 * responsible for closing it automatically on unbind.
 	 */
 	isrc->xi_close = 1;
 	isrc->xi_virq = virq;
 
 	return (0);
 }
 
 int
 xen_intr_alloc_and_bind_ipi(u_int cpu, driver_filter_t filter,
     enum intr_type flags, xen_intr_handle_t *port_handlep)
 {
 #ifdef SMP
 	int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
 	struct xenisrc *isrc;
 	struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id };
 	/* Same size as the one used by intr_handler->ih_name. */
 	char name[MAXCOMLEN + 1];
 	int error;
 
 	/* Ensure the target CPU is ready to handle evtchn interrupts. */
 	xen_intr_intrcnt_add(cpu);
 
 	isrc = NULL;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi);
 	if (error != 0) {
 		/*
 		 * XXX Trap Hypercall error code Linuxisms in
 		 *     the HYPERCALL layer.
 		 */
 		return (-error);
 	}
 
 	snprintf(name, sizeof(name), "cpu%u", cpu);
 
 	error = xen_intr_bind_isrc(&isrc, bind_ipi.port, EVTCHN_TYPE_IPI,
 	    name, filter, NULL, NULL, flags, port_handlep);
 	if (error != 0) {
 		evtchn_close_t close = { .port = bind_ipi.port };
 
 		xen_intr_unbind(*port_handlep);
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
 			panic("EVTCHNOP_close failed");
 		return (error);
 	}
 
 	if (isrc->xi_cpu != cpu) {
 		/*
 		 * Too early in the boot process for the generic interrupt
 		 * code to perform the binding.  Update our event channel
 		 * masks manually so events can't fire on the wrong cpu
 		 * during AP startup.
 		 */
 		xen_intr_assign_cpu(&isrc->xi_intsrc, cpu_apic_ids[cpu]);
 	}
 
 	/*
 	 * The Event Channel API opened this port, so it is
 	 * responsible for closing it automatically on unbind.
 	 */
 	isrc->xi_close = 1;
 	return (0);
 #else
 	return (EOPNOTSUPP);
 #endif
 }
 
 int
 xen_register_pirq(int vector, enum intr_trigger trig, enum intr_polarity pol)
 {
 	struct physdev_map_pirq map_pirq;
 	struct xenisrc *isrc;
 	int error;
 
 	if (vector == 0)
 		return (EINVAL);
 
 	if (bootverbose)
 		printf("xen: register IRQ#%d\n", vector);
 
 	map_pirq.domid = DOMID_SELF;
 	map_pirq.type = MAP_PIRQ_TYPE_GSI;
 	map_pirq.index = vector;
 	map_pirq.pirq = vector;
 
 	error = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_pirq);
 	if (error) {
 		printf("xen: unable to map IRQ#%d\n", vector);
 		return (error);
 	}
 
 	mtx_lock(&xen_intr_isrc_lock);
 	isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector);
 	mtx_unlock(&xen_intr_isrc_lock);
 	KASSERT((isrc != NULL), ("xen: unable to allocate isrc for interrupt"));
 	isrc->xi_pirq = vector;
 	isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0;
 	isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0;
 
 	return (0);
 }
 
 int
 xen_register_msi(device_t dev, int vector, int count)
 {
 	struct physdev_map_pirq msi_irq;
 	struct xenisrc *isrc;
 	int ret;
 
 	memset(&msi_irq, 0, sizeof(msi_irq));
 	msi_irq.domid = DOMID_SELF;
 	msi_irq.type = count == 1 ?
 	    MAP_PIRQ_TYPE_MSI_SEG : MAP_PIRQ_TYPE_MULTI_MSI;
 	msi_irq.index = -1;
 	msi_irq.pirq = -1;
 	msi_irq.bus = pci_get_bus(dev) | (pci_get_domain(dev) << 16);
 	msi_irq.devfn = (pci_get_slot(dev) << 3) | pci_get_function(dev);
 	msi_irq.entry_nr = count;
 
 	ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &msi_irq);
 	if (ret != 0)
 		return (ret);
 	if (count != msi_irq.entry_nr) {
 		panic("unable to setup all requested MSI vectors "
 		    "(expected %d got %d)", count, msi_irq.entry_nr);
 	}
 
 	mtx_lock(&xen_intr_isrc_lock);
 	for (int i = 0; i < count; i++) {
 		isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector + i);
 		KASSERT(isrc != NULL,
 		    ("xen: unable to allocate isrc for interrupt"));
 		isrc->xi_pirq = msi_irq.pirq + i;
 		/* MSI interrupts are always edge triggered */
 		isrc->xi_edgetrigger = 1;
 	}
 	mtx_unlock(&xen_intr_isrc_lock);
 
 	return (0);
 }
 
 int
 xen_release_msi(int vector)
 {
 	struct physdev_unmap_pirq unmap;
 	struct xenisrc *isrc;
 	int ret;
 
 	isrc = (struct xenisrc *)intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (ENXIO);
 
 	unmap.pirq = isrc->xi_pirq;
 	ret = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap);
 	if (ret != 0)
 		return (ret);
 
 	xen_intr_release_isrc(isrc);
 
 	return (0);
 }
 
 int
 xen_intr_describe(xen_intr_handle_t port_handle, const char *fmt, ...)
 {
 	char descr[MAXCOMLEN + 1];
 	struct xenisrc *isrc;
 	va_list ap;
 
 	isrc = xen_intr_isrc(port_handle);
 	if (isrc == NULL)
 		return (EINVAL);
 
 	va_start(ap, fmt);
 	vsnprintf(descr, sizeof(descr), fmt, ap);
 	va_end(ap);
 	return (intr_describe(isrc->xi_vector, isrc->xi_cookie, descr));
 }
 
 void
 xen_intr_unbind(xen_intr_handle_t *port_handlep)
 {
 	struct xenisrc *isrc;
 
 	KASSERT(port_handlep != NULL,
 	    ("NULL xen_intr_handle_t passed to xen_intr_unbind"));
 
 	isrc = xen_intr_isrc(*port_handlep);
 	*port_handlep = NULL;
 	if (isrc == NULL)
 		return;
 
 	mtx_lock(&xen_intr_isrc_lock);
 	if (refcount_release(&isrc->xi_refcount) == 0) {
 		mtx_unlock(&xen_intr_isrc_lock);
 		return;
 	}
 	mtx_unlock(&xen_intr_isrc_lock);
 
 	if (isrc->xi_cookie != NULL)
 		intr_remove_handler(isrc->xi_cookie);
 	xen_intr_release_isrc(isrc);
 }
 
 void
 xen_intr_signal(xen_intr_handle_t handle)
 {
 	struct xenisrc *isrc;
 
 	isrc = xen_intr_isrc(handle);
 	if (isrc != NULL) {
 		KASSERT(isrc->xi_type == EVTCHN_TYPE_PORT ||
 			isrc->xi_type == EVTCHN_TYPE_IPI,
 			("evtchn_signal on something other than a local port"));
 		struct evtchn_send send = { .port = isrc->xi_port };
 		(void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
 	}
 }
 
 evtchn_port_t
 xen_intr_port(xen_intr_handle_t handle)
 {
 	struct xenisrc *isrc;
 
 	isrc = xen_intr_isrc(handle);
 	if (isrc == NULL)
 		return (0);
 	
 	return (isrc->xi_port);
 }
 
 int
 xen_intr_add_handler(const char *name, driver_filter_t filter,
     driver_intr_t handler, void *arg, enum intr_type flags,
     xen_intr_handle_t handle)
 {
 	struct xenisrc *isrc;
 	int error;
 
 	isrc = xen_intr_isrc(handle);
 	if (isrc == NULL || isrc->xi_cookie != NULL)
 		return (EINVAL);
 
 	error = intr_add_handler(name, isrc->xi_vector,filter, handler, arg,
-	    flags|INTR_EXCL, &isrc->xi_cookie);
+	    flags|INTR_EXCL, &isrc->xi_cookie, 0);
 	if (error != 0) {
 		printf(
 		    "%s: xen_intr_add_handler: intr_add_handler failed: %d\n",
 		    name, error);
 	}
 
 	return (error);
 }
 
 int
 xen_intr_get_evtchn_from_port(evtchn_port_t port, xen_intr_handle_t *handlep)
 {
 
 	if (!is_valid_evtchn(port) || port >= NR_EVENT_CHANNELS)
 		return (EINVAL);
 
 	if (handlep == NULL) {
 		return (EINVAL);
 	}
 
 	mtx_lock(&xen_intr_isrc_lock);
 	if (xen_intr_port_to_isrc[port] == NULL) {
 		mtx_unlock(&xen_intr_isrc_lock);
 		return (EINVAL);
 	}
 	refcount_acquire(&xen_intr_port_to_isrc[port]->xi_refcount);
 	mtx_unlock(&xen_intr_isrc_lock);
 
 	/* Assign the opaque handler (the event channel port) */
 	*handlep = &xen_intr_port_to_isrc[port]->xi_vector;
 
 	return (0);
 }
 
 #ifdef DDB
 static const char *
 xen_intr_print_type(enum evtchn_type type)
 {
 	static const char *evtchn_type_to_string[EVTCHN_TYPE_COUNT] = {
 		[EVTCHN_TYPE_UNBOUND]	= "UNBOUND",
 		[EVTCHN_TYPE_PIRQ]	= "PIRQ",
 		[EVTCHN_TYPE_VIRQ]	= "VIRQ",
 		[EVTCHN_TYPE_IPI]	= "IPI",
 		[EVTCHN_TYPE_PORT]	= "PORT",
 	};
 
 	if (type >= EVTCHN_TYPE_COUNT)
 		return ("UNKNOWN");
 
 	return (evtchn_type_to_string[type]);
 }
 
 static void
 xen_intr_dump_port(struct xenisrc *isrc)
 {
 	struct xen_intr_pcpu_data *pcpu;
 	shared_info_t *s = HYPERVISOR_shared_info;
 	int i;
 
 	db_printf("Port %d Type: %s\n",
 	    isrc->xi_port, xen_intr_print_type(isrc->xi_type));
 	if (isrc->xi_type == EVTCHN_TYPE_PIRQ) {
 		db_printf("\tPirq: %d ActiveHi: %d EdgeTrigger: %d "
 		    "NeedsEOI: %d\n",
 		    isrc->xi_pirq, isrc->xi_activehi, isrc->xi_edgetrigger,
 		    !!xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map));
 	}
 	if (isrc->xi_type == EVTCHN_TYPE_VIRQ)
 		db_printf("\tVirq: %d\n", isrc->xi_virq);
 
 	db_printf("\tMasked: %d Pending: %d\n",
 	    !!xen_test_bit(isrc->xi_port, &s->evtchn_mask[0]),
 	    !!xen_test_bit(isrc->xi_port, &s->evtchn_pending[0]));
 
 	db_printf("\tPer-CPU Masks: ");
 	CPU_FOREACH(i) {
 		pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
 		db_printf("cpu#%d: %d ", i,
 		    !!xen_test_bit(isrc->xi_port, pcpu->evtchn_enabled));
 	}
 	db_printf("\n");
 }
 
 DB_SHOW_COMMAND(xen_evtchn, db_show_xen_evtchn)
 {
 	int i;
 
 	if (!xen_domain()) {
 		db_printf("Only available on Xen guests\n");
 		return;
 	}
 
 	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
 		struct xenisrc *isrc;
 
 		isrc = xen_intr_port_to_isrc[i];
 		if (isrc == NULL)
 			continue;
 
 		xen_intr_dump_port(isrc);
 	}
 }
 #endif /* DDB */