Index: user/jeff/numa/sys/amd64/include/intr_machdep.h
===================================================================
--- user/jeff/numa/sys/amd64/include/intr_machdep.h	(revision 329848)
+++ user/jeff/numa/sys/amd64/include/intr_machdep.h	(revision 329849)
@@ -1,198 +1,199 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef __MACHINE_INTR_MACHDEP_H__
 #define	__MACHINE_INTR_MACHDEP_H__
 
 #ifdef _KERNEL
 
 /*
  * The maximum number of I/O interrupts we allow.  This number is rather
  * arbitrary as it is just the maximum IRQ resource value.  The interrupt
  * source for a given IRQ maps that I/O interrupt to device interrupt
  * source whether it be a pin on an interrupt controller or an MSI interrupt.
  * The 16 ISA IRQs are assigned fixed IDT vectors, but all other device
  * interrupts allocate IDT vectors on demand.  Currently we have 191 IDT
  * vectors available for device interrupts.  On many systems with I/O APICs,
  * a lot of the IRQs are not used, so this number can be much larger than
  * 191 and still be safe since only interrupt sources in actual use will
  * allocate IDT vectors.
  *
  * The first 255 IRQs (0 - 254) are reserved for ISA IRQs and PCI intline IRQs.
  * IRQ values from 256 to 767 are used by MSI.  When running under the Xen
  * Hypervisor, IRQ values from 768 to 4863 are available for binding to
  * event channel events.  We leave 255 unused to avoid confusion since 255 is
  * used in PCI to indicate an invalid IRQ.
  */
 #define	NUM_MSI_INTS	512
 #define	FIRST_MSI_INT	256
 #ifdef XENHVM
 #include <xen/xen-os.h>
 #include <xen/interface/event_channel.h>
 #define	NUM_EVTCHN_INTS	NR_EVENT_CHANNELS
 #define	FIRST_EVTCHN_INT \
     (FIRST_MSI_INT + NUM_MSI_INTS)
 #define	LAST_EVTCHN_INT \
     (FIRST_EVTCHN_INT + NUM_EVTCHN_INTS - 1)
 #else
 #define	NUM_EVTCHN_INTS	0
 #endif
 #define	NUM_IO_INTS	(FIRST_MSI_INT + NUM_MSI_INTS + NUM_EVTCHN_INTS)
 
 /*
  * Default base address for MSI messages on x86 platforms.
  */
 #define	MSI_INTEL_ADDR_BASE		0xfee00000
 
 /*
  * - 1 ??? dummy counter.
  * - 2 counters for each I/O interrupt.
  * - 1 counter for each CPU for lapic timer.
  * - 8 counters for each CPU for IPI counters for SMP.
  */
 #ifdef SMP
 #define	INTRCNT_COUNT	(1 + NUM_IO_INTS * 2 + (1 + 8) * MAXCPU)
 #else
 #define	INTRCNT_COUNT	(1 + NUM_IO_INTS * 2 + 1)
 #endif
 
 #ifndef LOCORE
 
 typedef void inthand_t(void);
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
 struct intsrc;
 
 /*
  * Methods that a PIC provides to mask/unmask a given interrupt source,
  * "turn on" the interrupt on the CPU side by setting up an IDT entry, and
  * return the vector associated with this source.
  */
 struct pic {
 	void (*pic_enable_source)(struct intsrc *);
 	void (*pic_disable_source)(struct intsrc *, int);
 	void (*pic_eoi_source)(struct intsrc *);
 	void (*pic_enable_intr)(struct intsrc *);
 	void (*pic_disable_intr)(struct intsrc *);
 	int (*pic_vector)(struct intsrc *);
 	int (*pic_source_pending)(struct intsrc *);
 	void (*pic_suspend)(struct pic *);
 	void (*pic_resume)(struct pic *, bool suspend_cancelled);
 	int (*pic_config_intr)(struct intsrc *, enum intr_trigger,
 	    enum intr_polarity);
 	int (*pic_assign_cpu)(struct intsrc *, u_int apic_id);
 	void (*pic_reprogram_pin)(struct intsrc *);
 	TAILQ_ENTRY(pic) pics;
 };
 
 /* Flags for pic_disable_source() */
 enum {
 	PIC_EOI,
 	PIC_NO_EOI,
 };
 
 /*
  * An interrupt source.  The upper-layer code uses the PIC methods to
  * control a given source.  The lower-layer PIC drivers can store additional
  * private data in a given interrupt source such as an interrupt pin number
  * or an I/O APIC pointer.
  */
 struct intsrc {
 	struct pic *is_pic;
 	struct intr_event *is_event;
 	u_long *is_count;
 	u_long *is_straycount;
 	u_int is_index;
 	u_int is_handlers;
+	u_int is_domain;
 	u_int is_cpu;
 };
 
 struct trapframe;
 
 /*
  * The following data structure holds per-cpu data, and is placed just
  * above the top of the space used for the NMI and MC# stacks.
  */
 struct nmi_pcpu {
 	register_t	np_pcpu;
 	register_t	__padding;	/* pad to 16 bytes */
 };
 
 #ifdef SMP
 extern cpuset_t intr_cpus;
 #endif
 extern struct mtx icu_lock;
 extern int elcr_found;
 #ifdef SMP
 extern int msix_disable_migration;
 #endif
 
 #ifndef DEV_ATPIC
 void	atpic_reset(void);
 #endif
 /* XXX: The elcr_* prototypes probably belong somewhere else. */
 int	elcr_probe(void);
 enum intr_trigger elcr_read_trigger(u_int irq);
 void	elcr_resume(void);
 void	elcr_write_trigger(u_int irq, enum intr_trigger trigger);
 #ifdef SMP
 void	intr_add_cpu(u_int cpu);
 #endif
 int	intr_add_handler(const char *name, int vector, driver_filter_t filter, 
 			 driver_intr_t handler, void *arg, enum intr_type flags, 
-			 void **cookiep);    
+			 void **cookiep, int domain);    
 #ifdef SMP
 int	intr_bind(u_int vector, u_char cpu);
 #endif
 int	intr_config_intr(int vector, enum intr_trigger trig,
     enum intr_polarity pol);
 int	intr_describe(u_int vector, void *ih, const char *descr);
 void	intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame);
-u_int	intr_next_cpu(void);
+u_int	intr_next_cpu(int domain);
 struct intsrc *intr_lookup_source(int vector);
 int	intr_register_pic(struct pic *pic);
 int	intr_register_source(struct intsrc *isrc);
 int	intr_remove_handler(void *cookie);
 void	intr_resume(bool suspend_cancelled);
 void	intr_suspend(void);
 void	intr_reprogram(void);
 void	intrcnt_add(const char *name, u_long **countp);
 void	nexus_add_irq(u_long irq);
 int	msi_alloc(device_t dev, int count, int maxcount, int *irqs);
 void	msi_init(void);
 int	msi_map(int irq, uint64_t *addr, uint32_t *data);
 int	msi_release(int *irqs, int count);
 int	msix_alloc(device_t dev, int *irq);
 int	msix_release(int irq);
 
 #endif	/* !LOCORE */
 #endif	/* _KERNEL */
 #endif	/* !__MACHINE_INTR_MACHDEP_H__ */
Index: user/jeff/numa/sys/i386/include/intr_machdep.h
===================================================================
--- user/jeff/numa/sys/i386/include/intr_machdep.h	(revision 329848)
+++ user/jeff/numa/sys/i386/include/intr_machdep.h	(revision 329849)
@@ -1,188 +1,190 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef __MACHINE_INTR_MACHDEP_H__
 #define	__MACHINE_INTR_MACHDEP_H__
 
 #ifdef _KERNEL
 
 /*
  * The maximum number of I/O interrupts we allow.  This number is rather
  * arbitrary as it is just the maximum IRQ resource value.  The interrupt
  * source for a given IRQ maps that I/O interrupt to device interrupt
  * source whether it be a pin on an interrupt controller or an MSI interrupt.
  * The 16 ISA IRQs are assigned fixed IDT vectors, but all other device
  * interrupts allocate IDT vectors on demand.  Currently we have 191 IDT
  * vectors available for device interrupts.  On many systems with I/O APICs,
  * a lot of the IRQs are not used, so this number can be much larger than
  * 191 and still be safe since only interrupt sources in actual use will
  * allocate IDT vectors.
  *
  * The first 255 IRQs (0 - 254) are reserved for ISA IRQs and PCI intline IRQs.
  * IRQ values from 256 to 767 are used by MSI.  When running under the Xen
  * Hypervisor, IRQ values from 768 to 4863 are available for binding to
  * event channel events.  We leave 255 unused to avoid confusion since 255 is
  * used in PCI to indicate an invalid IRQ.
  */
 #define	NUM_MSI_INTS	512
 #define	FIRST_MSI_INT	256
 #ifdef XENHVM
 #include <xen/xen-os.h>
 #include <xen/interface/event_channel.h>
 #define	NUM_EVTCHN_INTS	NR_EVENT_CHANNELS
 #define	FIRST_EVTCHN_INT \
     (FIRST_MSI_INT + NUM_MSI_INTS)
 #define	LAST_EVTCHN_INT \
     (FIRST_EVTCHN_INT + NUM_EVTCHN_INTS - 1)
 #else /* !XENHVM */
 #define	NUM_EVTCHN_INTS	0
 #endif
 #define	NUM_IO_INTS	(FIRST_MSI_INT + NUM_MSI_INTS + NUM_EVTCHN_INTS)
 
 /*
  * Default base address for MSI messages on x86 platforms.
  */
 #define	MSI_INTEL_ADDR_BASE		0xfee00000
 
 /*
  * - 1 ??? dummy counter.
  * - 2 counters for each I/O interrupt.
  * - 1 counter for each CPU for lapic timer.
  * - 9 counters for each CPU for IPI counters for SMP.
  */
 #ifdef SMP
 #define	INTRCNT_COUNT	(1 + NUM_IO_INTS * 2 + (1 + 9) * MAXCPU)
 #else
 #define	INTRCNT_COUNT	(1 + NUM_IO_INTS * 2 + 1)
 #endif
 
 #ifndef LOCORE
 
 typedef void inthand_t(void);
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
 struct intsrc;
 
 /*
  * Methods that a PIC provides to mask/unmask a given interrupt source,
  * "turn on" the interrupt on the CPU side by setting up an IDT entry, and
  * return the vector associated with this source.
  */
 struct pic {
 	void (*pic_enable_source)(struct intsrc *);
 	void (*pic_disable_source)(struct intsrc *, int);
 	void (*pic_eoi_source)(struct intsrc *);
 	void (*pic_enable_intr)(struct intsrc *);
 	void (*pic_disable_intr)(struct intsrc *);
 	int (*pic_vector)(struct intsrc *);
 	int (*pic_source_pending)(struct intsrc *);
 	void (*pic_suspend)(struct pic *);
 	void (*pic_resume)(struct pic *, bool suspend_cancelled);
 	int (*pic_config_intr)(struct intsrc *, enum intr_trigger,
 	    enum intr_polarity);
 	int (*pic_assign_cpu)(struct intsrc *, u_int apic_id);
 	void (*pic_reprogram_pin)(struct intsrc *);
 	TAILQ_ENTRY(pic) pics;
 };
 
 /* Flags for pic_disable_source() */
 enum {
 	PIC_EOI,
 	PIC_NO_EOI,
 };
 
 /*
  * An interrupt source.  The upper-layer code uses the PIC methods to
  * control a given source.  The lower-layer PIC drivers can store additional
  * private data in a given interrupt source such as an interrupt pin number
  * or an I/O APIC pointer.
  */
 struct intsrc {
 	struct pic *is_pic;
 	struct intr_event *is_event;
 	u_long *is_count;
 	u_long *is_straycount;
 	u_int is_index;
 	u_int is_handlers;
+	u_int is_domain;
 	u_int is_cpu;
 };
 
 struct trapframe;
 
 #ifdef SMP
 extern cpuset_t intr_cpus;
 #endif
 extern struct mtx icu_lock;
 extern int elcr_found;
 #ifdef SMP
 extern int msix_disable_migration;
 #endif
 
 #ifndef DEV_ATPIC
 void	atpic_reset(void);
 #endif
 /* XXX: The elcr_* prototypes probably belong somewhere else. */
 int	elcr_probe(void);
 enum intr_trigger elcr_read_trigger(u_int irq);
 void	elcr_resume(void);
 void	elcr_write_trigger(u_int irq, enum intr_trigger trigger);
 #ifdef SMP
 void	intr_add_cpu(u_int cpu);
 #endif
 int	intr_add_handler(const char *name, int vector, driver_filter_t filter,
-    driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep);
+    driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep,
+    int domain);
 #ifdef SMP
 int	intr_bind(u_int vector, u_char cpu);
 #endif
 int	intr_config_intr(int vector, enum intr_trigger trig,
     enum intr_polarity pol);
 int	intr_describe(u_int vector, void *ih, const char *descr);
 void	intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame);
-u_int	intr_next_cpu(void);
+u_int	intr_next_cpu(int domain);
 struct intsrc *intr_lookup_source(int vector);
 int	intr_register_pic(struct pic *pic);
 int	intr_register_source(struct intsrc *isrc);
 int	intr_remove_handler(void *cookie);
 void	intr_resume(bool suspend_cancelled);
 void	intr_suspend(void);
 void	intr_reprogram(void);
 void	intrcnt_add(const char *name, u_long **countp);
 void	nexus_add_irq(u_long irq);
 int	msi_alloc(device_t dev, int count, int maxcount, int *irqs);
 void	msi_init(void);
 int	msi_map(int irq, uint64_t *addr, uint32_t *data);
 int	msi_release(int* irqs, int count);
 int	msix_alloc(device_t dev, int *irq);
 int	msix_release(int irq);
 
 #endif	/* !LOCORE */
 #endif	/* _KERNEL */
 #endif	/* !__MACHINE_INTR_MACHDEP_H__ */
Index: user/jeff/numa/sys/sys/smp.h
===================================================================
--- user/jeff/numa/sys/sys/smp.h	(revision 329848)
+++ user/jeff/numa/sys/sys/smp.h	(revision 329849)
@@ -1,277 +1,278 @@
 /*-
  * SPDX-License-Identifier: Beerware
  *
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_SMP_H_
 #define _SYS_SMP_H_
 
 #ifdef _KERNEL
 
 #ifndef LOCORE
 
 #include <sys/cpuset.h>
 #include <sys/queue.h>
 
 /*
  * Types of nodes in the topological tree.
  */
 typedef enum {
 	/* No node has this type; can be used in topo API calls. */
 	TOPO_TYPE_DUMMY,
 	/* Processing unit aka computing unit aka logical CPU. */
 	TOPO_TYPE_PU,
 	/* Physical subdivision of a package. */
 	TOPO_TYPE_CORE,
 	/* CPU L1/L2/L3 cache. */
 	TOPO_TYPE_CACHE,
 	/* Package aka chip, equivalent to socket. */
 	TOPO_TYPE_PKG,
 	/* NUMA node. */
 	TOPO_TYPE_NODE,
 	/* Other logical or physical grouping of PUs. */
 	/* E.g. PUs on the same dye, or PUs sharing an FPU. */
 	TOPO_TYPE_GROUP,
 	/* The whole system. */
 	TOPO_TYPE_SYSTEM
 } topo_node_type;
 
 /* Hardware indenitifier of a topology component. */
 typedef	unsigned int hwid_t;
 /* Logical CPU idenitifier. */
 typedef	int cpuid_t;
 
 /* A node in the topology. */
 struct topo_node {
 	struct topo_node			*parent;
 	TAILQ_HEAD(topo_children, topo_node)	children;
 	TAILQ_ENTRY(topo_node)			siblings;
 	cpuset_t				cpuset;
 	topo_node_type				type;
 	uintptr_t				subtype;
 	hwid_t					hwid;
 	cpuid_t					id;
 	int					nchildren;
 	int					cpu_count;
 };
 
 /*
  * Scheduling topology of a NUMA or SMP system.
  *
  * The top level topology is an array of pointers to groups.  Each group
  * contains a bitmask of cpus in its group or subgroups.  It may also
  * contain a pointer to an array of child groups.
  *
  * The bitmasks at non leaf groups may be used by consumers who support
  * a smaller depth than the hardware provides.
  *
  * The topology may be omitted by systems where all CPUs are equal.
  */
 
 struct cpu_group {
 	struct cpu_group *cg_parent;	/* Our parent group. */
 	struct cpu_group *cg_child;	/* Optional children groups. */
 	cpuset_t	cg_mask;	/* Mask of cpus in this group. */
 	int32_t		cg_count;	/* Count of cpus in this group. */
 	int16_t		cg_children;	/* Number of children groups. */
 	int8_t		cg_level;	/* Shared cache level. */
 	int8_t		cg_flags;	/* Traversal modifiers. */
 };
 
 typedef struct cpu_group *cpu_group_t;
 
 /*
  * Defines common resources for CPUs in the group.  The highest level
  * resource should be used when multiple are shared.
  */
 #define	CG_SHARE_NONE	0
 #define	CG_SHARE_L1	1
 #define	CG_SHARE_L2	2
 #define	CG_SHARE_L3	3
 
 #define MAX_CACHE_LEVELS	CG_SHARE_L3
 
 /*
  * Behavior modifiers for load balancing and affinity.
  */
 #define	CG_FLAG_HTT	0x01		/* Schedule the alternate core last. */
 #define	CG_FLAG_SMT	0x02		/* New age htt, less crippled. */
 #define	CG_FLAG_THREAD	(CG_FLAG_HTT | CG_FLAG_SMT)	/* Any threading. */
 
 /*
  * Convenience routines for building and traversing topologies.
  */
 #ifdef SMP
 void topo_init_node(struct topo_node *node);
 void topo_init_root(struct topo_node *root);
 struct topo_node * topo_add_node_by_hwid(struct topo_node *parent, int hwid,
     topo_node_type type, uintptr_t subtype);
 struct topo_node * topo_find_node_by_hwid(struct topo_node *parent, int hwid,
     topo_node_type type, uintptr_t subtype);
 void topo_promote_child(struct topo_node *child);
 struct topo_node * topo_next_node(struct topo_node *top,
     struct topo_node *node);
 struct topo_node * topo_next_nonchild_node(struct topo_node *top,
     struct topo_node *node);
 void topo_set_pu_id(struct topo_node *node, cpuid_t id);
 
 enum topo_level {
 	TOPO_LEVEL_PKG = 0,
 	/*
 	 * Some systems have useful sub-package core organizations.  On these,
 	 * a package has one or more subgroups.  Each subgroup contains one or
 	 * more cache groups (cores that share a last level cache).
 	 */
 	TOPO_LEVEL_GROUP,
 	TOPO_LEVEL_CACHEGROUP,
 	TOPO_LEVEL_CORE,
 	TOPO_LEVEL_THREAD,
 	TOPO_LEVEL_COUNT	/* Must be last */
 };
 struct topo_analysis {
 	int entities[TOPO_LEVEL_COUNT];
 };
 int topo_analyze(struct topo_node *topo_root, int all,
     struct topo_analysis *results);
 
 #define	TOPO_FOREACH(i, root)	\
 	for (i = root; i != NULL; i = topo_next_node(root, i))
 
 struct cpu_group *smp_topo(void);
 struct cpu_group *smp_topo_alloc(u_int count);
 struct cpu_group *smp_topo_none(void);
 struct cpu_group *smp_topo_1level(int l1share, int l1count, int l1flags);
 struct cpu_group *smp_topo_2level(int l2share, int l2count, int l1share,
     int l1count, int l1flags);
 struct cpu_group *smp_topo_find(struct cpu_group *top, int cpu);
 
 extern void (*cpustop_restartfunc)(void);
 extern int smp_cpus;
 /* The suspend/resume cpusets are x86 only, but minimize ifdefs. */
 extern volatile cpuset_t resuming_cpus;	/* woken up cpus in suspend pen */
 extern volatile cpuset_t started_cpus;	/* cpus to let out of stop pen */
 extern volatile cpuset_t stopped_cpus;	/* cpus in stop pen */
 extern volatile cpuset_t suspended_cpus; /* cpus [near] sleeping in susp pen */
 extern volatile cpuset_t toresume_cpus;	/* cpus to let out of suspend pen */
 extern cpuset_t hlt_cpus_mask;		/* XXX 'mask' is detail in old impl */
 extern cpuset_t logical_cpus_mask;
 #endif /* SMP */
 
 extern u_int mp_maxid;
 extern int mp_maxcpus;
 extern int mp_ncpus;
 extern volatile int smp_started;
+extern int vm_ndomains;
 
 extern cpuset_t all_cpus;
 extern cpuset_t cpuset_domain[MAXMEMDOM]; 	/* CPUs in each NUMA domain. */
 
 /*
  * Macro allowing us to determine whether a CPU is absent at any given
  * time, thus permitting us to configure sparse maps of cpuid-dependent
  * (per-CPU) structures.
  */
 #define	CPU_ABSENT(x_cpu)	(!CPU_ISSET(x_cpu, &all_cpus))
 
 /*
  * Macros to iterate over non-absent CPUs.  CPU_FOREACH() takes an
  * integer iterator and iterates over the available set of CPUs.
  * CPU_FIRST() returns the id of the first non-absent CPU.  CPU_NEXT()
  * returns the id of the next non-absent CPU.  It will wrap back to
  * CPU_FIRST() once the end of the list is reached.  The iterators are
  * currently implemented via inline functions.
  */
 #define	CPU_FOREACH(i)							\
 	for ((i) = 0; (i) <= mp_maxid; (i)++)				\
 		if (!CPU_ABSENT((i)))
 
 static __inline int
 cpu_first(void)
 {
 	int i;
 
 	for (i = 0;; i++)
 		if (!CPU_ABSENT(i))
 			return (i);
 }
 
 static __inline int
 cpu_next(int i)
 {
 
 	for (;;) {
 		i++;
 		if (i > mp_maxid)
 			i = 0;
 		if (!CPU_ABSENT(i))
 			return (i);
 	}
 }
 
 #define	CPU_FIRST()	cpu_first()
 #define	CPU_NEXT(i)	cpu_next((i))
 
 #ifdef SMP
 /*
  * Machine dependent functions used to initialize MP support.
  *
  * The cpu_mp_probe() should check to see if MP support is present and return
  * zero if it is not or non-zero if it is.  If MP support is present, then
  * cpu_mp_start() will be called so that MP can be enabled.  This function
  * should do things such as startup secondary processors.  It should also
  * setup mp_ncpus, all_cpus, and smp_cpus.  It should also ensure that
  * smp_started is initialized at the appropriate time.
  * Once cpu_mp_start() returns, machine independent MP startup code will be
  * executed and a simple message will be output to the console.  Finally,
  * cpu_mp_announce() will be called so that machine dependent messages about
  * the MP support may be output to the console if desired.
  *
  * The cpu_setmaxid() function is called very early during the boot process
  * so that the MD code may set mp_maxid to provide an upper bound on CPU IDs
  * that other subsystems may use.  If a platform is not able to determine
  * the exact maximum ID that early, then it may set mp_maxid to MAXCPU - 1.
  */
 struct thread;
 
 struct cpu_group *cpu_topo(void);
 void	cpu_mp_announce(void);
 int	cpu_mp_probe(void);
 void	cpu_mp_setmaxid(void);
 void	cpu_mp_start(void);
 
 void	forward_signal(struct thread *);
 int	restart_cpus(cpuset_t);
 int	stop_cpus(cpuset_t);
 int	stop_cpus_hard(cpuset_t);
 #if defined(__amd64__) || defined(__i386__)
 int	suspend_cpus(cpuset_t);
 int	resume_cpus(cpuset_t);
 #endif
 
 void	smp_rendezvous_action(void);
 extern	struct mtx smp_ipi_mtx;
 
 #endif /* SMP */
 
 int	quiesce_all_cpus(const char *, int);
 int	quiesce_cpus(cpuset_t, const char *, int);
 void	smp_no_rendezvous_barrier(void *);
 void	smp_rendezvous(void (*)(void *), 
 		       void (*)(void *),
 		       void (*)(void *),
 		       void *arg);
 void	smp_rendezvous_cpus(cpuset_t,
 		       void (*)(void *), 
 		       void (*)(void *),
 		       void (*)(void *),
 		       void *arg);
 #endif /* !LOCORE */
 #endif /* _KERNEL */
 #endif /* _SYS_SMP_H_ */
Index: user/jeff/numa/sys/vm/vm_phys.h
===================================================================
--- user/jeff/numa/sys/vm/vm_phys.h	(revision 329848)
+++ user/jeff/numa/sys/vm/vm_phys.h	(revision 329849)
@@ -1,122 +1,121 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2006 Rice University
  * Copyright (c) 2007 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Alan L. Cox,
  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  *	Physical memory system definitions
  */
 
 #ifndef	_VM_PHYS_H_
 #define	_VM_PHYS_H_
 
 #ifdef _KERNEL
 
 /* Domains must be dense (non-sparse) and zero-based. */
 struct mem_affinity {
 	vm_paddr_t start;
 	vm_paddr_t end;
 	int domain;
 };
 #ifdef NUMA
 extern struct mem_affinity *mem_affinity;
 extern int *mem_locality;
 #endif
-extern int vm_ndomains;
 
 struct vm_freelist {
 	struct pglist pl;
 	int lcnt;
 };
 
 struct vm_phys_seg {
 	vm_paddr_t	start;
 	vm_paddr_t	end;
 	vm_page_t	first_page;
 	int		domain;
 	struct vm_freelist (*free_queues)[VM_NFREEPOOL][VM_NFREEORDER];
 };
 
 extern struct vm_phys_seg vm_phys_segs[];
 extern int vm_phys_nsegs;
 
 /*
  * The following functions are only to be used by the virtual memory system.
  */
 void vm_phys_add_seg(vm_paddr_t start, vm_paddr_t end);
 vm_page_t vm_phys_alloc_contig(int domain, u_long npages, vm_paddr_t low,
     vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
 vm_page_t vm_phys_alloc_freelist_pages(int domain, int freelist, int pool,
     int order);
 vm_page_t vm_phys_alloc_pages(int domain, int pool, int order);
 int vm_phys_alloc_npages(int domain, int pool, vm_page_t *m, int cnt);
 int vm_phys_domain_match(int prefer, vm_paddr_t low, vm_paddr_t high);
 int vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
     vm_memattr_t memattr);
 void vm_phys_fictitious_unreg_range(vm_paddr_t start, vm_paddr_t end);
 vm_page_t vm_phys_fictitious_to_vm_page(vm_paddr_t pa);
 void vm_phys_free_contig(vm_page_t m, u_long npages);
 void vm_phys_free_pages(vm_page_t m, int order);
 void vm_phys_init(void);
 vm_page_t vm_phys_paddr_to_vm_page(vm_paddr_t pa);
 vm_page_t vm_phys_scan_contig(int domain, u_long npages, vm_paddr_t low,
     vm_paddr_t high, u_long alignment, vm_paddr_t boundary, int options);
 void vm_phys_set_pool(int pool, vm_page_t m, int order);
 boolean_t vm_phys_unfree_page(vm_page_t m);
 int vm_phys_mem_affinity(int f, int t);
 
 /*
  *
  *	vm_phys_domain:
  *
  *	Return the index of the domain the page belongs to.
  */
 static inline int
 vm_phys_domain(vm_page_t m)
 {
 #ifdef NUMA
 	int domn, segind;
 
 	/* XXXKIB try to assert that the page is managed */
 	segind = m->segind;
 	KASSERT(segind < vm_phys_nsegs, ("segind %d m %p", segind, m));
 	domn = vm_phys_segs[segind].domain;
 	KASSERT(domn < vm_ndomains, ("domain %d m %p", domn, m));
 	return (domn);
 #else
 	return (0);
 #endif
 }
 
 #endif	/* _KERNEL */
 #endif	/* !_VM_PHYS_H_ */
Index: user/jeff/numa/sys/vm/vm_reserv.c
===================================================================
--- user/jeff/numa/sys/vm/vm_reserv.c	(revision 329848)
+++ user/jeff/numa/sys/vm/vm_reserv.c	(revision 329849)
@@ -1,1367 +1,1368 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2006 Rice University
  * Copyright (c) 2007-2011 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Alan L. Cox,
  * Olivier Crameri, Peter Druschel, Sitaram Iyer, and Juan Navarro.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  *	Superpage reservation management module
  *
  * Any external functions defined by this module are only to be used by the
  * virtual memory system.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
+#include <sys/smp.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 
 /*
  * The reservation system supports the speculative allocation of large physical
  * pages ("superpages").  Speculative allocation enables the fully automatic
  * utilization of superpages by the virtual memory system.  In other words, no
  * programmatic directives are required to use superpages.
  */
 
 #if VM_NRESERVLEVEL > 0
 
 /*
  * The number of small pages that are contained in a level 0 reservation
  */
 #define	VM_LEVEL_0_NPAGES	(1 << VM_LEVEL_0_ORDER)
 
 /*
  * The number of bits by which a physical address is shifted to obtain the
  * reservation number
  */
 #define	VM_LEVEL_0_SHIFT	(VM_LEVEL_0_ORDER + PAGE_SHIFT)
 
 /*
  * The size of a level 0 reservation in bytes
  */
 #define	VM_LEVEL_0_SIZE		(1 << VM_LEVEL_0_SHIFT)
 
 /*
  * Computes the index of the small page underlying the given (object, pindex)
  * within the reservation's array of small pages.
  */
 #define	VM_RESERV_INDEX(object, pindex)	\
     (((object)->pg_color + (pindex)) & (VM_LEVEL_0_NPAGES - 1))
 
 /*
  * The size of a population map entry
  */
 typedef	u_long		popmap_t;
 
 /*
  * The number of bits in a population map entry
  */
 #define	NBPOPMAP	(NBBY * sizeof(popmap_t))
 
 /*
  * The number of population map entries in a reservation
  */
 #define	NPOPMAP		howmany(VM_LEVEL_0_NPAGES, NBPOPMAP)
 
 /*
  * Clear a bit in the population map.
  */
 static __inline void
 popmap_clear(popmap_t popmap[], int i)
 {
 
 	popmap[i / NBPOPMAP] &= ~(1UL << (i % NBPOPMAP));
 }
 
 /*
  * Set a bit in the population map.
  */
 static __inline void
 popmap_set(popmap_t popmap[], int i)
 {
 
 	popmap[i / NBPOPMAP] |= 1UL << (i % NBPOPMAP);
 }
 
 /*
  * Is a bit in the population map clear?
  */
 static __inline boolean_t
 popmap_is_clear(popmap_t popmap[], int i)
 {
 
 	return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) == 0);
 }
 
 /*
  * Is a bit in the population map set?
  */
 static __inline boolean_t
 popmap_is_set(popmap_t popmap[], int i)
 {
 
 	return ((popmap[i / NBPOPMAP] & (1UL << (i % NBPOPMAP))) != 0);
 }
 
 /*
  * The reservation structure
  *
  * A reservation structure is constructed whenever a large physical page is
  * speculatively allocated to an object.  The reservation provides the small
  * physical pages for the range [pindex, pindex + VM_LEVEL_0_NPAGES) of offsets
  * within that object.  The reservation's "popcnt" tracks the number of these
  * small physical pages that are in use at any given time.  When and if the
  * reservation is not fully utilized, it appears in the queue of partially
  * populated reservations.  The reservation always appears on the containing
  * object's list of reservations.
  *
  * A partially populated reservation can be broken and reclaimed at any time.
  *
  * f - vm_domain_free_lock
  * o - vm_reserv_object_lock
  * c - constant after boot
  */
 struct vm_reserv {
 	TAILQ_ENTRY(vm_reserv) partpopq;	/* (f) per-domain queue. */
 	LIST_ENTRY(vm_reserv) objq;		/* (o, f) object queue */
 	vm_object_t	object;			/* (o, f) containing object */
 	vm_pindex_t	pindex;			/* (o, f) offset in object */
 	vm_page_t	pages;			/* (c) first page  */
 	int		domain;			/* (c) NUMA domain. */
 	int		popcnt;			/* (f) # of pages in use */
 	char		inpartpopq;		/* (f) */
 	popmap_t	popmap[NPOPMAP];	/* (f) bit vector, used pages */
 };
 
 /*
  * The reservation array
  *
  * This array is analoguous in function to vm_page_array.  It differs in the
  * respect that it may contain a greater number of useful reservation
  * structures than there are (physical) superpages.  These "invalid"
  * reservation structures exist to trade-off space for time in the
  * implementation of vm_reserv_from_page().  Invalid reservation structures are
  * distinguishable from "valid" reservation structures by inspecting the
  * reservation's "pages" field.  Invalid reservation structures have a NULL
  * "pages" field.
  *
  * vm_reserv_from_page() maps a small (physical) page to an element of this
  * array by computing a physical reservation number from the page's physical
  * address.  The physical reservation number is used as the array index.
  *
  * An "active" reservation is a valid reservation structure that has a non-NULL
  * "object" field and a non-zero "popcnt" field.  In other words, every active
  * reservation belongs to a particular object.  Moreover, every active
  * reservation has an entry in the containing object's list of reservations.  
  */
 static vm_reserv_t vm_reserv_array;
 
 /*
  * The partially populated reservation queue
  *
  * This queue enables the fast recovery of an unused free small page from a
  * partially populated reservation.  The reservation at the head of this queue
  * is the least recently changed, partially populated reservation.
  *
  * Access to this queue is synchronized by the free page queue lock.
  */
 static TAILQ_HEAD(, vm_reserv) vm_rvq_partpop[MAXMEMDOM];
 
 static SYSCTL_NODE(_vm, OID_AUTO, reserv, CTLFLAG_RD, 0, "Reservation Info");
 
 static long vm_reserv_broken;
 SYSCTL_LONG(_vm_reserv, OID_AUTO, broken, CTLFLAG_RD,
     &vm_reserv_broken, 0, "Cumulative number of broken reservations");
 
 static long vm_reserv_freed;
 SYSCTL_LONG(_vm_reserv, OID_AUTO, freed, CTLFLAG_RD,
     &vm_reserv_freed, 0, "Cumulative number of freed reservations");
 
 static int sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_PROC(_vm_reserv, OID_AUTO, fullpop, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
     sysctl_vm_reserv_fullpop, "I", "Current number of full reservations");
 
 static int sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_OID(_vm_reserv, OID_AUTO, partpopq, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
     sysctl_vm_reserv_partpopq, "A", "Partially populated reservation queues");
 
 static long vm_reserv_reclaimed;
 SYSCTL_LONG(_vm_reserv, OID_AUTO, reclaimed, CTLFLAG_RD,
     &vm_reserv_reclaimed, 0, "Cumulative number of reclaimed reservations");
 
 /*
  * The object lock pool is used to synchronize the rvq.  We can not use a
  * pool mutex because it is required before malloc works.
  *
  * The "hash" function could be made faster without divide and modulo.
  */
 #define	VM_RESERV_OBJ_LOCK_COUNT	MAXCPU
 
 struct mtx_padalign vm_reserv_object_mtx[VM_RESERV_OBJ_LOCK_COUNT];
 
 #define	vm_reserv_object_lock_idx(object)			\
 	    (((uintptr_t)object / sizeof(*object)) % VM_RESERV_OBJ_LOCK_COUNT)
 #define	vm_reserv_object_lock_ptr(object)			\
 	    &vm_reserv_object_mtx[vm_reserv_object_lock_idx((object))]
 #define	vm_reserv_object_lock(object)				\
 	    mtx_lock(vm_reserv_object_lock_ptr((object)))
 #define	vm_reserv_object_unlock(object)				\
 	    mtx_unlock(vm_reserv_object_lock_ptr((object)))
 
 static void		vm_reserv_break(vm_reserv_t rv);
 static void		vm_reserv_depopulate(vm_reserv_t rv, int index);
 static vm_reserv_t	vm_reserv_from_page(vm_page_t m);
 static boolean_t	vm_reserv_has_pindex(vm_reserv_t rv,
 			    vm_pindex_t pindex);
 static void		vm_reserv_populate(vm_reserv_t rv, int index);
 static void		vm_reserv_reclaim(vm_reserv_t rv);
 
 /*
  * Returns the current number of full reservations.
  *
  * Since the number of full reservations is computed without acquiring the
  * free page queue lock, the returned value may be inexact.
  */
 static int
 sysctl_vm_reserv_fullpop(SYSCTL_HANDLER_ARGS)
 {
 	vm_paddr_t paddr;
 	struct vm_phys_seg *seg;
 	vm_reserv_t rv;
 	int fullpop, segind;
 
 	fullpop = 0;
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
 		paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
 		while (paddr + VM_LEVEL_0_SIZE <= seg->end) {
 			rv = &vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT];
 			fullpop += rv->popcnt == VM_LEVEL_0_NPAGES;
 			paddr += VM_LEVEL_0_SIZE;
 		}
 	}
 	return (sysctl_handle_int(oidp, &fullpop, 0, req));
 }
 
 /*
  * Describes the current state of the partially populated reservation queue.
  */
 static int
 sysctl_vm_reserv_partpopq(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	vm_reserv_t rv;
 	int counter, error, domain, level, unused_pages;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	sbuf_printf(&sbuf, "\nDOMAIN    LEVEL     SIZE  NUMBER\n\n");
 	for (domain = 0; domain < vm_ndomains; domain++) {
 		for (level = -1; level <= VM_NRESERVLEVEL - 2; level++) {
 			counter = 0;
 			unused_pages = 0;
 			vm_domain_free_lock(VM_DOMAIN(domain));
 			TAILQ_FOREACH(rv, &vm_rvq_partpop[domain], partpopq) {
 				counter++;
 				unused_pages += VM_LEVEL_0_NPAGES - rv->popcnt;
 			}
 			vm_domain_free_unlock(VM_DOMAIN(domain));
 			sbuf_printf(&sbuf, "%6d, %7d, %6dK, %6d\n",
 			    domain, level,
 			    unused_pages * ((int)PAGE_SIZE / 1024), counter);
 		}
 	}
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 /*
  * Remove a reservation from the object's objq.
  */
 static void
 vm_reserv_remove(vm_reserv_t rv)
 {
 	vm_object_t object;
 
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_remove: reserv %p is free", rv));
 	KASSERT(!rv->inpartpopq,
 	    ("vm_reserv_remove: reserv %p's inpartpopq is TRUE", rv));
 	object = rv->object;
 	vm_reserv_object_lock(object);
 	LIST_REMOVE(rv, objq);
 	rv->object = NULL;
 	vm_reserv_object_unlock(object);
 }
 
 /*
  * Insert a new reservation into the object's objq.
  */
 static void
 vm_reserv_insert(vm_reserv_t rv, vm_object_t object, vm_pindex_t pindex)
 {
 	int i;
 
 	KASSERT(rv->object == NULL,
 	    ("vm_reserv_insert: reserv %p isn't free", rv));
 	KASSERT(rv->popcnt == 0,
 	    ("vm_reserv_insert: reserv %p's popcnt is corrupted", rv));
 	KASSERT(!rv->inpartpopq,
 	    ("vm_reserv_insert: reserv %p's inpartpopq is TRUE", rv));
 	for (i = 0; i < NPOPMAP; i++)
 		KASSERT(rv->popmap[i] == 0,
 		    ("vm_reserv_insert: reserv %p's popmap is corrupted", rv));
 	vm_reserv_object_lock(object);
 	rv->pindex = pindex;
 	rv->object = object;
 	LIST_INSERT_HEAD(&object->rvq, rv, objq);
 	vm_reserv_object_unlock(object);
 }
 
 /*
  * Reduces the given reservation's population count.  If the population count
  * becomes zero, the reservation is destroyed.  Additionally, moves the
  * reservation to the tail of the partially populated reservation queue if the
  * population count is non-zero.
  *
  * The free page queue lock must be held.
  */
 static void
 vm_reserv_depopulate(vm_reserv_t rv, int index)
 {
 
 	vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_depopulate: reserv %p is free", rv));
 	KASSERT(popmap_is_set(rv->popmap, index),
 	    ("vm_reserv_depopulate: reserv %p's popmap[%d] is clear", rv,
 	    index));
 	KASSERT(rv->popcnt > 0,
 	    ("vm_reserv_depopulate: reserv %p's popcnt is corrupted", rv));
 	KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
 	    ("vm_reserv_depopulate: reserv %p's domain is corrupted %d",
 	    rv, rv->domain));
 	if (rv->inpartpopq) {
 		TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
 		rv->inpartpopq = FALSE;
 	} else {
 		KASSERT(rv->pages->psind == 1,
 		    ("vm_reserv_depopulate: reserv %p is already demoted",
 		    rv));
 		rv->pages->psind = 0;
 	}
 	popmap_clear(rv->popmap, index);
 	rv->popcnt--;
 	if (rv->popcnt == 0) {
 		vm_reserv_remove(rv);
 		vm_phys_free_pages(rv->pages, VM_LEVEL_0_ORDER);
 		vm_reserv_freed++;
 	} else {
 		rv->inpartpopq = TRUE;
 		TAILQ_INSERT_TAIL(&vm_rvq_partpop[rv->domain], rv, partpopq);
 	}
 }
 
 /*
  * Returns the reservation to which the given page might belong.
  */
 static __inline vm_reserv_t
 vm_reserv_from_page(vm_page_t m)
 {
 
 	return (&vm_reserv_array[VM_PAGE_TO_PHYS(m) >> VM_LEVEL_0_SHIFT]);
 }
 
 /*
  * Returns an existing reservation or NULL and initialized successor pointer.
  */
 static vm_reserv_t
 vm_reserv_from_object(vm_object_t object, vm_pindex_t pindex,
     vm_page_t mpred, vm_page_t *msuccp)
 {
 	vm_reserv_t rv;
 	vm_page_t msucc;
 
 	msucc = NULL;
 	if (mpred != NULL) {
 		KASSERT(mpred->object == object,
 		    ("vm_reserv_from_object: object doesn't contain mpred"));
 		KASSERT(mpred->pindex < pindex,
 		    ("vm_reserv_from_object: mpred doesn't precede pindex"));
 		rv = vm_reserv_from_page(mpred);
 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 			goto found;
 		msucc = TAILQ_NEXT(mpred, listq);
 	} else
 		msucc = TAILQ_FIRST(&object->memq);
 	if (msucc != NULL) {
 		KASSERT(msucc->pindex > pindex,
 		    ("vm_reserv_from_object: msucc doesn't succeed pindex"));
 		rv = vm_reserv_from_page(msucc);
 		if (rv->object == object && vm_reserv_has_pindex(rv, pindex))
 			goto found;
 	}
 	rv = NULL;
 
 found:
 	*msuccp = msucc;
 
 	return (rv);
 }
 
 /*
  * Returns TRUE if the given reservation contains the given page index and
  * FALSE otherwise.
  */
 static __inline boolean_t
 vm_reserv_has_pindex(vm_reserv_t rv, vm_pindex_t pindex)
 {
 
 	return (((pindex - rv->pindex) & ~(VM_LEVEL_0_NPAGES - 1)) == 0);
 }
 
 /*
  * Increases the given reservation's population count.  Moves the reservation
  * to the tail of the partially populated reservation queue.
  *
  * The free page queue must be locked.
  */
 static void
 vm_reserv_populate(vm_reserv_t rv, int index)
 {
 
 	vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
 	KASSERT(rv->object != NULL,
 	    ("vm_reserv_populate: reserv %p is free", rv));
 	KASSERT(popmap_is_clear(rv->popmap, index),
 	    ("vm_reserv_populate: reserv %p's popmap[%d] is set", rv,
 	    index));
 	KASSERT(rv->popcnt < VM_LEVEL_0_NPAGES,
 	    ("vm_reserv_populate: reserv %p is already full", rv));
 	KASSERT(rv->pages->psind == 0,
 	    ("vm_reserv_populate: reserv %p is already promoted", rv));
 	KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
 	    ("vm_reserv_populate: reserv %p's domain is corrupted %d",
 	    rv, rv->domain));
 	if (rv->inpartpopq) {
 		TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
 		rv->inpartpopq = FALSE;
 	}
 	popmap_set(rv->popmap, index);
 	rv->popcnt++;
 	if (rv->popcnt < VM_LEVEL_0_NPAGES) {
 		rv->inpartpopq = TRUE;
 		TAILQ_INSERT_TAIL(&vm_rvq_partpop[rv->domain], rv, partpopq);
 	} else
 		rv->pages->psind = 1;
 }
 
 /*
  * Allocates a contiguous set of physical pages of the given size "npages"
  * from existing or newly created reservations.  All of the physical pages
  * must be at or above the given physical address "low" and below the given
  * physical address "high".  The given value "alignment" determines the
  * alignment of the first physical page in the set.  If the given value
  * "boundary" is non-zero, then the set of physical pages cannot cross any
  * physical address boundary that is a multiple of that value.  Both
  * "alignment" and "boundary" must be a power of two.
  *
  * The page "mpred" must immediately precede the offset "pindex" within the
  * specified object.
  *
  * The object and free page queue must be locked.
  */
 vm_page_t
 vm_reserv_extend_contig(int req, vm_object_t object, vm_pindex_t pindex,
     int domain, u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary, vm_page_t mpred)
 {
 	struct vm_domain *vmd;
 	vm_paddr_t pa, size;
 	vm_page_t m, msucc;
 	vm_reserv_t rv;
 	int i, index;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
 
 	/*
 	 * Is a reservation fundamentally impossible?
 	 */
 	if (pindex < VM_RESERV_INDEX(object, pindex) ||
 	    pindex + npages > object->size || object->resident_page_count == 0)
 		return (NULL);
 
 	/*
 	 * All reservations of a particular size have the same alignment.
 	 * Assuming that the first page is allocated from a reservation, the
 	 * least significant bits of its physical address can be determined
 	 * from its offset from the beginning of the reservation and the size
 	 * of the reservation.
 	 *
 	 * Could the specified index within a reservation of the smallest
 	 * possible size satisfy the alignment and boundary requirements?
 	 */
 	pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
 	if ((pa & (alignment - 1)) != 0)
 		return (NULL);
 	size = npages << PAGE_SHIFT;
 	if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
 		return (NULL);
 
 	/*
 	 * Look for an existing reservation.
 	 */
 	rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
 	if (rv == NULL)
 		return (NULL);
 	KASSERT(object != kernel_object || rv->domain == domain,
 	    ("vm_reserv_extend_contig: Domain mismatch from reservation."));
 	index = VM_RESERV_INDEX(object, pindex);
 	/* Does the allocation fit within the reservation? */
 	if (index + npages > VM_LEVEL_0_NPAGES)
 		return (NULL);
 	domain = rv->domain;
 	vmd = VM_DOMAIN(domain);
 	vm_domain_free_lock(vmd);
 	if (rv->object != object || !vm_domain_available(vmd, req, npages)) {
 		m = NULL;
 		goto out;
 	}
 	m = &rv->pages[index];
 	pa = VM_PAGE_TO_PHYS(m);
 	if (pa < low || pa + size > high || (pa & (alignment - 1)) != 0 ||
 	    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) {
 		m = NULL;
 		goto out;
 	}
 	/* Handle vm_page_rename(m, new_object, ...). */
 	for (i = 0; i < npages; i++) {
 		if (popmap_is_set(rv->popmap, index + i)) {
 			m = NULL;
 			goto out;
 		}
 	}
 	for (i = 0; i < npages; i++)
 		vm_reserv_populate(rv, index + i);
 	vm_domain_freecnt_adj(vmd, -npages);
 out:
 	vm_domain_free_unlock(vmd);
 	return (m);
 }
 
 /*
  * Allocates a contiguous set of physical pages of the given size "npages"
  * from existing or newly created reservations.  All of the physical pages
  * must be at or above the given physical address "low" and below the given
  * physical address "high".  The given value "alignment" determines the
  * alignment of the first physical page in the set.  If the given value
  * "boundary" is non-zero, then the set of physical pages cannot cross any
  * physical address boundary that is a multiple of that value.  Both
  * "alignment" and "boundary" must be a power of two.
  *
  * The page "mpred" must immediately precede the offset "pindex" within the
  * specified object.
  *
  * The object and free page queue must be locked.
  */
 vm_page_t
 vm_reserv_alloc_contig(vm_object_t object, vm_pindex_t pindex, int domain,
     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
     vm_paddr_t boundary, vm_page_t mpred)
 {
 	vm_paddr_t pa, size;
 	vm_page_t m, m_ret, msucc;
 	vm_pindex_t first, leftcap, rightcap;
 	vm_reserv_t rv;
 	u_long allocpages, maxpages, minpages;
 	int i, index, n;
 
 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(npages != 0, ("vm_reserv_alloc_contig: npages is 0"));
 
 	/*
 	 * Is a reservation fundamentally impossible?
 	 */
 	if (pindex < VM_RESERV_INDEX(object, pindex) ||
 	    pindex + npages > object->size)
 		return (NULL);
 
 	/*
 	 * All reservations of a particular size have the same alignment.
 	 * Assuming that the first page is allocated from a reservation, the
 	 * least significant bits of its physical address can be determined
 	 * from its offset from the beginning of the reservation and the size
 	 * of the reservation.
 	 *
 	 * Could the specified index within a reservation of the smallest
 	 * possible size satisfy the alignment and boundary requirements?
 	 */
 	pa = VM_RESERV_INDEX(object, pindex) << PAGE_SHIFT;
 	if ((pa & (alignment - 1)) != 0)
 		return (NULL);
 	size = npages << PAGE_SHIFT;
 	if (((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0)
 		return (NULL);
 
 	/*
 	 * Callers should've extended an existing reservation prior to
 	 * calling this function.  If a reservation exists it is
 	 * incompatible with the allocation.
 	 */
 	rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
 	if (rv != NULL)
 		return (NULL);
 
 	/*
 	 * Could at least one reservation fit between the first index to the
 	 * left that can be used ("leftcap") and the first index to the right
 	 * that cannot be used ("rightcap")?
 	 *
 	 * We must synchronize with the reserv object lock to protect the
 	 * pindex/object of the resulting reservations against rename while
 	 * we are inspecting.
 	 */
 	first = pindex - VM_RESERV_INDEX(object, pindex);
 	minpages = VM_RESERV_INDEX(object, pindex) + npages;
 	maxpages = roundup2(minpages, VM_LEVEL_0_NPAGES);
 	allocpages = maxpages;
 	vm_reserv_object_lock(object);
 	if (mpred != NULL) {
 		if ((rv = vm_reserv_from_page(mpred))->object != object)
 			leftcap = mpred->pindex + 1;
 		else
 			leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
 		if (leftcap > first) {
 			vm_reserv_object_unlock(object);
 			return (NULL);
 		}
 	}
 	if (msucc != NULL) {
 		if ((rv = vm_reserv_from_page(msucc))->object != object)
 			rightcap = msucc->pindex;
 		else
 			rightcap = rv->pindex;
 		if (first + maxpages > rightcap) {
 			if (maxpages == VM_LEVEL_0_NPAGES) {
 				vm_reserv_object_unlock(object);
 				return (NULL);
 			}
 
 			/*
 			 * At least one reservation will fit between "leftcap"
 			 * and "rightcap".  However, a reservation for the
 			 * last of the requested pages will not fit.  Reduce
 			 * the size of the upcoming allocation accordingly.
 			 */
 			allocpages = minpages;
 		}
 	}
 	vm_reserv_object_unlock(object);
 
 	/*
 	 * Would the last new reservation extend past the end of the object?
 	 */
 	if (first + maxpages > object->size) {
 		/*
 		 * Don't allocate the last new reservation if the object is a
 		 * vnode or backed by another object that is a vnode. 
 		 */
 		if (object->type == OBJT_VNODE ||
 		    (object->backing_object != NULL &&
 		    object->backing_object->type == OBJT_VNODE)) {
 			if (maxpages == VM_LEVEL_0_NPAGES)
 				return (NULL);
 			allocpages = minpages;
 		}
 		/* Speculate that the object may grow. */
 	}
 
 	/*
 	 * Allocate the physical pages.  The alignment and boundary specified
 	 * for this allocation may be different from the alignment and
 	 * boundary specified for the requested pages.  For instance, the
 	 * specified index may not be the first page within the first new
 	 * reservation.
 	 */
 	m = vm_phys_alloc_contig(domain, allocpages, low, high, ulmax(alignment,
 	    VM_LEVEL_0_SIZE), boundary > VM_LEVEL_0_SIZE ? boundary : 0);
 	if (m == NULL)
 		return (NULL);
 	KASSERT(vm_phys_domain(m) == domain,
 	    ("vm_reserv_alloc_contig: Page domain does not match requested."));
 
 	/*
 	 * The allocated physical pages always begin at a reservation
 	 * boundary, but they do not always end at a reservation boundary.
 	 * Initialize every reservation that is completely covered by the
 	 * allocated physical pages.
 	 */
 	m_ret = NULL;
 	index = VM_RESERV_INDEX(object, pindex);
 	do {
 		rv = vm_reserv_from_page(m);
 		KASSERT(rv->pages == m,
 		    ("vm_reserv_alloc_contig: reserv %p's pages is corrupted",
 		    rv));
 		vm_reserv_insert(rv, object, first);
 		n = ulmin(VM_LEVEL_0_NPAGES - index, npages);
 		for (i = 0; i < n; i++)
 			vm_reserv_populate(rv, index + i);
 		npages -= n;
 		if (m_ret == NULL) {
 			m_ret = &rv->pages[index];
 			index = 0;
 		}
 		m += VM_LEVEL_0_NPAGES;
 		first += VM_LEVEL_0_NPAGES;
 		allocpages -= VM_LEVEL_0_NPAGES;
 	} while (allocpages >= VM_LEVEL_0_NPAGES);
 	return (m_ret);
 }
 
 /*
  * Attempts to extend an existing reservation and allocate the page to the
  * object.
  *
  * The page "mpred" must immediately precede the offset "pindex" within the
  * specified object.
  *
  * The object must be locked.
  */
 vm_page_t
 vm_reserv_extend(int req, vm_object_t object, vm_pindex_t pindex, int domain,
     vm_page_t mpred)
 {
 	struct vm_domain *vmd;
 	vm_page_t m, msucc;
 	vm_reserv_t rv;
 	int index, free_count;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Could a reservation currently exist?
 	 */
 	if (pindex < VM_RESERV_INDEX(object, pindex) ||
 	    pindex >= object->size || object->resident_page_count == 0)
 		return (NULL);
 
 	/*
 	 * Look for an existing reservation.
 	 */
 	rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
 	if (rv == NULL)
 		return (NULL);
 
 	KASSERT(object != kernel_object || rv->domain == domain,
 	    ("vm_reserv_extend: Domain mismatch from reservation."));
 	domain = rv->domain;
 	vmd = VM_DOMAIN(domain);
 	index = VM_RESERV_INDEX(object, pindex);
 	m = &rv->pages[index];
 	vm_domain_free_lock(vmd);
 	if (vm_domain_available(vmd, req, 1) == 0 ||
 	    /* Handle reclaim race. */
 	    rv->object != object ||
 	    /* Handle vm_page_rename(m, new_object, ...). */
 	    popmap_is_set(rv->popmap, index))
 		m = NULL;
 	if (m != NULL) {
 		vm_reserv_populate(rv, index);
 		free_count = vm_domain_freecnt_adj(vmd, -1);
 	} else
 		free_count = vmd->vmd_free_count;
 	vm_domain_free_unlock(vmd);
 
 	if (vm_paging_needed(vmd, free_count))
 		pagedaemon_wakeup(domain);
 
 	return (m);
 }
 
 /*
  * Allocates a page from an existing reservation.
  *
  * The page "mpred" must immediately precede the offset "pindex" within the
  * specified object.
  *
  * The object and free page queue must be locked.
  */
 vm_page_t
 vm_reserv_alloc_page(vm_object_t object, vm_pindex_t pindex, int domain,
     vm_page_t mpred)
 {
 	vm_page_t m, msucc;
 	vm_pindex_t first, leftcap, rightcap;
 	vm_reserv_t rv;
 	int index;
 
 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Is a reservation fundamentally impossible?
 	 */
 	if (pindex < VM_RESERV_INDEX(object, pindex) ||
 	    pindex >= object->size)
 		return (NULL);
 
 	/*
 	 * Callers should've extended an existing reservation prior to
 	 * calling this function.  If a reservation exists it is
 	 * incompatible with the allocation.
 	 */
 	rv = vm_reserv_from_object(object, pindex, mpred, &msucc);
 	if (rv != NULL)
 		return (NULL);
 
 	/*
 	 * Could a reservation fit between the first index to the left that
 	 * can be used and the first index to the right that cannot be used?
 	 *
 	 * We must synchronize with the reserv object lock to protect the
 	 * pindex/object of the resulting reservations against rename while
 	 * we are inspecting.
 	 */
 	first = pindex - VM_RESERV_INDEX(object, pindex);
 	vm_reserv_object_lock(object);
 	if (mpred != NULL) {
 		if ((rv = vm_reserv_from_page(mpred))->object != object)
 			leftcap = mpred->pindex + 1;
 		else
 			leftcap = rv->pindex + VM_LEVEL_0_NPAGES;
 		if (leftcap > first) {
 			vm_reserv_object_unlock(object);
 			return (NULL);
 		}
 	}
 	if (msucc != NULL) {
 		if ((rv = vm_reserv_from_page(msucc))->object != object)
 			rightcap = msucc->pindex;
 		else
 			rightcap = rv->pindex;
 		if (first + VM_LEVEL_0_NPAGES > rightcap) {
 			vm_reserv_object_unlock(object);
 			return (NULL);
 		}
 	}
 	vm_reserv_object_unlock(object);
 
 	/*
 	 * Would a new reservation extend past the end of the object? 
 	 */
 	if (first + VM_LEVEL_0_NPAGES > object->size) {
 		/*
 		 * Don't allocate a new reservation if the object is a vnode or
 		 * backed by another object that is a vnode. 
 		 */
 		if (object->type == OBJT_VNODE ||
 		    (object->backing_object != NULL &&
 		    object->backing_object->type == OBJT_VNODE))
 			return (NULL);
 		/* Speculate that the object may grow. */
 	}
 
 	/*
 	 * Allocate and populate the new reservation.
 	 */
 	m = vm_phys_alloc_pages(domain, VM_FREEPOOL_DEFAULT, VM_LEVEL_0_ORDER);
 	if (m == NULL)
 		return (NULL);
 	rv = vm_reserv_from_page(m);
 	KASSERT(rv->pages == m,
 	    ("vm_reserv_alloc_page: reserv %p's pages is corrupted", rv));
 	vm_reserv_insert(rv, object, first);
 	index = VM_RESERV_INDEX(object, pindex);
 	vm_reserv_populate(rv, index);
 	return (&rv->pages[index]);
 }
 
 /*
  * Breaks the given reservation.  All free pages in the reservation
  * are returned to the physical memory allocator.  The reservation's
  * population count and map are reset to their initial state.
  *
  * The given reservation must not be in the partially populated reservation
  * queue.  The free page queue lock must be held.
  */
 static void
 vm_reserv_break(vm_reserv_t rv)
 {
 	int begin_zeroes, hi, i, lo;
 
 	vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
 	vm_reserv_remove(rv);
 	rv->pages->psind = 0;
 	i = hi = 0;
 	do {
 		/* Find the next 0 bit.  Any previous 0 bits are < "hi". */
 		lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
 		if (lo == 0) {
 			/* Redundantly clears bits < "hi". */
 			rv->popmap[i] = 0;
 			rv->popcnt -= NBPOPMAP - hi;
 			while (++i < NPOPMAP) {
 				lo = ffsl(~rv->popmap[i]);
 				if (lo == 0) {
 					rv->popmap[i] = 0;
 					rv->popcnt -= NBPOPMAP;
 				} else
 					break;
 			}
 			if (i == NPOPMAP)
 				break;
 			hi = 0;
 		}
 		KASSERT(lo > 0, ("vm_reserv_break: lo is %d", lo));
 		/* Convert from ffsl() to ordinary bit numbering. */
 		lo--;
 		if (lo > 0) {
 			/* Redundantly clears bits < "hi". */
 			rv->popmap[i] &= ~((1UL << lo) - 1);
 			rv->popcnt -= lo - hi;
 		}
 		begin_zeroes = NBPOPMAP * i + lo;
 		/* Find the next 1 bit. */
 		do
 			hi = ffsl(rv->popmap[i]);
 		while (hi == 0 && ++i < NPOPMAP);
 		if (i != NPOPMAP)
 			/* Convert from ffsl() to ordinary bit numbering. */
 			hi--;
 		vm_phys_free_contig(&rv->pages[begin_zeroes], NBPOPMAP * i +
 		    hi - begin_zeroes);
 	} while (i < NPOPMAP);
 	KASSERT(rv->popcnt == 0,
 	    ("vm_reserv_break: reserv %p's popcnt is corrupted", rv));
 	vm_reserv_broken++;
 }
 
 /*
  * Breaks all reservations belonging to the given object.
  */
 void
 vm_reserv_break_all(vm_object_t object)
 {
 	vm_reserv_t rv;
 	struct vm_domain *vmd;
 
 	/*
 	 * This access of object->rvq is unsynchronized so that the
 	 * object rvq lock can nest after the domain_free lock.  We
 	 * must check for races in the results.  However, the object
 	 * lock prevents new additions, so we are guaranteed that when
 	 * it returns NULL the object is properly empty.
 	 */
 	vmd = NULL;
 	while ((rv = LIST_FIRST(&object->rvq)) != NULL) {
 		if (vmd != VM_DOMAIN(rv->domain)) {
 			if (vmd != NULL)
 				vm_domain_free_unlock(vmd);
 			vmd = VM_DOMAIN(rv->domain);
 			vm_domain_free_lock(vmd);
 		}
 		/* Reclaim race. */
 		if (rv->object != object)
 			continue;
 		KASSERT(rv->object == object,
 		    ("vm_reserv_break_all: reserv %p is corrupted", rv));
 		if (rv->inpartpopq) {
 			TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
 			rv->inpartpopq = FALSE;
 		}
 		vm_reserv_break(rv);
 	}
 	if (vmd != NULL)
 		vm_domain_free_unlock(vmd);
 }
 
 /*
  * Frees the given page if it belongs to a reservation.  Returns TRUE if the
  * page is freed and FALSE otherwise.
  *
  * The free page queue lock must be held.
  */
 boolean_t
 vm_reserv_free_page(vm_page_t m)
 {
 	vm_reserv_t rv;
 
 	rv = vm_reserv_from_page(m);
 	if (rv->object == NULL)
 		return (FALSE);
 	vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
 	vm_reserv_depopulate(rv, m - rv->pages);
 	return (TRUE);
 }
 
 /*
  * Initializes the reservation management system.  Specifically, initializes
  * the reservation array.
  *
  * Requires that vm_page_array and first_page are initialized!
  */
 void
 vm_reserv_init(void)
 {
 	vm_paddr_t paddr;
 	struct vm_phys_seg *seg;
 	int i, segind;
 
 	/*
 	 * Initialize the reservation array.  Specifically, initialize the
 	 * "pages" field for every element that has an underlying superpage.
 	 */
 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
 		seg = &vm_phys_segs[segind];
 		paddr = roundup2(seg->start, VM_LEVEL_0_SIZE);
 		while (paddr + VM_LEVEL_0_SIZE <= seg->end) {
 			vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].pages =
 			    PHYS_TO_VM_PAGE(paddr);
 			vm_reserv_array[paddr >> VM_LEVEL_0_SHIFT].domain =
 			    seg->domain;
 			paddr += VM_LEVEL_0_SIZE;
 		}
 	}
 	for (i = 0; i < MAXMEMDOM; i++)
 		TAILQ_INIT(&vm_rvq_partpop[i]);
 }
 
 /*
  * Returns true if the given page belongs to a reservation and that page is
  * free.  Otherwise, returns false.
  */
 bool
 vm_reserv_is_page_free(vm_page_t m)
 {
 	vm_reserv_t rv;
 
 	rv = vm_reserv_from_page(m);
 	if (rv->object == NULL)
 		return (false);
 	vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
 	return (popmap_is_clear(rv->popmap, m - rv->pages));
 }
 
 /*
  * If the given page belongs to a reservation, returns the level of that
  * reservation.  Otherwise, returns -1.
  */
 int
 vm_reserv_level(vm_page_t m)
 {
 	vm_reserv_t rv;
 
 	rv = vm_reserv_from_page(m);
 	return (rv->object != NULL ? 0 : -1);
 }
 
 /*
  * Returns a reservation level if the given page belongs to a fully populated
  * reservation and -1 otherwise.
  */
 int
 vm_reserv_level_iffullpop(vm_page_t m)
 {
 	vm_reserv_t rv;
 
 	rv = vm_reserv_from_page(m);
 	return (rv->popcnt == VM_LEVEL_0_NPAGES ? 0 : -1);
 }
 
 /*
  * Breaks the given partially populated reservation, releasing its free pages
  * to the physical memory allocator.
  *
  * The free page queue lock must be held.
  */
 static void
 vm_reserv_reclaim(vm_reserv_t rv)
 {
 
 	vm_domain_free_assert_locked(VM_DOMAIN(rv->domain));
 	KASSERT(rv->inpartpopq,
 	    ("vm_reserv_reclaim: reserv %p's inpartpopq is FALSE", rv));
 	KASSERT(rv->domain >= 0 && rv->domain < vm_ndomains,
 	    ("vm_reserv_reclaim: reserv %p's domain is corrupted %d",
 	    rv, rv->domain));
 	TAILQ_REMOVE(&vm_rvq_partpop[rv->domain], rv, partpopq);
 	rv->inpartpopq = FALSE;
 	vm_reserv_break(rv);
 	vm_reserv_reclaimed++;
 }
 
 /*
  * Breaks the reservation at the head of the partially populated reservation
  * queue, releasing its free pages to the physical memory allocator.  Returns
  * TRUE if a reservation is broken and FALSE otherwise.
  *
  * The free page queue lock must be held.
  */
 boolean_t
 vm_reserv_reclaim_inactive(int domain)
 {
 	vm_reserv_t rv;
 
 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
 	if ((rv = TAILQ_FIRST(&vm_rvq_partpop[domain])) != NULL) {
 		vm_reserv_reclaim(rv);
 		return (TRUE);
 	}
 	return (FALSE);
 }
 
 /*
  * Searches the partially populated reservation queue for the least recently
  * changed reservation with free pages that satisfy the given request for
  * contiguous physical memory.  If a satisfactory reservation is found, it is
  * broken.  Returns TRUE if a reservation is broken and FALSE otherwise.
  *
  * The free page queue lock must be held.
  */
 boolean_t
 vm_reserv_reclaim_contig(int domain, u_long npages, vm_paddr_t low,
     vm_paddr_t high, u_long alignment, vm_paddr_t boundary)
 {
 	vm_paddr_t pa, size;
 	vm_reserv_t rv;
 	int hi, i, lo, low_index, next_free;
 
 	vm_domain_free_assert_locked(VM_DOMAIN(domain));
 	if (npages > VM_LEVEL_0_NPAGES - 1)
 		return (FALSE);
 	size = npages << PAGE_SHIFT;
 	TAILQ_FOREACH(rv, &vm_rvq_partpop[domain], partpopq) {
 		pa = VM_PAGE_TO_PHYS(&rv->pages[VM_LEVEL_0_NPAGES - 1]);
 		if (pa + PAGE_SIZE - size < low) {
 			/* This entire reservation is too low; go to next. */
 			continue;
 		}
 		pa = VM_PAGE_TO_PHYS(&rv->pages[0]);
 		if (pa + size > high) {
 			/* This entire reservation is too high; go to next. */
 			continue;
 		}
 		if (pa < low) {
 			/* Start the search for free pages at "low". */
 			low_index = (low + PAGE_MASK - pa) >> PAGE_SHIFT;
 			i = low_index / NBPOPMAP;
 			hi = low_index % NBPOPMAP;
 		} else
 			i = hi = 0;
 		do {
 			/* Find the next free page. */
 			lo = ffsl(~(((1UL << hi) - 1) | rv->popmap[i]));
 			while (lo == 0 && ++i < NPOPMAP)
 				lo = ffsl(~rv->popmap[i]);
 			if (i == NPOPMAP)
 				break;
 			/* Convert from ffsl() to ordinary bit numbering. */
 			lo--;
 			next_free = NBPOPMAP * i + lo;
 			pa = VM_PAGE_TO_PHYS(&rv->pages[next_free]);
 			KASSERT(pa >= low,
 			    ("vm_reserv_reclaim_contig: pa is too low"));
 			if (pa + size > high) {
 				/* The rest of this reservation is too high. */
 				break;
 			} else if ((pa & (alignment - 1)) != 0 ||
 			    ((pa ^ (pa + size - 1)) & ~(boundary - 1)) != 0) {
 				/*
 				 * The current page doesn't meet the alignment
 				 * and/or boundary requirements.  Continue
 				 * searching this reservation until the rest
 				 * of its free pages are either excluded or
 				 * exhausted.
 				 */
 				hi = lo + 1;
 				if (hi >= NBPOPMAP) {
 					hi = 0;
 					i++;
 				}
 				continue;
 			}
 			/* Find the next used page. */
 			hi = ffsl(rv->popmap[i] & ~((1UL << lo) - 1));
 			while (hi == 0 && ++i < NPOPMAP) {
 				if ((NBPOPMAP * i - next_free) * PAGE_SIZE >=
 				    size) {
 					vm_reserv_reclaim(rv);
 					return (TRUE);
 				}
 				hi = ffsl(rv->popmap[i]);
 			}
 			/* Convert from ffsl() to ordinary bit numbering. */
 			if (i != NPOPMAP)
 				hi--;
 			if ((NBPOPMAP * i + hi - next_free) * PAGE_SIZE >=
 			    size) {
 				vm_reserv_reclaim(rv);
 				return (TRUE);
 			}
 		} while (i < NPOPMAP);
 	}
 	return (FALSE);
 }
 
 /*
  * Transfers the reservation underlying the given page to a new object.
  *
  * The object must be locked.
  */
 void
 vm_reserv_rename(vm_page_t m, vm_object_t new_object, vm_object_t old_object,
     vm_pindex_t old_object_offset)
 {
 	vm_reserv_t rv;
 
 	VM_OBJECT_ASSERT_WLOCKED(new_object);
 	rv = vm_reserv_from_page(m);
 	if (rv->object == old_object) {
 		vm_domain_free_lock(VM_DOMAIN(rv->domain));
 		if (rv->object == old_object) {
 			vm_reserv_object_lock(old_object);
 			rv->object = NULL;
 			LIST_REMOVE(rv, objq);
 			vm_reserv_object_unlock(old_object);
 			vm_reserv_object_lock(new_object);
 			rv->object = new_object;
 			rv->pindex -= old_object_offset;
 			LIST_INSERT_HEAD(&new_object->rvq, rv, objq);
 			vm_reserv_object_unlock(new_object);
 		}
 		vm_domain_free_unlock(VM_DOMAIN(rv->domain));
 	}
 }
 
 /*
  * Returns the size (in bytes) of a reservation of the specified level.
  */
 int
 vm_reserv_size(int level)
 {
 
 	switch (level) {
 	case 0:
 		return (VM_LEVEL_0_SIZE);
 	case -1:
 		return (PAGE_SIZE);
 	default:
 		return (0);
 	}
 }
 
 /*
  * Allocates the virtual and physical memory required by the reservation
  * management system's data structures, in particular, the reservation array.
  */
 vm_paddr_t
 vm_reserv_startup(vm_offset_t *vaddr, vm_paddr_t end, vm_paddr_t high_water)
 {
 	vm_paddr_t new_end;
 	size_t size;
 	int i;
 
 	/*
 	 * Calculate the size (in bytes) of the reservation array.  Round up
 	 * from "high_water" because every small page is mapped to an element
 	 * in the reservation array based on its physical address.  Thus, the
 	 * number of elements in the reservation array can be greater than the
 	 * number of superpages. 
 	 */
 	size = howmany(high_water, VM_LEVEL_0_SIZE) * sizeof(struct vm_reserv);
 
 	/*
 	 * Allocate and map the physical memory for the reservation array.  The
 	 * next available virtual address is returned by reference.
 	 */
 	new_end = end - round_page(size);
 	vm_reserv_array = (void *)(uintptr_t)pmap_map(vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	bzero(vm_reserv_array, size);
 
 	for (i = 0; i < VM_RESERV_OBJ_LOCK_COUNT; i++)
 		mtx_init(&vm_reserv_object_mtx[i], "resv obj lock", NULL,
 		    MTX_DEF);
 
 	/*
 	 * Return the next available physical address.
 	 */
 	return (new_end);
 }
 
 /*
  * Returns the superpage containing the given page.
  */
 vm_page_t
 vm_reserv_to_superpage(vm_page_t m)
 {
 	vm_reserv_t rv;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	rv = vm_reserv_from_page(m);
 	if (rv->object == m->object && rv->popcnt == VM_LEVEL_0_NPAGES)
 		m = rv->pages;
 	else
 		m = NULL;
 
 	return (m);
 }
 
 #endif	/* VM_NRESERVLEVEL > 0 */
Index: user/jeff/numa/sys/x86/x86/intr_machdep.c
===================================================================
--- user/jeff/numa/sys/x86/x86/intr_machdep.c	(revision 329848)
+++ user/jeff/numa/sys/x86/x86/intr_machdep.c	(revision 329849)
@@ -1,741 +1,746 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Machine dependent interrupt code for x86.  For x86, we have to
  * deal with different PICs.  Thus, we use the passed in vector to lookup
  * an interrupt source associated with that vector.  The interrupt source
  * describes which PIC the source belongs to and includes methods to handle
  * that source.
  */
 
 #include "opt_atpic.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/ktr.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 #include <sys/vmmeter.h>
 #include <machine/clock.h>
 #include <machine/intr_machdep.h>
 #include <machine/smp.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #ifndef DEV_ATPIC
 #include <machine/segments.h>
 #include <machine/frame.h>
 #include <dev/ic/i8259.h>
 #include <x86/isa/icu.h>
 #include <isa/isareg.h>
 #endif
 
 #define	MAX_STRAY_LOG	5
 
 typedef void (*mask_fn)(void *);
 
 static int intrcnt_index;
 static struct intsrc *interrupt_sources[NUM_IO_INTS];
 #ifdef SMP
 static struct intsrc *interrupt_sorted[NUM_IO_INTS];
 CTASSERT(sizeof(interrupt_sources) == sizeof(interrupt_sorted));
 static int intrbalance;
 SYSCTL_INT(_hw, OID_AUTO, intrbalance, CTLFLAG_RW, &intrbalance, 0,
     "Interrupt auto-balance interval (seconds).  Zero disables.");
 static struct timeout_task intrbalance_task;
 #endif
 static struct sx intrsrc_lock;
 static struct mtx intrpic_lock;
 static struct mtx intrcnt_lock;
 static TAILQ_HEAD(pics_head, pic) pics;
 
 #if defined(SMP) && !defined(EARLY_AP_STARTUP)
 static int assign_cpu;
 #endif
 
 u_long intrcnt[INTRCNT_COUNT];
 char intrnames[INTRCNT_COUNT * (MAXCOMLEN + 1)];
 size_t sintrcnt = sizeof(intrcnt);
 size_t sintrnames = sizeof(intrnames);
 
 static int	intr_assign_cpu(void *arg, int cpu);
 static void	intr_disable_src(void *arg);
 static void	intr_init(void *__dummy);
 static int	intr_pic_registered(struct pic *pic);
 static void	intrcnt_setname(const char *name, int index);
 static void	intrcnt_updatename(struct intsrc *is);
 static void	intrcnt_register(struct intsrc *is);
 
 static int
 intr_pic_registered(struct pic *pic)
 {
 	struct pic *p;
 
 	TAILQ_FOREACH(p, &pics, pics) {
 		if (p == pic)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Register a new interrupt controller (PIC).  This is to support suspend
  * and resume where we suspend/resume controllers rather than individual
  * sources.  This also allows controllers with no active sources (such as
  * 8259As in a system using the APICs) to participate in suspend and resume.
  */
 int
 intr_register_pic(struct pic *pic)
 {
 	int error;
 
 	mtx_lock(&intrpic_lock);
 	if (intr_pic_registered(pic))
 		error = EBUSY;
 	else {
 		TAILQ_INSERT_TAIL(&pics, pic, pics);
 		error = 0;
 	}
 	mtx_unlock(&intrpic_lock);
 	return (error);
 }
 
 /*
  * Register a new interrupt source with the global interrupt system.
  * The global interrupts need to be disabled when this function is
  * called.
  */
 int
 intr_register_source(struct intsrc *isrc)
 {
 	int error, vector;
 
 	KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC"));
 	vector = isrc->is_pic->pic_vector(isrc);
 	if (interrupt_sources[vector] != NULL)
 		return (EEXIST);
 	error = intr_event_create(&isrc->is_event, isrc, 0, vector,
 	    intr_disable_src, (mask_fn)isrc->is_pic->pic_enable_source,
 	    (mask_fn)isrc->is_pic->pic_eoi_source, intr_assign_cpu, "irq%d:",
 	    vector);
 	if (error)
 		return (error);
 	sx_xlock(&intrsrc_lock);
 	if (interrupt_sources[vector] != NULL) {
 		sx_xunlock(&intrsrc_lock);
 		intr_event_destroy(isrc->is_event);
 		return (EEXIST);
 	}
 	intrcnt_register(isrc);
 	interrupt_sources[vector] = isrc;
 	isrc->is_handlers = 0;
 	sx_xunlock(&intrsrc_lock);
 	return (0);
 }
 
 struct intsrc *
 intr_lookup_source(int vector)
 {
 
 	return (interrupt_sources[vector]);
 }
 
 int
 intr_add_handler(const char *name, int vector, driver_filter_t filter,
-    driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep)
+    driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep,
+    int domain)
 {
 	struct intsrc *isrc;
 	int error;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	error = intr_event_add_handler(isrc->is_event, name, filter, handler,
 	    arg, intr_priority(flags), flags, cookiep);
 	if (error == 0) {
 		sx_xlock(&intrsrc_lock);
 		intrcnt_updatename(isrc);
 		isrc->is_handlers++;
 		if (isrc->is_handlers == 1) {
+			isrc->is_domain = domain;
 			isrc->is_pic->pic_enable_intr(isrc);
 			isrc->is_pic->pic_enable_source(isrc);
 		}
 		sx_xunlock(&intrsrc_lock);
 	}
 	return (error);
 }
 
 int
 intr_remove_handler(void *cookie)
 {
 	struct intsrc *isrc;
 	int error;
 
 	isrc = intr_handler_source(cookie);
 	error = intr_event_remove_handler(cookie);
 	if (error == 0) {
 		sx_xlock(&intrsrc_lock);
 		isrc->is_handlers--;
 		if (isrc->is_handlers == 0) {
 			isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI);
 			isrc->is_pic->pic_disable_intr(isrc);
 		}
 		intrcnt_updatename(isrc);
 		sx_xunlock(&intrsrc_lock);
 	}
 	return (error);
 }
 
 int
 intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol)
 {
 	struct intsrc *isrc;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	return (isrc->is_pic->pic_config_intr(isrc, trig, pol));
 }
 
 static void
 intr_disable_src(void *arg)
 {
 	struct intsrc *isrc;
 
 	isrc = arg;
 	isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
 }
 
 void
 intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame)
 {
 	struct intr_event *ie;
 	int vector;
 
 	/*
 	 * We count software interrupts when we process them.  The
 	 * code here follows previous practice, but there's an
 	 * argument for counting hardware interrupts when they're
 	 * processed too.
 	 */
 	(*isrc->is_count)++;
 	VM_CNT_INC(v_intr);
 
 	ie = isrc->is_event;
 
 	/*
 	 * XXX: We assume that IRQ 0 is only used for the ISA timer
 	 * device (clk).
 	 */
 	vector = isrc->is_pic->pic_vector(isrc);
 	if (vector == 0)
 		clkintr_pending = 1;
 
 	/*
 	 * For stray interrupts, mask and EOI the source, bump the
 	 * stray count, and log the condition.
 	 */
 	if (intr_event_handle(ie, frame) != 0) {
 		isrc->is_pic->pic_disable_source(isrc, PIC_EOI);
 		(*isrc->is_straycount)++;
 		if (*isrc->is_straycount < MAX_STRAY_LOG)
 			log(LOG_ERR, "stray irq%d\n", vector);
 		else if (*isrc->is_straycount == MAX_STRAY_LOG)
 			log(LOG_CRIT,
 			    "too many stray irq %d's: not logging anymore\n",
 			    vector);
 	}
 }
 
 void
 intr_resume(bool suspend_cancelled)
 {
 	struct pic *pic;
 
 #ifndef DEV_ATPIC
 	atpic_reset();
 #endif
 	mtx_lock(&intrpic_lock);
 	TAILQ_FOREACH(pic, &pics, pics) {
 		if (pic->pic_resume != NULL)
 			pic->pic_resume(pic, suspend_cancelled);
 	}
 	mtx_unlock(&intrpic_lock);
 }
 
 void
 intr_suspend(void)
 {
 	struct pic *pic;
 
 	mtx_lock(&intrpic_lock);
 	TAILQ_FOREACH_REVERSE(pic, &pics, pics_head, pics) {
 		if (pic->pic_suspend != NULL)
 			pic->pic_suspend(pic);
 	}
 	mtx_unlock(&intrpic_lock);
 }
 
 static int
 intr_assign_cpu(void *arg, int cpu)
 {
 #ifdef SMP
 	struct intsrc *isrc;
 	int error;
 
 #ifdef EARLY_AP_STARTUP
 	MPASS(mp_ncpus == 1 || smp_started);
 
 	/* Nothing to do if there is only a single CPU. */
 	if (mp_ncpus > 1 && cpu != NOCPU) {
 #else
 	/*
 	 * Don't do anything during early boot.  We will pick up the
 	 * assignment once the APs are started.
 	 */
 	if (assign_cpu && cpu != NOCPU) {
 #endif
 		isrc = arg;
 		sx_xlock(&intrsrc_lock);
 		error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]);
 		if (error == 0)
 			isrc->is_cpu = cpu;
 		sx_xunlock(&intrsrc_lock);
 	} else
 		error = 0;
 	return (error);
 #else
 	return (EOPNOTSUPP);
 #endif
 }
 
 static void
 intrcnt_setname(const char *name, int index)
 {
 
 	snprintf(intrnames + (MAXCOMLEN + 1) * index, MAXCOMLEN + 1, "%-*s",
 	    MAXCOMLEN, name);
 }
 
 static void
 intrcnt_updatename(struct intsrc *is)
 {
 
 	intrcnt_setname(is->is_event->ie_fullname, is->is_index);
 }
 
 static void
 intrcnt_register(struct intsrc *is)
 {
 	char straystr[MAXCOMLEN + 1];
 
 	KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__));
 	mtx_lock_spin(&intrcnt_lock);
 	is->is_index = intrcnt_index;
 	intrcnt_index += 2;
 	snprintf(straystr, MAXCOMLEN + 1, "stray irq%d",
 	    is->is_pic->pic_vector(is));
 	intrcnt_updatename(is);
 	is->is_count = &intrcnt[is->is_index];
 	intrcnt_setname(straystr, is->is_index + 1);
 	is->is_straycount = &intrcnt[is->is_index + 1];
 	mtx_unlock_spin(&intrcnt_lock);
 }
 
 void
 intrcnt_add(const char *name, u_long **countp)
 {
 
 	mtx_lock_spin(&intrcnt_lock);
 	*countp = &intrcnt[intrcnt_index];
 	intrcnt_setname(name, intrcnt_index);
 	intrcnt_index++;
 	mtx_unlock_spin(&intrcnt_lock);
 }
 
 static void
 intr_init(void *dummy __unused)
 {
 
 	intrcnt_setname("???", 0);
 	intrcnt_index = 1;
 	TAILQ_INIT(&pics);
 	mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF);
 	sx_init(&intrsrc_lock, "intrsrc");
 	mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN);
 }
 SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL);
 
 static void
 intr_init_final(void *dummy __unused)
 {
 
 	/*
 	 * Enable interrupts on the BSP after all of the interrupt
 	 * controllers are initialized.  Device interrupts are still
 	 * disabled in the interrupt controllers until interrupt
 	 * handlers are registered.  Interrupts are enabled on each AP
 	 * after their first context switch.
 	 */
 	enable_intr();
 }
 SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL);
 
 #ifndef DEV_ATPIC
 /* Initialize the two 8259A's to a known-good shutdown state. */
 void
 atpic_reset(void)
 {
 
 	outb(IO_ICU1, ICW1_RESET | ICW1_IC4);
 	outb(IO_ICU1 + ICU_IMR_OFFSET, IDT_IO_INTS);
 	outb(IO_ICU1 + ICU_IMR_OFFSET, IRQ_MASK(ICU_SLAVEID));
 	outb(IO_ICU1 + ICU_IMR_OFFSET, MASTER_MODE);
 	outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff);
 	outb(IO_ICU1, OCW3_SEL | OCW3_RR);
 
 	outb(IO_ICU2, ICW1_RESET | ICW1_IC4);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, IDT_IO_INTS + 8);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, ICU_SLAVEID);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, SLAVE_MODE);
 	outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff);
 	outb(IO_ICU2, OCW3_SEL | OCW3_RR);
 }
 #endif
 
 /* Add a description to an active interrupt handler. */
 int
 intr_describe(u_int vector, void *ih, const char *descr)
 {
 	struct intsrc *isrc;
 	int error;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	error = intr_event_describe_handler(isrc->is_event, ih, descr);
 	if (error)
 		return (error);
 	intrcnt_updatename(isrc);
 	return (0);
 }
 
 void
 intr_reprogram(void)
 {
 	struct intsrc *is;
 	int v;
 
 	sx_xlock(&intrsrc_lock);
 	for (v = 0; v < NUM_IO_INTS; v++) {
 		is = interrupt_sources[v];
 		if (is == NULL)
 			continue;
 		if (is->is_pic->pic_reprogram_pin != NULL)
 			is->is_pic->pic_reprogram_pin(is);
 	}
 	sx_xunlock(&intrsrc_lock);
 }
 
 #ifdef DDB
 /*
  * Dump data about interrupt handlers
  */
 DB_SHOW_COMMAND(irqs, db_show_irqs)
 {
 	struct intsrc **isrc;
 	int i, verbose;
 
 	if (strcmp(modif, "v") == 0)
 		verbose = 1;
 	else
 		verbose = 0;
 	isrc = interrupt_sources;
 	for (i = 0; i < NUM_IO_INTS && !db_pager_quit; i++, isrc++)
 		if (*isrc != NULL)
 			db_dump_intr_event((*isrc)->is_event, verbose);
 }
 #endif
 
 #ifdef SMP
 /*
  * Support for balancing interrupt sources across CPUs.  For now we just
  * allocate CPUs round-robin.
  */
 
 cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1);
-static int current_cpu;
+static int current_cpu[MAXMEMDOM];
 
 /*
  * Return the CPU that the next interrupt source should use.  For now
  * this just returns the next local APIC according to round-robin.
  */
 u_int
-intr_next_cpu(void)
+intr_next_cpu(int domain)
 {
 	u_int apic_id;
 
 #ifdef EARLY_AP_STARTUP
 	MPASS(mp_ncpus == 1 || smp_started);
 	if (mp_ncpus == 1)
 		return (PCPU_GET(apic_id));
 #else
 	/* Leave all interrupts on the BSP during boot. */
 	if (!assign_cpu)
 		return (PCPU_GET(apic_id));
 #endif
 
 	mtx_lock_spin(&icu_lock);
-	apic_id = cpu_apic_ids[current_cpu];
+	apic_id = cpu_apic_ids[current_cpu[domain]];
 	do {
-		current_cpu++;
-		if (current_cpu > mp_maxid)
-			current_cpu = 0;
-	} while (!CPU_ISSET(current_cpu, &intr_cpus));
+		current_cpu[domain]++;
+		if (current_cpu[domain] > mp_maxid)
+			current_cpu[domain] = 0;
+	} while (!CPU_ISSET(current_cpu[domain], &intr_cpus) ||
+	    !CPU_ISSET(current_cpu[domain], &cpuset_domain[domain]));
 	mtx_unlock_spin(&icu_lock);
 	return (apic_id);
 }
 
 /* Attempt to bind the specified IRQ to the specified CPU. */
 int
 intr_bind(u_int vector, u_char cpu)
 {
 	struct intsrc *isrc;
 
 	isrc = intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (EINVAL);
 	return (intr_event_bind(isrc->is_event, cpu));
 }
 
 /*
  * Add a CPU to our mask of valid CPUs that can be destinations of
  * interrupts.
  */
 void
 intr_add_cpu(u_int cpu)
 {
 
 	if (cpu >= MAXCPU)
 		panic("%s: Invalid CPU ID", __func__);
 	if (bootverbose)
 		printf("INTR: Adding local APIC %d as a target\n",
 		    cpu_apic_ids[cpu]);
 
 	CPU_SET(cpu, &intr_cpus);
 }
 
 #ifndef EARLY_AP_STARTUP
 /*
  * Distribute all the interrupt sources among the available CPUs once the
  * AP's have been launched.
  */
 static void
 intr_shuffle_irqs(void *arg __unused)
 {
 	struct intsrc *isrc;
 	u_int cpu;
 	int i;
 
 	/* Don't bother on UP. */
 	if (mp_ncpus == 1)
 		return;
 
 	/* Round-robin assign a CPU to each enabled source. */
 	sx_xlock(&intrsrc_lock);
 	assign_cpu = 1;
 	for (i = 0; i < NUM_IO_INTS; i++) {
 		isrc = interrupt_sources[i];
 		if (isrc != NULL && isrc->is_handlers > 0) {
 			/*
 			 * If this event is already bound to a CPU,
 			 * then assign the source to that CPU instead
 			 * of picking one via round-robin.  Note that
 			 * this is careful to only advance the
 			 * round-robin if the CPU assignment succeeds.
 			 */
 			cpu = isrc->is_event->ie_cpu;
 			if (cpu == NOCPU)
-				cpu = current_cpu;
+				cpu = current_cpu[isrc->is_domain];
 			if (isrc->is_pic->pic_assign_cpu(isrc,
 			    cpu_apic_ids[cpu]) == 0) {
 				isrc->is_cpu = cpu;
 				if (isrc->is_event->ie_cpu == NOCPU)
-					intr_next_cpu();
+					intr_next_cpu(isrc->is_domain);
 			}
 		}
 	}
 	sx_xunlock(&intrsrc_lock);
 }
 SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs,
     NULL);
 #endif
 
 /*
  * TODO: Export this information in a non-MD fashion, integrate with vmstat -i.
  */
 static int
 sysctl_hw_intrs(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	struct intsrc *isrc;
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	sx_slock(&intrsrc_lock);
 	for (i = 0; i < NUM_IO_INTS; i++) {
 		isrc = interrupt_sources[i];
 		if (isrc == NULL)
 			continue;
-		sbuf_printf(&sbuf, "%s:%d @%d: %ld\n",
+		sbuf_printf(&sbuf, "%s:%d @cpu%d(domain%d): %ld\n",
 		    isrc->is_event->ie_fullname,
 		    isrc->is_index,
 		    isrc->is_cpu,
+		    isrc->is_domain,
 		    *isrc->is_count);
 	}
 
 	sx_sunlock(&intrsrc_lock);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 SYSCTL_PROC(_hw, OID_AUTO, intrs, CTLTYPE_STRING | CTLFLAG_RW,
     0, 0, sysctl_hw_intrs, "A", "interrupt:number @cpu: count");
 
 /*
  * Compare two, possibly NULL, entries in the interrupt source array
  * by load.
  */
 static int
 intrcmp(const void *one, const void *two)
 {
 	const struct intsrc *i1, *i2;
 
 	i1 = *(const struct intsrc * const *)one;
 	i2 = *(const struct intsrc * const *)two;
 	if (i1 != NULL && i2 != NULL)
 		return (*i1->is_count - *i2->is_count);
 	if (i1 != NULL)
 		return (1);
 	if (i2 != NULL)
 		return (-1);
 	return (0);
 }
 
 /*
  * Balance IRQs across available CPUs according to load.
  */
 static void
 intr_balance(void *dummy __unused, int pending __unused)
 {
 	struct intsrc *isrc;
 	int interval;
 	u_int cpu;
 	int i;
 
 	interval = intrbalance;
 	if (interval == 0)
 		goto out;
 
 	/*
 	 * Sort interrupts according to count.
 	 */
 	sx_xlock(&intrsrc_lock);
 	memcpy(interrupt_sorted, interrupt_sources, sizeof(interrupt_sorted));
 	qsort(interrupt_sorted, NUM_IO_INTS, sizeof(interrupt_sorted[0]),
 	    intrcmp);
 
 	/*
 	 * Restart the scan from the same location to avoid moving in the
 	 * common case.
 	 */
-	current_cpu = 0;
+	for (i = 0; i < vm_ndomains; i++)
+		current_cpu[i] = 0;
 
 	/*
 	 * Assign round-robin from most loaded to least.
 	 */
 	for (i = NUM_IO_INTS - 1; i >= 0; i--) {
 		isrc = interrupt_sorted[i];
 		if (isrc == NULL  || isrc->is_event->ie_cpu != NOCPU)
 			continue;
-		cpu = current_cpu;
-		intr_next_cpu();
+		cpu = current_cpu[isrc->is_domain];
+		intr_next_cpu(isrc->is_domain);
 		if (isrc->is_cpu != cpu &&
 		    isrc->is_pic->pic_assign_cpu(isrc,
 		    cpu_apic_ids[cpu]) == 0)
 			isrc->is_cpu = cpu;
 	}
 	sx_xunlock(&intrsrc_lock);
 out:
 	taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task,
 	    interval ? hz * interval : hz * 60);
 
 }
 
 static void
 intr_balance_init(void *dummy __unused)
 {
 
 	TIMEOUT_TASK_INIT(taskqueue_thread, &intrbalance_task, 0, intr_balance,
 	    NULL);
 	taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task, hz);
 }
 SYSINIT(intr_balance_init, SI_SUB_SMP, SI_ORDER_ANY, intr_balance_init, NULL);
 
 #else
 /*
  * Always route interrupts to the current processor in the UP case.
  */
 u_int
-intr_next_cpu(void)
+intr_next_cpu(int domain)
 {
 
 	return (PCPU_GET(apic_id));
 }
 #endif
Index: user/jeff/numa/sys/x86/x86/io_apic.c
===================================================================
--- user/jeff/numa/sys/x86/x86/io_apic.c	(revision 329848)
+++ user/jeff/numa/sys/x86/x86/io_apic.c	(revision 329849)
@@ -1,1234 +1,1234 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 #include "opt_isa.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/rman.h>
 #include <sys/sysctl.h>
 
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <x86/apicreg.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <machine/resource.h>
 #include <machine/segments.h>
 #include <x86/iommu/iommu_intrmap.h>
 
 #define IOAPIC_ISA_INTS		16
 #define	IOAPIC_MEM_REGION	32
 #define	IOAPIC_REDTBL_LO(i)	(IOAPIC_REDTBL + (i) * 2)
 #define	IOAPIC_REDTBL_HI(i)	(IOAPIC_REDTBL_LO(i) + 1)
 
 static MALLOC_DEFINE(M_IOAPIC, "io_apic", "I/O APIC structures");
 
 /*
  * I/O APIC interrupt source driver.  Each pin is assigned an IRQ cookie
  * as laid out in the ACPI System Interrupt number model where each I/O
  * APIC has a contiguous chunk of the System Interrupt address space.
  * We assume that IRQs 1 - 15 behave like ISA IRQs and that all other
  * IRQs behave as PCI IRQs by default.  We also assume that the pin for
  * IRQ 0 is actually an ExtINT pin.  The apic enumerators override the
  * configuration of individual pins as indicated by their tables.
  *
  * Documentation for the I/O APIC: "82093AA I/O Advanced Programmable
  * Interrupt Controller (IOAPIC)", May 1996, Intel Corp.
  * ftp://download.intel.com/design/chipsets/datashts/29056601.pdf
  */
 
 struct ioapic_intsrc {
 	struct intsrc io_intsrc;
 	u_int io_irq;
 	u_int io_intpin:8;
 	u_int io_vector:8;
 	u_int io_cpu;
 	u_int io_activehi:1;
 	u_int io_edgetrigger:1;
 	u_int io_masked:1;
 	int io_bus:4;
 	uint32_t io_lowreg;
 	u_int io_remap_cookie;
 };
 
 struct ioapic {
 	struct pic io_pic;
 	u_int io_id:8;			/* logical ID */
 	u_int io_apic_id:4;
 	u_int io_intbase:8;		/* System Interrupt base */
 	u_int io_numintr:8;
 	u_int io_haseoi:1;
 	volatile ioapic_t *io_addr;	/* XXX: should use bus_space */
 	vm_paddr_t io_paddr;
 	STAILQ_ENTRY(ioapic) io_next;
 	device_t pci_dev;		/* matched pci device, if found */
 	struct resource *pci_wnd;	/* BAR 0, should be same or alias to
 					   io_paddr */
 	struct ioapic_intsrc io_pins[0];
 };
 
 static u_int	ioapic_read(volatile ioapic_t *apic, int reg);
 static void	ioapic_write(volatile ioapic_t *apic, int reg, u_int val);
 static const char *ioapic_bus_string(int bus_type);
 static void	ioapic_print_irq(struct ioapic_intsrc *intpin);
 static void	ioapic_enable_source(struct intsrc *isrc);
 static void	ioapic_disable_source(struct intsrc *isrc, int eoi);
 static void	ioapic_eoi_source(struct intsrc *isrc);
 static void	ioapic_enable_intr(struct intsrc *isrc);
 static void	ioapic_disable_intr(struct intsrc *isrc);
 static int	ioapic_vector(struct intsrc *isrc);
 static int	ioapic_source_pending(struct intsrc *isrc);
 static int	ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig,
 		    enum intr_polarity pol);
 static void	ioapic_resume(struct pic *pic, bool suspend_cancelled);
 static int	ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id);
 static void	ioapic_program_intpin(struct ioapic_intsrc *intpin);
 static void	ioapic_reprogram_intpin(struct intsrc *isrc);
 
 static STAILQ_HEAD(,ioapic) ioapic_list = STAILQ_HEAD_INITIALIZER(ioapic_list);
 struct pic ioapic_template = {
 	.pic_enable_source = ioapic_enable_source,
 	.pic_disable_source = ioapic_disable_source,
 	.pic_eoi_source = ioapic_eoi_source,
 	.pic_enable_intr = ioapic_enable_intr,
 	.pic_disable_intr = ioapic_disable_intr,
 	.pic_vector = ioapic_vector,
 	.pic_source_pending = ioapic_source_pending,
 	.pic_suspend = NULL,
 	.pic_resume = ioapic_resume,
 	.pic_config_intr = ioapic_config_intr,
 	.pic_assign_cpu = ioapic_assign_cpu,
 	.pic_reprogram_pin = ioapic_reprogram_intpin,
 };
 
 static int next_ioapic_base;
 static u_int next_id;
 
 static int enable_extint;
 SYSCTL_INT(_hw_apic, OID_AUTO, enable_extint, CTLFLAG_RDTUN, &enable_extint, 0,
     "Enable the ExtINT pin in the first I/O APIC");
 
 static void
 _ioapic_eoi_source(struct intsrc *isrc, int locked)
 {
 	struct ioapic_intsrc *src;
 	struct ioapic *io;
 	volatile uint32_t *apic_eoi;
 	uint32_t low1;
 
 	lapic_eoi();
 	if (!lapic_eoi_suppression)
 		return;
 	src = (struct ioapic_intsrc *)isrc;
 	if (src->io_edgetrigger)
 		return;
 	io = (struct ioapic *)isrc->is_pic;
 
 	/*
 	 * Handle targeted EOI for level-triggered pins, if broadcast
 	 * EOI suppression is supported by LAPICs.
 	 */
 	if (io->io_haseoi) {
 		/*
 		 * If IOAPIC has EOI Register, simply write vector
 		 * number into the reg.
 		 */
 		apic_eoi = (volatile uint32_t *)((volatile char *)
 		    io->io_addr + IOAPIC_EOIR);
 		*apic_eoi = src->io_vector;
 	} else {
 		/*
 		 * Otherwise, if IO-APIC is too old to provide EOIR,
 		 * do what Intel did for the Linux kernel. Temporary
 		 * switch the pin to edge-trigger and back, masking
 		 * the pin during the trick.
 		 */
 		if (!locked)
 			mtx_lock_spin(&icu_lock);
 		low1 = src->io_lowreg;
 		low1 &= ~IOART_TRGRLVL;
 		low1 |= IOART_TRGREDG | IOART_INTMSET;
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin),
 		    low1);
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin),
 		    src->io_lowreg);
 		if (!locked)
 			mtx_unlock_spin(&icu_lock);
 	}
 }
 
 static u_int
 ioapic_read(volatile ioapic_t *apic, int reg)
 {
 
 	mtx_assert(&icu_lock, MA_OWNED);
 	apic->ioregsel = reg;
 	return (apic->iowin);
 }
 
 static void
 ioapic_write(volatile ioapic_t *apic, int reg, u_int val)
 {
 
 	mtx_assert(&icu_lock, MA_OWNED);
 	apic->ioregsel = reg;
 	apic->iowin = val;
 }
 
 static const char *
 ioapic_bus_string(int bus_type)
 {
 
 	switch (bus_type) {
 	case APIC_BUS_ISA:
 		return ("ISA");
 	case APIC_BUS_EISA:
 		return ("EISA");
 	case APIC_BUS_PCI:
 		return ("PCI");
 	default:
 		return ("unknown");
 	}
 }
 
 static void
 ioapic_print_irq(struct ioapic_intsrc *intpin)
 {
 
 	switch (intpin->io_irq) {
 	case IRQ_DISABLED:
 		printf("disabled");
 		break;
 	case IRQ_EXTINT:
 		printf("ExtINT");
 		break;
 	case IRQ_NMI:
 		printf("NMI");
 		break;
 	case IRQ_SMI:
 		printf("SMI");
 		break;
 	default:
 		printf("%s IRQ %u", ioapic_bus_string(intpin->io_bus),
 		    intpin->io_irq);
 	}
 }
 
 static void
 ioapic_enable_source(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	struct ioapic *io = (struct ioapic *)isrc->is_pic;
 	uint32_t flags;
 
 	mtx_lock_spin(&icu_lock);
 	if (intpin->io_masked) {
 		flags = intpin->io_lowreg & ~IOART_INTMASK;
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
 		    flags);
 		intpin->io_masked = 0;
 	}
 	mtx_unlock_spin(&icu_lock);
 }
 
 static void
 ioapic_disable_source(struct intsrc *isrc, int eoi)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	struct ioapic *io = (struct ioapic *)isrc->is_pic;
 	uint32_t flags;
 
 	mtx_lock_spin(&icu_lock);
 	if (!intpin->io_masked && !intpin->io_edgetrigger) {
 		flags = intpin->io_lowreg | IOART_INTMSET;
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
 		    flags);
 		intpin->io_masked = 1;
 	}
 
 	if (eoi == PIC_EOI)
 		_ioapic_eoi_source(isrc, 1);
 
 	mtx_unlock_spin(&icu_lock);
 }
 
 static void
 ioapic_eoi_source(struct intsrc *isrc)
 {
 
 	_ioapic_eoi_source(isrc, 0);
 }
 
 /*
  * Completely program an intpin based on the data in its interrupt source
  * structure.
  */
 static void
 ioapic_program_intpin(struct ioapic_intsrc *intpin)
 {
 	struct ioapic *io = (struct ioapic *)intpin->io_intsrc.is_pic;
 	uint32_t low, high;
 #ifdef ACPI_DMAR
 	int error;
 #endif
 
 	/*
 	 * If a pin is completely invalid or if it is valid but hasn't
 	 * been enabled yet, just ensure that the pin is masked.
 	 */
 	mtx_assert(&icu_lock, MA_OWNED);
 	if (intpin->io_irq == IRQ_DISABLED || (intpin->io_irq < NUM_IO_INTS &&
 	    intpin->io_vector == 0)) {
 		low = ioapic_read(io->io_addr,
 		    IOAPIC_REDTBL_LO(intpin->io_intpin));
 		if ((low & IOART_INTMASK) == IOART_INTMCLR)
 			ioapic_write(io->io_addr,
 			    IOAPIC_REDTBL_LO(intpin->io_intpin),
 			    low | IOART_INTMSET);
 #ifdef ACPI_DMAR
 		mtx_unlock_spin(&icu_lock);
 		iommu_unmap_ioapic_intr(io->io_apic_id,
 		    &intpin->io_remap_cookie);
 		mtx_lock_spin(&icu_lock);
 #endif
 		return;
 	}
 
 #ifdef ACPI_DMAR
 	mtx_unlock_spin(&icu_lock);
 	error = iommu_map_ioapic_intr(io->io_apic_id,
 	    intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger,
 	    intpin->io_activehi, intpin->io_irq, &intpin->io_remap_cookie,
 	    &high, &low);
 	mtx_lock_spin(&icu_lock);
 	if (error == 0) {
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin),
 		    high);
 		intpin->io_lowreg = low;
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
 		    low);
 		return;
 	} else if (error != EOPNOTSUPP) {
 		return;
 	}
 #endif
 
 	/*
 	 * Set the destination.  Note that with Intel interrupt remapping,
 	 * the previously reserved bits 55:48 now have a purpose so ensure
 	 * these are zero.
 	 */
 	low = IOART_DESTPHY;
 	high = intpin->io_cpu << APIC_ID_SHIFT;
 
 	/* Program the rest of the low word. */
 	if (intpin->io_edgetrigger)
 		low |= IOART_TRGREDG;
 	else
 		low |= IOART_TRGRLVL;
 	if (intpin->io_activehi)
 		low |= IOART_INTAHI;
 	else
 		low |= IOART_INTALO;
 	if (intpin->io_masked)
 		low |= IOART_INTMSET;
 	switch (intpin->io_irq) {
 	case IRQ_EXTINT:
 		KASSERT(intpin->io_edgetrigger,
 		    ("ExtINT not edge triggered"));
 		low |= IOART_DELEXINT;
 		break;
 	case IRQ_NMI:
 		KASSERT(intpin->io_edgetrigger,
 		    ("NMI not edge triggered"));
 		low |= IOART_DELNMI;
 		break;
 	case IRQ_SMI:
 		KASSERT(intpin->io_edgetrigger,
 		    ("SMI not edge triggered"));
 		low |= IOART_DELSMI;
 		break;
 	default:
 		KASSERT(intpin->io_vector != 0, ("No vector for IRQ %u",
 		    intpin->io_irq));
 		low |= IOART_DELFIXED | intpin->io_vector;
 	}
 
 	/* Write the values to the APIC. */
 	ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin), high);
 	intpin->io_lowreg = low;
 	ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low);
 }
 
 static void
 ioapic_reprogram_intpin(struct intsrc *isrc)
 {
 
 	mtx_lock_spin(&icu_lock);
 	ioapic_program_intpin((struct ioapic_intsrc *)isrc);
 	mtx_unlock_spin(&icu_lock);
 }
 
 static int
 ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	struct ioapic *io = (struct ioapic *)isrc->is_pic;
 	u_int old_vector, new_vector;
 	u_int old_id;
 
 	/*
 	 * On Hyper-V:
 	 * - Stick to the first cpu for all I/O APIC pins.
 	 * - And don't allow destination cpu changes.
 	 */
 	if (vm_guest == VM_GUEST_HV) {
 		if (intpin->io_vector)
 			return (EINVAL);
 		else
 			apic_id = 0;
 	}
 
 	/*
 	 * keep 1st core as the destination for NMI
 	 */
 	if (intpin->io_irq == IRQ_NMI)
 		apic_id = 0;
 
 	/*
 	 * Set us up to free the old irq.
 	 */
 	old_vector = intpin->io_vector;
 	old_id = intpin->io_cpu;
 	if (old_vector && apic_id == old_id)
 		return (0);
 
 	/*
 	 * Allocate an APIC vector for this interrupt pin.  Once
 	 * we have a vector we program the interrupt pin.
 	 */
 	new_vector = apic_alloc_vector(apic_id, intpin->io_irq);
 	if (new_vector == 0)
 		return (ENOSPC);
 
 	/*
 	 * Mask the old intpin if it is enabled while it is migrated.
 	 *
 	 * At least some level-triggered interrupts seem to need the
 	 * extra DELAY() to avoid being stuck in a non-EOI'd state.
 	 */
 	mtx_lock_spin(&icu_lock);
 	if (!intpin->io_masked && !intpin->io_edgetrigger) {
 		ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin),
 		    intpin->io_lowreg | IOART_INTMSET);
 		mtx_unlock_spin(&icu_lock);
 		DELAY(100);
 		mtx_lock_spin(&icu_lock);
 	}
 
 	intpin->io_cpu = apic_id;
 	intpin->io_vector = new_vector;
 	if (isrc->is_handlers > 0)
 		apic_enable_vector(intpin->io_cpu, intpin->io_vector);
 	if (bootverbose) {
 		printf("ioapic%u: routing intpin %u (", io->io_id,
 		    intpin->io_intpin);
 		ioapic_print_irq(intpin);
 		printf(") to lapic %u vector %u\n", intpin->io_cpu,
 		    intpin->io_vector);
 	}
 	ioapic_program_intpin(intpin);
 	mtx_unlock_spin(&icu_lock);
 
 	/*
 	 * Free the old vector after the new one is established.  This is done
 	 * to prevent races where we could miss an interrupt.
 	 */
 	if (old_vector) {
 		if (isrc->is_handlers > 0)
 			apic_disable_vector(old_id, old_vector);
 		apic_free_vector(old_id, old_vector, intpin->io_irq);
 	}
 	return (0);
 }
 
 static void
 ioapic_enable_intr(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 
 	if (intpin->io_vector == 0)
-		if (ioapic_assign_cpu(isrc, intr_next_cpu()) != 0)
+		if (ioapic_assign_cpu(isrc, intr_next_cpu(isrc->is_domain)) != 0)
 			panic("Couldn't find an APIC vector for IRQ %d",
 			    intpin->io_irq);
 	apic_enable_vector(intpin->io_cpu, intpin->io_vector);
 }
 
 
 static void
 ioapic_disable_intr(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	u_int vector;
 
 	if (intpin->io_vector != 0) {
 		/* Mask this interrupt pin and free its APIC vector. */
 		vector = intpin->io_vector;
 		apic_disable_vector(intpin->io_cpu, vector);
 		mtx_lock_spin(&icu_lock);
 		intpin->io_masked = 1;
 		intpin->io_vector = 0;
 		ioapic_program_intpin(intpin);
 		mtx_unlock_spin(&icu_lock);
 		apic_free_vector(intpin->io_cpu, vector, intpin->io_irq);
 	}
 }
 
 static int
 ioapic_vector(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *pin;
 
 	pin = (struct ioapic_intsrc *)isrc;
 	return (pin->io_irq);
 }
 
 static int
 ioapic_source_pending(struct intsrc *isrc)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 
 	if (intpin->io_vector == 0)
 		return 0;
 	return (lapic_intr_pending(intpin->io_vector));
 }
 
 static int
 ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 	struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc;
 	struct ioapic *io = (struct ioapic *)isrc->is_pic;
 	int changed;
 
 	KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM),
 	    ("%s: Conforming trigger or polarity\n", __func__));
 
 	/*
 	 * EISA interrupts always use active high polarity, so don't allow
 	 * them to be set to active low.
 	 *
 	 * XXX: Should we write to the ELCR if the trigger mode changes for
 	 * an EISA IRQ or an ISA IRQ with the ELCR present?
 	 */
 	mtx_lock_spin(&icu_lock);
 	if (intpin->io_bus == APIC_BUS_EISA)
 		pol = INTR_POLARITY_HIGH;
 	changed = 0;
 	if (intpin->io_edgetrigger != (trig == INTR_TRIGGER_EDGE)) {
 		if (bootverbose)
 			printf("ioapic%u: Changing trigger for pin %u to %s\n",
 			    io->io_id, intpin->io_intpin,
 			    trig == INTR_TRIGGER_EDGE ? "edge" : "level");
 		intpin->io_edgetrigger = (trig == INTR_TRIGGER_EDGE);
 		changed++;
 	}
 	if (intpin->io_activehi != (pol == INTR_POLARITY_HIGH)) {
 		if (bootverbose)
 			printf("ioapic%u: Changing polarity for pin %u to %s\n",
 			    io->io_id, intpin->io_intpin,
 			    pol == INTR_POLARITY_HIGH ? "high" : "low");
 		intpin->io_activehi = (pol == INTR_POLARITY_HIGH);
 		changed++;
 	}
 	if (changed)
 		ioapic_program_intpin(intpin);
 	mtx_unlock_spin(&icu_lock);
 	return (0);
 }
 
 static void
 ioapic_resume(struct pic *pic, bool suspend_cancelled)
 {
 	struct ioapic *io = (struct ioapic *)pic;
 	int i;
 
 	mtx_lock_spin(&icu_lock);
 	for (i = 0; i < io->io_numintr; i++)
 		ioapic_program_intpin(&io->io_pins[i]);
 	mtx_unlock_spin(&icu_lock);
 }
 
 /*
  * Create a plain I/O APIC object.
  */
 void *
 ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase)
 {
 	struct ioapic *io;
 	struct ioapic_intsrc *intpin;
 	volatile ioapic_t *apic;
 	u_int numintr, i;
 	uint32_t value;
 
 	/* Map the register window so we can access the device. */
 	apic = pmap_mapdev(addr, IOAPIC_MEM_REGION);
 	mtx_lock_spin(&icu_lock);
 	value = ioapic_read(apic, IOAPIC_VER);
 	mtx_unlock_spin(&icu_lock);
 
 	/* If it's version register doesn't seem to work, punt. */
 	if (value == 0xffffffff) {
 		pmap_unmapdev((vm_offset_t)apic, IOAPIC_MEM_REGION);
 		return (NULL);
 	}
 
 	/* Determine the number of vectors and set the APIC ID. */
 	numintr = ((value & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) + 1;
 	io = malloc(sizeof(struct ioapic) +
 	    numintr * sizeof(struct ioapic_intsrc), M_IOAPIC, M_WAITOK);
 	io->io_pic = ioapic_template;
 	io->pci_dev = NULL;
 	io->pci_wnd = NULL;
 	mtx_lock_spin(&icu_lock);
 	io->io_id = next_id++;
 	io->io_apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT;
 	if (apic_id != -1 && io->io_apic_id != apic_id) {
 		ioapic_write(apic, IOAPIC_ID, apic_id << APIC_ID_SHIFT);
 		mtx_unlock_spin(&icu_lock);
 		io->io_apic_id = apic_id;
 		printf("ioapic%u: Changing APIC ID to %d\n", io->io_id,
 		    apic_id);
 	} else
 		mtx_unlock_spin(&icu_lock);
 	if (intbase == -1) {
 		intbase = next_ioapic_base;
 		printf("ioapic%u: Assuming intbase of %d\n", io->io_id,
 		    intbase);
 	} else if (intbase != next_ioapic_base && bootverbose)
 		printf("ioapic%u: WARNING: intbase %d != expected base %d\n",
 		    io->io_id, intbase, next_ioapic_base);
 	io->io_intbase = intbase;
 	next_ioapic_base = intbase + numintr;
 	io->io_numintr = numintr;
 	io->io_addr = apic;
 	io->io_paddr = addr;
 
 	if (bootverbose) {
 		printf("ioapic%u: ver 0x%02x maxredir 0x%02x\n", io->io_id,
 		    (value & IOART_VER_VERSION), (value & IOART_VER_MAXREDIR)
 		    >> MAXREDIRSHIFT);
 	}
 	/*
 	 * The  summary information about IO-APIC versions is taken from
 	 * the Linux kernel source:
 	 *     0Xh     82489DX
 	 *     1Xh     I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
 	 *     2Xh     I/O(x)APIC which is PCI 2.2 Compliant
 	 *     30h-FFh Reserved
 	 * IO-APICs with version >= 0x20 have working EOIR register.
 	 */
 	io->io_haseoi = (value & IOART_VER_VERSION) >= 0x20;
 
 	/*
 	 * Initialize pins.  Start off with interrupts disabled.  Default
 	 * to active-hi and edge-triggered for ISA interrupts and active-lo
 	 * and level-triggered for all others.
 	 */
 	bzero(io->io_pins, sizeof(struct ioapic_intsrc) * numintr);
 	mtx_lock_spin(&icu_lock);
 	for (i = 0, intpin = io->io_pins; i < numintr; i++, intpin++) {
 		intpin->io_intsrc.is_pic = (struct pic *)io;
 		intpin->io_intpin = i;
 		intpin->io_irq = intbase + i;
 
 		/*
 		 * Assume that pin 0 on the first I/O APIC is an ExtINT pin.
 		 * Assume that pins 1-15 are ISA interrupts and that all
 		 * other pins are PCI interrupts.
 		 */
 		if (intpin->io_irq == 0)
 			ioapic_set_extint(io, i);
 		else if (intpin->io_irq < IOAPIC_ISA_INTS) {
 			intpin->io_bus = APIC_BUS_ISA;
 			intpin->io_activehi = 1;
 			intpin->io_edgetrigger = 1;
 			intpin->io_masked = 1;
 		} else {
 			intpin->io_bus = APIC_BUS_PCI;
 			intpin->io_activehi = 0;
 			intpin->io_edgetrigger = 0;
 			intpin->io_masked = 1;
 		}
 
 		/*
 		 * Route interrupts to the BSP by default.  Interrupts may
 		 * be routed to other CPUs later after they are enabled.
 		 */
 		intpin->io_cpu = PCPU_GET(apic_id);
 		value = ioapic_read(apic, IOAPIC_REDTBL_LO(i));
 		ioapic_write(apic, IOAPIC_REDTBL_LO(i), value | IOART_INTMSET);
 #ifdef ACPI_DMAR
 		/* dummy, but sets cookie */
 		mtx_unlock_spin(&icu_lock);
 		iommu_map_ioapic_intr(io->io_apic_id,
 		    intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger,
 		    intpin->io_activehi, intpin->io_irq,
 		    &intpin->io_remap_cookie, NULL, NULL);
 		mtx_lock_spin(&icu_lock);
 #endif
 	}
 	mtx_unlock_spin(&icu_lock);
 
 	return (io);
 }
 
 int
 ioapic_get_vector(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (-1);
 	return (io->io_pins[pin].io_irq);
 }
 
 int
 ioapic_disable_pin(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_DISABLED)
 		return (EINVAL);
 	io->io_pins[pin].io_irq = IRQ_DISABLED;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d disabled\n", io->io_id, pin);
 	return (0);
 }
 
 int
 ioapic_remap_vector(void *cookie, u_int pin, int vector)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || vector < 0)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	io->io_pins[pin].io_irq = vector;
 	if (bootverbose)
 		printf("ioapic%u: Routing IRQ %d -> intpin %d\n", io->io_id,
 		    vector, pin);
 	return (0);
 }
 
 int
 ioapic_set_bus(void *cookie, u_int pin, int bus_type)
 {
 	struct ioapic *io;
 
 	if (bus_type < 0 || bus_type > APIC_BUS_MAX)
 		return (EINVAL);
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	if (io->io_pins[pin].io_bus == bus_type)
 		return (0);
 	io->io_pins[pin].io_bus = bus_type;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d bus %s\n", io->io_id, pin,
 		    ioapic_bus_string(bus_type));
 	return (0);
 }
 
 int
 ioapic_set_nmi(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_NMI)
 		return (0);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_NMI;
 	io->io_pins[pin].io_masked = 0;
 	io->io_pins[pin].io_edgetrigger = 1;
 	io->io_pins[pin].io_activehi = 1;
 	if (bootverbose)
 		printf("ioapic%u: Routing NMI -> intpin %d\n",
 		    io->io_id, pin);
 	return (0);
 }
 
 int
 ioapic_set_smi(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_SMI)
 		return (0);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_SMI;
 	io->io_pins[pin].io_masked = 0;
 	io->io_pins[pin].io_edgetrigger = 1;
 	io->io_pins[pin].io_activehi = 1;
 	if (bootverbose)
 		printf("ioapic%u: Routing SMI -> intpin %d\n",
 		    io->io_id, pin);
 	return (0);
 }
 
 int
 ioapic_set_extint(void *cookie, u_int pin)
 {
 	struct ioapic *io;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq == IRQ_EXTINT)
 		return (0);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN;
 	io->io_pins[pin].io_irq = IRQ_EXTINT;
 	if (enable_extint)
 		io->io_pins[pin].io_masked = 0;
 	else
 		io->io_pins[pin].io_masked = 1;
 	io->io_pins[pin].io_edgetrigger = 1;
 	io->io_pins[pin].io_activehi = 1;
 	if (bootverbose)
 		printf("ioapic%u: Routing external 8259A's -> intpin %d\n",
 		    io->io_id, pin);
 	return (0);
 }
 
 int
 ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol)
 {
 	struct ioapic *io;
 	int activehi;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || pol == INTR_POLARITY_CONFORM)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	activehi = (pol == INTR_POLARITY_HIGH);
 	if (io->io_pins[pin].io_activehi == activehi)
 		return (0);
 	io->io_pins[pin].io_activehi = activehi;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d polarity: %s\n", io->io_id, pin,
 		    pol == INTR_POLARITY_HIGH ? "high" : "low");
 	return (0);
 }
 
 int
 ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger)
 {
 	struct ioapic *io;
 	int edgetrigger;
 
 	io = (struct ioapic *)cookie;
 	if (pin >= io->io_numintr || trigger == INTR_TRIGGER_CONFORM)
 		return (EINVAL);
 	if (io->io_pins[pin].io_irq >= NUM_IO_INTS)
 		return (EINVAL);
 	edgetrigger = (trigger == INTR_TRIGGER_EDGE);
 	if (io->io_pins[pin].io_edgetrigger == edgetrigger)
 		return (0);
 	io->io_pins[pin].io_edgetrigger = edgetrigger;
 	if (bootverbose)
 		printf("ioapic%u: intpin %d trigger: %s\n", io->io_id, pin,
 		    trigger == INTR_TRIGGER_EDGE ? "edge" : "level");
 	return (0);
 }
 
 /*
  * Register a complete I/O APIC object with the interrupt subsystem.
  */
 void
 ioapic_register(void *cookie)
 {
 	struct ioapic_intsrc *pin;
 	struct ioapic *io;
 	volatile ioapic_t *apic;
 	uint32_t flags;
 	int i;
 
 	io = (struct ioapic *)cookie;
 	apic = io->io_addr;
 	mtx_lock_spin(&icu_lock);
 	flags = ioapic_read(apic, IOAPIC_VER) & IOART_VER_VERSION;
 	STAILQ_INSERT_TAIL(&ioapic_list, io, io_next);
 	mtx_unlock_spin(&icu_lock);
 	printf("ioapic%u <Version %u.%u> irqs %u-%u on motherboard\n",
 	    io->io_id, flags >> 4, flags & 0xf, io->io_intbase,
 	    io->io_intbase + io->io_numintr - 1);
 
 	/*
 	 * Reprogram pins to handle special case pins (such as NMI and
 	 * SMI) and register valid pins as interrupt sources.
 	 */
 	intr_register_pic(&io->io_pic);
 	for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++) {
 		ioapic_reprogram_intpin(&pin->io_intsrc);
 		if (pin->io_irq < NUM_IO_INTS)
 			intr_register_source(&pin->io_intsrc);
 	}
 }
 
 /* A simple new-bus driver to consume PCI I/O APIC devices. */
 static int
 ioapic_pci_probe(device_t dev)
 {
 
 	if (pci_get_class(dev) == PCIC_BASEPERIPH &&
 	    pci_get_subclass(dev) == PCIS_BASEPERIPH_PIC) {
 		switch (pci_get_progif(dev)) {
 		case PCIP_BASEPERIPH_PIC_IO_APIC:
 			device_set_desc(dev, "IO APIC");
 			break;
 		case PCIP_BASEPERIPH_PIC_IOX_APIC:
 			device_set_desc(dev, "IO(x) APIC");
 			break;
 		default:
 			return (ENXIO);
 		}
 		device_quiet(dev);
 		return (-10000);
 	}
 	return (ENXIO);
 }
 
 static int
 ioapic_pci_attach(device_t dev)
 {
 	struct resource *res;
 	volatile ioapic_t *apic;
 	struct ioapic *io;
 	int rid;
 	u_int apic_id;
 
 	/*
 	 * Try to match the enumerated ioapic.  Match BAR start
 	 * against io_paddr.  Due to a fear that PCI window is not the
 	 * same as the MADT reported io window, but an alias, read the
 	 * APIC ID from the mapped BAR and match against it.
 	 */
 	rid = PCIR_BAR(0);
 	res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
 	    RF_ACTIVE | RF_SHAREABLE);
 	if (res == NULL) {
 		if (bootverbose)
 			device_printf(dev, "cannot activate BAR0\n");
 		return (ENXIO);
 	}
 	apic = (volatile ioapic_t *)rman_get_virtual(res);
 	if (rman_get_size(res) < IOAPIC_WND_SIZE) {
 		if (bootverbose)
 			device_printf(dev,
 			    "BAR0 too small (%jd) for IOAPIC window\n",
 			    (uintmax_t)rman_get_size(res));
 		goto fail;
 	}
 	mtx_lock_spin(&icu_lock);
 	apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT;
 	/* First match by io window address */
 	STAILQ_FOREACH(io, &ioapic_list, io_next) {
 		if (io->io_paddr == (vm_paddr_t)rman_get_start(res))
 			goto found;
 	}
 	/* Then by apic id */
 	STAILQ_FOREACH(io, &ioapic_list, io_next) {
 		if (io->io_apic_id == apic_id)
 			goto found;
 	}
 	mtx_unlock_spin(&icu_lock);
 	if (bootverbose)
 		device_printf(dev,
 		    "cannot match pci bar apic id %d against MADT\n",
 		    apic_id);
 fail:
 	bus_release_resource(dev, SYS_RES_MEMORY, rid, res);
 	return (ENXIO);
 found:
 	KASSERT(io->pci_dev == NULL,
 	    ("ioapic %d pci_dev not NULL", io->io_id));
 	KASSERT(io->pci_wnd == NULL,
 	    ("ioapic %d pci_wnd not NULL", io->io_id));
 
 	io->pci_dev = dev;
 	io->pci_wnd = res;
 	if (bootverbose && (io->io_paddr != (vm_paddr_t)rman_get_start(res) ||
 	    io->io_apic_id != apic_id)) {
 		device_printf(dev, "pci%d:%d:%d:%d pci BAR0@%jx id %d "
 		    "MADT id %d paddr@%jx\n",
 		    pci_get_domain(dev), pci_get_bus(dev),
 		    pci_get_slot(dev), pci_get_function(dev),
 		    (uintmax_t)rman_get_start(res), apic_id,
 		    io->io_apic_id, (uintmax_t)io->io_paddr);
 	}
 	mtx_unlock_spin(&icu_lock);
 	return (0);
 }
 
 static device_method_t ioapic_pci_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		ioapic_pci_probe),
 	DEVMETHOD(device_attach,	ioapic_pci_attach),
 
 	{ 0, 0 }
 };
 
 DEFINE_CLASS_0(ioapic, ioapic_pci_driver, ioapic_pci_methods, 0);
 
 static devclass_t ioapic_devclass;
 DRIVER_MODULE(ioapic, pci, ioapic_pci_driver, ioapic_devclass, 0, 0);
 
 int
 ioapic_get_rid(u_int apic_id, uint16_t *ridp)
 {
 	struct ioapic *io;
 	uintptr_t rid;
 	int error;
 
 	mtx_lock_spin(&icu_lock);
 	STAILQ_FOREACH(io, &ioapic_list, io_next) {
 		if (io->io_apic_id == apic_id)
 			break;
 	}
 	mtx_unlock_spin(&icu_lock);
 	if (io == NULL || io->pci_dev == NULL)
 		return (EINVAL);
 	error = pci_get_id(io->pci_dev, PCI_ID_RID, &rid);
 	if (error != 0)
 		return (error);
 	*ridp = rid;
 	return (0);
 }
 
 /*
  * A new-bus driver to consume the memory resources associated with
  * the APICs in the system.  On some systems ACPI or PnPBIOS system
  * resource devices may already claim these resources.  To keep from
  * breaking those devices, we attach ourself to the nexus device after
  * legacy0 and acpi0 and ignore any allocation failures.
  */
 static void
 apic_identify(driver_t *driver, device_t parent)
 {
 
 	/*
 	 * Add at order 12.  acpi0 is probed at order 10 and legacy0
 	 * is probed at order 11.
 	 */
 	if (lapic_paddr != 0)
 		BUS_ADD_CHILD(parent, 12, "apic", 0);
 }
 
 static int
 apic_probe(device_t dev)
 {
 
 	device_set_desc(dev, "APIC resources");
 	device_quiet(dev);
 	return (0);
 }
 
 static void
 apic_add_resource(device_t dev, int rid, vm_paddr_t base, size_t length)
 {
 	int error;
 
 	error = bus_set_resource(dev, SYS_RES_MEMORY, rid, base, length);
 	if (error)
 		panic("apic_add_resource: resource %d failed set with %d", rid,
 		    error);
 	bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_SHAREABLE);
 }
 
 static int
 apic_attach(device_t dev)
 {
 	struct ioapic *io;
 	int i;
 
 	/* Reserve the local APIC. */
 	apic_add_resource(dev, 0, lapic_paddr, LAPIC_MEM_REGION);
 	i = 1;
 	STAILQ_FOREACH(io, &ioapic_list, io_next) {
 		apic_add_resource(dev, i, io->io_paddr, IOAPIC_MEM_REGION);
 		i++;
 	}
 	return (0);
 }
 
 static device_method_t apic_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_identify,	apic_identify),
 	DEVMETHOD(device_probe,		apic_probe),
 	DEVMETHOD(device_attach,	apic_attach),
 
 	{ 0, 0 }
 };
 
 DEFINE_CLASS_0(apic, apic_driver, apic_methods, 0);
 
 static devclass_t apic_devclass;
 DRIVER_MODULE(apic, nexus, apic_driver, apic_devclass, 0, 0);
 
 #include "opt_ddb.h"
 
 #ifdef DDB
 #include <ddb/ddb.h>
 
 static const char *
 ioapic_delivery_mode(uint32_t mode)
 {
 
 	switch (mode) {
 	case IOART_DELFIXED:
 		return ("fixed");
 	case IOART_DELLOPRI:
 		return ("lowestpri");
 	case IOART_DELSMI:
 		return ("SMI");
 	case IOART_DELRSV1:
 		return ("rsrvd1");
 	case IOART_DELNMI:
 		return ("NMI");
 	case IOART_DELINIT:
 		return ("INIT");
 	case IOART_DELRSV2:
 		return ("rsrvd2");
 	case IOART_DELEXINT:
 		return ("ExtINT");
 	default:
 		return ("");
 	}
 }
 
 static u_int
 db_ioapic_read(volatile ioapic_t *apic, int reg)
 {
 
 	apic->ioregsel = reg;
 	return (apic->iowin);
 }
 
 static void
 db_show_ioapic_one(volatile ioapic_t *io_addr)
 {
 	uint32_t r, lo, hi;
 	int mre, i;
 
 	r = db_ioapic_read(io_addr, IOAPIC_VER);
 	mre = (r & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT;
 	db_printf("Id 0x%08x Ver 0x%02x MRE %d\n",
 	    db_ioapic_read(io_addr, IOAPIC_ID), r & IOART_VER_VERSION, mre);
 	for (i = 0; i < mre; i++) {
 		lo = db_ioapic_read(io_addr, IOAPIC_REDTBL_LO(i));
 		hi = db_ioapic_read(io_addr, IOAPIC_REDTBL_HI(i));
 		db_printf("  pin %d Dest %s/%x %smasked Trig %s RemoteIRR %d "
 		    "Polarity %s Status %s DeliveryMode %s Vec %d\n", i,
 		    (lo & IOART_DESTMOD) == IOART_DESTLOG ? "log" : "phy",
 		    (hi & IOART_DEST) >> 24,
 		    (lo & IOART_INTMASK) == IOART_INTMSET ? "" : "not",
 		    (lo & IOART_TRGRMOD) == IOART_TRGRLVL ? "lvl" : "edge",
 		    (lo & IOART_REM_IRR) == IOART_REM_IRR ? 1 : 0,
 		    (lo & IOART_INTPOL) == IOART_INTALO ? "low" : "high",
 		    (lo & IOART_DELIVS) == IOART_DELIVS ? "pend" : "idle",
 		    ioapic_delivery_mode(lo & IOART_DELMOD),
 		    (lo & IOART_INTVEC));
 	  }
 }
 
 DB_SHOW_COMMAND(ioapic, db_show_ioapic)
 {
 	struct ioapic *ioapic;
 	int idx, i;
 
 	if (!have_addr) {
 		db_printf("usage: show ioapic index\n");
 		return;
 	}
 
 	idx = (int)addr;
 	i = 0;
 	STAILQ_FOREACH(ioapic, &ioapic_list, io_next) {
 		if (idx == i) {
 			db_show_ioapic_one(ioapic->io_addr);
 			break;
 		}
 		i++;
 	}
 }
 
 DB_SHOW_ALL_COMMAND(ioapics, db_show_all_ioapics)
 {
 	struct ioapic *ioapic;
 
 	STAILQ_FOREACH(ioapic, &ioapic_list, io_next)
 		db_show_ioapic_one(ioapic->io_addr);
 }
 #endif
Index: user/jeff/numa/sys/x86/x86/msi.c
===================================================================
--- user/jeff/numa/sys/x86/x86/msi.c	(revision 329848)
+++ user/jeff/numa/sys/x86/x86/msi.c	(revision 329849)
@@ -1,732 +1,738 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2006 Yahoo!, Inc.
  * All rights reserved.
  * Written by: John Baldwin <jhb@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Support for PCI Message Signalled Interrupts (MSI).  MSI interrupts on
  * x86 are basically APIC messages that the northbridge delivers directly
  * to the local APICs as if they had come from an I/O APIC.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_acpi.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <x86/apicreg.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <x86/iommu/iommu_intrmap.h>
 #include <machine/specialreg.h>
 #include <dev/pci/pcivar.h>
 
 /* Fields in address for Intel MSI messages. */
 #define	MSI_INTEL_ADDR_DEST		0x000ff000
 #define	MSI_INTEL_ADDR_RH		0x00000008
 # define MSI_INTEL_ADDR_RH_ON		0x00000008
 # define MSI_INTEL_ADDR_RH_OFF		0x00000000
 #define	MSI_INTEL_ADDR_DM		0x00000004
 # define MSI_INTEL_ADDR_DM_PHYSICAL	0x00000000
 # define MSI_INTEL_ADDR_DM_LOGICAL	0x00000004
 
 /* Fields in data for Intel MSI messages. */
 #define	MSI_INTEL_DATA_TRGRMOD		IOART_TRGRMOD	/* Trigger mode. */
 # define MSI_INTEL_DATA_TRGREDG		IOART_TRGREDG
 # define MSI_INTEL_DATA_TRGRLVL		IOART_TRGRLVL
 #define	MSI_INTEL_DATA_LEVEL		0x00004000	/* Polarity. */
 # define MSI_INTEL_DATA_DEASSERT	0x00000000
 # define MSI_INTEL_DATA_ASSERT		0x00004000
 #define	MSI_INTEL_DATA_DELMOD		IOART_DELMOD	/* Delivery mode. */
 # define MSI_INTEL_DATA_DELFIXED	IOART_DELFIXED
 # define MSI_INTEL_DATA_DELLOPRI	IOART_DELLOPRI
 # define MSI_INTEL_DATA_DELSMI		IOART_DELSMI
 # define MSI_INTEL_DATA_DELNMI		IOART_DELNMI
 # define MSI_INTEL_DATA_DELINIT		IOART_DELINIT
 # define MSI_INTEL_DATA_DELEXINT	IOART_DELEXINT
 #define	MSI_INTEL_DATA_INTVEC		IOART_INTVEC	/* Interrupt vector. */
 
 /*
  * Build Intel MSI message and data values from a source.  AMD64 systems
  * seem to be compatible, so we use the same function for both.
  */
 #define	INTEL_ADDR(msi)							\
 	(MSI_INTEL_ADDR_BASE | (msi)->msi_cpu << 12 |			\
 	    MSI_INTEL_ADDR_RH_OFF | MSI_INTEL_ADDR_DM_PHYSICAL)
 #define	INTEL_DATA(msi)							\
 	(MSI_INTEL_DATA_TRGREDG | MSI_INTEL_DATA_DELFIXED | (msi)->msi_vector)
 
 static MALLOC_DEFINE(M_MSI, "msi", "PCI MSI");
 
 /*
  * MSI sources are bunched into groups.  This is because MSI forces
  * all of the messages to share the address and data registers and
  * thus certain properties (such as the local APIC ID target on x86).
  * Each group has a 'first' source that contains information global to
  * the group.  These fields are marked with (g) below.
  *
  * Note that local APIC ID is kind of special.  Each message will be
  * assigned an ID by the system; however, a group will use the ID from
  * the first message.
  *
  * For MSI-X, each message is isolated.
  */
 struct msi_intsrc {
 	struct intsrc msi_intsrc;
 	device_t msi_dev;		/* Owning device. (g) */
 	struct msi_intsrc *msi_first;	/* First source in group. */
 	u_int msi_irq;			/* IRQ cookie. */
 	u_int msi_msix;			/* MSI-X message. */
 	u_int msi_vector:8;		/* IDT vector. */
 	u_int msi_cpu;			/* Local APIC ID. (g) */
 	u_int msi_count:8;		/* Messages in this group. (g) */
 	u_int msi_maxcount:8;		/* Alignment for this group. (g) */
 	int *msi_irqs;			/* Group's IRQ list. (g) */
 	u_int msi_remap_cookie;
 };
 
 static void	msi_create_source(void);
 static void	msi_enable_source(struct intsrc *isrc);
 static void	msi_disable_source(struct intsrc *isrc, int eoi);
 static void	msi_eoi_source(struct intsrc *isrc);
 static void	msi_enable_intr(struct intsrc *isrc);
 static void	msi_disable_intr(struct intsrc *isrc);
 static int	msi_vector(struct intsrc *isrc);
 static int	msi_source_pending(struct intsrc *isrc);
 static int	msi_config_intr(struct intsrc *isrc, enum intr_trigger trig,
 		    enum intr_polarity pol);
 static int	msi_assign_cpu(struct intsrc *isrc, u_int apic_id);
 
 struct pic msi_pic = {
 	.pic_enable_source = msi_enable_source,
 	.pic_disable_source = msi_disable_source,
 	.pic_eoi_source = msi_eoi_source,
 	.pic_enable_intr = msi_enable_intr,
 	.pic_disable_intr = msi_disable_intr,
 	.pic_vector = msi_vector,
 	.pic_source_pending = msi_source_pending,
 	.pic_suspend = NULL,
 	.pic_resume = NULL,
 	.pic_config_intr = msi_config_intr,
 	.pic_assign_cpu = msi_assign_cpu,
 	.pic_reprogram_pin = NULL,
 };
 
 #ifdef SMP
 /**
  * Xen hypervisors prior to 4.6.0 do not properly handle updates to
  * enabled MSI-X table entries.  Allow migration of MSI-X interrupts
  * to be disabled via a tunable. Values have the following meaning:
  *
  * -1: automatic detection by FreeBSD
  *  0: enable migration
  *  1: disable migration
  */
 int msix_disable_migration = -1;
 SYSCTL_INT(_machdep, OID_AUTO, disable_msix_migration, CTLFLAG_RDTUN,
     &msix_disable_migration, 0,
     "Disable migration of MSI-X interrupts between CPUs");
 #endif
 
 static int msi_enabled;
 static int msi_last_irq;
 static struct mtx msi_lock;
 
 static void
 msi_enable_source(struct intsrc *isrc)
 {
 }
 
 static void
 msi_disable_source(struct intsrc *isrc, int eoi)
 {
 
 	if (eoi == PIC_EOI)
 		lapic_eoi();
 }
 
 static void
 msi_eoi_source(struct intsrc *isrc)
 {
 
 	lapic_eoi();
 }
 
 static void
 msi_enable_intr(struct intsrc *isrc)
 {
 	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
 
 	apic_enable_vector(msi->msi_cpu, msi->msi_vector);
 }
 
 static void
 msi_disable_intr(struct intsrc *isrc)
 {
 	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
 
 	apic_disable_vector(msi->msi_cpu, msi->msi_vector);
 }
 
 static int
 msi_vector(struct intsrc *isrc)
 {
 	struct msi_intsrc *msi = (struct msi_intsrc *)isrc;
 
 	return (msi->msi_irq);
 }
 
 static int
 msi_source_pending(struct intsrc *isrc)
 {
 
 	return (0);
 }
 
 static int
 msi_config_intr(struct intsrc *isrc, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 
 	return (ENODEV);
 }
 
 static int
 msi_assign_cpu(struct intsrc *isrc, u_int apic_id)
 {
 	struct msi_intsrc *sib, *msi = (struct msi_intsrc *)isrc;
 	int old_vector;
 	u_int old_id;
 	int i, vector;
 
 	/*
 	 * Only allow CPUs to be assigned to the first message for an
 	 * MSI group.
 	 */
 	if (msi->msi_first != msi)
 		return (EINVAL);
 
 #ifdef SMP
 	if (msix_disable_migration && msi->msi_msix)
 		return (EINVAL);
 #endif
 
 	/* Store information to free existing irq. */
 	old_vector = msi->msi_vector;
 	old_id = msi->msi_cpu;
 	if (old_id == apic_id)
 		return (0);
 
 	/* Allocate IDT vectors on this cpu. */
 	if (msi->msi_count > 1) {
 		KASSERT(msi->msi_msix == 0, ("MSI-X message group"));
 		vector = apic_alloc_vectors(apic_id, msi->msi_irqs,
 		    msi->msi_count, msi->msi_maxcount);
 	} else
 		vector = apic_alloc_vector(apic_id, msi->msi_irq);
 	if (vector == 0)
 		return (ENOSPC);
 
 	msi->msi_cpu = apic_id;
 	msi->msi_vector = vector;
 	if (msi->msi_intsrc.is_handlers > 0)
 		apic_enable_vector(msi->msi_cpu, msi->msi_vector);
 	if (bootverbose)
 		printf("msi: Assigning %s IRQ %d to local APIC %u vector %u\n",
 		    msi->msi_msix ? "MSI-X" : "MSI", msi->msi_irq,
 		    msi->msi_cpu, msi->msi_vector);
 	for (i = 1; i < msi->msi_count; i++) {
 		sib = (struct msi_intsrc *)intr_lookup_source(msi->msi_irqs[i]);
 		sib->msi_cpu = apic_id;
 		sib->msi_vector = vector + i;
 		if (sib->msi_intsrc.is_handlers > 0)
 			apic_enable_vector(sib->msi_cpu, sib->msi_vector);
 		if (bootverbose)
 			printf(
 		    "msi: Assigning MSI IRQ %d to local APIC %u vector %u\n",
 			    sib->msi_irq, sib->msi_cpu, sib->msi_vector);
 	}
 	BUS_REMAP_INTR(device_get_parent(msi->msi_dev), msi->msi_dev,
 	    msi->msi_irq);
 
 	/*
 	 * Free the old vector after the new one is established.  This is done
 	 * to prevent races where we could miss an interrupt.
 	 */
 	if (msi->msi_intsrc.is_handlers > 0)
 		apic_disable_vector(old_id, old_vector);
 	apic_free_vector(old_id, old_vector, msi->msi_irq);
 	for (i = 1; i < msi->msi_count; i++) {
 		sib = (struct msi_intsrc *)intr_lookup_source(msi->msi_irqs[i]);
 		if (sib->msi_intsrc.is_handlers > 0)
 			apic_disable_vector(old_id, old_vector + i);
 		apic_free_vector(old_id, old_vector + i, msi->msi_irqs[i]);
 	}
 	return (0);
 }
 
 void
 msi_init(void)
 {
 
 	/* Check if we have a supported CPU. */
 	switch (cpu_vendor_id) {
 	case CPU_VENDOR_INTEL:
 	case CPU_VENDOR_AMD:
 		break;
 	case CPU_VENDOR_CENTAUR:
 		if (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
 		    CPUID_TO_MODEL(cpu_id) >= 0xf)
 			break;
 		/* FALLTHROUGH */
 	default:
 		return;
 	}
 
 #ifdef SMP
 	if (msix_disable_migration == -1) {
 		/* The default is to allow migration of MSI-X interrupts. */
 		msix_disable_migration = 0;
 	}
 #endif
 
 	msi_enabled = 1;
 	intr_register_pic(&msi_pic);
 	mtx_init(&msi_lock, "msi", NULL, MTX_DEF);
 }
 
 static void
 msi_create_source(void)
 {
 	struct msi_intsrc *msi;
 	u_int irq;
 
 	mtx_lock(&msi_lock);
 	if (msi_last_irq >= NUM_MSI_INTS) {
 		mtx_unlock(&msi_lock);
 		return;
 	}
 	irq = msi_last_irq + FIRST_MSI_INT;
 	msi_last_irq++;
 	mtx_unlock(&msi_lock);
 
 	msi = malloc(sizeof(struct msi_intsrc), M_MSI, M_WAITOK | M_ZERO);
 	msi->msi_intsrc.is_pic = &msi_pic;
 	msi->msi_irq = irq;
 	intr_register_source(&msi->msi_intsrc);
 	nexus_add_irq(irq);
 }
 
 /*
  * Try to allocate 'count' interrupt sources with contiguous IDT values.
  */
 int
 msi_alloc(device_t dev, int count, int maxcount, int *irqs)
 {
 	struct msi_intsrc *msi, *fsrc;
-	u_int cpu;
+	u_int cpu, domain;
 	int cnt, i, *mirqs, vector;
 #ifdef ACPI_DMAR
 	u_int cookies[count];
 	int error;
 #endif
 
 	if (!msi_enabled)
 		return (ENXIO);
 
+	if (bus_get_domain(dev, &domain) != 0)
+		domain = 0;
+
 	if (count > 1)
 		mirqs = malloc(count * sizeof(*mirqs), M_MSI, M_WAITOK);
 	else
 		mirqs = NULL;
 again:
 	mtx_lock(&msi_lock);
 
 	/* Try to find 'count' free IRQs. */
 	cnt = 0;
 	for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(i);
 
 		/* End of allocated sources, so break. */
 		if (msi == NULL)
 			break;
 
 		/* If this is a free one, save its IRQ in the array. */
 		if (msi->msi_dev == NULL) {
 			irqs[cnt] = i;
 			cnt++;
 			if (cnt == count)
 				break;
 		}
 	}
 
 	/* Do we need to create some new sources? */
 	if (cnt < count) {
 		/* If we would exceed the max, give up. */
 		if (i + (count - cnt) > FIRST_MSI_INT + NUM_MSI_INTS) {
 			mtx_unlock(&msi_lock);
 			free(mirqs, M_MSI);
 			return (ENXIO);
 		}
 		mtx_unlock(&msi_lock);
 
 		/* We need count - cnt more sources. */
 		while (cnt < count) {
 			msi_create_source();
 			cnt++;
 		}
 		goto again;
 	}
 
 	/* Ok, we now have the IRQs allocated. */
 	KASSERT(cnt == count, ("count mismatch"));
 
 	/* Allocate 'count' IDT vectors. */
-	cpu = intr_next_cpu();
+	cpu = intr_next_cpu(domain);
 	vector = apic_alloc_vectors(cpu, irqs, count, maxcount);
 	if (vector == 0) {
 		mtx_unlock(&msi_lock);
 		free(mirqs, M_MSI);
 		return (ENOSPC);
 	}
 
 #ifdef ACPI_DMAR
 	mtx_unlock(&msi_lock);
 	error = iommu_alloc_msi_intr(dev, cookies, count);
 	mtx_lock(&msi_lock);
 	if (error == EOPNOTSUPP)
 		error = 0;
 	if (error != 0) {
 		for (i = 0; i < count; i++)
 			apic_free_vector(cpu, vector + i, irqs[i]);
 		free(mirqs, M_MSI);
 		return (error);
 	}
 	for (i = 0; i < count; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
 		msi->msi_remap_cookie = cookies[i];
 	}
 #endif
 
 	/* Assign IDT vectors and make these messages owned by 'dev'. */
 	fsrc = (struct msi_intsrc *)intr_lookup_source(irqs[0]);
 	for (i = 0; i < count; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
 		msi->msi_cpu = cpu;
 		msi->msi_dev = dev;
 		msi->msi_vector = vector + i;
 		if (bootverbose)
 			printf(
 		    "msi: routing MSI IRQ %d to local APIC %u vector %u\n",
 			    msi->msi_irq, msi->msi_cpu, msi->msi_vector);
 		msi->msi_first = fsrc;
 		KASSERT(msi->msi_intsrc.is_handlers == 0,
 		    ("dead MSI has handlers"));
 	}
 	fsrc->msi_count = count;
 	fsrc->msi_maxcount = maxcount;
 	if (count > 1)
 		bcopy(irqs, mirqs, count * sizeof(*mirqs));
 	fsrc->msi_irqs = mirqs;
 	mtx_unlock(&msi_lock);
 	return (0);
 }
 
 int
 msi_release(int *irqs, int count)
 {
 	struct msi_intsrc *msi, *first;
 	int i;
 
 	mtx_lock(&msi_lock);
 	first = (struct msi_intsrc *)intr_lookup_source(irqs[0]);
 	if (first == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENOENT);
 	}
 
 	/* Make sure this isn't an MSI-X message. */
 	if (first->msi_msix) {
 		mtx_unlock(&msi_lock);
 		return (EINVAL);
 	}
 
 	/* Make sure this message is allocated to a group. */
 	if (first->msi_first == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENXIO);
 	}
 
 	/*
 	 * Make sure this is the start of a group and that we are releasing
 	 * the entire group.
 	 */
 	if (first->msi_first != first || first->msi_count != count) {
 		mtx_unlock(&msi_lock);
 		return (EINVAL);
 	}
 	KASSERT(first->msi_dev != NULL, ("unowned group"));
 
 	/* Clear all the extra messages in the group. */
 	for (i = 1; i < count; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]);
 		KASSERT(msi->msi_first == first, ("message not in group"));
 		KASSERT(msi->msi_dev == first->msi_dev, ("owner mismatch"));
 #ifdef ACPI_DMAR
 		iommu_unmap_msi_intr(first->msi_dev, msi->msi_remap_cookie);
 #endif
 		msi->msi_first = NULL;
 		msi->msi_dev = NULL;
 		apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq);
 		msi->msi_vector = 0;
 	}
 
 	/* Clear out the first message. */
 #ifdef ACPI_DMAR
 	mtx_unlock(&msi_lock);
 	iommu_unmap_msi_intr(first->msi_dev, first->msi_remap_cookie);
 	mtx_lock(&msi_lock);
 #endif
 	first->msi_first = NULL;
 	first->msi_dev = NULL;
 	apic_free_vector(first->msi_cpu, first->msi_vector, first->msi_irq);
 	first->msi_vector = 0;
 	first->msi_count = 0;
 	first->msi_maxcount = 0;
 	free(first->msi_irqs, M_MSI);
 	first->msi_irqs = NULL;
 
 	mtx_unlock(&msi_lock);
 	return (0);
 }
 
 int
 msi_map(int irq, uint64_t *addr, uint32_t *data)
 {
 	struct msi_intsrc *msi;
 	int error;
 #ifdef ACPI_DMAR
 	struct msi_intsrc *msi1;
 	int i, k;
 #endif
 
 	mtx_lock(&msi_lock);
 	msi = (struct msi_intsrc *)intr_lookup_source(irq);
 	if (msi == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENOENT);
 	}
 
 	/* Make sure this message is allocated to a device. */
 	if (msi->msi_dev == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENXIO);
 	}
 
 	/*
 	 * If this message isn't an MSI-X message, make sure it's part
 	 * of a group, and switch to the first message in the
 	 * group.
 	 */
 	if (!msi->msi_msix) {
 		if (msi->msi_first == NULL) {
 			mtx_unlock(&msi_lock);
 			return (ENXIO);
 		}
 		msi = msi->msi_first;
 	}
 
 #ifdef ACPI_DMAR
 	if (!msi->msi_msix) {
 		for (k = msi->msi_count - 1, i = FIRST_MSI_INT; k > 0 &&
 		    i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
 			if (i == msi->msi_irq)
 				continue;
 			msi1 = (struct msi_intsrc *)intr_lookup_source(i);
 			if (!msi1->msi_msix && msi1->msi_first == msi) {
 				mtx_unlock(&msi_lock);
 				iommu_map_msi_intr(msi1->msi_dev,
 				    msi1->msi_cpu, msi1->msi_vector,
 				    msi1->msi_remap_cookie, NULL, NULL);
 				k--;
 				mtx_lock(&msi_lock);
 			}
 		}
 	}
 	mtx_unlock(&msi_lock);
 	error = iommu_map_msi_intr(msi->msi_dev, msi->msi_cpu,
 	    msi->msi_vector, msi->msi_remap_cookie, addr, data);
 #else
 	mtx_unlock(&msi_lock);
 	error = EOPNOTSUPP;
 #endif
 	if (error == EOPNOTSUPP) {
 		*addr = INTEL_ADDR(msi);
 		*data = INTEL_DATA(msi);
 		error = 0;
 	}
 	return (error);
 }
 
 int
 msix_alloc(device_t dev, int *irq)
 {
 	struct msi_intsrc *msi;
-	u_int cpu;
+	u_int cpu, domain;
 	int i, vector;
 #ifdef ACPI_DMAR
 	u_int cookie;
 	int error;
 #endif
 
 	if (!msi_enabled)
 		return (ENXIO);
 
+	if (bus_get_domain(dev, &domain) != 0)
+		domain = 0;
+
 again:
 	mtx_lock(&msi_lock);
 
 	/* Find a free IRQ. */
 	for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) {
 		msi = (struct msi_intsrc *)intr_lookup_source(i);
 
 		/* End of allocated sources, so break. */
 		if (msi == NULL)
 			break;
 
 		/* Stop at the first free source. */
 		if (msi->msi_dev == NULL)
 			break;
 	}
 
 	/* Do we need to create a new source? */
 	if (msi == NULL) {
 		/* If we would exceed the max, give up. */
 		if (i + 1 > FIRST_MSI_INT + NUM_MSI_INTS) {
 			mtx_unlock(&msi_lock);
 			return (ENXIO);
 		}
 		mtx_unlock(&msi_lock);
 
 		/* Create a new source. */
 		msi_create_source();
 		goto again;
 	}
 
 	/* Allocate an IDT vector. */
-	cpu = intr_next_cpu();
+	cpu = intr_next_cpu(domain);
 	vector = apic_alloc_vector(cpu, i);
 	if (vector == 0) {
 		mtx_unlock(&msi_lock);
 		return (ENOSPC);
 	}
 
 	msi->msi_dev = dev;
 #ifdef ACPI_DMAR
 	mtx_unlock(&msi_lock);
 	error = iommu_alloc_msi_intr(dev, &cookie, 1);
 	mtx_lock(&msi_lock);
 	if (error == EOPNOTSUPP)
 		error = 0;
 	if (error != 0) {
 		msi->msi_dev = NULL;
 		apic_free_vector(cpu, vector, i);
 		return (error);
 	}
 	msi->msi_remap_cookie = cookie;
 #endif
 
 	if (bootverbose)
 		printf("msi: routing MSI-X IRQ %d to local APIC %u vector %u\n",
 		    msi->msi_irq, cpu, vector);
 
 	/* Setup source. */
 	msi->msi_cpu = cpu;
 	msi->msi_first = msi;
 	msi->msi_vector = vector;
 	msi->msi_msix = 1;
 	msi->msi_count = 1;
 	msi->msi_maxcount = 1;
 	msi->msi_irqs = NULL;
 
 	KASSERT(msi->msi_intsrc.is_handlers == 0, ("dead MSI-X has handlers"));
 	mtx_unlock(&msi_lock);
 
 	*irq = i;
 	return (0);
 }
 
 int
 msix_release(int irq)
 {
 	struct msi_intsrc *msi;
 
 	mtx_lock(&msi_lock);
 	msi = (struct msi_intsrc *)intr_lookup_source(irq);
 	if (msi == NULL) {
 		mtx_unlock(&msi_lock);
 		return (ENOENT);
 	}
 
 	/* Make sure this is an MSI-X message. */
 	if (!msi->msi_msix) {
 		mtx_unlock(&msi_lock);
 		return (EINVAL);
 	}
 
 	KASSERT(msi->msi_dev != NULL, ("unowned message"));
 
 	/* Clear out the message. */
 #ifdef ACPI_DMAR
 	mtx_unlock(&msi_lock);
 	iommu_unmap_msi_intr(msi->msi_dev, msi->msi_remap_cookie);
 	mtx_lock(&msi_lock);
 #endif
 	msi->msi_first = NULL;
 	msi->msi_dev = NULL;
 	apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq);
 	msi->msi_vector = 0;
 	msi->msi_msix = 0;
 	msi->msi_count = 0;
 	msi->msi_maxcount = 0;
 
 	mtx_unlock(&msi_lock);
 	return (0);
 }
Index: user/jeff/numa/sys/x86/x86/nexus.c
===================================================================
--- user/jeff/numa/sys/x86/x86/nexus.c	(revision 329848)
+++ user/jeff/numa/sys/x86/x86/nexus.c	(revision 329849)
@@ -1,905 +1,907 @@
 /*-
  * Copyright 1998 Massachusetts Institute of Technology
  *
  * Permission to use, copy, modify, and distribute this software and
  * its documentation for any purpose and without fee is hereby
  * granted, provided that both the above copyright notice and this
  * permission notice appear in all copies, that both the above
  * copyright notice and this permission notice appear in all
  * supporting documentation, and that the name of M.I.T. not be used
  * in advertising or publicity pertaining to distribution of the
  * software without specific, written prior permission.  M.I.T. makes
  * no representations about the suitability of this software for any
  * purpose.  It is provided "as is" without express or implied
  * warranty.
  *
  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * This code implements a `root nexus' for Intel Architecture
  * machines.  The function of the root nexus is to serve as an
  * attachment point for both processors and buses, and to manage
  * resources which are common to all of them.  In particular,
  * this code implements the core resource managers for interrupt
  * requests, DMA requests (which rightfully should be a part of the
  * ISA code but it's easier to do it here for now), I/O port addresses,
  * and I/O memory address space.
  */
 
 #ifdef __amd64__
 #define	DEV_APIC
 #else
 #include "opt_apic.h"
 #endif
 #include "opt_isa.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <machine/bus.h>
 #include <machine/intr_machdep.h>
 #include <sys/rman.h>
 #include <sys/interrupt.h>
 
 #include <machine/vmparam.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/metadata.h>
 #include <machine/nexusvar.h>
 #include <machine/resource.h>
 #include <machine/pc/bios.h>
 
 #ifdef DEV_APIC
 #include "pcib_if.h"
 #endif
 
 #ifdef DEV_ISA
 #include <isa/isavar.h>
 #include <isa/isareg.h>
 #endif
 #include <sys/rtprio.h>
 
 #define	ELF_KERN_STR	("elf"__XSTRING(__ELF_WORD_SIZE)" kernel")
 
 static MALLOC_DEFINE(M_NEXUSDEV, "nexusdev", "Nexus device");
 
 #define DEVTONX(dev)	((struct nexus_device *)device_get_ivars(dev))
 
 struct rman irq_rman, drq_rman, port_rman, mem_rman;
 
 static	int nexus_probe(device_t);
 static	int nexus_attach(device_t);
 static	int nexus_print_all_resources(device_t dev);
 static	int nexus_print_child(device_t, device_t);
 static device_t nexus_add_child(device_t bus, u_int order, const char *name,
 				int unit);
 static	struct resource *nexus_alloc_resource(device_t, device_t, int, int *,
 					      rman_res_t, rman_res_t, rman_res_t,
 					      u_int);
 static	int nexus_adjust_resource(device_t, device_t, int, struct resource *,
 				  rman_res_t, rman_res_t);
 #ifdef SMP
 static	int nexus_bind_intr(device_t, device_t, struct resource *, int);
 #endif
 static	int nexus_config_intr(device_t, int, enum intr_trigger,
 			      enum intr_polarity);
 static	int nexus_describe_intr(device_t dev, device_t child,
 				struct resource *irq, void *cookie,
 				const char *descr);
 static	int nexus_activate_resource(device_t, device_t, int, int,
 				    struct resource *);
 static	int nexus_deactivate_resource(device_t, device_t, int, int,
 				      struct resource *);
 static	int nexus_map_resource(device_t bus, device_t child, int type,
     			       struct resource *r,
 			       struct resource_map_request *argsp,
 			       struct resource_map *map);
 static	int nexus_unmap_resource(device_t bus, device_t child, int type,
 				 struct resource *r, struct resource_map *map);
 static	int nexus_release_resource(device_t, device_t, int, int,
 				   struct resource *);
 static	int nexus_setup_intr(device_t, device_t, struct resource *, int flags,
 			     driver_filter_t filter, void (*)(void *), void *,
 			      void **);
 static	int nexus_teardown_intr(device_t, device_t, struct resource *,
 				void *);
 static struct resource_list *nexus_get_reslist(device_t dev, device_t child);
 static	int nexus_set_resource(device_t, device_t, int, int,
 			       rman_res_t, rman_res_t);
 static	int nexus_get_resource(device_t, device_t, int, int,
 			       rman_res_t *, rman_res_t *);
 static void nexus_delete_resource(device_t, device_t, int, int);
 static	int nexus_get_cpus(device_t, device_t, enum cpu_sets, size_t,
 			   cpuset_t *);
 #ifdef DEV_APIC
 static	int nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs);
 static	int nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs);
 static	int nexus_alloc_msix(device_t pcib, device_t dev, int *irq);
 static	int nexus_release_msix(device_t pcib, device_t dev, int irq);
 static	int nexus_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data);
 #endif
 
 static device_method_t nexus_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		nexus_probe),
 	DEVMETHOD(device_attach,	nexus_attach),
 	DEVMETHOD(device_detach,	bus_generic_detach),
 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
 	DEVMETHOD(device_suspend,	bus_generic_suspend),
 	DEVMETHOD(device_resume,	bus_generic_resume),
 
 	/* Bus interface */
 	DEVMETHOD(bus_print_child,	nexus_print_child),
 	DEVMETHOD(bus_add_child,	nexus_add_child),
 	DEVMETHOD(bus_alloc_resource,	nexus_alloc_resource),
 	DEVMETHOD(bus_adjust_resource,	nexus_adjust_resource),
 	DEVMETHOD(bus_release_resource,	nexus_release_resource),
 	DEVMETHOD(bus_activate_resource, nexus_activate_resource),
 	DEVMETHOD(bus_deactivate_resource, nexus_deactivate_resource),
 	DEVMETHOD(bus_map_resource,	nexus_map_resource),
 	DEVMETHOD(bus_unmap_resource,	nexus_unmap_resource),
 	DEVMETHOD(bus_setup_intr,	nexus_setup_intr),
 	DEVMETHOD(bus_teardown_intr,	nexus_teardown_intr),
 #ifdef SMP
 	DEVMETHOD(bus_bind_intr,	nexus_bind_intr),
 #endif
 	DEVMETHOD(bus_config_intr,	nexus_config_intr),
 	DEVMETHOD(bus_describe_intr,	nexus_describe_intr),
 	DEVMETHOD(bus_get_resource_list, nexus_get_reslist),
 	DEVMETHOD(bus_set_resource,	nexus_set_resource),
 	DEVMETHOD(bus_get_resource,	nexus_get_resource),
 	DEVMETHOD(bus_delete_resource,	nexus_delete_resource),
 	DEVMETHOD(bus_get_cpus,		nexus_get_cpus),
 
 	/* pcib interface */
 #ifdef DEV_APIC
 	DEVMETHOD(pcib_alloc_msi,	nexus_alloc_msi),
 	DEVMETHOD(pcib_release_msi,	nexus_release_msi),
 	DEVMETHOD(pcib_alloc_msix,	nexus_alloc_msix),
 	DEVMETHOD(pcib_release_msix,	nexus_release_msix),
 	DEVMETHOD(pcib_map_msi,		nexus_map_msi),
 #endif
 
 	{ 0, 0 }
 };
 
 DEFINE_CLASS_0(nexus, nexus_driver, nexus_methods, 1);
 static devclass_t nexus_devclass;
 
 DRIVER_MODULE(nexus, root, nexus_driver, nexus_devclass, 0, 0);
 
 static int
 nexus_probe(device_t dev)
 {
 
 	device_quiet(dev);	/* suppress attach message for neatness */
 	return (BUS_PROBE_GENERIC);
 }
 
 void
 nexus_init_resources(void)
 {
 	int irq;
 
 	/*
 	 * XXX working notes:
 	 *
 	 * - IRQ resource creation should be moved to the PIC/APIC driver.
 	 * - DRQ resource creation should be moved to the DMAC driver.
 	 * - The above should be sorted to probe earlier than any child buses.
 	 *
 	 * - Leave I/O and memory creation here, as child probes may need them.
 	 *   (especially eg. ACPI)
 	 */
 
 	/*
 	 * IRQ's are on the mainboard on old systems, but on the ISA part
 	 * of PCI->ISA bridges.  There would be multiple sets of IRQs on
 	 * multi-ISA-bus systems.  PCI interrupts are routed to the ISA
 	 * component, so in a way, PCI can be a partial child of an ISA bus(!).
 	 * APIC interrupts are global though.
 	 */
 	irq_rman.rm_start = 0;
 	irq_rman.rm_type = RMAN_ARRAY;
 	irq_rman.rm_descr = "Interrupt request lines";
 	irq_rman.rm_end = NUM_IO_INTS - 1;
 	if (rman_init(&irq_rman))
 		panic("nexus_init_resources irq_rman");
 
 	/*
 	 * We search for regions of existing IRQs and add those to the IRQ
 	 * resource manager.
 	 */
 	for (irq = 0; irq < NUM_IO_INTS; irq++)
 		if (intr_lookup_source(irq) != NULL)
 			if (rman_manage_region(&irq_rman, irq, irq) != 0)
 				panic("nexus_init_resources irq_rman add");
 
 	/*
 	 * ISA DMA on PCI systems is implemented in the ISA part of each
 	 * PCI->ISA bridge and the channels can be duplicated if there are
 	 * multiple bridges.  (eg: laptops with docking stations)
 	 */
 	drq_rman.rm_start = 0;
 	drq_rman.rm_end = 7;
 	drq_rman.rm_type = RMAN_ARRAY;
 	drq_rman.rm_descr = "DMA request lines";
 	/* XXX drq 0 not available on some machines */
 	if (rman_init(&drq_rman)
 	    || rman_manage_region(&drq_rman,
 				  drq_rman.rm_start, drq_rman.rm_end))
 		panic("nexus_init_resources drq_rman");
 
 	/*
 	 * However, IO ports and Memory truely are global at this level,
 	 * as are APIC interrupts (however many IO APICS there turn out
 	 * to be on large systems..)
 	 */
 	port_rman.rm_start = 0;
 	port_rman.rm_end = 0xffff;
 	port_rman.rm_type = RMAN_ARRAY;
 	port_rman.rm_descr = "I/O ports";
 	if (rman_init(&port_rman)
 	    || rman_manage_region(&port_rman, 0, 0xffff))
 		panic("nexus_init_resources port_rman");
 
 	mem_rman.rm_start = 0;
 #ifndef PAE
 	mem_rman.rm_end = BUS_SPACE_MAXADDR;
 #else
 	mem_rman.rm_end = ((1ULL << cpu_maxphyaddr) - 1);
 #endif
 	mem_rman.rm_type = RMAN_ARRAY;
 	mem_rman.rm_descr = "I/O memory addresses";
 	if (rman_init(&mem_rman)
 	    || rman_manage_region(&mem_rman, 0, mem_rman.rm_end))
 		panic("nexus_init_resources mem_rman");
 }
 
 static int
 nexus_attach(device_t dev)
 {
 
 	nexus_init_resources();
 	bus_generic_probe(dev);
 
 	/*
 	 * Explicitly add the legacy0 device here.  Other platform
 	 * types (such as ACPI), use their own nexus(4) subclass
 	 * driver to override this routine and add their own root bus.
 	 */
 	if (BUS_ADD_CHILD(dev, 10, "legacy", 0) == NULL)
 		panic("legacy: could not attach");
 	bus_generic_attach(dev);
 	return 0;
 }
 
 static int
 nexus_print_all_resources(device_t dev)
 {
 	struct	nexus_device *ndev = DEVTONX(dev);
 	struct resource_list *rl = &ndev->nx_resources;
 	int retval = 0;
 
 	if (STAILQ_FIRST(rl))
 		retval += printf(" at");
 
 	retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#jx");
 	retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#jx");
 	retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd");
 
 	return retval;
 }
 
 static int
 nexus_print_child(device_t bus, device_t child)
 {
 	int retval = 0;
 
 	retval += bus_print_child_header(bus, child);
 	retval += nexus_print_all_resources(child);
 	if (device_get_flags(child))
 		retval += printf(" flags %#x", device_get_flags(child));
 	retval += printf(" on motherboard\n");	/* XXX "motherboard", ick */
 
 	return (retval);
 }
 
 static device_t
 nexus_add_child(device_t bus, u_int order, const char *name, int unit)
 {
 	device_t		child;
 	struct nexus_device	*ndev;
 
 	ndev = malloc(sizeof(struct nexus_device), M_NEXUSDEV, M_NOWAIT|M_ZERO);
 	if (!ndev)
 		return(0);
 	resource_list_init(&ndev->nx_resources);
 
 	child = device_add_child_ordered(bus, order, name, unit);
 
 	/* should we free this in nexus_child_detached? */
 	device_set_ivars(child, ndev);
 
 	return(child);
 }
 
 static struct rman *
 nexus_rman(int type)
 {
 	switch (type) {
 	case SYS_RES_IRQ:
 		return (&irq_rman);
 	case SYS_RES_DRQ:
 		return (&drq_rman);
 	case SYS_RES_IOPORT:
 		return (&port_rman);
 	case SYS_RES_MEMORY:
 		return (&mem_rman);
 	default:
 		return (NULL);
 	}
 }
 
 /*
  * Allocate a resource on behalf of child.  NB: child is usually going to be a
  * child of one of our descendants, not a direct child of nexus0.
  * (Exceptions include npx.)
  */
 static struct resource *
 nexus_alloc_resource(device_t bus, device_t child, int type, int *rid,
 		     rman_res_t start, rman_res_t end, rman_res_t count,
 		     u_int flags)
 {
 	struct nexus_device *ndev = DEVTONX(child);
 	struct	resource *rv;
 	struct resource_list_entry *rle;
 	struct	rman *rm;
 	int needactivate = flags & RF_ACTIVE;
 
 	/*
 	 * If this is an allocation of the "default" range for a given
 	 * RID, and we know what the resources for this device are
 	 * (ie. they aren't maintained by a child bus), then work out
 	 * the start/end values.
 	 */
 	if (RMAN_IS_DEFAULT_RANGE(start, end) && (count == 1)) {
 		if (device_get_parent(child) != bus || ndev == NULL)
 			return(NULL);
 		rle = resource_list_find(&ndev->nx_resources, type, *rid);
 		if (rle == NULL)
 			return(NULL);
 		start = rle->start;
 		end = rle->end;
 		count = rle->count;
 	}
 
 	flags &= ~RF_ACTIVE;
 	rm = nexus_rman(type);
 	if (rm == NULL)
 		return (NULL);
 
 	rv = rman_reserve_resource(rm, start, end, count, flags, child);
 	if (rv == NULL)
 		return 0;
 	rman_set_rid(rv, *rid);
 
 	if (needactivate) {
 		if (bus_activate_resource(child, type, *rid, rv)) {
 			rman_release_resource(rv);
 			return 0;
 		}
 	}
 
 	return rv;
 }
 
 static int
 nexus_adjust_resource(device_t bus, device_t child, int type,
     struct resource *r, rman_res_t start, rman_res_t end)
 {
 	struct rman *rm;
 
 	rm = nexus_rman(type);
 	if (rm == NULL)
 		return (ENXIO);
 	if (!rman_is_region_manager(r, rm))
 		return (EINVAL);
 	return (rman_adjust_resource(r, start, end));
 }
 
 static int
 nexus_activate_resource(device_t bus, device_t child, int type, int rid,
 			struct resource *r)
 {
 	struct resource_map map;
 	int error;
 
 	error = rman_activate_resource(r);
 	if (error != 0)
 		return (error);
 
 	if (!(rman_get_flags(r) & RF_UNMAPPED) &&
 	    (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) {
 		error = nexus_map_resource(bus, child, type, r, NULL, &map);
 		if (error) {
 			rman_deactivate_resource(r);
 			return (error);
 		}
 
 		rman_set_mapping(r,&map);
 	}
 	return (0);
 }
 
 static int
 nexus_deactivate_resource(device_t bus, device_t child, int type, int rid,
 			  struct resource *r)
 {
 	struct resource_map map;
 	int error;
 
 	error = rman_deactivate_resource(r);
 	if (error)
 		return (error);
 
 	if (!(rman_get_flags(r) & RF_UNMAPPED) &&
 	    (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) {
 		rman_get_mapping(r, &map);
 		nexus_unmap_resource(bus, child, type, r, &map);
 	}
 	return (0);
 }
 
 static int
 nexus_map_resource(device_t bus, device_t child, int type, struct resource *r,
     struct resource_map_request *argsp, struct resource_map *map)
 {
 	struct resource_map_request args;
 	rman_res_t end, length, start;
 
 	/* Resources must be active to be mapped. */
 	if (!(rman_get_flags(r) & RF_ACTIVE))
 		return (ENXIO);
 
 	/* Mappings are only supported on I/O and memory resources. */
 	switch (type) {
 	case SYS_RES_IOPORT:
 	case SYS_RES_MEMORY:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	resource_init_map_request(&args);
 	if (argsp != NULL)
 		bcopy(argsp, &args, imin(argsp->size, args.size));
 	start = rman_get_start(r) + args.offset;
 	if (args.length == 0)
 		length = rman_get_size(r);
 	else
 		length = args.length;
 	end = start + length - 1;
 	if (start > rman_get_end(r) || start < rman_get_start(r))
 		return (EINVAL);
 	if (end > rman_get_end(r) || end < start)
 		return (EINVAL);
 
 	/*
 	 * If this is a memory resource, map it into the kernel.
 	 */
 	switch (type) {
 	case SYS_RES_IOPORT:
 		map->r_bushandle = start;
 		map->r_bustag = X86_BUS_SPACE_IO;
 		map->r_size = length;
 		map->r_vaddr = NULL;
 		break;
 	case SYS_RES_MEMORY:
 		map->r_vaddr = pmap_mapdev_attr(start, length, args.memattr);
 		map->r_bustag = X86_BUS_SPACE_MEM;
 		map->r_size = length;
 
 		/*
 		 * The handle is the virtual address.
 		 */
 		map->r_bushandle = (bus_space_handle_t)map->r_vaddr;
 		break;
 	}
 	return (0);
 }
 
 static int
 nexus_unmap_resource(device_t bus, device_t child, int type, struct resource *r,
     struct resource_map *map)
 {
 	
 	/*
 	 * If this is a memory resource, unmap it.
 	 */
 	switch (type) {
 	case SYS_RES_MEMORY:
 		pmap_unmapdev((vm_offset_t)map->r_vaddr, map->r_size);
 		/* FALLTHROUGH */
 	case SYS_RES_IOPORT:
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 nexus_release_resource(device_t bus, device_t child, int type, int rid,
 		       struct resource *r)
 {
 
 	if (rman_get_flags(r) & RF_ACTIVE) {
 		int error = bus_deactivate_resource(child, type, rid, r);
 		if (error)
 			return error;
 	}
 	return (rman_release_resource(r));
 }
 
 /*
  * Currently this uses the really grody interface from kern/kern_intr.c
  * (which really doesn't belong in kern/anything.c).  Eventually, all of
  * the code in kern_intr.c and machdep_intr.c should get moved here, since
  * this is going to be the official interface.
  */
 static int
 nexus_setup_intr(device_t bus, device_t child, struct resource *irq,
 		 int flags, driver_filter_t filter, void (*ihand)(void *),
 		 void *arg, void **cookiep)
 {
-	int		error;
+	int		error, domain;
 
 	/* somebody tried to setup an irq that failed to allocate! */
 	if (irq == NULL)
 		panic("nexus_setup_intr: NULL irq resource!");
 
 	*cookiep = NULL;
 	if ((rman_get_flags(irq) & RF_SHAREABLE) == 0)
 		flags |= INTR_EXCL;
 
 	/*
 	 * We depend here on rman_activate_resource() being idempotent.
 	 */
 	error = rman_activate_resource(irq);
 	if (error)
 		return (error);
+	if (bus_get_domain(child, &domain) != 0)
+		domain = 0;
 
 	error = intr_add_handler(device_get_nameunit(child),
-	    rman_get_start(irq), filter, ihand, arg, flags, cookiep);
+	    rman_get_start(irq), filter, ihand, arg, flags, cookiep, domain);
 
 	return (error);
 }
 
 static int
 nexus_teardown_intr(device_t dev, device_t child, struct resource *r, void *ih)
 {
 	return (intr_remove_handler(ih));
 }
 
 #ifdef SMP
 static int
 nexus_bind_intr(device_t dev, device_t child, struct resource *irq, int cpu)
 {
 	return (intr_bind(rman_get_start(irq), cpu));
 }
 #endif
 
 static int
 nexus_config_intr(device_t dev, int irq, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 	return (intr_config_intr(irq, trig, pol));
 }
 
 static int
 nexus_describe_intr(device_t dev, device_t child, struct resource *irq,
     void *cookie, const char *descr)
 {
 
 	return (intr_describe(rman_get_start(irq), cookie, descr));
 }
 
 static struct resource_list *
 nexus_get_reslist(device_t dev, device_t child)
 {
 	struct nexus_device *ndev = DEVTONX(child);
 
 	return (&ndev->nx_resources);
 }
 
 static int
 nexus_set_resource(device_t dev, device_t child, int type, int rid,
     rman_res_t start, rman_res_t count)
 {
 	struct nexus_device	*ndev = DEVTONX(child);
 	struct resource_list	*rl = &ndev->nx_resources;
 
 	/* XXX this should return a success/failure indicator */
 	resource_list_add(rl, type, rid, start, start + count - 1, count);
 	return(0);
 }
 
 static int
 nexus_get_resource(device_t dev, device_t child, int type, int rid,
     rman_res_t *startp, rman_res_t *countp)
 {
 	struct nexus_device	*ndev = DEVTONX(child);
 	struct resource_list	*rl = &ndev->nx_resources;
 	struct resource_list_entry *rle;
 
 	rle = resource_list_find(rl, type, rid);
 	if (!rle)
 		return(ENOENT);
 	if (startp)
 		*startp = rle->start;
 	if (countp)
 		*countp = rle->count;
 	return(0);
 }
 
 static void
 nexus_delete_resource(device_t dev, device_t child, int type, int rid)
 {
 	struct nexus_device	*ndev = DEVTONX(child);
 	struct resource_list	*rl = &ndev->nx_resources;
 
 	resource_list_delete(rl, type, rid);
 }
 
 static int
 nexus_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize,
     cpuset_t *cpuset)
 {
 
 	switch (op) {
 #ifdef SMP
 	case INTR_CPUS:
 		if (setsize != sizeof(cpuset_t))
 			return (EINVAL);
 		*cpuset = intr_cpus;
 		return (0);
 #endif
 	default:
 		return (bus_generic_get_cpus(dev, child, op, setsize, cpuset));
 	}
 }
 
 /* Called from the MSI code to add new IRQs to the IRQ rman. */
 void
 nexus_add_irq(u_long irq)
 {
 
 	if (rman_manage_region(&irq_rman, irq, irq) != 0)
 		panic("%s: failed", __func__);
 }
 
 #ifdef DEV_APIC
 static int
 nexus_alloc_msix(device_t pcib, device_t dev, int *irq)
 {
 
 	return (msix_alloc(dev, irq));
 }
 
 static int
 nexus_release_msix(device_t pcib, device_t dev, int irq)
 {
 
 	return (msix_release(irq));
 }
 
 static int
 nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs)
 {
 
 	return (msi_alloc(dev, count, maxcount, irqs));
 }
 
 static int
 nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs)
 {
 
 	return (msi_release(irqs, count));
 }
 
 static int
 nexus_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data)
 {
 
 	return (msi_map(irq, addr, data));
 }
 #endif
 
 /* Placeholder for system RAM. */
 static void
 ram_identify(driver_t *driver, device_t parent)
 {
 
 	if (resource_disabled("ram", 0))
 		return;	
 	if (BUS_ADD_CHILD(parent, 0, "ram", 0) == NULL)
 		panic("ram_identify");
 }
 
 static int
 ram_probe(device_t dev)
 {
 
 	device_quiet(dev);
 	device_set_desc(dev, "System RAM");
 	return (0);
 }
 
 static int
 ram_attach(device_t dev)
 {
 	struct bios_smap *smapbase, *smap, *smapend;
 	struct resource *res;
 	vm_paddr_t *p;
 	caddr_t kmdp;
 	uint32_t smapsize;
 	int error, rid;
 
 	/* Retrieve the system memory map from the loader. */
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type(ELF_KERN_STR);  
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase != NULL) {
 		smapsize = *((u_int32_t *)smapbase - 1);
 		smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
 		rid = 0;
 		for (smap = smapbase; smap < smapend; smap++) {
 			if (smap->type != SMAP_TYPE_MEMORY ||
 			    smap->length == 0)
 				continue;
 #ifdef __i386__
 			/*
 			 * Resources use long's to track resources, so
 			 * we can't include memory regions above 4GB.
 			 */
 			if (smap->base > ~0ul)
 				continue;
 #endif
 			error = bus_set_resource(dev, SYS_RES_MEMORY, rid,
 			    smap->base, smap->length);
 			if (error)
 				panic(
 				    "ram_attach: resource %d failed set with %d",
 				    rid, error);
 			res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
 			    0);
 			if (res == NULL)
 				panic("ram_attach: resource %d failed to attach",
 				    rid);
 			rid++;
 		}
 		return (0);
 	}
 
 	/*
 	 * If the system map is not available, fall back to using
 	 * dump_avail[].  We use the dump_avail[] array rather than
 	 * phys_avail[] for the memory map as phys_avail[] contains
 	 * holes for kernel memory, page 0, the message buffer, and
 	 * the dcons buffer.  We test the end address in the loop
 	 * instead of the start since the start address for the first
 	 * segment is 0.
 	 */
 	for (rid = 0, p = dump_avail; p[1] != 0; rid++, p += 2) {
 #ifdef PAE
 		/*
 		 * Resources use long's to track resources, so we can't
 		 * include memory regions above 4GB.
 		 */
 		if (p[0] > ~0ul)
 			break;
 #endif
 		error = bus_set_resource(dev, SYS_RES_MEMORY, rid, p[0],
 		    p[1] - p[0]);
 		if (error)
 			panic("ram_attach: resource %d failed set with %d", rid,
 			    error);
 		res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0);
 		if (res == NULL)
 			panic("ram_attach: resource %d failed to attach", rid);
 	}
 	return (0);
 }
 
 static device_method_t ram_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_identify,	ram_identify),
 	DEVMETHOD(device_probe,		ram_probe),
 	DEVMETHOD(device_attach,	ram_attach),
 	{ 0, 0 }
 };
 
 static driver_t ram_driver = {
 	"ram",
 	ram_methods,
 	1,		/* no softc */
 };
 
 static devclass_t ram_devclass;
 
 DRIVER_MODULE(ram, nexus, ram_driver, ram_devclass, 0, 0);
 
 #ifdef DEV_ISA
 /*
  * Placeholder which claims PnP 'devices' which describe system
  * resources.
  */
 static struct isa_pnp_id sysresource_ids[] = {
 	{ 0x010cd041 /* PNP0c01 */, "System Memory" },
 	{ 0x020cd041 /* PNP0c02 */, "System Resource" },
 	{ 0 }
 };
 
 static int
 sysresource_probe(device_t dev)
 {
 	int	result;
 
 	if ((result = ISA_PNP_PROBE(device_get_parent(dev), dev, sysresource_ids)) <= 0) {
 		device_quiet(dev);
 	}
 	return(result);
 }
 
 static int
 sysresource_attach(device_t dev)
 {
 	return(0);
 }
 
 static device_method_t sysresource_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		sysresource_probe),
 	DEVMETHOD(device_attach,	sysresource_attach),
 	DEVMETHOD(device_detach,	bus_generic_detach),
 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
 	DEVMETHOD(device_suspend,	bus_generic_suspend),
 	DEVMETHOD(device_resume,	bus_generic_resume),
 	{ 0, 0 }
 };
 
 static driver_t sysresource_driver = {
 	"sysresource",
 	sysresource_methods,
 	1,		/* no softc */
 };
 
 static devclass_t sysresource_devclass;
 
 DRIVER_MODULE(sysresource, isa, sysresource_driver, sysresource_devclass, 0, 0);
 ISA_PNP_INFO(sysresource_ids);
 #endif /* DEV_ISA */
Index: user/jeff/numa/sys/x86/xen/xen_intr.c
===================================================================
--- user/jeff/numa/sys/x86/xen/xen_intr.c	(revision 329848)
+++ user/jeff/numa/sys/x86/xen/xen_intr.c	(revision 329849)
@@ -1,1668 +1,1668 @@
 /******************************************************************************
  * xen_intr.c
  *
  * Xen event and interrupt services for x86 HVM guests.
  *
  * Copyright (c) 2002-2005, K A Fraser
  * Copyright (c) 2005, Intel Corporation <xiaofeng.ling@intel.com>
  * Copyright (c) 2012, Spectra Logic Corporation
  *
  * This file may be distributed separately from the Linux kernel, or
  * incorporated into other software packages, subject to the following license:
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this source file (the "Software"), to deal in the Software without
  * restriction, including without limitation the rights to use, copy, modify,
  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  * and to permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/interrupt.h>
 #include <sys/pcpu.h>
 #include <sys/smp.h>
 #include <sys/refcount.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <x86/apicreg.h>
 #include <machine/smp.h>
 #include <machine/stdarg.h>
 
 #include <machine/xen/synch_bitops.h>
 #include <machine/xen/xen-os.h>
 
 #include <xen/hypervisor.h>
 #include <xen/xen_intr.h>
 #include <xen/evtchn/evtchnvar.h>
 
 #include <dev/xen/xenpci/xenpcivar.h>
 #include <dev/pci/pcivar.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 static MALLOC_DEFINE(M_XENINTR, "xen_intr", "Xen Interrupt Services");
 
 /**
  * Per-cpu event channel processing state.
  */
 struct xen_intr_pcpu_data {
 	/**
 	 * The last event channel bitmap section (level one bit) processed.
 	 * This is used to ensure we scan all ports before
 	 * servicing an already servied port again.
 	 */
 	u_int	last_processed_l1i;
 
 	/**
 	 * The last event channel processed within the event channel
 	 * bitmap being scanned.
 	 */
 	u_int	last_processed_l2i;
 
 	/** Pointer to this CPU's interrupt statistic counter. */
 	u_long *evtchn_intrcnt;
 
 	/**
 	 * A bitmap of ports that can be serviced from this CPU.
 	 * A set bit means interrupt handling is enabled.
 	 */
 	u_long	evtchn_enabled[sizeof(u_long) * 8];
 };
 
 /*
  * Start the scan at port 0 by initializing the last scanned
  * location as the highest numbered event channel port.
  */
 static DPCPU_DEFINE(struct xen_intr_pcpu_data, xen_intr_pcpu) = {
 	.last_processed_l1i = LONG_BIT - 1,
 	.last_processed_l2i = LONG_BIT - 1
 };
 
 DPCPU_DECLARE(struct vcpu_info *, vcpu_info);
 
 #define	XEN_EEXIST		17 /* Xen "already exists" error */
 #define	XEN_ALLOCATE_VECTOR	0 /* Allocate a vector for this event channel */
 #define	XEN_INVALID_EVTCHN	0 /* Invalid event channel */
 
 #define	is_valid_evtchn(x)	((x) != XEN_INVALID_EVTCHN)
 
 struct xenisrc {
 	struct intsrc	xi_intsrc;
 	enum evtchn_type xi_type;
 	int		xi_cpu;		/* VCPU for delivery. */
 	int		xi_vector;	/* Global isrc vector number. */
 	evtchn_port_t	xi_port;
 	int		xi_pirq;
 	int		xi_virq;
 	void		*xi_cookie;
 	u_int		xi_close:1;	/* close on unbind? */
 	u_int		xi_activehi:1;
 	u_int		xi_edgetrigger:1;
 	u_int		xi_masked:1;
 	volatile u_int	xi_refcount;
 };
 
 static void	xen_intr_suspend(struct pic *);
 static void	xen_intr_resume(struct pic *, bool suspend_cancelled);
 static void	xen_intr_enable_source(struct intsrc *isrc);
 static void	xen_intr_disable_source(struct intsrc *isrc, int eoi);
 static void	xen_intr_eoi_source(struct intsrc *isrc);
 static void	xen_intr_enable_intr(struct intsrc *isrc);
 static void	xen_intr_disable_intr(struct intsrc *isrc);
 static int	xen_intr_vector(struct intsrc *isrc);
 static int	xen_intr_source_pending(struct intsrc *isrc);
 static int	xen_intr_config_intr(struct intsrc *isrc,
 		     enum intr_trigger trig, enum intr_polarity pol);
 static int	xen_intr_assign_cpu(struct intsrc *isrc, u_int apic_id);
 
 static void	xen_intr_pirq_enable_source(struct intsrc *isrc);
 static void	xen_intr_pirq_disable_source(struct intsrc *isrc, int eoi);
 static void	xen_intr_pirq_eoi_source(struct intsrc *isrc);
 static void	xen_intr_pirq_enable_intr(struct intsrc *isrc);
 static void	xen_intr_pirq_disable_intr(struct intsrc *isrc);
 static int	xen_intr_pirq_config_intr(struct intsrc *isrc,
 		     enum intr_trigger trig, enum intr_polarity pol);
 
 /**
  * PIC interface for all event channel port types except physical IRQs.
  */
 struct pic xen_intr_pic = {
 	.pic_enable_source  = xen_intr_enable_source,
 	.pic_disable_source = xen_intr_disable_source,
 	.pic_eoi_source     = xen_intr_eoi_source,
 	.pic_enable_intr    = xen_intr_enable_intr,
 	.pic_disable_intr   = xen_intr_disable_intr,
 	.pic_vector         = xen_intr_vector,
 	.pic_source_pending = xen_intr_source_pending,
 	.pic_suspend        = xen_intr_suspend,
 	.pic_resume         = xen_intr_resume,
 	.pic_config_intr    = xen_intr_config_intr,
 	.pic_assign_cpu     = xen_intr_assign_cpu
 };
 
 /**
  * PIC interface for all event channel representing
  * physical interrupt sources.
  */
 struct pic xen_intr_pirq_pic = {
 	.pic_enable_source  = xen_intr_pirq_enable_source,
 	.pic_disable_source = xen_intr_pirq_disable_source,
 	.pic_eoi_source     = xen_intr_pirq_eoi_source,
 	.pic_enable_intr    = xen_intr_pirq_enable_intr,
 	.pic_disable_intr   = xen_intr_pirq_disable_intr,
 	.pic_vector         = xen_intr_vector,
 	.pic_source_pending = xen_intr_source_pending,
 	.pic_config_intr    = xen_intr_pirq_config_intr,
 	.pic_assign_cpu     = xen_intr_assign_cpu
 };
 
 static struct mtx	 xen_intr_isrc_lock;
 static int		 xen_intr_auto_vector_count;
 static struct xenisrc	*xen_intr_port_to_isrc[NR_EVENT_CHANNELS];
 static u_long		*xen_intr_pirq_eoi_map;
 static boolean_t	 xen_intr_pirq_eoi_map_enabled;
 
 /*------------------------- Private Functions --------------------------------*/
 /**
  * Disable signal delivery for an event channel port on the
  * specified CPU.
  *
  * \param port  The event channel port to mask.
  *
  * This API is used to manage the port<=>CPU binding of event
  * channel handlers.
  *
  * \note  This operation does not preclude reception of an event
  *        for this event channel on another CPU.  To mask the
  *        event channel globally, use evtchn_mask().
  */
 static inline void
 evtchn_cpu_mask_port(u_int cpu, evtchn_port_t port)
 {
 	struct xen_intr_pcpu_data *pcpu;
 
 	pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu);
 	xen_clear_bit(port, pcpu->evtchn_enabled);
 }
 
 /**
  * Enable signal delivery for an event channel port on the
  * specified CPU.
  *
  * \param port  The event channel port to unmask.
  *
  * This API is used to manage the port<=>CPU binding of event
  * channel handlers.
  *
  * \note  This operation does not guarantee that event delivery
  *        is enabled for this event channel port.  The port must
  *        also be globally enabled.  See evtchn_unmask().
  */
 static inline void
 evtchn_cpu_unmask_port(u_int cpu, evtchn_port_t port)
 {
 	struct xen_intr_pcpu_data *pcpu;
 
 	pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu);
 	xen_set_bit(port, pcpu->evtchn_enabled);
 }
 
 /**
  * Allocate and register a per-cpu Xen upcall interrupt counter.
  *
  * \param cpu  The cpu for which to register this interrupt count.
  */
 static void
 xen_intr_intrcnt_add(u_int cpu)
 {
 	char buf[MAXCOMLEN + 1];
 	struct xen_intr_pcpu_data *pcpu;
 
 	pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu);
 	if (pcpu->evtchn_intrcnt != NULL)
 		return;
 
 	snprintf(buf, sizeof(buf), "cpu%d:xen", cpu);
 	intrcnt_add(buf, &pcpu->evtchn_intrcnt);
 }
 
 /**
  * Search for an already allocated but currently unused Xen interrupt
  * source object.
  *
  * \param type  Restrict the search to interrupt sources of the given
  *              type.
  *
  * \return  A pointer to a free Xen interrupt source object or NULL.
  */
 static struct xenisrc *
 xen_intr_find_unused_isrc(enum evtchn_type type)
 {
 	int isrc_idx;
 
 	KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn isrc lock not held"));
 
 	for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx ++) {
 		struct xenisrc *isrc;
 		u_int vector;
 
 		vector = FIRST_EVTCHN_INT + isrc_idx;
 		isrc = (struct xenisrc *)intr_lookup_source(vector);
 		if (isrc != NULL
 		 && isrc->xi_type == EVTCHN_TYPE_UNBOUND) {
 			KASSERT(isrc->xi_intsrc.is_handlers == 0,
 			    ("Free evtchn still has handlers"));
 			isrc->xi_type = type;
 			return (isrc);
 		}
 	}
 	return (NULL);
 }
 
 /**
  * Allocate a Xen interrupt source object.
  *
  * \param type  The type of interrupt source to create.
  *
  * \return  A pointer to a newly allocated Xen interrupt source
  *          object or NULL.
  */
 static struct xenisrc *
 xen_intr_alloc_isrc(enum evtchn_type type, int vector)
 {
 	static int warned;
 	struct xenisrc *isrc;
 
 	KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn alloc lock not held"));
 
 	if (xen_intr_auto_vector_count > NR_EVENT_CHANNELS) {
 		if (!warned) {
 			warned = 1;
 			printf("xen_intr_alloc: Event channels exhausted.\n");
 		}
 		return (NULL);
 	}
 
 	if (type != EVTCHN_TYPE_PIRQ) {
 		vector = FIRST_EVTCHN_INT + xen_intr_auto_vector_count;
 		xen_intr_auto_vector_count++;
 	}
 
 	KASSERT((intr_lookup_source(vector) == NULL),
 	    ("Trying to use an already allocated vector"));
 
 	mtx_unlock(&xen_intr_isrc_lock);
 	isrc = malloc(sizeof(*isrc), M_XENINTR, M_WAITOK | M_ZERO);
 	isrc->xi_intsrc.is_pic =
 	    (type == EVTCHN_TYPE_PIRQ) ? &xen_intr_pirq_pic : &xen_intr_pic;
 	isrc->xi_vector = vector;
 	isrc->xi_type = type;
 	intr_register_source(&isrc->xi_intsrc);
 	mtx_lock(&xen_intr_isrc_lock);
 
 	return (isrc);
 }
 
 /**
  * Attempt to free an active Xen interrupt source object.
  *
  * \param isrc  The interrupt source object to release.
  *
  * \returns  EBUSY if the source is still in use, otherwise 0.
  */
 static int
 xen_intr_release_isrc(struct xenisrc *isrc)
 {
 
 	mtx_lock(&xen_intr_isrc_lock);
 	KASSERT(isrc->xi_intsrc.is_handlers == 0,
 	    ("Release called, but xenisrc still in use"));
 	evtchn_mask_port(isrc->xi_port);
 	evtchn_clear_port(isrc->xi_port);
 
 	/* Rebind port to CPU 0. */
 	evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port);
 	evtchn_cpu_unmask_port(0, isrc->xi_port);
 
 	if (isrc->xi_close != 0 && is_valid_evtchn(isrc->xi_port)) {
 		struct evtchn_close close = { .port = isrc->xi_port };
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
 			panic("EVTCHNOP_close failed");
 	}
 
 	xen_intr_port_to_isrc[isrc->xi_port] = NULL;
 	isrc->xi_cpu = 0;
 	isrc->xi_type = EVTCHN_TYPE_UNBOUND;
 	isrc->xi_port = 0;
 	isrc->xi_cookie = NULL;
 	mtx_unlock(&xen_intr_isrc_lock);
 	return (0);
 }
 
 /**
  * Associate an interrupt handler with an already allocated local Xen
  * event channel port.
  *
  * \param isrcp       The returned Xen interrupt object associated with
  *                    the specified local port.
  * \param local_port  The event channel to bind.
  * \param type        The event channel type of local_port.
  * \param intr_owner  The device making this bind request.
  * \param filter      An interrupt filter handler.  Specify NULL
  *                    to always dispatch to the ithread handler.
  * \param handler     An interrupt ithread handler.  Optional (can
  *                    specify NULL) if all necessary event actions
  *                    are performed by filter.
  * \param arg         Argument to present to both filter and handler.
  * \param irqflags    Interrupt handler flags.  See sys/bus.h.
  * \param handlep     Pointer to an opaque handle used to manage this
  *                    registration.
  *
  * \returns  0 on success, otherwise an errno.
  */
 static int
 xen_intr_bind_isrc(struct xenisrc **isrcp, evtchn_port_t local_port,
     enum evtchn_type type, const char *intr_owner, driver_filter_t filter,
     driver_intr_t handler, void *arg, enum intr_type flags,
     xen_intr_handle_t *port_handlep)
 {
 	struct xenisrc *isrc;
 	int error;
 
 	*isrcp = NULL;
 	if (port_handlep == NULL) {
 		printf("%s: xen_intr_bind_isrc: Bad event handle\n",
 		    intr_owner);
 		return (EINVAL);
 	}
 
 	mtx_lock(&xen_intr_isrc_lock);
 	isrc = xen_intr_find_unused_isrc(type);
 	if (isrc == NULL) {
 		isrc = xen_intr_alloc_isrc(type, XEN_ALLOCATE_VECTOR);
 		if (isrc == NULL) {
 			mtx_unlock(&xen_intr_isrc_lock);
 			return (ENOSPC);
 		}
 	}
 	isrc->xi_port = local_port;
 	xen_intr_port_to_isrc[local_port] = isrc;
 	refcount_init(&isrc->xi_refcount, 1);
 	mtx_unlock(&xen_intr_isrc_lock);
 
 	/* Assign the opaque handler (the event channel port) */
 	*port_handlep = &isrc->xi_vector;
 
 #ifdef SMP
 	if (type == EVTCHN_TYPE_PORT) {
 		/*
 		 * By default all interrupts are assigned to vCPU#0
 		 * unless specified otherwise, so shuffle them to balance
 		 * the interrupt load.
 		 */
-		xen_intr_assign_cpu(&isrc->xi_intsrc, intr_next_cpu());
+		xen_intr_assign_cpu(&isrc->xi_intsrc, intr_next_cpu(0));
 	}
 #endif
 
 	if (filter == NULL && handler == NULL) {
 		/*
 		 * No filter/handler provided, leave the event channel
 		 * masked and without a valid handler, the caller is
 		 * in charge of setting that up.
 		 */
 		*isrcp = isrc;
 		return (0);
 	}
 
 	error = xen_intr_add_handler(intr_owner, filter, handler, arg, flags,
 	    *port_handlep);
 	if (error != 0) {
 		xen_intr_release_isrc(isrc);
 		return (error);
 	}
 	*isrcp = isrc;
 	return (0);
 }
 
 /**
  * Lookup a Xen interrupt source object given an interrupt binding handle.
  * 
  * \param handle  A handle initialized by a previous call to
  *                xen_intr_bind_isrc().
  *
  * \returns  A pointer to the Xen interrupt source object associated
  *           with the given interrupt handle.  NULL if no association
  *           currently exists.
  */
 static struct xenisrc *
 xen_intr_isrc(xen_intr_handle_t handle)
 {
 	int vector;
 
 	if (handle == NULL)
 		return (NULL);
 
 	vector = *(int *)handle;
 	KASSERT(vector >= FIRST_EVTCHN_INT &&
 	    vector < (FIRST_EVTCHN_INT + xen_intr_auto_vector_count),
 	    ("Xen interrupt vector is out of range"));
 
 	return ((struct xenisrc *)intr_lookup_source(vector));
 }
 
 /**
  * Determine the event channel ports at the given section of the
  * event port bitmap which have pending events for the given cpu.
  * 
  * \param pcpu  The Xen interrupt pcpu data for the cpu being querried.
  * \param sh    The Xen shared info area.
  * \param idx   The index of the section of the event channel bitmap to
  *              inspect.
  *
  * \returns  A u_long with bits set for every event channel with pending
  *           events.
  */
 static inline u_long
 xen_intr_active_ports(struct xen_intr_pcpu_data *pcpu, shared_info_t *sh,
     u_int idx)
 {
 
 	CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(sh->evtchn_pending[0]));
 	CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(pcpu->evtchn_enabled[0]));
 	CTASSERT(sizeof(sh->evtchn_mask) == sizeof(sh->evtchn_pending));
 	CTASSERT(sizeof(sh->evtchn_mask) == sizeof(pcpu->evtchn_enabled));
 	return (sh->evtchn_pending[idx]
 	      & ~sh->evtchn_mask[idx]
 	      & pcpu->evtchn_enabled[idx]);
 }
 
 /**
  * Interrupt handler for processing all Xen event channel events.
  * 
  * \param trap_frame  The trap frame context for the current interrupt.
  */
 void
 xen_intr_handle_upcall(struct trapframe *trap_frame)
 {
 	u_int l1i, l2i, port, cpu;
 	u_long masked_l1, masked_l2;
 	struct xenisrc *isrc;
 	shared_info_t *s;
 	vcpu_info_t *v;
 	struct xen_intr_pcpu_data *pc;
 	u_long l1, l2;
 
 	/*
 	 * Disable preemption in order to always check and fire events
 	 * on the right vCPU
 	 */
 	critical_enter();
 
 	cpu = PCPU_GET(cpuid);
 	pc  = DPCPU_PTR(xen_intr_pcpu);
 	s   = HYPERVISOR_shared_info;
 	v   = DPCPU_GET(vcpu_info);
 
 	if (xen_hvm_domain() && !xen_vector_callback_enabled) {
 		KASSERT((cpu == 0), ("Fired PCI event callback on wrong CPU"));
 	}
 
 	v->evtchn_upcall_pending = 0;
 
 #if 0
 #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
 	/* Clear master flag /before/ clearing selector flag. */
 	wmb();
 #endif
 #endif
 
 	l1 = atomic_readandclear_long(&v->evtchn_pending_sel);
 
 	l1i = pc->last_processed_l1i;
 	l2i = pc->last_processed_l2i;
 	(*pc->evtchn_intrcnt)++;
 
 	while (l1 != 0) {
 
 		l1i = (l1i + 1) % LONG_BIT;
 		masked_l1 = l1 & ((~0UL) << l1i);
 
 		if (masked_l1 == 0) {
 			/*
 			 * if we masked out all events, wrap around
 			 * to the beginning.
 			 */
 			l1i = LONG_BIT - 1;
 			l2i = LONG_BIT - 1;
 			continue;
 		}
 		l1i = ffsl(masked_l1) - 1;
 
 		do {
 			l2 = xen_intr_active_ports(pc, s, l1i);
 
 			l2i = (l2i + 1) % LONG_BIT;
 			masked_l2 = l2 & ((~0UL) << l2i);
 
 			if (masked_l2 == 0) {
 				/* if we masked out all events, move on */
 				l2i = LONG_BIT - 1;
 				break;
 			}
 			l2i = ffsl(masked_l2) - 1;
 
 			/* process port */
 			port = (l1i * LONG_BIT) + l2i;
 			synch_clear_bit(port, &s->evtchn_pending[0]);
 
 			isrc = xen_intr_port_to_isrc[port];
 			if (__predict_false(isrc == NULL))
 				continue;
 
 			/* Make sure we are firing on the right vCPU */
 			KASSERT((isrc->xi_cpu == PCPU_GET(cpuid)),
 				("Received unexpected event on vCPU#%d, event bound to vCPU#%d",
 				PCPU_GET(cpuid), isrc->xi_cpu));
 
 			intr_execute_handlers(&isrc->xi_intsrc, trap_frame);
 
 			/*
 			 * If this is the final port processed,
 			 * we'll pick up here+1 next time.
 			 */
 			pc->last_processed_l1i = l1i;
 			pc->last_processed_l2i = l2i;
 
 		} while (l2i != LONG_BIT - 1);
 
 		l2 = xen_intr_active_ports(pc, s, l1i);
 		if (l2 == 0) {
 			/*
 			 * We handled all ports, so we can clear the
 			 * selector bit.
 			 */
 			l1 &= ~(1UL << l1i);
 		}
 	}
 	critical_exit();
 }
 
 static int
 xen_intr_init(void *dummy __unused)
 {
 	shared_info_t *s = HYPERVISOR_shared_info;
 	struct xen_intr_pcpu_data *pcpu;
 	struct physdev_pirq_eoi_gmfn eoi_gmfn;
 	int i, rc;
 
 	if (!xen_domain())
 		return (0);
 
 	mtx_init(&xen_intr_isrc_lock, "xen-irq-lock", NULL, MTX_DEF);
 
 	/*
 	 * Register interrupt count manually as we aren't
 	 * guaranteed to see a call to xen_intr_assign_cpu()
 	 * before our first interrupt. Also set the per-cpu
 	 * mask of CPU#0 to enable all, since by default
 	 * all event channels are bound to CPU#0.
 	 */
 	CPU_FOREACH(i) {
 		pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
 		memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0,
 		    sizeof(pcpu->evtchn_enabled));
 		xen_intr_intrcnt_add(i);
 	}
 
 	for (i = 0; i < nitems(s->evtchn_mask); i++)
 		atomic_store_rel_long(&s->evtchn_mask[i], ~0);
 
 	/* Try to register PIRQ EOI map */
 	xen_intr_pirq_eoi_map = malloc(PAGE_SIZE, M_XENINTR, M_WAITOK | M_ZERO);
 	eoi_gmfn.gmfn = atop(vtophys(xen_intr_pirq_eoi_map));
 	rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn);
 	if (rc != 0 && bootverbose)
 		printf("Xen interrupts: unable to register PIRQ EOI map\n");
 	else
 		xen_intr_pirq_eoi_map_enabled = true;
 
 	intr_register_pic(&xen_intr_pic);
 	intr_register_pic(&xen_intr_pirq_pic);
 
 	if (bootverbose)
 		printf("Xen interrupt system initialized\n");
 
 	return (0);
 }
 SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_SECOND, xen_intr_init, NULL);
 
 /*--------------------------- Common PIC Functions ---------------------------*/
 /**
  * Prepare this PIC for system suspension.
  */
 static void
 xen_intr_suspend(struct pic *unused)
 {
 }
 
 static void
 xen_rebind_ipi(struct xenisrc *isrc)
 {
 #ifdef SMP
 	int cpu = isrc->xi_cpu;
 	int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
 	int error;
 	struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id };
 
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
 	                                    &bind_ipi);
 	if (error != 0)
 		panic("unable to rebind xen IPI: %d", error);
 
 	isrc->xi_port = bind_ipi.port;
 	isrc->xi_cpu = 0;
 	xen_intr_port_to_isrc[bind_ipi.port] = isrc;
 
 	error = xen_intr_assign_cpu(&isrc->xi_intsrc,
 	                            cpu_apic_ids[cpu]);
 	if (error)
 		panic("unable to bind xen IPI to CPU#%d: %d",
 		      cpu, error);
 
 	evtchn_unmask_port(bind_ipi.port);
 #else
 	panic("Resume IPI event channel on UP");
 #endif
 }
 
 static void
 xen_rebind_virq(struct xenisrc *isrc)
 {
 	int cpu = isrc->xi_cpu;
 	int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
 	int error;
 	struct evtchn_bind_virq bind_virq = { .virq = isrc->xi_virq,
 	                                      .vcpu = vcpu_id };
 
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
 	                                    &bind_virq);
 	if (error != 0)
 		panic("unable to rebind xen VIRQ#%d: %d", isrc->xi_virq, error);
 
 	isrc->xi_port = bind_virq.port;
 	isrc->xi_cpu = 0;
 	xen_intr_port_to_isrc[bind_virq.port] = isrc;
 
 #ifdef SMP
 	error = xen_intr_assign_cpu(&isrc->xi_intsrc,
 	                            cpu_apic_ids[cpu]);
 	if (error)
 		panic("unable to bind xen VIRQ#%d to CPU#%d: %d",
 		      isrc->xi_virq, cpu, error);
 #endif
 
 	evtchn_unmask_port(bind_virq.port);
 }
 
 /**
  * Return this PIC to service after being suspended.
  */
 static void
 xen_intr_resume(struct pic *unused, bool suspend_cancelled)
 {
 	shared_info_t *s = HYPERVISOR_shared_info;
 	struct xenisrc *isrc;
 	u_int isrc_idx;
 	int i;
 
 	if (suspend_cancelled)
 		return;
 
 	/* Reset the per-CPU masks */
 	CPU_FOREACH(i) {
 		struct xen_intr_pcpu_data *pcpu;
 
 		pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
 		memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0,
 		    sizeof(pcpu->evtchn_enabled));
 	}
 
 	/* Mask all event channels. */
 	for (i = 0; i < nitems(s->evtchn_mask); i++)
 		atomic_store_rel_long(&s->evtchn_mask[i], ~0);
 
 	/* Remove port -> isrc mappings */
 	memset(xen_intr_port_to_isrc, 0, sizeof(xen_intr_port_to_isrc));
 
 	/* Free unused isrcs and rebind VIRQs and IPIs */
 	for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx++) {
 		u_int vector;
 
 		vector = FIRST_EVTCHN_INT + isrc_idx;
 		isrc = (struct xenisrc *)intr_lookup_source(vector);
 		if (isrc != NULL) {
 			isrc->xi_port = 0;
 			switch (isrc->xi_type) {
 			case EVTCHN_TYPE_IPI:
 				xen_rebind_ipi(isrc);
 				break;
 			case EVTCHN_TYPE_VIRQ:
 				xen_rebind_virq(isrc);
 				break;
 			default:
 				break;
 			}
 		}
 	}
 }
 
 /**
  * Disable a Xen interrupt source.
  *
  * \param isrc  The interrupt source to disable.
  */
 static void
 xen_intr_disable_intr(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc = (struct xenisrc *)base_isrc;
 
 	evtchn_mask_port(isrc->xi_port);
 }
 
 /**
  * Determine the global interrupt vector number for
  * a Xen interrupt source.
  *
  * \param isrc  The interrupt source to query.
  *
  * \return  The vector number corresponding to the given interrupt source.
  */
 static int
 xen_intr_vector(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc = (struct xenisrc *)base_isrc;
 
 	return (isrc->xi_vector);
 }
 
 /**
  * Determine whether or not interrupt events are pending on the
  * the given interrupt source.
  *
  * \param isrc  The interrupt source to query.
  *
  * \returns  0 if no events are pending, otherwise non-zero.
  */
 static int
 xen_intr_source_pending(struct intsrc *isrc)
 {
 	/*
 	 * EventChannels are edge triggered and never masked.
 	 * There can be no pending events.
 	 */
 	return (0);
 }
 
 /**
  * Perform configuration of an interrupt source.
  *
  * \param isrc  The interrupt source to configure.
  * \param trig  Edge or level.
  * \param pol   Active high or low.
  *
  * \returns  0 if no events are pending, otherwise non-zero.
  */
 static int
 xen_intr_config_intr(struct intsrc *isrc, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 	/* Configuration is only possible via the evtchn apis. */
 	return (ENODEV);
 }
 
 /**
  * Configure CPU affinity for interrupt source event delivery.
  *
  * \param isrc     The interrupt source to configure.
  * \param apic_id  The apic id of the CPU for handling future events.
  *
  * \returns  0 if successful, otherwise an errno.
  */
 static int
 xen_intr_assign_cpu(struct intsrc *base_isrc, u_int apic_id)
 {
 #ifdef SMP
 	struct evtchn_bind_vcpu bind_vcpu;
 	struct xenisrc *isrc;
 	u_int to_cpu, vcpu_id;
 	int error, masked;
 
 	if (xen_vector_callback_enabled == 0)
 		return (EOPNOTSUPP);
 
 	to_cpu = apic_cpuid(apic_id);
 	vcpu_id = pcpu_find(to_cpu)->pc_vcpu_id;
 	xen_intr_intrcnt_add(to_cpu);
 
 	mtx_lock(&xen_intr_isrc_lock);
 	isrc = (struct xenisrc *)base_isrc;
 	if (!is_valid_evtchn(isrc->xi_port)) {
 		mtx_unlock(&xen_intr_isrc_lock);
 		return (EINVAL);
 	}
 
 	/*
 	 * Mask the event channel while binding it to prevent interrupt
 	 * delivery with an inconsistent state in isrc->xi_cpu.
 	 */
 	masked = evtchn_test_and_set_mask(isrc->xi_port);
 	if ((isrc->xi_type == EVTCHN_TYPE_VIRQ) ||
 		(isrc->xi_type == EVTCHN_TYPE_IPI)) {
 		/*
 		 * Virtual IRQs are associated with a cpu by
 		 * the Hypervisor at evtchn_bind_virq time, so
 		 * all we need to do is update the per-CPU masks.
 		 */
 		evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port);
 		isrc->xi_cpu = to_cpu;
 		evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port);
 		goto out;
 	}
 
 	bind_vcpu.port = isrc->xi_port;
 	bind_vcpu.vcpu = vcpu_id;
 
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu);
 	if (isrc->xi_cpu != to_cpu) {
 		if (error == 0) {
 			/* Commit to new binding by removing the old one. */
 			evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port);
 			isrc->xi_cpu = to_cpu;
 			evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port);
 		}
 	}
 
 out:
 	if (masked == 0)
 		evtchn_unmask_port(isrc->xi_port);
 	mtx_unlock(&xen_intr_isrc_lock);
 	return (0);
 #else
 	return (EOPNOTSUPP);
 #endif
 }
 
 /*------------------- Virtual Interrupt Source PIC Functions -----------------*/
 /*
  * Mask a level triggered interrupt source.
  *
  * \param isrc  The interrupt source to mask (if necessary).
  * \param eoi   If non-zero, perform any necessary end-of-interrupt
  *              acknowledgements.
  */
 static void
 xen_intr_disable_source(struct intsrc *base_isrc, int eoi)
 {
 	struct xenisrc *isrc;
 
 	isrc = (struct xenisrc *)base_isrc;
 
 	/*
 	 * NB: checking if the event channel is already masked is
 	 * needed because the event channel user-space device
 	 * masks event channels on its filter as part of its
 	 * normal operation, and those shouldn't be automatically
 	 * unmasked by the generic interrupt code. The event channel
 	 * device will unmask them when needed.
 	 */
 	isrc->xi_masked = !!evtchn_test_and_set_mask(isrc->xi_port);
 }
 
 /*
  * Unmask a level triggered interrupt source.
  *
  * \param isrc  The interrupt source to unmask (if necessary).
  */
 static void
 xen_intr_enable_source(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc;
 
 	isrc = (struct xenisrc *)base_isrc;
 
 	if (isrc->xi_masked == 0)
 		evtchn_unmask_port(isrc->xi_port);
 }
 
 /*
  * Perform any necessary end-of-interrupt acknowledgements.
  *
  * \param isrc  The interrupt source to EOI.
  */
 static void
 xen_intr_eoi_source(struct intsrc *base_isrc)
 {
 }
 
 /*
  * Enable and unmask the interrupt source.
  *
  * \param isrc  The interrupt source to enable.
  */
 static void
 xen_intr_enable_intr(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc = (struct xenisrc *)base_isrc;
 
 	evtchn_unmask_port(isrc->xi_port);
 }
 
 /*------------------ Physical Interrupt Source PIC Functions -----------------*/
 /*
  * Mask a level triggered interrupt source.
  *
  * \param isrc  The interrupt source to mask (if necessary).
  * \param eoi   If non-zero, perform any necessary end-of-interrupt
  *              acknowledgements.
  */
 static void
 xen_intr_pirq_disable_source(struct intsrc *base_isrc, int eoi)
 {
 	struct xenisrc *isrc;
 
 	isrc = (struct xenisrc *)base_isrc;
 
 	if (isrc->xi_edgetrigger == 0)
 		evtchn_mask_port(isrc->xi_port);
 	if (eoi == PIC_EOI)
 		xen_intr_pirq_eoi_source(base_isrc);
 }
 
 /*
  * Unmask a level triggered interrupt source.
  *
  * \param isrc  The interrupt source to unmask (if necessary).
  */
 static void
 xen_intr_pirq_enable_source(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc;
 
 	isrc = (struct xenisrc *)base_isrc;
 
 	if (isrc->xi_edgetrigger == 0)
 		evtchn_unmask_port(isrc->xi_port);
 }
 
 /*
  * Perform any necessary end-of-interrupt acknowledgements.
  *
  * \param isrc  The interrupt source to EOI.
  */
 static void
 xen_intr_pirq_eoi_source(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc;
 	int error;
 
 	isrc = (struct xenisrc *)base_isrc;
 
 	if (xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map)) {
 		struct physdev_eoi eoi = { .irq = isrc->xi_pirq };
 
 		error = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi);
 		if (error != 0)
 			panic("Unable to EOI PIRQ#%d: %d\n",
 			    isrc->xi_pirq, error);
 	}
 }
 
 /*
  * Enable and unmask the interrupt source.
  *
  * \param isrc  The interrupt source to enable.
  */
 static void
 xen_intr_pirq_enable_intr(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc;
 	struct evtchn_bind_pirq bind_pirq;
 	struct physdev_irq_status_query irq_status;
 	int error;
 
 	isrc = (struct xenisrc *)base_isrc;
 
 	if (!xen_intr_pirq_eoi_map_enabled) {
 		irq_status.irq = isrc->xi_pirq;
 		error = HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query,
 		    &irq_status);
 		if (error)
 			panic("unable to get status of IRQ#%d", isrc->xi_pirq);
 
 		if (irq_status.flags & XENIRQSTAT_needs_eoi) {
 			/*
 			 * Since the dynamic PIRQ EOI map is not available
 			 * mark the PIRQ as needing EOI unconditionally.
 			 */
 			xen_set_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map);
 		}
 	}
 
 	bind_pirq.pirq = isrc->xi_pirq;
 	bind_pirq.flags = isrc->xi_edgetrigger ? 0 : BIND_PIRQ__WILL_SHARE;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq);
 	if (error)
 		panic("unable to bind IRQ#%d", isrc->xi_pirq);
 
 	isrc->xi_port = bind_pirq.port;
 
 	mtx_lock(&xen_intr_isrc_lock);
 	KASSERT((xen_intr_port_to_isrc[bind_pirq.port] == NULL),
 	    ("trying to override an already setup event channel port"));
 	xen_intr_port_to_isrc[bind_pirq.port] = isrc;
 	mtx_unlock(&xen_intr_isrc_lock);
 
 	evtchn_unmask_port(isrc->xi_port);
 }
 
 /*
  * Disable an interrupt source.
  *
  * \param isrc  The interrupt source to disable.
  */
 static void
 xen_intr_pirq_disable_intr(struct intsrc *base_isrc)
 {
 	struct xenisrc *isrc;
 	struct evtchn_close close;
 	int error;
 
 	isrc = (struct xenisrc *)base_isrc;
 
 	evtchn_mask_port(isrc->xi_port);
 
 	close.port = isrc->xi_port;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close);
 	if (error)
 		panic("unable to close event channel %d IRQ#%d",
 		    isrc->xi_port, isrc->xi_pirq);
 
 	mtx_lock(&xen_intr_isrc_lock);
 	xen_intr_port_to_isrc[isrc->xi_port] = NULL;
 	mtx_unlock(&xen_intr_isrc_lock);
 
 	isrc->xi_port = 0;
 }
 
 /**
  * Perform configuration of an interrupt source.
  *
  * \param isrc  The interrupt source to configure.
  * \param trig  Edge or level.
  * \param pol   Active high or low.
  *
  * \returns  0 if no events are pending, otherwise non-zero.
  */
 static int
 xen_intr_pirq_config_intr(struct intsrc *base_isrc, enum intr_trigger trig,
     enum intr_polarity pol)
 {
 	struct xenisrc *isrc = (struct xenisrc *)base_isrc;
 	struct physdev_setup_gsi setup_gsi;
 	int error;
 
 	KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM),
 	    ("%s: Conforming trigger or polarity\n", __func__));
 
 	setup_gsi.gsi = isrc->xi_pirq;
 	setup_gsi.triggering = trig == INTR_TRIGGER_EDGE ? 0 : 1;
 	setup_gsi.polarity = pol == INTR_POLARITY_HIGH ? 0 : 1;
 
 	error = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
 	if (error == -XEN_EEXIST) {
 		if ((isrc->xi_edgetrigger && (trig != INTR_TRIGGER_EDGE)) ||
 		    (isrc->xi_activehi && (pol != INTR_POLARITY_HIGH)))
 			panic("unable to reconfigure interrupt IRQ#%d",
 			    isrc->xi_pirq);
 		error = 0;
 	}
 	if (error)
 		panic("unable to configure IRQ#%d\n", isrc->xi_pirq);
 
 	isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0;
 	isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0;
 
 	return (0);
 }
 
 /*--------------------------- Public Functions -------------------------------*/
 /*------- API comments for these methods can be found in xen/xenintr.h -------*/
 int
 xen_intr_bind_local_port(device_t dev, evtchn_port_t local_port,
     driver_filter_t filter, driver_intr_t handler, void *arg,
     enum intr_type flags, xen_intr_handle_t *port_handlep)
 {
 	struct xenisrc *isrc;
 	int error;
 
 	error = xen_intr_bind_isrc(&isrc, local_port, EVTCHN_TYPE_PORT,
 	    device_get_nameunit(dev), filter, handler, arg, flags,
 	    port_handlep);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * The Event Channel API didn't open this port, so it is not
 	 * responsible for closing it automatically on unbind.
 	 */
 	isrc->xi_close = 0;
 	return (0);
 }
 
 int
 xen_intr_alloc_and_bind_local_port(device_t dev, u_int remote_domain,
     driver_filter_t filter, driver_intr_t handler, void *arg,
     enum intr_type flags, xen_intr_handle_t *port_handlep)
 {
 	struct xenisrc *isrc;
 	struct evtchn_alloc_unbound alloc_unbound;
 	int error;
 
 	alloc_unbound.dom        = DOMID_SELF;
 	alloc_unbound.remote_dom = remote_domain;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
 		    &alloc_unbound);
 	if (error != 0) {
 		/*
 		 * XXX Trap Hypercall error code Linuxisms in
 		 *     the HYPERCALL layer.
 		 */
 		return (-error);
 	}
 
 	error = xen_intr_bind_isrc(&isrc, alloc_unbound.port, EVTCHN_TYPE_PORT,
 	    device_get_nameunit(dev), filter, handler, arg, flags,
 	    port_handlep);
 	if (error != 0) {
 		evtchn_close_t close = { .port = alloc_unbound.port };
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
 			panic("EVTCHNOP_close failed");
 		return (error);
 	}
 
 	isrc->xi_close = 1;
 	return (0);
 }
 
 int 
 xen_intr_bind_remote_port(device_t dev, u_int remote_domain,
     u_int remote_port, driver_filter_t filter, driver_intr_t handler,
     void *arg, enum intr_type flags, xen_intr_handle_t *port_handlep)
 {
 	struct xenisrc *isrc;
 	struct evtchn_bind_interdomain bind_interdomain;
 	int error;
 
 	bind_interdomain.remote_dom  = remote_domain;
 	bind_interdomain.remote_port = remote_port;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
 					    &bind_interdomain);
 	if (error != 0) {
 		/*
 		 * XXX Trap Hypercall error code Linuxisms in
 		 *     the HYPERCALL layer.
 		 */
 		return (-error);
 	}
 
 	error = xen_intr_bind_isrc(&isrc, bind_interdomain.local_port,
 	    EVTCHN_TYPE_PORT, device_get_nameunit(dev), filter, handler, arg,
 	    flags, port_handlep);
 	if (error) {
 		evtchn_close_t close = { .port = bind_interdomain.local_port };
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
 			panic("EVTCHNOP_close failed");
 		return (error);
 	}
 
 	/*
 	 * The Event Channel API opened this port, so it is
 	 * responsible for closing it automatically on unbind.
 	 */
 	isrc->xi_close = 1;
 	return (0);
 }
 
 int 
 xen_intr_bind_virq(device_t dev, u_int virq, u_int cpu,
     driver_filter_t filter, driver_intr_t handler, void *arg,
     enum intr_type flags, xen_intr_handle_t *port_handlep)
 {
 	int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
 	struct xenisrc *isrc;
 	struct evtchn_bind_virq bind_virq = { .virq = virq, .vcpu = vcpu_id };
 	int error;
 
 	/* Ensure the target CPU is ready to handle evtchn interrupts. */
 	xen_intr_intrcnt_add(cpu);
 
 	isrc = NULL;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq);
 	if (error != 0) {
 		/*
 		 * XXX Trap Hypercall error code Linuxisms in
 		 *     the HYPERCALL layer.
 		 */
 		return (-error);
 	}
 
 	error = xen_intr_bind_isrc(&isrc, bind_virq.port, EVTCHN_TYPE_VIRQ,
 	    device_get_nameunit(dev), filter, handler, arg, flags,
 	    port_handlep);
 
 #ifdef SMP
 	if (error == 0)
 		error = intr_event_bind(isrc->xi_intsrc.is_event, cpu);
 #endif
 
 	if (error != 0) {
 		evtchn_close_t close = { .port = bind_virq.port };
 
 		xen_intr_unbind(*port_handlep);
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
 			panic("EVTCHNOP_close failed");
 		return (error);
 	}
 
 #ifdef SMP
 	if (isrc->xi_cpu != cpu) {
 		/*
 		 * Too early in the boot process for the generic interrupt
 		 * code to perform the binding.  Update our event channel
 		 * masks manually so events can't fire on the wrong cpu
 		 * during AP startup.
 		 */
 		xen_intr_assign_cpu(&isrc->xi_intsrc, cpu_apic_ids[cpu]);
 	}
 #endif
 
 	/*
 	 * The Event Channel API opened this port, so it is
 	 * responsible for closing it automatically on unbind.
 	 */
 	isrc->xi_close = 1;
 	isrc->xi_virq = virq;
 
 	return (0);
 }
 
 int
 xen_intr_alloc_and_bind_ipi(u_int cpu, driver_filter_t filter,
     enum intr_type flags, xen_intr_handle_t *port_handlep)
 {
 #ifdef SMP
 	int vcpu_id = pcpu_find(cpu)->pc_vcpu_id;
 	struct xenisrc *isrc;
 	struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id };
 	/* Same size as the one used by intr_handler->ih_name. */
 	char name[MAXCOMLEN + 1];
 	int error;
 
 	/* Ensure the target CPU is ready to handle evtchn interrupts. */
 	xen_intr_intrcnt_add(cpu);
 
 	isrc = NULL;
 	error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi);
 	if (error != 0) {
 		/*
 		 * XXX Trap Hypercall error code Linuxisms in
 		 *     the HYPERCALL layer.
 		 */
 		return (-error);
 	}
 
 	snprintf(name, sizeof(name), "cpu%u", cpu);
 
 	error = xen_intr_bind_isrc(&isrc, bind_ipi.port, EVTCHN_TYPE_IPI,
 	    name, filter, NULL, NULL, flags, port_handlep);
 	if (error != 0) {
 		evtchn_close_t close = { .port = bind_ipi.port };
 
 		xen_intr_unbind(*port_handlep);
 		if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close))
 			panic("EVTCHNOP_close failed");
 		return (error);
 	}
 
 	if (isrc->xi_cpu != cpu) {
 		/*
 		 * Too early in the boot process for the generic interrupt
 		 * code to perform the binding.  Update our event channel
 		 * masks manually so events can't fire on the wrong cpu
 		 * during AP startup.
 		 */
 		xen_intr_assign_cpu(&isrc->xi_intsrc, cpu_apic_ids[cpu]);
 	}
 
 	/*
 	 * The Event Channel API opened this port, so it is
 	 * responsible for closing it automatically on unbind.
 	 */
 	isrc->xi_close = 1;
 	return (0);
 #else
 	return (EOPNOTSUPP);
 #endif
 }
 
 int
 xen_register_pirq(int vector, enum intr_trigger trig, enum intr_polarity pol)
 {
 	struct physdev_map_pirq map_pirq;
 	struct xenisrc *isrc;
 	int error;
 
 	if (vector == 0)
 		return (EINVAL);
 
 	if (bootverbose)
 		printf("xen: register IRQ#%d\n", vector);
 
 	map_pirq.domid = DOMID_SELF;
 	map_pirq.type = MAP_PIRQ_TYPE_GSI;
 	map_pirq.index = vector;
 	map_pirq.pirq = vector;
 
 	error = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_pirq);
 	if (error) {
 		printf("xen: unable to map IRQ#%d\n", vector);
 		return (error);
 	}
 
 	mtx_lock(&xen_intr_isrc_lock);
 	isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector);
 	mtx_unlock(&xen_intr_isrc_lock);
 	KASSERT((isrc != NULL), ("xen: unable to allocate isrc for interrupt"));
 	isrc->xi_pirq = vector;
 	isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0;
 	isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0;
 
 	return (0);
 }
 
 int
 xen_register_msi(device_t dev, int vector, int count)
 {
 	struct physdev_map_pirq msi_irq;
 	struct xenisrc *isrc;
 	int ret;
 
 	memset(&msi_irq, 0, sizeof(msi_irq));
 	msi_irq.domid = DOMID_SELF;
 	msi_irq.type = count == 1 ?
 	    MAP_PIRQ_TYPE_MSI_SEG : MAP_PIRQ_TYPE_MULTI_MSI;
 	msi_irq.index = -1;
 	msi_irq.pirq = -1;
 	msi_irq.bus = pci_get_bus(dev) | (pci_get_domain(dev) << 16);
 	msi_irq.devfn = (pci_get_slot(dev) << 3) | pci_get_function(dev);
 	msi_irq.entry_nr = count;
 
 	ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &msi_irq);
 	if (ret != 0)
 		return (ret);
 	if (count != msi_irq.entry_nr) {
 		panic("unable to setup all requested MSI vectors "
 		    "(expected %d got %d)", count, msi_irq.entry_nr);
 	}
 
 	mtx_lock(&xen_intr_isrc_lock);
 	for (int i = 0; i < count; i++) {
 		isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector + i);
 		KASSERT(isrc != NULL,
 		    ("xen: unable to allocate isrc for interrupt"));
 		isrc->xi_pirq = msi_irq.pirq + i;
 		/* MSI interrupts are always edge triggered */
 		isrc->xi_edgetrigger = 1;
 	}
 	mtx_unlock(&xen_intr_isrc_lock);
 
 	return (0);
 }
 
 int
 xen_release_msi(int vector)
 {
 	struct physdev_unmap_pirq unmap;
 	struct xenisrc *isrc;
 	int ret;
 
 	isrc = (struct xenisrc *)intr_lookup_source(vector);
 	if (isrc == NULL)
 		return (ENXIO);
 
 	unmap.pirq = isrc->xi_pirq;
 	ret = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap);
 	if (ret != 0)
 		return (ret);
 
 	xen_intr_release_isrc(isrc);
 
 	return (0);
 }
 
 int
 xen_intr_describe(xen_intr_handle_t port_handle, const char *fmt, ...)
 {
 	char descr[MAXCOMLEN + 1];
 	struct xenisrc *isrc;
 	va_list ap;
 
 	isrc = xen_intr_isrc(port_handle);
 	if (isrc == NULL)
 		return (EINVAL);
 
 	va_start(ap, fmt);
 	vsnprintf(descr, sizeof(descr), fmt, ap);
 	va_end(ap);
 	return (intr_describe(isrc->xi_vector, isrc->xi_cookie, descr));
 }
 
 void
 xen_intr_unbind(xen_intr_handle_t *port_handlep)
 {
 	struct xenisrc *isrc;
 
 	KASSERT(port_handlep != NULL,
 	    ("NULL xen_intr_handle_t passed to xen_intr_unbind"));
 
 	isrc = xen_intr_isrc(*port_handlep);
 	*port_handlep = NULL;
 	if (isrc == NULL)
 		return;
 
 	mtx_lock(&xen_intr_isrc_lock);
 	if (refcount_release(&isrc->xi_refcount) == 0) {
 		mtx_unlock(&xen_intr_isrc_lock);
 		return;
 	}
 	mtx_unlock(&xen_intr_isrc_lock);
 
 	if (isrc->xi_cookie != NULL)
 		intr_remove_handler(isrc->xi_cookie);
 	xen_intr_release_isrc(isrc);
 }
 
 void
 xen_intr_signal(xen_intr_handle_t handle)
 {
 	struct xenisrc *isrc;
 
 	isrc = xen_intr_isrc(handle);
 	if (isrc != NULL) {
 		KASSERT(isrc->xi_type == EVTCHN_TYPE_PORT ||
 			isrc->xi_type == EVTCHN_TYPE_IPI,
 			("evtchn_signal on something other than a local port"));
 		struct evtchn_send send = { .port = isrc->xi_port };
 		(void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
 	}
 }
 
 evtchn_port_t
 xen_intr_port(xen_intr_handle_t handle)
 {
 	struct xenisrc *isrc;
 
 	isrc = xen_intr_isrc(handle);
 	if (isrc == NULL)
 		return (0);
 	
 	return (isrc->xi_port);
 }
 
 int
 xen_intr_add_handler(const char *name, driver_filter_t filter,
     driver_intr_t handler, void *arg, enum intr_type flags,
     xen_intr_handle_t handle)
 {
 	struct xenisrc *isrc;
 	int error;
 
 	isrc = xen_intr_isrc(handle);
 	if (isrc == NULL || isrc->xi_cookie != NULL)
 		return (EINVAL);
 
 	error = intr_add_handler(name, isrc->xi_vector,filter, handler, arg,
-	    flags|INTR_EXCL, &isrc->xi_cookie);
+	    flags|INTR_EXCL, &isrc->xi_cookie, 0);
 	if (error != 0) {
 		printf(
 		    "%s: xen_intr_add_handler: intr_add_handler failed: %d\n",
 		    name, error);
 	}
 
 	return (error);
 }
 
 int
 xen_intr_get_evtchn_from_port(evtchn_port_t port, xen_intr_handle_t *handlep)
 {
 
 	if (!is_valid_evtchn(port) || port >= NR_EVENT_CHANNELS)
 		return (EINVAL);
 
 	if (handlep == NULL) {
 		return (EINVAL);
 	}
 
 	mtx_lock(&xen_intr_isrc_lock);
 	if (xen_intr_port_to_isrc[port] == NULL) {
 		mtx_unlock(&xen_intr_isrc_lock);
 		return (EINVAL);
 	}
 	refcount_acquire(&xen_intr_port_to_isrc[port]->xi_refcount);
 	mtx_unlock(&xen_intr_isrc_lock);
 
 	/* Assign the opaque handler (the event channel port) */
 	*handlep = &xen_intr_port_to_isrc[port]->xi_vector;
 
 	return (0);
 }
 
 #ifdef DDB
 static const char *
 xen_intr_print_type(enum evtchn_type type)
 {
 	static const char *evtchn_type_to_string[EVTCHN_TYPE_COUNT] = {
 		[EVTCHN_TYPE_UNBOUND]	= "UNBOUND",
 		[EVTCHN_TYPE_PIRQ]	= "PIRQ",
 		[EVTCHN_TYPE_VIRQ]	= "VIRQ",
 		[EVTCHN_TYPE_IPI]	= "IPI",
 		[EVTCHN_TYPE_PORT]	= "PORT",
 	};
 
 	if (type >= EVTCHN_TYPE_COUNT)
 		return ("UNKNOWN");
 
 	return (evtchn_type_to_string[type]);
 }
 
 static void
 xen_intr_dump_port(struct xenisrc *isrc)
 {
 	struct xen_intr_pcpu_data *pcpu;
 	shared_info_t *s = HYPERVISOR_shared_info;
 	int i;
 
 	db_printf("Port %d Type: %s\n",
 	    isrc->xi_port, xen_intr_print_type(isrc->xi_type));
 	if (isrc->xi_type == EVTCHN_TYPE_PIRQ) {
 		db_printf("\tPirq: %d ActiveHi: %d EdgeTrigger: %d "
 		    "NeedsEOI: %d\n",
 		    isrc->xi_pirq, isrc->xi_activehi, isrc->xi_edgetrigger,
 		    !!xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map));
 	}
 	if (isrc->xi_type == EVTCHN_TYPE_VIRQ)
 		db_printf("\tVirq: %d\n", isrc->xi_virq);
 
 	db_printf("\tMasked: %d Pending: %d\n",
 	    !!xen_test_bit(isrc->xi_port, &s->evtchn_mask[0]),
 	    !!xen_test_bit(isrc->xi_port, &s->evtchn_pending[0]));
 
 	db_printf("\tPer-CPU Masks: ");
 	CPU_FOREACH(i) {
 		pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu);
 		db_printf("cpu#%d: %d ", i,
 		    !!xen_test_bit(isrc->xi_port, pcpu->evtchn_enabled));
 	}
 	db_printf("\n");
 }
 
 DB_SHOW_COMMAND(xen_evtchn, db_show_xen_evtchn)
 {
 	int i;
 
 	if (!xen_domain()) {
 		db_printf("Only available on Xen guests\n");
 		return;
 	}
 
 	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
 		struct xenisrc *isrc;
 
 		isrc = xen_intr_port_to_isrc[i];
 		if (isrc == NULL)
 			continue;
 
 		xen_intr_dump_port(isrc);
 	}
 }
 #endif /* DDB */