Index: head/sys/amd64/include/intr_machdep.h =================================================================== --- head/sys/amd64/include/intr_machdep.h (revision 331697) +++ head/sys/amd64/include/intr_machdep.h (revision 331698) @@ -1,198 +1,199 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_INTR_MACHDEP_H__ #define __MACHINE_INTR_MACHDEP_H__ #ifdef _KERNEL /* * The maximum number of I/O interrupts we allow. This number is rather * arbitrary as it is just the maximum IRQ resource value. The interrupt * source for a given IRQ maps that I/O interrupt to device interrupt * source whether it be a pin on an interrupt controller or an MSI interrupt. * The 16 ISA IRQs are assigned fixed IDT vectors, but all other device * interrupts allocate IDT vectors on demand. Currently we have 191 IDT * vectors available for device interrupts. On many systems with I/O APICs, * a lot of the IRQs are not used, so this number can be much larger than * 191 and still be safe since only interrupt sources in actual use will * allocate IDT vectors. * * The first 255 IRQs (0 - 254) are reserved for ISA IRQs and PCI intline IRQs. * IRQ values from 256 to 767 are used by MSI. When running under the Xen * Hypervisor, IRQ values from 768 to 4863 are available for binding to * event channel events. We leave 255 unused to avoid confusion since 255 is * used in PCI to indicate an invalid IRQ. */ #define NUM_MSI_INTS 512 #define FIRST_MSI_INT 256 #ifdef XENHVM #include #include #define NUM_EVTCHN_INTS NR_EVENT_CHANNELS #define FIRST_EVTCHN_INT \ (FIRST_MSI_INT + NUM_MSI_INTS) #define LAST_EVTCHN_INT \ (FIRST_EVTCHN_INT + NUM_EVTCHN_INTS - 1) #else #define NUM_EVTCHN_INTS 0 #endif #define NUM_IO_INTS (FIRST_MSI_INT + NUM_MSI_INTS + NUM_EVTCHN_INTS) /* * Default base address for MSI messages on x86 platforms. */ #define MSI_INTEL_ADDR_BASE 0xfee00000 /* * - 1 ??? dummy counter. * - 2 counters for each I/O interrupt. * - 1 counter for each CPU for lapic timer. * - 8 counters for each CPU for IPI counters for SMP. */ #ifdef SMP #define INTRCNT_COUNT (1 + NUM_IO_INTS * 2 + (1 + 8) * MAXCPU) #else #define INTRCNT_COUNT (1 + NUM_IO_INTS * 2 + 1) #endif #ifndef LOCORE typedef void inthand_t(void); #define IDTVEC(name) __CONCAT(X,name) struct intsrc; /* * Methods that a PIC provides to mask/unmask a given interrupt source, * "turn on" the interrupt on the CPU side by setting up an IDT entry, and * return the vector associated with this source. */ struct pic { void (*pic_enable_source)(struct intsrc *); void (*pic_disable_source)(struct intsrc *, int); void (*pic_eoi_source)(struct intsrc *); void (*pic_enable_intr)(struct intsrc *); void (*pic_disable_intr)(struct intsrc *); int (*pic_vector)(struct intsrc *); int (*pic_source_pending)(struct intsrc *); void (*pic_suspend)(struct pic *); void (*pic_resume)(struct pic *, bool suspend_cancelled); int (*pic_config_intr)(struct intsrc *, enum intr_trigger, enum intr_polarity); int (*pic_assign_cpu)(struct intsrc *, u_int apic_id); void (*pic_reprogram_pin)(struct intsrc *); TAILQ_ENTRY(pic) pics; }; /* Flags for pic_disable_source() */ enum { PIC_EOI, PIC_NO_EOI, }; /* * An interrupt source. The upper-layer code uses the PIC methods to * control a given source. The lower-layer PIC drivers can store additional * private data in a given interrupt source such as an interrupt pin number * or an I/O APIC pointer. */ struct intsrc { struct pic *is_pic; struct intr_event *is_event; u_long *is_count; u_long *is_straycount; u_int is_index; u_int is_handlers; + u_int is_domain; u_int is_cpu; }; struct trapframe; /* * The following data structure holds per-cpu data, and is placed just * above the top of the space used for the NMI and MC# stacks. */ struct nmi_pcpu { register_t np_pcpu; register_t __padding; /* pad to 16 bytes */ }; #ifdef SMP extern cpuset_t intr_cpus; #endif extern struct mtx icu_lock; extern int elcr_found; #ifdef SMP extern int msix_disable_migration; #endif #ifndef DEV_ATPIC void atpic_reset(void); #endif /* XXX: The elcr_* prototypes probably belong somewhere else. */ int elcr_probe(void); enum intr_trigger elcr_read_trigger(u_int irq); void elcr_resume(void); void elcr_write_trigger(u_int irq, enum intr_trigger trigger); #ifdef SMP void intr_add_cpu(u_int cpu); #endif int intr_add_handler(const char *name, int vector, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, - void **cookiep); + void **cookiep, int domain); #ifdef SMP int intr_bind(u_int vector, u_char cpu); #endif int intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol); int intr_describe(u_int vector, void *ih, const char *descr); void intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame); -u_int intr_next_cpu(void); +u_int intr_next_cpu(int domain); struct intsrc *intr_lookup_source(int vector); int intr_register_pic(struct pic *pic); int intr_register_source(struct intsrc *isrc); int intr_remove_handler(void *cookie); void intr_resume(bool suspend_cancelled); void intr_suspend(void); void intr_reprogram(void); void intrcnt_add(const char *name, u_long **countp); void nexus_add_irq(u_long irq); int msi_alloc(device_t dev, int count, int maxcount, int *irqs); void msi_init(void); int msi_map(int irq, uint64_t *addr, uint32_t *data); int msi_release(int *irqs, int count); int msix_alloc(device_t dev, int *irq); int msix_release(int irq); #endif /* !LOCORE */ #endif /* _KERNEL */ #endif /* !__MACHINE_INTR_MACHDEP_H__ */ Index: head/sys/i386/include/intr_machdep.h =================================================================== --- head/sys/i386/include/intr_machdep.h (revision 331697) +++ head/sys/i386/include/intr_machdep.h (revision 331698) @@ -1,188 +1,190 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_INTR_MACHDEP_H__ #define __MACHINE_INTR_MACHDEP_H__ #ifdef _KERNEL /* * The maximum number of I/O interrupts we allow. This number is rather * arbitrary as it is just the maximum IRQ resource value. The interrupt * source for a given IRQ maps that I/O interrupt to device interrupt * source whether it be a pin on an interrupt controller or an MSI interrupt. * The 16 ISA IRQs are assigned fixed IDT vectors, but all other device * interrupts allocate IDT vectors on demand. Currently we have 191 IDT * vectors available for device interrupts. On many systems with I/O APICs, * a lot of the IRQs are not used, so this number can be much larger than * 191 and still be safe since only interrupt sources in actual use will * allocate IDT vectors. * * The first 255 IRQs (0 - 254) are reserved for ISA IRQs and PCI intline IRQs. * IRQ values from 256 to 767 are used by MSI. When running under the Xen * Hypervisor, IRQ values from 768 to 4863 are available for binding to * event channel events. We leave 255 unused to avoid confusion since 255 is * used in PCI to indicate an invalid IRQ. */ #define NUM_MSI_INTS 512 #define FIRST_MSI_INT 256 #ifdef XENHVM #include #include #define NUM_EVTCHN_INTS NR_EVENT_CHANNELS #define FIRST_EVTCHN_INT \ (FIRST_MSI_INT + NUM_MSI_INTS) #define LAST_EVTCHN_INT \ (FIRST_EVTCHN_INT + NUM_EVTCHN_INTS - 1) #else /* !XENHVM */ #define NUM_EVTCHN_INTS 0 #endif #define NUM_IO_INTS (FIRST_MSI_INT + NUM_MSI_INTS + NUM_EVTCHN_INTS) /* * Default base address for MSI messages on x86 platforms. */ #define MSI_INTEL_ADDR_BASE 0xfee00000 /* * - 1 ??? dummy counter. * - 2 counters for each I/O interrupt. * - 1 counter for each CPU for lapic timer. * - 9 counters for each CPU for IPI counters for SMP. */ #ifdef SMP #define INTRCNT_COUNT (1 + NUM_IO_INTS * 2 + (1 + 9) * MAXCPU) #else #define INTRCNT_COUNT (1 + NUM_IO_INTS * 2 + 1) #endif #ifndef LOCORE typedef void inthand_t(void); #define IDTVEC(name) __CONCAT(X,name) struct intsrc; /* * Methods that a PIC provides to mask/unmask a given interrupt source, * "turn on" the interrupt on the CPU side by setting up an IDT entry, and * return the vector associated with this source. */ struct pic { void (*pic_enable_source)(struct intsrc *); void (*pic_disable_source)(struct intsrc *, int); void (*pic_eoi_source)(struct intsrc *); void (*pic_enable_intr)(struct intsrc *); void (*pic_disable_intr)(struct intsrc *); int (*pic_vector)(struct intsrc *); int (*pic_source_pending)(struct intsrc *); void (*pic_suspend)(struct pic *); void (*pic_resume)(struct pic *, bool suspend_cancelled); int (*pic_config_intr)(struct intsrc *, enum intr_trigger, enum intr_polarity); int (*pic_assign_cpu)(struct intsrc *, u_int apic_id); void (*pic_reprogram_pin)(struct intsrc *); TAILQ_ENTRY(pic) pics; }; /* Flags for pic_disable_source() */ enum { PIC_EOI, PIC_NO_EOI, }; /* * An interrupt source. The upper-layer code uses the PIC methods to * control a given source. The lower-layer PIC drivers can store additional * private data in a given interrupt source such as an interrupt pin number * or an I/O APIC pointer. */ struct intsrc { struct pic *is_pic; struct intr_event *is_event; u_long *is_count; u_long *is_straycount; u_int is_index; u_int is_handlers; + u_int is_domain; u_int is_cpu; }; struct trapframe; #ifdef SMP extern cpuset_t intr_cpus; #endif extern struct mtx icu_lock; extern int elcr_found; #ifdef SMP extern int msix_disable_migration; #endif #ifndef DEV_ATPIC void atpic_reset(void); #endif /* XXX: The elcr_* prototypes probably belong somewhere else. */ int elcr_probe(void); enum intr_trigger elcr_read_trigger(u_int irq); void elcr_resume(void); void elcr_write_trigger(u_int irq, enum intr_trigger trigger); #ifdef SMP void intr_add_cpu(u_int cpu); #endif int intr_add_handler(const char *name, int vector, driver_filter_t filter, - driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep); + driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep, + int domain); #ifdef SMP int intr_bind(u_int vector, u_char cpu); #endif int intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol); int intr_describe(u_int vector, void *ih, const char *descr); void intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame); -u_int intr_next_cpu(void); +u_int intr_next_cpu(int domain); struct intsrc *intr_lookup_source(int vector); int intr_register_pic(struct pic *pic); int intr_register_source(struct intsrc *isrc); int intr_remove_handler(void *cookie); void intr_resume(bool suspend_cancelled); void intr_suspend(void); void intr_reprogram(void); void intrcnt_add(const char *name, u_long **countp); void nexus_add_irq(u_long irq); int msi_alloc(device_t dev, int count, int maxcount, int *irqs); void msi_init(void); int msi_map(int irq, uint64_t *addr, uint32_t *data); int msi_release(int* irqs, int count); int msix_alloc(device_t dev, int *irq); int msix_release(int irq); #endif /* !LOCORE */ #endif /* _KERNEL */ #endif /* !__MACHINE_INTR_MACHDEP_H__ */ Index: head/sys/kern/kern_cpuset.c =================================================================== --- head/sys/kern/kern_cpuset.c (revision 331697) +++ head/sys/kern/kern_cpuset.c (revision 331698) @@ -1,2192 +1,2175 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008, Jeffrey Roberson * All rights reserved. * * Copyright (c) 2008 Nokia Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif /* DDB */ /* * cpusets provide a mechanism for creating and manipulating sets of * processors for the purpose of constraining the scheduling of threads to * specific processors. * * Each process belongs to an identified set, by default this is set 1. Each * thread may further restrict the cpus it may run on to a subset of this * named set. This creates an anonymous set which other threads and processes * may not join by number. * * The named set is referred to herein as the 'base' set to avoid ambiguity. * This set is usually a child of a 'root' set while the anonymous set may * simply be referred to as a mask. In the syscall api these are referred to * as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here. * * Threads inherit their set from their creator whether it be anonymous or * not. This means that anonymous sets are immutable because they may be * shared. To modify an anonymous set a new set is created with the desired * mask and the same parent as the existing anonymous set. This gives the * illusion of each thread having a private mask. * * Via the syscall apis a user may ask to retrieve or modify the root, base, * or mask that is discovered via a pid, tid, or setid. Modifying a set * modifies all numbered and anonymous child sets to comply with the new mask. * Modifying a pid or tid's mask applies only to that tid but must still * exist within the assigned parent set. * * A thread may not be assigned to a group separate from other threads in * the process. This is to remove ambiguity when the setid is queried with * a pid argument. There is no other technical limitation. * * This somewhat complex arrangement is intended to make it easy for * applications to query available processors and bind their threads to * specific processors while also allowing administrators to dynamically * reprovision by changing sets which apply to groups of processes. * * A simple application should not concern itself with sets at all and * rather apply masks to its own threads via CPU_WHICH_TID and a -1 id * meaning 'curthread'. It may query available cpus for that tid with a * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...). */ static uma_zone_t cpuset_zone; static uma_zone_t domainset_zone; static struct mtx cpuset_lock; static struct setlist cpuset_ids; static struct domainlist cpuset_domains; static struct unrhdr *cpuset_unr; static struct cpuset *cpuset_zero, *cpuset_default; /* Return the size of cpuset_t at the kernel level */ SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD, SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)"); cpuset_t *cpuset_root; cpuset_t cpuset_domain[MAXMEMDOM]; static int domainset_valid(const struct domainset *, const struct domainset *); /* * Find the first non-anonymous set starting from 'set'. */ static struct cpuset * cpuset_getbase(struct cpuset *set) { if (set->cs_id == CPUSET_INVALID) set = set->cs_parent; return (set); } /* * Walks up the tree from 'set' to find the root. */ static struct cpuset * cpuset_getroot(struct cpuset *set) { while ((set->cs_flags & CPU_SET_ROOT) == 0 && set->cs_parent != NULL) set = set->cs_parent; return (set); } /* * Acquire a reference to a cpuset, all pointers must be tracked with refs. */ struct cpuset * cpuset_ref(struct cpuset *set) { refcount_acquire(&set->cs_ref); return (set); } /* * Walks up the tree from 'set' to find the root. Returns the root * referenced. */ static struct cpuset * cpuset_refroot(struct cpuset *set) { return (cpuset_ref(cpuset_getroot(set))); } /* * Find the first non-anonymous set starting from 'set'. Returns this set * referenced. May return the passed in set with an extra ref if it is * not anonymous. */ static struct cpuset * cpuset_refbase(struct cpuset *set) { return (cpuset_ref(cpuset_getbase(set))); } /* * Release a reference in a context where it is safe to allocate. */ void cpuset_rel(struct cpuset *set) { cpusetid_t id; if (refcount_release(&set->cs_ref) == 0) return; mtx_lock_spin(&cpuset_lock); LIST_REMOVE(set, cs_siblings); id = set->cs_id; if (id != CPUSET_INVALID) LIST_REMOVE(set, cs_link); mtx_unlock_spin(&cpuset_lock); cpuset_rel(set->cs_parent); uma_zfree(cpuset_zone, set); if (id != CPUSET_INVALID) free_unr(cpuset_unr, id); } /* * Deferred release must be used when in a context that is not safe to * allocate/free. This places any unreferenced sets on the list 'head'. */ static void cpuset_rel_defer(struct setlist *head, struct cpuset *set) { if (refcount_release(&set->cs_ref) == 0) return; mtx_lock_spin(&cpuset_lock); LIST_REMOVE(set, cs_siblings); if (set->cs_id != CPUSET_INVALID) LIST_REMOVE(set, cs_link); LIST_INSERT_HEAD(head, set, cs_link); mtx_unlock_spin(&cpuset_lock); } /* * Complete a deferred release. Removes the set from the list provided to * cpuset_rel_defer. */ static void cpuset_rel_complete(struct cpuset *set) { LIST_REMOVE(set, cs_link); cpuset_rel(set->cs_parent); uma_zfree(cpuset_zone, set); } /* * Find a set based on an id. Returns it with a ref. */ static struct cpuset * cpuset_lookup(cpusetid_t setid, struct thread *td) { struct cpuset *set; if (setid == CPUSET_INVALID) return (NULL); mtx_lock_spin(&cpuset_lock); LIST_FOREACH(set, &cpuset_ids, cs_link) if (set->cs_id == setid) break; if (set) cpuset_ref(set); mtx_unlock_spin(&cpuset_lock); KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__)); if (set != NULL && jailed(td->td_ucred)) { struct cpuset *jset, *tset; jset = td->td_ucred->cr_prison->pr_cpuset; for (tset = set; tset != NULL; tset = tset->cs_parent) if (tset == jset) break; if (tset == NULL) { cpuset_rel(set); set = NULL; } } return (set); } /* * Create a set in the space provided in 'set' with the provided parameters. * The set is returned with a single ref. May return EDEADLK if the set * will have no valid cpu based on restrictions from the parent. */ static int _cpuset_create(struct cpuset *set, struct cpuset *parent, const cpuset_t *mask, struct domainset *domain, cpusetid_t id) { if (domain == NULL) domain = parent->cs_domain; if (mask == NULL) mask = &parent->cs_mask; if (!CPU_OVERLAP(&parent->cs_mask, mask)) return (EDEADLK); /* The domain must be prepared ahead of time. */ if (!domainset_valid(parent->cs_domain, domain)) return (EDEADLK); CPU_COPY(mask, &set->cs_mask); LIST_INIT(&set->cs_children); refcount_init(&set->cs_ref, 1); set->cs_flags = 0; mtx_lock_spin(&cpuset_lock); set->cs_domain = domain; CPU_AND(&set->cs_mask, &parent->cs_mask); set->cs_id = id; set->cs_parent = cpuset_ref(parent); LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings); if (set->cs_id != CPUSET_INVALID) LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); mtx_unlock_spin(&cpuset_lock); return (0); } /* * Create a new non-anonymous set with the requested parent and mask. May * return failures if the mask is invalid or a new number can not be * allocated. */ static int cpuset_create(struct cpuset **setp, struct cpuset *parent, const cpuset_t *mask) { struct cpuset *set; cpusetid_t id; int error; id = alloc_unr(cpuset_unr); if (id == -1) return (ENFILE); *setp = set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); error = _cpuset_create(set, parent, mask, NULL, id); if (error == 0) return (0); free_unr(cpuset_unr, id); uma_zfree(cpuset_zone, set); return (error); } static void cpuset_freelist_add(struct setlist *list, int count) { struct cpuset *set; int i; for (i = 0; i < count; i++) { set = uma_zalloc(cpuset_zone, M_ZERO | M_WAITOK); LIST_INSERT_HEAD(list, set, cs_link); } } static void cpuset_freelist_init(struct setlist *list, int count) { LIST_INIT(list); cpuset_freelist_add(list, count); } static void cpuset_freelist_free(struct setlist *list) { struct cpuset *set; while ((set = LIST_FIRST(list)) != NULL) { LIST_REMOVE(set, cs_link); uma_zfree(cpuset_zone, set); } } static void domainset_freelist_add(struct domainlist *list, int count) { struct domainset *set; int i; for (i = 0; i < count; i++) { set = uma_zalloc(domainset_zone, M_ZERO | M_WAITOK); LIST_INSERT_HEAD(list, set, ds_link); } } static void domainset_freelist_init(struct domainlist *list, int count) { LIST_INIT(list); domainset_freelist_add(list, count); } static void domainset_freelist_free(struct domainlist *list) { struct domainset *set; while ((set = LIST_FIRST(list)) != NULL) { LIST_REMOVE(set, ds_link); uma_zfree(domainset_zone, set); } } /* Copy a domainset preserving mask and policy. */ static void domainset_copy(const struct domainset *from, struct domainset *to) { DOMAINSET_COPY(&from->ds_mask, &to->ds_mask); to->ds_policy = from->ds_policy; to->ds_prefer = from->ds_prefer; } /* Return 1 if mask and policy are equal, otherwise 0. */ static int domainset_equal(const struct domainset *one, const struct domainset *two) { return (DOMAINSET_CMP(&one->ds_mask, &two->ds_mask) == 0 && one->ds_policy == two->ds_policy && one->ds_prefer == two->ds_prefer); } /* Return 1 if child is a valid subset of parent. */ static int domainset_valid(const struct domainset *parent, const struct domainset *child) { if (child->ds_policy != DOMAINSET_POLICY_PREFER) return (DOMAINSET_SUBSET(&parent->ds_mask, &child->ds_mask)); return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask)); } static int domainset_restrict(const struct domainset *parent, const struct domainset *child) { if (child->ds_policy != DOMAINSET_POLICY_PREFER) return (DOMAINSET_OVERLAP(&parent->ds_mask, &child->ds_mask)); return (DOMAINSET_ISSET(child->ds_prefer, &parent->ds_mask)); } /* * Lookup or create a domainset. The key is provided in ds_mask and * ds_policy. If the domainset does not yet exist the storage in * 'domain' is used to insert. Otherwise this storage is freed to the * domainset_zone and the existing domainset is returned. */ static struct domainset * _domainset_create(struct domainset *domain, struct domainlist *freelist) { struct domainset *ndomain; mtx_lock_spin(&cpuset_lock); LIST_FOREACH(ndomain, &cpuset_domains, ds_link) if (domainset_equal(ndomain, domain)) break; /* * If the domain does not yet exist we insert it and initialize * various iteration helpers which are not part of the key. */ if (ndomain == NULL) { LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link); domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask); domain->ds_max = DOMAINSET_FLS(&domain->ds_mask) + 1; } mtx_unlock_spin(&cpuset_lock); if (ndomain == NULL) return (domain); if (freelist != NULL) LIST_INSERT_HEAD(freelist, domain, ds_link); else uma_zfree(domainset_zone, domain); return (ndomain); } /* * Create or lookup a domainset based on the key held in 'domain'. */ static struct domainset * domainset_create(const struct domainset *domain) { struct domainset *ndomain; ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO); domainset_copy(domain, ndomain); return _domainset_create(ndomain, NULL); } /* * Update thread domainset pointers. */ static void domainset_notify(void) { struct thread *td; struct proc *p; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); td->td_domain.dr_policy = td->td_cpuset->cs_domain; thread_unlock(td); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); kernel_object->domain.dr_policy = cpuset_default->cs_domain; } /* * Create a new set that is a subset of a parent. */ static struct domainset * domainset_shadow(const struct domainset *pdomain, const struct domainset *domain, struct domainlist *freelist) { struct domainset *ndomain; ndomain = LIST_FIRST(freelist); LIST_REMOVE(ndomain, ds_link); /* * Initialize the key from the request. */ domainset_copy(domain, ndomain); /* * Restrict the key by the parent. */ DOMAINSET_AND(&ndomain->ds_mask, &pdomain->ds_mask); return _domainset_create(ndomain, freelist); } /* * Recursively check for errors that would occur from applying mask to * the tree of sets starting at 'set'. Checks for sets that would become * empty as well as RDONLY flags. */ static int cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask) { struct cpuset *nset; cpuset_t newmask; int error; mtx_assert(&cpuset_lock, MA_OWNED); if (set->cs_flags & CPU_SET_RDONLY) return (EPERM); if (check_mask) { if (!CPU_OVERLAP(&set->cs_mask, mask)) return (EDEADLK); CPU_COPY(&set->cs_mask, &newmask); CPU_AND(&newmask, mask); } else CPU_COPY(mask, &newmask); error = 0; LIST_FOREACH(nset, &set->cs_children, cs_siblings) if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0) break; return (error); } /* * Applies the mask 'mask' without checking for empty sets or permissions. */ static void cpuset_update(struct cpuset *set, cpuset_t *mask) { struct cpuset *nset; mtx_assert(&cpuset_lock, MA_OWNED); CPU_AND(&set->cs_mask, mask); LIST_FOREACH(nset, &set->cs_children, cs_siblings) cpuset_update(nset, &set->cs_mask); return; } /* * Modify the set 'set' to use a copy of the mask provided. Apply this new * mask to restrict all children in the tree. Checks for validity before * applying the changes. */ static int cpuset_modify(struct cpuset *set, cpuset_t *mask) { struct cpuset *root; int error; error = priv_check(curthread, PRIV_SCHED_CPUSET); if (error) return (error); /* * In case we are called from within the jail * we do not allow modifying the dedicated root * cpuset of the jail but may still allow to * change child sets. */ if (jailed(curthread->td_ucred) && set->cs_flags & CPU_SET_ROOT) return (EPERM); /* * Verify that we have access to this set of * cpus. */ root = cpuset_getroot(set); mtx_lock_spin(&cpuset_lock); if (root && !CPU_SUBSET(&root->cs_mask, mask)) { error = EINVAL; goto out; } error = cpuset_testupdate(set, mask, 0); if (error) goto out; CPU_COPY(mask, &set->cs_mask); cpuset_update(set, mask); out: mtx_unlock_spin(&cpuset_lock); return (error); } /* * Recursively check for errors that would occur from applying mask to * the tree of sets starting at 'set'. Checks for sets that would become * empty as well as RDONLY flags. */ static int cpuset_testupdate_domain(struct cpuset *set, struct domainset *dset, struct domainset *orig, int *count, int check_mask) { struct cpuset *nset; struct domainset *domain; struct domainset newset; int error; mtx_assert(&cpuset_lock, MA_OWNED); if (set->cs_flags & CPU_SET_RDONLY) return (EPERM); domain = set->cs_domain; domainset_copy(domain, &newset); if (!domainset_equal(domain, orig)) { if (!domainset_restrict(domain, dset)) return (EDEADLK); DOMAINSET_AND(&newset.ds_mask, &dset->ds_mask); /* Count the number of domains that are changing. */ (*count)++; } error = 0; LIST_FOREACH(nset, &set->cs_children, cs_siblings) if ((error = cpuset_testupdate_domain(nset, &newset, domain, count, 1)) != 0) break; return (error); } /* * Applies the mask 'mask' without checking for empty sets or permissions. */ static void cpuset_update_domain(struct cpuset *set, struct domainset *domain, struct domainset *orig, struct domainlist *domains) { struct cpuset *nset; mtx_assert(&cpuset_lock, MA_OWNED); /* * If this domainset has changed from the parent we must calculate * a new set. Otherwise it simply inherits from the parent. When * we inherit from the parent we get a new mask and policy. If the * set is modified from the parent we keep the policy and only * update the mask. */ if (set->cs_domain != orig) { orig = set->cs_domain; set->cs_domain = domainset_shadow(domain, orig, domains); } else set->cs_domain = domain; LIST_FOREACH(nset, &set->cs_children, cs_siblings) cpuset_update_domain(nset, set->cs_domain, orig, domains); return; } /* * Modify the set 'set' to use a copy the domainset provided. Apply this new * mask to restrict all children in the tree. Checks for validity before * applying the changes. */ static int cpuset_modify_domain(struct cpuset *set, struct domainset *domain) { struct domainlist domains; struct domainset temp; struct domainset *dset; struct cpuset *root; int ndomains, needed; int error; error = priv_check(curthread, PRIV_SCHED_CPUSET); if (error) return (error); /* * In case we are called from within the jail * we do not allow modifying the dedicated root * cpuset of the jail but may still allow to * change child sets. */ if (jailed(curthread->td_ucred) && set->cs_flags & CPU_SET_ROOT) return (EPERM); domainset_freelist_init(&domains, 0); domain = domainset_create(domain); ndomains = needed = 0; do { if (ndomains < needed) { domainset_freelist_add(&domains, needed - ndomains); ndomains = needed; } root = cpuset_getroot(set); mtx_lock_spin(&cpuset_lock); dset = root->cs_domain; /* * Verify that we have access to this set of domains. */ if (root && !domainset_valid(dset, domain)) { error = EINVAL; goto out; } /* * If applying prefer we keep the current set as the fallback. */ if (domain->ds_policy == DOMAINSET_POLICY_PREFER) DOMAINSET_COPY(&set->cs_domain->ds_mask, &domain->ds_mask); /* * Determine whether we can apply this set of domains and * how many new domain structures it will require. */ domainset_copy(domain, &temp); needed = 0; error = cpuset_testupdate_domain(set, &temp, set->cs_domain, &needed, 0); if (error) goto out; } while (ndomains < needed); dset = set->cs_domain; cpuset_update_domain(set, domain, dset, &domains); out: mtx_unlock_spin(&cpuset_lock); domainset_freelist_free(&domains); if (error == 0) domainset_notify(); return (error); } /* * Resolve the 'which' parameter of several cpuset apis. * * For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also * checks for permission via p_cansched(). * * For WHICH_SET returns a valid set with a new reference. * * -1 may be supplied for any argument to mean the current proc/thread or * the base set of the current thread. May fail with ESRCH/EPERM. */ int cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp, struct cpuset **setp) { struct cpuset *set; struct thread *td; struct proc *p; int error; *pp = p = NULL; *tdp = td = NULL; *setp = set = NULL; switch (which) { case CPU_WHICH_PID: if (id == -1) { PROC_LOCK(curproc); p = curproc; break; } if ((p = pfind(id)) == NULL) return (ESRCH); break; case CPU_WHICH_TID: if (id == -1) { PROC_LOCK(curproc); p = curproc; td = curthread; break; } td = tdfind(id, -1); if (td == NULL) return (ESRCH); p = td->td_proc; break; case CPU_WHICH_CPUSET: if (id == -1) { thread_lock(curthread); set = cpuset_refbase(curthread->td_cpuset); thread_unlock(curthread); } else set = cpuset_lookup(id, curthread); if (set) { *setp = set; return (0); } return (ESRCH); case CPU_WHICH_JAIL: { /* Find `set' for prison with given id. */ struct prison *pr; sx_slock(&allprison_lock); pr = prison_find_child(curthread->td_ucred->cr_prison, id); sx_sunlock(&allprison_lock); if (pr == NULL) return (ESRCH); cpuset_ref(pr->pr_cpuset); *setp = pr->pr_cpuset; mtx_unlock(&pr->pr_mtx); return (0); } case CPU_WHICH_IRQ: case CPU_WHICH_DOMAIN: return (0); default: return (EINVAL); } error = p_cansched(curthread, p); if (error) { PROC_UNLOCK(p); return (error); } if (td == NULL) td = FIRST_THREAD_IN_PROC(p); *pp = p; *tdp = td; return (0); } static int cpuset_testshadow(struct cpuset *set, const cpuset_t *mask, const struct domainset *domain) { struct cpuset *parent; struct domainset *dset; parent = cpuset_getbase(set); /* * If we are restricting a cpu mask it must be a subset of the * parent or invalid CPUs have been specified. */ if (mask != NULL && !CPU_SUBSET(&parent->cs_mask, mask)) return (EINVAL); /* * If we are restricting a domain mask it must be a subset of the * parent or invalid domains have been specified. */ dset = parent->cs_domain; if (domain != NULL && !domainset_valid(dset, domain)) return (EINVAL); return (0); } /* * Create an anonymous set with the provided mask in the space provided by * 'nset'. If the passed in set is anonymous we use its parent otherwise * the new set is a child of 'set'. */ static int cpuset_shadow(struct cpuset *set, struct cpuset **nsetp, const cpuset_t *mask, const struct domainset *domain, struct setlist *cpusets, struct domainlist *domains) { struct cpuset *parent; struct cpuset *nset; struct domainset *dset; struct domainset *d; int error; error = cpuset_testshadow(set, mask, domain); if (error) return (error); parent = cpuset_getbase(set); dset = parent->cs_domain; if (mask == NULL) mask = &set->cs_mask; if (domain != NULL) d = domainset_shadow(dset, domain, domains); else d = set->cs_domain; nset = LIST_FIRST(cpusets); error = _cpuset_create(nset, parent, mask, d, CPUSET_INVALID); if (error == 0) { LIST_REMOVE(nset, cs_link); *nsetp = nset; } return (error); } static struct cpuset * cpuset_update_thread(struct thread *td, struct cpuset *nset) { struct cpuset *tdset; tdset = td->td_cpuset; td->td_cpuset = nset; td->td_domain.dr_policy = nset->cs_domain; sched_affinity(td); return (tdset); } static int cpuset_setproc_test_maskthread(struct cpuset *tdset, cpuset_t *mask, struct domainset *domain) { struct cpuset *parent; parent = cpuset_getbase(tdset); if (mask == NULL) mask = &tdset->cs_mask; if (domain == NULL) domain = tdset->cs_domain; return cpuset_testshadow(parent, mask, domain); } static int cpuset_setproc_maskthread(struct cpuset *tdset, cpuset_t *mask, struct domainset *domain, struct cpuset **nsetp, struct setlist *freelist, struct domainlist *domainlist) { struct cpuset *parent; parent = cpuset_getbase(tdset); if (mask == NULL) mask = &tdset->cs_mask; if (domain == NULL) domain = tdset->cs_domain; return cpuset_shadow(parent, nsetp, mask, domain, freelist, domainlist); } static int cpuset_setproc_setthread_mask(struct cpuset *tdset, struct cpuset *set, cpuset_t *mask, struct domainset *domain) { struct cpuset *parent; parent = cpuset_getbase(tdset); /* * If the thread restricted its mask then apply that same * restriction to the new set, otherwise take it wholesale. */ if (CPU_CMP(&tdset->cs_mask, &parent->cs_mask) != 0) { CPU_COPY(&tdset->cs_mask, mask); CPU_AND(mask, &set->cs_mask); } else CPU_COPY(&set->cs_mask, mask); /* * If the thread restricted the domain then we apply the * restriction to the new set but retain the policy. */ if (tdset->cs_domain != parent->cs_domain) { domainset_copy(tdset->cs_domain, domain); DOMAINSET_AND(&domain->ds_mask, &set->cs_domain->ds_mask); } else domainset_copy(set->cs_domain, domain); if (CPU_EMPTY(mask) || DOMAINSET_EMPTY(&domain->ds_mask)) return (EDEADLK); return (0); } static int cpuset_setproc_test_setthread(struct cpuset *tdset, struct cpuset *set) { struct domainset domain; cpuset_t mask; if (tdset->cs_id != CPUSET_INVALID) return (0); return cpuset_setproc_setthread_mask(tdset, set, &mask, &domain); } static int cpuset_setproc_setthread(struct cpuset *tdset, struct cpuset *set, struct cpuset **nsetp, struct setlist *freelist, struct domainlist *domainlist) { struct domainset domain; cpuset_t mask; int error; /* * If we're replacing on a thread that has not constrained the * original set we can simply accept the new set. */ if (tdset->cs_id != CPUSET_INVALID) { *nsetp = cpuset_ref(set); return (0); } error = cpuset_setproc_setthread_mask(tdset, set, &mask, &domain); if (error) return (error); return cpuset_shadow(tdset, nsetp, &mask, &domain, freelist, domainlist); } /* * Handle three cases for updating an entire process. * * 1) Set is non-null. This reparents all anonymous sets to the provided * set and replaces all non-anonymous td_cpusets with the provided set. * 2) Mask is non-null. This replaces or creates anonymous sets for every * thread with the existing base as a parent. * 3) domain is non-null. This creates anonymous sets for every thread * and replaces the domain set. * * This is overly complicated because we can't allocate while holding a * spinlock and spinlocks must be held while changing and examining thread * state. */ static int cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask, struct domainset *domain) { struct setlist freelist; struct setlist droplist; struct domainlist domainlist; struct cpuset *nset; struct thread *td; struct proc *p; int threads; int nfree; int error; /* * The algorithm requires two passes due to locking considerations. * * 1) Lookup the process and acquire the locks in the required order. * 2) If enough cpusets have not been allocated release the locks and * allocate them. Loop. */ cpuset_freelist_init(&freelist, 1); domainset_freelist_init(&domainlist, 1); nfree = 1; LIST_INIT(&droplist); nfree = 0; for (;;) { error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset); if (error) goto out; if (nfree >= p->p_numthreads) break; threads = p->p_numthreads; PROC_UNLOCK(p); if (nfree < threads) { cpuset_freelist_add(&freelist, threads - nfree); domainset_freelist_add(&domainlist, threads - nfree); nfree = threads; } } PROC_LOCK_ASSERT(p, MA_OWNED); /* * Now that the appropriate locks are held and we have enough cpusets, * make sure the operation will succeed before applying changes. The * proc lock prevents td_cpuset from changing between calls. */ error = 0; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (set != NULL) error = cpuset_setproc_test_setthread(td->td_cpuset, set); else error = cpuset_setproc_test_maskthread(td->td_cpuset, mask, domain); thread_unlock(td); if (error) goto unlock_out; } /* * Replace each thread's cpuset while using deferred release. We * must do this because the thread lock must be held while operating * on the thread and this limits the type of operations allowed. */ FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (set != NULL) error = cpuset_setproc_setthread(td->td_cpuset, set, &nset, &freelist, &domainlist); else error = cpuset_setproc_maskthread(td->td_cpuset, mask, domain, &nset, &freelist, &domainlist); if (error) { thread_unlock(td); break; } cpuset_rel_defer(&droplist, cpuset_update_thread(td, nset)); thread_unlock(td); } unlock_out: PROC_UNLOCK(p); out: while ((nset = LIST_FIRST(&droplist)) != NULL) cpuset_rel_complete(nset); cpuset_freelist_free(&freelist); domainset_freelist_free(&domainlist); return (error); } /* * Return a string representing a valid layout for a cpuset_t object. * It expects an incoming buffer at least sized as CPUSETBUFSIZ. */ char * cpusetobj_strprint(char *buf, const cpuset_t *set) { char *tbuf; size_t i, bytesp, bufsiz; tbuf = buf; bytesp = 0; bufsiz = CPUSETBUFSIZ; for (i = 0; i < (_NCPUWORDS - 1); i++) { bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]); bufsiz -= bytesp; tbuf += bytesp; } snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]); return (buf); } /* * Build a valid cpuset_t object from a string representation. * It expects an incoming buffer at least sized as CPUSETBUFSIZ. */ int cpusetobj_strscan(cpuset_t *set, const char *buf) { u_int nwords; int i, ret; if (strlen(buf) > CPUSETBUFSIZ - 1) return (-1); /* Allow to pass a shorter version of the mask when necessary. */ nwords = 1; for (i = 0; buf[i] != '\0'; i++) if (buf[i] == ',') nwords++; if (nwords > _NCPUWORDS) return (-1); CPU_ZERO(set); for (i = 0; i < (nwords - 1); i++) { ret = sscanf(buf, "%lx,", &set->__bits[i]); if (ret == 0 || ret == -1) return (-1); buf = strstr(buf, ","); if (buf == NULL) return (-1); buf++; } ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]); if (ret == 0 || ret == -1) return (-1); return (0); } /* * Apply an anonymous mask or a domain to a single thread. */ static int _cpuset_setthread(lwpid_t id, cpuset_t *mask, struct domainset *domain) { struct setlist cpusets; struct domainlist domainlist; struct cpuset *nset; struct cpuset *set; struct thread *td; struct proc *p; int error; cpuset_freelist_init(&cpusets, 1); domainset_freelist_init(&domainlist, domain != NULL); error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set); if (error) goto out; set = NULL; thread_lock(td); error = cpuset_shadow(td->td_cpuset, &nset, mask, domain, &cpusets, &domainlist); if (error == 0) set = cpuset_update_thread(td, nset); thread_unlock(td); PROC_UNLOCK(p); if (set) cpuset_rel(set); out: cpuset_freelist_free(&cpusets); domainset_freelist_free(&domainlist); return (error); } /* * Apply an anonymous mask to a single thread. */ int cpuset_setthread(lwpid_t id, cpuset_t *mask) { return _cpuset_setthread(id, mask, NULL); } /* * Apply new cpumask to the ithread. */ int cpuset_setithread(lwpid_t id, int cpu) { struct setlist cpusets; struct cpuset *nset, *rset; struct cpuset *parent, *old_set; struct thread *td; struct proc *p; cpusetid_t cs_id; cpuset_t mask; int error; cpuset_freelist_init(&cpusets, 1); rset = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); cs_id = CPUSET_INVALID; CPU_ZERO(&mask); if (cpu == NOCPU) CPU_COPY(cpuset_root, &mask); else CPU_SET(cpu, &mask); error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &old_set); if (error != 0 || ((cs_id = alloc_unr(cpuset_unr)) == CPUSET_INVALID)) goto out; /* cpuset_which() returns with PROC_LOCK held. */ old_set = td->td_cpuset; if (cpu == NOCPU) { nset = LIST_FIRST(&cpusets); LIST_REMOVE(nset, cs_link); /* * roll back to default set. We're not using cpuset_shadow() * here because we can fail CPU_SUBSET() check. This can happen * if default set does not contain all CPUs. */ error = _cpuset_create(nset, cpuset_default, &mask, NULL, CPUSET_INVALID); goto applyset; } if (old_set->cs_id == 1 || (old_set->cs_id == CPUSET_INVALID && old_set->cs_parent->cs_id == 1)) { /* * Current set is either default (1) or * shadowed version of default set. * * Allocate new root set to be able to shadow it * with any mask. */ error = _cpuset_create(rset, cpuset_zero, &cpuset_zero->cs_mask, NULL, cs_id); if (error != 0) { PROC_UNLOCK(p); goto out; } rset->cs_flags |= CPU_SET_ROOT; parent = rset; rset = NULL; cs_id = CPUSET_INVALID; } else { /* Assume existing set was already allocated by previous call */ parent = old_set; old_set = NULL; } error = cpuset_shadow(parent, &nset, &mask, NULL, &cpusets, NULL); applyset: if (error == 0) { thread_lock(td); old_set = cpuset_update_thread(td, nset); thread_unlock(td); } else old_set = NULL; PROC_UNLOCK(p); if (old_set != NULL) cpuset_rel(old_set); out: cpuset_freelist_free(&cpusets); if (rset != NULL) uma_zfree(cpuset_zone, rset); if (cs_id != CPUSET_INVALID) free_unr(cpuset_unr, cs_id); return (error); } static struct domainset domainset0; void domainset_zero(void) { struct domainset *dset; int i; mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN | MTX_RECURSE); dset = &domainset0; DOMAINSET_ZERO(&dset->ds_mask); for (i = 0; i < vm_ndomains; i++) DOMAINSET_SET(i, &dset->ds_mask); dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN; dset->ds_prefer = -1; curthread->td_domain.dr_policy = _domainset_create(dset, NULL); kernel_object->domain.dr_policy = curthread->td_domain.dr_policy; } /* * Creates system-wide cpusets and the cpuset for thread0 including two * sets: * * 0 - The root set which should represent all valid processors in the * system. It is initially created with a mask of all processors * because we don't know what processors are valid until cpuset_init() * runs. This set is immutable. * 1 - The default set which all processes are a member of until changed. * This allows an administrator to move all threads off of given cpus to * dedicate them to high priority tasks or save power etc. */ struct cpuset * cpuset_thread0(void) { struct cpuset *set; int error; + int i; cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); domainset_zone = uma_zcreate("domainset", sizeof(struct domainset), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Create the root system set for the whole machine. Doesn't use * cpuset_create() due to NULL parent. */ set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); - CPU_FILL(&set->cs_mask); + CPU_COPY(&all_cpus, &set->cs_mask); LIST_INIT(&set->cs_children); LIST_INSERT_HEAD(&cpuset_ids, set, cs_link); set->cs_ref = 1; - set->cs_flags = CPU_SET_ROOT; + set->cs_flags = CPU_SET_ROOT | CPU_SET_RDONLY; set->cs_domain = &domainset0; cpuset_zero = set; cpuset_root = &set->cs_mask; /* * Now derive a default, modifiable set from that to give out. */ set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO); error = _cpuset_create(set, cpuset_zero, NULL, NULL, 1); KASSERT(error == 0, ("Error creating default set: %d\n", error)); cpuset_default = set; /* * Initialize the unit allocator. 0 and 1 are allocated above. */ cpuset_unr = new_unrhdr(2, INT_MAX, NULL); + /* + * If MD code has not initialized per-domain cpusets, place all + * CPUs in domain 0. + */ + for (i = 0; i < MAXMEMDOM; i++) + if (!CPU_EMPTY(&cpuset_domain[i])) + goto domains_set; + CPU_COPY(&all_cpus, &cpuset_domain[0]); +domains_set: + return (set); } /* * Create a cpuset, which would be cpuset_create() but * mark the new 'set' as root. * * We are not going to reparent the td to it. Use cpuset_setproc_update_set() * for that. * * In case of no error, returns the set in *setp locked with a reference. */ int cpuset_create_root(struct prison *pr, struct cpuset **setp) { struct cpuset *set; int error; KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__)); KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__)); error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask); if (error) return (error); KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data", __func__, __LINE__)); /* Mark the set as root. */ set = *setp; set->cs_flags |= CPU_SET_ROOT; return (0); } int cpuset_setproc_update_set(struct proc *p, struct cpuset *set) { int error; KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__)); KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__)); cpuset_ref(set); error = cpuset_setproc(p->p_pid, set, NULL, NULL); if (error) return (error); cpuset_rel(set); return (0); } - -/* - * This is called once the final set of system cpus is known. Modifies - * the root set and all children and mark the root read-only. - */ -static void -cpuset_init(void *arg) -{ - cpuset_t mask; - int i; - - mask = all_cpus; - if (cpuset_modify(cpuset_zero, &mask)) - panic("Can't set initial cpuset mask.\n"); - cpuset_zero->cs_flags |= CPU_SET_RDONLY; - - /* - * If MD code has not initialized per-domain cpusets, place all - * CPUs in domain 0. - */ - for (i = 0; i < MAXMEMDOM; i++) - if (!CPU_EMPTY(&cpuset_domain[i])) - goto domains_set; - CPU_COPY(&all_cpus, &cpuset_domain[0]); -domains_set: - return; -} -SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL); #ifndef _SYS_SYSPROTO_H_ struct cpuset_args { cpusetid_t *setid; }; #endif int sys_cpuset(struct thread *td, struct cpuset_args *uap) { struct cpuset *root; struct cpuset *set; int error; thread_lock(td); root = cpuset_refroot(td->td_cpuset); thread_unlock(td); error = cpuset_create(&set, root, &root->cs_mask); cpuset_rel(root); if (error) return (error); error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id)); if (error == 0) error = cpuset_setproc(-1, set, NULL, NULL); cpuset_rel(set); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_setid_args { cpuwhich_t which; id_t id; cpusetid_t setid; }; #endif int sys_cpuset_setid(struct thread *td, struct cpuset_setid_args *uap) { return (kern_cpuset_setid(td, uap->which, uap->id, uap->setid)); } int kern_cpuset_setid(struct thread *td, cpuwhich_t which, id_t id, cpusetid_t setid) { struct cpuset *set; int error; /* * Presently we only support per-process sets. */ if (which != CPU_WHICH_PID) return (EINVAL); set = cpuset_lookup(setid, td); if (set == NULL) return (ESRCH); error = cpuset_setproc(id, set, NULL, NULL); cpuset_rel(set); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_getid_args { cpulevel_t level; cpuwhich_t which; id_t id; cpusetid_t *setid; }; #endif int sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap) { return (kern_cpuset_getid(td, uap->level, uap->which, uap->id, uap->setid)); } int kern_cpuset_getid(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, cpusetid_t *setid) { struct cpuset *nset; struct cpuset *set; struct thread *ttd; struct proc *p; cpusetid_t tmpid; int error; if (level == CPU_LEVEL_WHICH && which != CPU_WHICH_CPUSET) return (EINVAL); error = cpuset_which(which, id, &p, &ttd, &set); if (error) return (error); switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_refbase(ttd->td_cpuset); thread_unlock(ttd); PROC_UNLOCK(p); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_DOMAIN: return (EINVAL); } switch (level) { case CPU_LEVEL_ROOT: nset = cpuset_refroot(set); cpuset_rel(set); set = nset; break; case CPU_LEVEL_CPUSET: break; case CPU_LEVEL_WHICH: break; } tmpid = set->cs_id; cpuset_rel(set); if (error == 0) error = copyout(&tmpid, setid, sizeof(tmpid)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_getaffinity_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t cpusetsize; cpuset_t *mask; }; #endif int sys_cpuset_getaffinity(struct thread *td, struct cpuset_getaffinity_args *uap) { return (kern_cpuset_getaffinity(td, uap->level, uap->which, uap->id, uap->cpusetsize, uap->mask)); } int kern_cpuset_getaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, cpuset_t *maskp) { struct thread *ttd; struct cpuset *nset; struct cpuset *set; struct proc *p; cpuset_t *mask; int error; size_t size; if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY) return (ERANGE); /* In Capability mode, you can only get your own CPU set. */ if (IN_CAPABILITY_MODE(td)) { if (level != CPU_LEVEL_WHICH) return (ECAPMODE); if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) return (ECAPMODE); if (id != -1) return (ECAPMODE); } size = cpusetsize; mask = malloc(size, M_TEMP, M_WAITOK | M_ZERO); error = cpuset_which(which, id, &p, &ttd, &set); if (error) goto out; switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: error = EINVAL; goto out; } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); CPU_COPY(&nset->cs_mask, mask); cpuset_rel(nset); break; case CPU_LEVEL_WHICH: switch (which) { case CPU_WHICH_TID: thread_lock(ttd); CPU_COPY(&ttd->td_cpuset->cs_mask, mask); thread_unlock(ttd); break; case CPU_WHICH_PID: FOREACH_THREAD_IN_PROC(p, ttd) { thread_lock(ttd); CPU_OR(mask, &ttd->td_cpuset->cs_mask); thread_unlock(ttd); } break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: CPU_COPY(&set->cs_mask, mask); break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: error = intr_getaffinity(id, which, mask); break; case CPU_WHICH_DOMAIN: if (id < 0 || id >= MAXMEMDOM) error = ESRCH; else CPU_COPY(&cpuset_domain[id], mask); break; } break; default: error = EINVAL; break; } if (set) cpuset_rel(set); if (p) PROC_UNLOCK(p); if (error == 0) error = copyout(mask, maskp, size); out: free(mask, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_setaffinity_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t cpusetsize; const cpuset_t *mask; }; #endif int sys_cpuset_setaffinity(struct thread *td, struct cpuset_setaffinity_args *uap) { return (kern_cpuset_setaffinity(td, uap->level, uap->which, uap->id, uap->cpusetsize, uap->mask)); } int kern_cpuset_setaffinity(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t cpusetsize, const cpuset_t *maskp) { struct cpuset *nset; struct cpuset *set; struct thread *ttd; struct proc *p; cpuset_t *mask; int error; if (cpusetsize < sizeof(cpuset_t) || cpusetsize > CPU_MAXSIZE / NBBY) return (ERANGE); /* In Capability mode, you can only set your own CPU set. */ if (IN_CAPABILITY_MODE(td)) { if (level != CPU_LEVEL_WHICH) return (ECAPMODE); if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) return (ECAPMODE); if (id != -1) return (ECAPMODE); } mask = malloc(cpusetsize, M_TEMP, M_WAITOK | M_ZERO); error = copyin(maskp, mask, cpusetsize); if (error) goto out; /* * Verify that no high bits are set. */ if (cpusetsize > sizeof(cpuset_t)) { char *end; char *cp; end = cp = (char *)&mask->__bits; end += cpusetsize; cp += sizeof(cpuset_t); while (cp != end) if (*cp++ != 0) { error = EINVAL; goto out; } } switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: error = cpuset_which(which, id, &p, &ttd, &set); if (error) break; switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); PROC_UNLOCK(p); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: error = EINVAL; goto out; } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); error = cpuset_modify(nset, mask); cpuset_rel(nset); cpuset_rel(set); break; case CPU_LEVEL_WHICH: switch (which) { case CPU_WHICH_TID: error = cpuset_setthread(id, mask); break; case CPU_WHICH_PID: error = cpuset_setproc(id, NULL, mask, NULL); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: error = cpuset_which(which, id, &p, &ttd, &set); if (error == 0) { error = cpuset_modify(set, mask); cpuset_rel(set); } break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: error = intr_setaffinity(id, which, mask); break; default: error = EINVAL; break; } break; default: error = EINVAL; break; } out: free(mask, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_getdomain_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t domainsetsize; domainset_t *mask; int *policy; }; #endif int sys_cpuset_getdomain(struct thread *td, struct cpuset_getdomain_args *uap) { return (kern_cpuset_getdomain(td, uap->level, uap->which, uap->id, uap->domainsetsize, uap->mask, uap->policy)); } int kern_cpuset_getdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, domainset_t *maskp, int *policyp) { struct domainset outset; struct thread *ttd; struct cpuset *nset; struct cpuset *set; struct domainset *dset; struct proc *p; domainset_t *mask; int error; if (domainsetsize < sizeof(domainset_t) || domainsetsize > DOMAINSET_MAXSIZE / NBBY) return (ERANGE); /* In Capability mode, you can only get your own domain set. */ if (IN_CAPABILITY_MODE(td)) { if (level != CPU_LEVEL_WHICH) return (ECAPMODE); if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) return (ECAPMODE); if (id != -1) return (ECAPMODE); } mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); bzero(&outset, sizeof(outset)); error = cpuset_which(which, id, &p, &ttd, &set); if (error) goto out; switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: error = EINVAL; goto out; } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); domainset_copy(nset->cs_domain, &outset); cpuset_rel(nset); break; case CPU_LEVEL_WHICH: switch (which) { case CPU_WHICH_TID: thread_lock(ttd); domainset_copy(ttd->td_cpuset->cs_domain, &outset); thread_unlock(ttd); break; case CPU_WHICH_PID: FOREACH_THREAD_IN_PROC(p, ttd) { thread_lock(ttd); dset = ttd->td_cpuset->cs_domain; /* Show all domains in the proc. */ DOMAINSET_OR(&outset.ds_mask, &dset->ds_mask); /* Last policy wins. */ outset.ds_policy = dset->ds_policy; outset.ds_prefer = dset->ds_prefer; thread_unlock(ttd); } break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: domainset_copy(set->cs_domain, &outset); break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: error = EINVAL; break; } break; default: error = EINVAL; break; } if (set) cpuset_rel(set); if (p) PROC_UNLOCK(p); /* * Translate prefer into a set containing only the preferred domain, * not the entire fallback set. */ if (outset.ds_policy == DOMAINSET_POLICY_PREFER) { DOMAINSET_ZERO(&outset.ds_mask); DOMAINSET_SET(outset.ds_prefer, &outset.ds_mask); } DOMAINSET_COPY(&outset.ds_mask, mask); if (error == 0) error = copyout(mask, maskp, domainsetsize); if (error == 0) if (suword32(policyp, outset.ds_policy) != 0) error = EFAULT; out: free(mask, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct cpuset_setdomain_args { cpulevel_t level; cpuwhich_t which; id_t id; size_t domainsetsize; domainset_t *mask; int policy; }; #endif int sys_cpuset_setdomain(struct thread *td, struct cpuset_setdomain_args *uap) { return (kern_cpuset_setdomain(td, uap->level, uap->which, uap->id, uap->domainsetsize, uap->mask, uap->policy)); } int kern_cpuset_setdomain(struct thread *td, cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, const domainset_t *maskp, int policy) { struct cpuset *nset; struct cpuset *set; struct thread *ttd; struct proc *p; struct domainset domain; domainset_t *mask; int error; if (domainsetsize < sizeof(domainset_t) || domainsetsize > DOMAINSET_MAXSIZE / NBBY) return (ERANGE); /* In Capability mode, you can only set your own CPU set. */ if (IN_CAPABILITY_MODE(td)) { if (level != CPU_LEVEL_WHICH) return (ECAPMODE); if (which != CPU_WHICH_TID && which != CPU_WHICH_PID) return (ECAPMODE); if (id != -1) return (ECAPMODE); } memset(&domain, 0, sizeof(domain)); mask = malloc(domainsetsize, M_TEMP, M_WAITOK | M_ZERO); error = copyin(maskp, mask, domainsetsize); if (error) goto out; /* * Verify that no high bits are set. */ if (domainsetsize > sizeof(domainset_t)) { char *end; char *cp; end = cp = (char *)&mask->__bits; end += domainsetsize; cp += sizeof(domainset_t); while (cp != end) if (*cp++ != 0) { error = EINVAL; goto out; } } DOMAINSET_COPY(mask, &domain.ds_mask); domain.ds_policy = policy; if (policy <= DOMAINSET_POLICY_INVALID || policy > DOMAINSET_POLICY_MAX) return (EINVAL); /* Translate preferred policy into a mask and fallback. */ if (policy == DOMAINSET_POLICY_PREFER) { /* Only support a single preferred domain. */ if (DOMAINSET_COUNT(&domain.ds_mask) != 1) return (EINVAL); domain.ds_prefer = DOMAINSET_FFS(&domain.ds_mask) - 1; /* This will be constrained by domainset_shadow(). */ DOMAINSET_FILL(&domain.ds_mask); } switch (level) { case CPU_LEVEL_ROOT: case CPU_LEVEL_CPUSET: error = cpuset_which(which, id, &p, &ttd, &set); if (error) break; switch (which) { case CPU_WHICH_TID: case CPU_WHICH_PID: thread_lock(ttd); set = cpuset_ref(ttd->td_cpuset); thread_unlock(ttd); PROC_UNLOCK(p); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: case CPU_WHICH_DOMAIN: error = EINVAL; goto out; } if (level == CPU_LEVEL_ROOT) nset = cpuset_refroot(set); else nset = cpuset_refbase(set); error = cpuset_modify_domain(nset, &domain); cpuset_rel(nset); cpuset_rel(set); break; case CPU_LEVEL_WHICH: switch (which) { case CPU_WHICH_TID: error = _cpuset_setthread(id, NULL, &domain); break; case CPU_WHICH_PID: error = cpuset_setproc(id, NULL, NULL, &domain); break; case CPU_WHICH_CPUSET: case CPU_WHICH_JAIL: error = cpuset_which(which, id, &p, &ttd, &set); if (error == 0) { error = cpuset_modify_domain(set, &domain); cpuset_rel(set); } break; case CPU_WHICH_IRQ: case CPU_WHICH_INTRHANDLER: case CPU_WHICH_ITHREAD: default: error = EINVAL; break; } break; default: error = EINVAL; break; } out: free(mask, M_TEMP); return (error); } #ifdef DDB BITSET_DEFINE(bitset, 1); static void ddb_display_bitset(const struct bitset *set, int size) { int bit, once; for (once = 0, bit = 0; bit < size; bit++) { if (CPU_ISSET(bit, set)) { if (once == 0) { db_printf("%d", bit); once = 1; } else db_printf(",%d", bit); } } if (once == 0) db_printf(""); } void ddb_display_cpuset(const cpuset_t *set) { ddb_display_bitset((const struct bitset *)set, CPU_SETSIZE); } static void ddb_display_domainset(const domainset_t *set) { ddb_display_bitset((const struct bitset *)set, DOMAINSET_SETSIZE); } DB_SHOW_COMMAND(cpusets, db_show_cpusets) { struct cpuset *set; LIST_FOREACH(set, &cpuset_ids, cs_link) { db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n", set, set->cs_id, set->cs_ref, set->cs_flags, (set->cs_parent != NULL) ? set->cs_parent->cs_id : 0); db_printf(" cpu mask="); ddb_display_cpuset(&set->cs_mask); db_printf("\n"); db_printf(" domain policy %d prefer %d mask=", set->cs_domain->ds_policy, set->cs_domain->ds_prefer); ddb_display_domainset(&set->cs_domain->ds_mask); db_printf("\n"); if (db_pager_quit) break; } } DB_SHOW_COMMAND(domainsets, db_show_domainsets) { struct domainset *set; LIST_FOREACH(set, &cpuset_domains, ds_link) { db_printf("set=%p policy %d prefer %d cnt %d max %d\n", set, set->ds_policy, set->ds_prefer, set->ds_cnt, set->ds_max); db_printf(" mask ="); ddb_display_domainset(&set->ds_mask); db_printf("\n"); } } #endif /* DDB */ Index: head/sys/x86/x86/intr_machdep.c =================================================================== --- head/sys/x86/x86/intr_machdep.c (revision 331697) +++ head/sys/x86/x86/intr_machdep.c (revision 331698) @@ -1,743 +1,774 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Machine dependent interrupt code for x86. For x86, we have to * deal with different PICs. Thus, we use the passed in vector to lookup * an interrupt source associated with that vector. The interrupt source * describes which PIC the source belongs to and includes methods to handle * that source. */ #include "opt_atpic.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #ifndef DEV_ATPIC #include #include #include #include #include #endif +#include + #define MAX_STRAY_LOG 5 typedef void (*mask_fn)(void *); static int intrcnt_index; static struct intsrc *interrupt_sources[NUM_IO_INTS]; #ifdef SMP static struct intsrc *interrupt_sorted[NUM_IO_INTS]; CTASSERT(sizeof(interrupt_sources) == sizeof(interrupt_sorted)); static int intrbalance; SYSCTL_INT(_hw, OID_AUTO, intrbalance, CTLFLAG_RW, &intrbalance, 0, "Interrupt auto-balance interval (seconds). Zero disables."); static struct timeout_task intrbalance_task; #endif static struct sx intrsrc_lock; static struct mtx intrpic_lock; static struct mtx intrcnt_lock; static TAILQ_HEAD(pics_head, pic) pics; #if defined(SMP) && !defined(EARLY_AP_STARTUP) static int assign_cpu; #endif u_long intrcnt[INTRCNT_COUNT]; char intrnames[INTRCNT_COUNT * (MAXCOMLEN + 1)]; size_t sintrcnt = sizeof(intrcnt); size_t sintrnames = sizeof(intrnames); static int intr_assign_cpu(void *arg, int cpu); static void intr_disable_src(void *arg); static void intr_init(void *__dummy); static int intr_pic_registered(struct pic *pic); static void intrcnt_setname(const char *name, int index); static void intrcnt_updatename(struct intsrc *is); static void intrcnt_register(struct intsrc *is); static int intr_pic_registered(struct pic *pic) { struct pic *p; TAILQ_FOREACH(p, &pics, pics) { if (p == pic) return (1); } return (0); } /* * Register a new interrupt controller (PIC). This is to support suspend * and resume where we suspend/resume controllers rather than individual * sources. This also allows controllers with no active sources (such as * 8259As in a system using the APICs) to participate in suspend and resume. */ int intr_register_pic(struct pic *pic) { int error; mtx_lock(&intrpic_lock); if (intr_pic_registered(pic)) error = EBUSY; else { TAILQ_INSERT_TAIL(&pics, pic, pics); error = 0; } mtx_unlock(&intrpic_lock); return (error); } /* * Register a new interrupt source with the global interrupt system. * The global interrupts need to be disabled when this function is * called. */ int intr_register_source(struct intsrc *isrc) { int error, vector; KASSERT(intr_pic_registered(isrc->is_pic), ("unregistered PIC")); vector = isrc->is_pic->pic_vector(isrc); if (interrupt_sources[vector] != NULL) return (EEXIST); error = intr_event_create(&isrc->is_event, isrc, 0, vector, intr_disable_src, (mask_fn)isrc->is_pic->pic_enable_source, (mask_fn)isrc->is_pic->pic_eoi_source, intr_assign_cpu, "irq%d:", vector); if (error) return (error); sx_xlock(&intrsrc_lock); if (interrupt_sources[vector] != NULL) { sx_xunlock(&intrsrc_lock); intr_event_destroy(isrc->is_event); return (EEXIST); } intrcnt_register(isrc); interrupt_sources[vector] = isrc; isrc->is_handlers = 0; sx_xunlock(&intrsrc_lock); return (0); } struct intsrc * intr_lookup_source(int vector) { if (vector < 0 || vector >= nitems(interrupt_sources)) return (NULL); return (interrupt_sources[vector]); } int intr_add_handler(const char *name, int vector, driver_filter_t filter, - driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep) + driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep, + int domain) { struct intsrc *isrc; int error; isrc = intr_lookup_source(vector); if (isrc == NULL) return (EINVAL); error = intr_event_add_handler(isrc->is_event, name, filter, handler, arg, intr_priority(flags), flags, cookiep); if (error == 0) { sx_xlock(&intrsrc_lock); intrcnt_updatename(isrc); isrc->is_handlers++; if (isrc->is_handlers == 1) { + isrc->is_domain = domain; isrc->is_pic->pic_enable_intr(isrc); isrc->is_pic->pic_enable_source(isrc); } sx_xunlock(&intrsrc_lock); } return (error); } int intr_remove_handler(void *cookie) { struct intsrc *isrc; int error; isrc = intr_handler_source(cookie); error = intr_event_remove_handler(cookie); if (error == 0) { sx_xlock(&intrsrc_lock); isrc->is_handlers--; if (isrc->is_handlers == 0) { isrc->is_pic->pic_disable_source(isrc, PIC_NO_EOI); isrc->is_pic->pic_disable_intr(isrc); } intrcnt_updatename(isrc); sx_xunlock(&intrsrc_lock); } return (error); } int intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol) { struct intsrc *isrc; isrc = intr_lookup_source(vector); if (isrc == NULL) return (EINVAL); return (isrc->is_pic->pic_config_intr(isrc, trig, pol)); } static void intr_disable_src(void *arg) { struct intsrc *isrc; isrc = arg; isrc->is_pic->pic_disable_source(isrc, PIC_EOI); } void intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame) { struct intr_event *ie; int vector; /* * We count software interrupts when we process them. The * code here follows previous practice, but there's an * argument for counting hardware interrupts when they're * processed too. */ (*isrc->is_count)++; VM_CNT_INC(v_intr); ie = isrc->is_event; /* * XXX: We assume that IRQ 0 is only used for the ISA timer * device (clk). */ vector = isrc->is_pic->pic_vector(isrc); if (vector == 0) clkintr_pending = 1; /* * For stray interrupts, mask and EOI the source, bump the * stray count, and log the condition. */ if (intr_event_handle(ie, frame) != 0) { isrc->is_pic->pic_disable_source(isrc, PIC_EOI); (*isrc->is_straycount)++; if (*isrc->is_straycount < MAX_STRAY_LOG) log(LOG_ERR, "stray irq%d\n", vector); else if (*isrc->is_straycount == MAX_STRAY_LOG) log(LOG_CRIT, "too many stray irq %d's: not logging anymore\n", vector); } } void intr_resume(bool suspend_cancelled) { struct pic *pic; #ifndef DEV_ATPIC atpic_reset(); #endif mtx_lock(&intrpic_lock); TAILQ_FOREACH(pic, &pics, pics) { if (pic->pic_resume != NULL) pic->pic_resume(pic, suspend_cancelled); } mtx_unlock(&intrpic_lock); } void intr_suspend(void) { struct pic *pic; mtx_lock(&intrpic_lock); TAILQ_FOREACH_REVERSE(pic, &pics, pics_head, pics) { if (pic->pic_suspend != NULL) pic->pic_suspend(pic); } mtx_unlock(&intrpic_lock); } static int intr_assign_cpu(void *arg, int cpu) { #ifdef SMP struct intsrc *isrc; int error; #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); /* Nothing to do if there is only a single CPU. */ if (mp_ncpus > 1 && cpu != NOCPU) { #else /* * Don't do anything during early boot. We will pick up the * assignment once the APs are started. */ if (assign_cpu && cpu != NOCPU) { #endif isrc = arg; sx_xlock(&intrsrc_lock); error = isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]); if (error == 0) isrc->is_cpu = cpu; sx_xunlock(&intrsrc_lock); } else error = 0; return (error); #else return (EOPNOTSUPP); #endif } static void intrcnt_setname(const char *name, int index) { snprintf(intrnames + (MAXCOMLEN + 1) * index, MAXCOMLEN + 1, "%-*s", MAXCOMLEN, name); } static void intrcnt_updatename(struct intsrc *is) { intrcnt_setname(is->is_event->ie_fullname, is->is_index); } static void intrcnt_register(struct intsrc *is) { char straystr[MAXCOMLEN + 1]; KASSERT(is->is_event != NULL, ("%s: isrc with no event", __func__)); mtx_lock_spin(&intrcnt_lock); is->is_index = intrcnt_index; intrcnt_index += 2; snprintf(straystr, MAXCOMLEN + 1, "stray irq%d", is->is_pic->pic_vector(is)); intrcnt_updatename(is); is->is_count = &intrcnt[is->is_index]; intrcnt_setname(straystr, is->is_index + 1); is->is_straycount = &intrcnt[is->is_index + 1]; mtx_unlock_spin(&intrcnt_lock); } void intrcnt_add(const char *name, u_long **countp) { mtx_lock_spin(&intrcnt_lock); *countp = &intrcnt[intrcnt_index]; intrcnt_setname(name, intrcnt_index); intrcnt_index++; mtx_unlock_spin(&intrcnt_lock); } static void intr_init(void *dummy __unused) { intrcnt_setname("???", 0); intrcnt_index = 1; TAILQ_INIT(&pics); mtx_init(&intrpic_lock, "intrpic", NULL, MTX_DEF); sx_init(&intrsrc_lock, "intrsrc"); mtx_init(&intrcnt_lock, "intrcnt", NULL, MTX_SPIN); } SYSINIT(intr_init, SI_SUB_INTR, SI_ORDER_FIRST, intr_init, NULL); static void intr_init_final(void *dummy __unused) { /* * Enable interrupts on the BSP after all of the interrupt * controllers are initialized. Device interrupts are still * disabled in the interrupt controllers until interrupt * handlers are registered. Interrupts are enabled on each AP * after their first context switch. */ enable_intr(); } SYSINIT(intr_init_final, SI_SUB_INTR, SI_ORDER_ANY, intr_init_final, NULL); #ifndef DEV_ATPIC /* Initialize the two 8259A's to a known-good shutdown state. */ void atpic_reset(void) { outb(IO_ICU1, ICW1_RESET | ICW1_IC4); outb(IO_ICU1 + ICU_IMR_OFFSET, IDT_IO_INTS); outb(IO_ICU1 + ICU_IMR_OFFSET, IRQ_MASK(ICU_SLAVEID)); outb(IO_ICU1 + ICU_IMR_OFFSET, MASTER_MODE); outb(IO_ICU1 + ICU_IMR_OFFSET, 0xff); outb(IO_ICU1, OCW3_SEL | OCW3_RR); outb(IO_ICU2, ICW1_RESET | ICW1_IC4); outb(IO_ICU2 + ICU_IMR_OFFSET, IDT_IO_INTS + 8); outb(IO_ICU2 + ICU_IMR_OFFSET, ICU_SLAVEID); outb(IO_ICU2 + ICU_IMR_OFFSET, SLAVE_MODE); outb(IO_ICU2 + ICU_IMR_OFFSET, 0xff); outb(IO_ICU2, OCW3_SEL | OCW3_RR); } #endif /* Add a description to an active interrupt handler. */ int intr_describe(u_int vector, void *ih, const char *descr) { struct intsrc *isrc; int error; isrc = intr_lookup_source(vector); if (isrc == NULL) return (EINVAL); error = intr_event_describe_handler(isrc->is_event, ih, descr); if (error) return (error); intrcnt_updatename(isrc); return (0); } void intr_reprogram(void) { struct intsrc *is; int v; sx_xlock(&intrsrc_lock); for (v = 0; v < NUM_IO_INTS; v++) { is = interrupt_sources[v]; if (is == NULL) continue; if (is->is_pic->pic_reprogram_pin != NULL) is->is_pic->pic_reprogram_pin(is); } sx_xunlock(&intrsrc_lock); } #ifdef DDB /* * Dump data about interrupt handlers */ DB_SHOW_COMMAND(irqs, db_show_irqs) { struct intsrc **isrc; int i, verbose; if (strcmp(modif, "v") == 0) verbose = 1; else verbose = 0; isrc = interrupt_sources; for (i = 0; i < NUM_IO_INTS && !db_pager_quit; i++, isrc++) if (*isrc != NULL) db_dump_intr_event((*isrc)->is_event, verbose); } #endif #ifdef SMP /* * Support for balancing interrupt sources across CPUs. For now we just * allocate CPUs round-robin. */ cpuset_t intr_cpus = CPUSET_T_INITIALIZER(0x1); -static int current_cpu; +static int current_cpu[MAXMEMDOM]; +static void +intr_init_cpus(void) +{ + int i; + + for (i = 0; i < vm_ndomains; i++) { + current_cpu[i] = 0; + if (!CPU_ISSET(current_cpu[i], &intr_cpus) || + !CPU_ISSET(current_cpu[i], &cpuset_domain[i])) + intr_next_cpu(i); + } +} + /* * Return the CPU that the next interrupt source should use. For now * this just returns the next local APIC according to round-robin. */ u_int -intr_next_cpu(void) +intr_next_cpu(int domain) { u_int apic_id; #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); if (mp_ncpus == 1) return (PCPU_GET(apic_id)); #else /* Leave all interrupts on the BSP during boot. */ if (!assign_cpu) return (PCPU_GET(apic_id)); #endif mtx_lock_spin(&icu_lock); - apic_id = cpu_apic_ids[current_cpu]; + apic_id = cpu_apic_ids[current_cpu[domain]]; do { - current_cpu++; - if (current_cpu > mp_maxid) - current_cpu = 0; - } while (!CPU_ISSET(current_cpu, &intr_cpus)); + current_cpu[domain]++; + if (current_cpu[domain] > mp_maxid) + current_cpu[domain] = 0; + } while (!CPU_ISSET(current_cpu[domain], &intr_cpus) || + !CPU_ISSET(current_cpu[domain], &cpuset_domain[domain])); mtx_unlock_spin(&icu_lock); return (apic_id); } /* Attempt to bind the specified IRQ to the specified CPU. */ int intr_bind(u_int vector, u_char cpu) { struct intsrc *isrc; isrc = intr_lookup_source(vector); if (isrc == NULL) return (EINVAL); return (intr_event_bind(isrc->is_event, cpu)); } /* * Add a CPU to our mask of valid CPUs that can be destinations of * interrupts. */ void intr_add_cpu(u_int cpu) { if (cpu >= MAXCPU) panic("%s: Invalid CPU ID", __func__); if (bootverbose) printf("INTR: Adding local APIC %d as a target\n", cpu_apic_ids[cpu]); CPU_SET(cpu, &intr_cpus); } -#ifndef EARLY_AP_STARTUP +#ifdef EARLY_AP_STARTUP +static void +intr_smp_startup(void *arg __unused) +{ + + intr_init_cpus(); + return; +} +SYSINIT(intr_smp_startup, SI_SUB_SMP, SI_ORDER_SECOND, intr_smp_startup, + NULL); + +#else /* * Distribute all the interrupt sources among the available CPUs once the * AP's have been launched. */ static void intr_shuffle_irqs(void *arg __unused) { struct intsrc *isrc; u_int cpu; int i; + intr_init_cpus(); /* Don't bother on UP. */ if (mp_ncpus == 1) return; /* Round-robin assign a CPU to each enabled source. */ sx_xlock(&intrsrc_lock); assign_cpu = 1; for (i = 0; i < NUM_IO_INTS; i++) { isrc = interrupt_sources[i]; if (isrc != NULL && isrc->is_handlers > 0) { /* * If this event is already bound to a CPU, * then assign the source to that CPU instead * of picking one via round-robin. Note that * this is careful to only advance the * round-robin if the CPU assignment succeeds. */ cpu = isrc->is_event->ie_cpu; if (cpu == NOCPU) - cpu = current_cpu; + cpu = current_cpu[isrc->is_domain]; if (isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]) == 0) { isrc->is_cpu = cpu; if (isrc->is_event->ie_cpu == NOCPU) - intr_next_cpu(); + intr_next_cpu(isrc->is_domain); } } } sx_xunlock(&intrsrc_lock); } SYSINIT(intr_shuffle_irqs, SI_SUB_SMP, SI_ORDER_SECOND, intr_shuffle_irqs, NULL); #endif /* * TODO: Export this information in a non-MD fashion, integrate with vmstat -i. */ static int sysctl_hw_intrs(SYSCTL_HANDLER_ARGS) { struct sbuf sbuf; struct intsrc *isrc; int error; int i; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sbuf, NULL, 128, req); sx_slock(&intrsrc_lock); for (i = 0; i < NUM_IO_INTS; i++) { isrc = interrupt_sources[i]; if (isrc == NULL) continue; - sbuf_printf(&sbuf, "%s:%d @%d: %ld\n", + sbuf_printf(&sbuf, "%s:%d @cpu%d(domain%d): %ld\n", isrc->is_event->ie_fullname, isrc->is_index, isrc->is_cpu, + isrc->is_domain, *isrc->is_count); } sx_sunlock(&intrsrc_lock); error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } SYSCTL_PROC(_hw, OID_AUTO, intrs, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, sysctl_hw_intrs, "A", "interrupt:number @cpu: count"); /* * Compare two, possibly NULL, entries in the interrupt source array * by load. */ static int intrcmp(const void *one, const void *two) { const struct intsrc *i1, *i2; i1 = *(const struct intsrc * const *)one; i2 = *(const struct intsrc * const *)two; if (i1 != NULL && i2 != NULL) return (*i1->is_count - *i2->is_count); if (i1 != NULL) return (1); if (i2 != NULL) return (-1); return (0); } /* * Balance IRQs across available CPUs according to load. */ static void intr_balance(void *dummy __unused, int pending __unused) { struct intsrc *isrc; int interval; u_int cpu; int i; interval = intrbalance; if (interval == 0) goto out; /* * Sort interrupts according to count. */ sx_xlock(&intrsrc_lock); memcpy(interrupt_sorted, interrupt_sources, sizeof(interrupt_sorted)); qsort(interrupt_sorted, NUM_IO_INTS, sizeof(interrupt_sorted[0]), intrcmp); /* * Restart the scan from the same location to avoid moving in the * common case. */ - current_cpu = 0; + intr_init_cpus(); /* * Assign round-robin from most loaded to least. */ for (i = NUM_IO_INTS - 1; i >= 0; i--) { isrc = interrupt_sorted[i]; if (isrc == NULL || isrc->is_event->ie_cpu != NOCPU) continue; - cpu = current_cpu; - intr_next_cpu(); + cpu = current_cpu[isrc->is_domain]; + intr_next_cpu(isrc->is_domain); if (isrc->is_cpu != cpu && isrc->is_pic->pic_assign_cpu(isrc, cpu_apic_ids[cpu]) == 0) isrc->is_cpu = cpu; } sx_xunlock(&intrsrc_lock); out: taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task, interval ? hz * interval : hz * 60); } static void intr_balance_init(void *dummy __unused) { TIMEOUT_TASK_INIT(taskqueue_thread, &intrbalance_task, 0, intr_balance, NULL); taskqueue_enqueue_timeout(taskqueue_thread, &intrbalance_task, hz); } SYSINIT(intr_balance_init, SI_SUB_SMP, SI_ORDER_ANY, intr_balance_init, NULL); #else /* * Always route interrupts to the current processor in the UP case. */ u_int -intr_next_cpu(void) +intr_next_cpu(int domain) { return (PCPU_GET(apic_id)); } #endif Index: head/sys/x86/x86/io_apic.c =================================================================== --- head/sys/x86/x86/io_apic.c (revision 331697) +++ head/sys/x86/x86/io_apic.c (revision 331698) @@ -1,1234 +1,1234 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2003 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include "opt_isa.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define IOAPIC_ISA_INTS 16 #define IOAPIC_MEM_REGION 32 #define IOAPIC_REDTBL_LO(i) (IOAPIC_REDTBL + (i) * 2) #define IOAPIC_REDTBL_HI(i) (IOAPIC_REDTBL_LO(i) + 1) static MALLOC_DEFINE(M_IOAPIC, "io_apic", "I/O APIC structures"); /* * I/O APIC interrupt source driver. Each pin is assigned an IRQ cookie * as laid out in the ACPI System Interrupt number model where each I/O * APIC has a contiguous chunk of the System Interrupt address space. * We assume that IRQs 1 - 15 behave like ISA IRQs and that all other * IRQs behave as PCI IRQs by default. We also assume that the pin for * IRQ 0 is actually an ExtINT pin. The apic enumerators override the * configuration of individual pins as indicated by their tables. * * Documentation for the I/O APIC: "82093AA I/O Advanced Programmable * Interrupt Controller (IOAPIC)", May 1996, Intel Corp. * ftp://download.intel.com/design/chipsets/datashts/29056601.pdf */ struct ioapic_intsrc { struct intsrc io_intsrc; u_int io_irq; u_int io_intpin:8; u_int io_vector:8; u_int io_cpu; u_int io_activehi:1; u_int io_edgetrigger:1; u_int io_masked:1; int io_bus:4; uint32_t io_lowreg; u_int io_remap_cookie; }; struct ioapic { struct pic io_pic; u_int io_id:8; /* logical ID */ u_int io_apic_id:4; u_int io_intbase:8; /* System Interrupt base */ u_int io_numintr:8; u_int io_haseoi:1; volatile ioapic_t *io_addr; /* XXX: should use bus_space */ vm_paddr_t io_paddr; STAILQ_ENTRY(ioapic) io_next; device_t pci_dev; /* matched pci device, if found */ struct resource *pci_wnd; /* BAR 0, should be same or alias to io_paddr */ struct ioapic_intsrc io_pins[0]; }; static u_int ioapic_read(volatile ioapic_t *apic, int reg); static void ioapic_write(volatile ioapic_t *apic, int reg, u_int val); static const char *ioapic_bus_string(int bus_type); static void ioapic_print_irq(struct ioapic_intsrc *intpin); static void ioapic_enable_source(struct intsrc *isrc); static void ioapic_disable_source(struct intsrc *isrc, int eoi); static void ioapic_eoi_source(struct intsrc *isrc); static void ioapic_enable_intr(struct intsrc *isrc); static void ioapic_disable_intr(struct intsrc *isrc); static int ioapic_vector(struct intsrc *isrc); static int ioapic_source_pending(struct intsrc *isrc); static int ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol); static void ioapic_resume(struct pic *pic, bool suspend_cancelled); static int ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id); static void ioapic_program_intpin(struct ioapic_intsrc *intpin); static void ioapic_reprogram_intpin(struct intsrc *isrc); static STAILQ_HEAD(,ioapic) ioapic_list = STAILQ_HEAD_INITIALIZER(ioapic_list); struct pic ioapic_template = { .pic_enable_source = ioapic_enable_source, .pic_disable_source = ioapic_disable_source, .pic_eoi_source = ioapic_eoi_source, .pic_enable_intr = ioapic_enable_intr, .pic_disable_intr = ioapic_disable_intr, .pic_vector = ioapic_vector, .pic_source_pending = ioapic_source_pending, .pic_suspend = NULL, .pic_resume = ioapic_resume, .pic_config_intr = ioapic_config_intr, .pic_assign_cpu = ioapic_assign_cpu, .pic_reprogram_pin = ioapic_reprogram_intpin, }; static int next_ioapic_base; static u_int next_id; static int enable_extint; SYSCTL_INT(_hw_apic, OID_AUTO, enable_extint, CTLFLAG_RDTUN, &enable_extint, 0, "Enable the ExtINT pin in the first I/O APIC"); static void _ioapic_eoi_source(struct intsrc *isrc, int locked) { struct ioapic_intsrc *src; struct ioapic *io; volatile uint32_t *apic_eoi; uint32_t low1; lapic_eoi(); if (!lapic_eoi_suppression) return; src = (struct ioapic_intsrc *)isrc; if (src->io_edgetrigger) return; io = (struct ioapic *)isrc->is_pic; /* * Handle targeted EOI for level-triggered pins, if broadcast * EOI suppression is supported by LAPICs. */ if (io->io_haseoi) { /* * If IOAPIC has EOI Register, simply write vector * number into the reg. */ apic_eoi = (volatile uint32_t *)((volatile char *) io->io_addr + IOAPIC_EOIR); *apic_eoi = src->io_vector; } else { /* * Otherwise, if IO-APIC is too old to provide EOIR, * do what Intel did for the Linux kernel. Temporary * switch the pin to edge-trigger and back, masking * the pin during the trick. */ if (!locked) mtx_lock_spin(&icu_lock); low1 = src->io_lowreg; low1 &= ~IOART_TRGRLVL; low1 |= IOART_TRGREDG | IOART_INTMSET; ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin), low1); ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(src->io_intpin), src->io_lowreg); if (!locked) mtx_unlock_spin(&icu_lock); } } static u_int ioapic_read(volatile ioapic_t *apic, int reg) { mtx_assert(&icu_lock, MA_OWNED); apic->ioregsel = reg; return (apic->iowin); } static void ioapic_write(volatile ioapic_t *apic, int reg, u_int val) { mtx_assert(&icu_lock, MA_OWNED); apic->ioregsel = reg; apic->iowin = val; } static const char * ioapic_bus_string(int bus_type) { switch (bus_type) { case APIC_BUS_ISA: return ("ISA"); case APIC_BUS_EISA: return ("EISA"); case APIC_BUS_PCI: return ("PCI"); default: return ("unknown"); } } static void ioapic_print_irq(struct ioapic_intsrc *intpin) { switch (intpin->io_irq) { case IRQ_DISABLED: printf("disabled"); break; case IRQ_EXTINT: printf("ExtINT"); break; case IRQ_NMI: printf("NMI"); break; case IRQ_SMI: printf("SMI"); break; default: printf("%s IRQ %u", ioapic_bus_string(intpin->io_bus), intpin->io_irq); } } static void ioapic_enable_source(struct intsrc *isrc) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; struct ioapic *io = (struct ioapic *)isrc->is_pic; uint32_t flags; mtx_lock_spin(&icu_lock); if (intpin->io_masked) { flags = intpin->io_lowreg & ~IOART_INTMASK; ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), flags); intpin->io_masked = 0; } mtx_unlock_spin(&icu_lock); } static void ioapic_disable_source(struct intsrc *isrc, int eoi) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; struct ioapic *io = (struct ioapic *)isrc->is_pic; uint32_t flags; mtx_lock_spin(&icu_lock); if (!intpin->io_masked && !intpin->io_edgetrigger) { flags = intpin->io_lowreg | IOART_INTMSET; ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), flags); intpin->io_masked = 1; } if (eoi == PIC_EOI) _ioapic_eoi_source(isrc, 1); mtx_unlock_spin(&icu_lock); } static void ioapic_eoi_source(struct intsrc *isrc) { _ioapic_eoi_source(isrc, 0); } /* * Completely program an intpin based on the data in its interrupt source * structure. */ static void ioapic_program_intpin(struct ioapic_intsrc *intpin) { struct ioapic *io = (struct ioapic *)intpin->io_intsrc.is_pic; uint32_t low, high; #ifdef ACPI_DMAR int error; #endif /* * If a pin is completely invalid or if it is valid but hasn't * been enabled yet, just ensure that the pin is masked. */ mtx_assert(&icu_lock, MA_OWNED); if (intpin->io_irq == IRQ_DISABLED || (intpin->io_irq < NUM_IO_INTS && intpin->io_vector == 0)) { low = ioapic_read(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin)); if ((low & IOART_INTMASK) == IOART_INTMCLR) ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low | IOART_INTMSET); #ifdef ACPI_DMAR mtx_unlock_spin(&icu_lock); iommu_unmap_ioapic_intr(io->io_apic_id, &intpin->io_remap_cookie); mtx_lock_spin(&icu_lock); #endif return; } #ifdef ACPI_DMAR mtx_unlock_spin(&icu_lock); error = iommu_map_ioapic_intr(io->io_apic_id, intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger, intpin->io_activehi, intpin->io_irq, &intpin->io_remap_cookie, &high, &low); mtx_lock_spin(&icu_lock); if (error == 0) { ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin), high); intpin->io_lowreg = low; ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low); return; } else if (error != EOPNOTSUPP) { return; } #endif /* * Set the destination. Note that with Intel interrupt remapping, * the previously reserved bits 55:48 now have a purpose so ensure * these are zero. */ low = IOART_DESTPHY; high = intpin->io_cpu << APIC_ID_SHIFT; /* Program the rest of the low word. */ if (intpin->io_edgetrigger) low |= IOART_TRGREDG; else low |= IOART_TRGRLVL; if (intpin->io_activehi) low |= IOART_INTAHI; else low |= IOART_INTALO; if (intpin->io_masked) low |= IOART_INTMSET; switch (intpin->io_irq) { case IRQ_EXTINT: KASSERT(intpin->io_edgetrigger, ("ExtINT not edge triggered")); low |= IOART_DELEXINT; break; case IRQ_NMI: KASSERT(intpin->io_edgetrigger, ("NMI not edge triggered")); low |= IOART_DELNMI; break; case IRQ_SMI: KASSERT(intpin->io_edgetrigger, ("SMI not edge triggered")); low |= IOART_DELSMI; break; default: KASSERT(intpin->io_vector != 0, ("No vector for IRQ %u", intpin->io_irq)); low |= IOART_DELFIXED | intpin->io_vector; } /* Write the values to the APIC. */ ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin), high); intpin->io_lowreg = low; ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low); } static void ioapic_reprogram_intpin(struct intsrc *isrc) { mtx_lock_spin(&icu_lock); ioapic_program_intpin((struct ioapic_intsrc *)isrc); mtx_unlock_spin(&icu_lock); } static int ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; struct ioapic *io = (struct ioapic *)isrc->is_pic; u_int old_vector, new_vector; u_int old_id; /* * On Hyper-V: * - Stick to the first cpu for all I/O APIC pins. * - And don't allow destination cpu changes. */ if (vm_guest == VM_GUEST_HV) { if (intpin->io_vector) return (EINVAL); else apic_id = 0; } /* * keep 1st core as the destination for NMI */ if (intpin->io_irq == IRQ_NMI) apic_id = 0; /* * Set us up to free the old irq. */ old_vector = intpin->io_vector; old_id = intpin->io_cpu; if (old_vector && apic_id == old_id) return (0); /* * Allocate an APIC vector for this interrupt pin. Once * we have a vector we program the interrupt pin. */ new_vector = apic_alloc_vector(apic_id, intpin->io_irq); if (new_vector == 0) return (ENOSPC); /* * Mask the old intpin if it is enabled while it is migrated. * * At least some level-triggered interrupts seem to need the * extra DELAY() to avoid being stuck in a non-EOI'd state. */ mtx_lock_spin(&icu_lock); if (!intpin->io_masked && !intpin->io_edgetrigger) { ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), intpin->io_lowreg | IOART_INTMSET); mtx_unlock_spin(&icu_lock); DELAY(100); mtx_lock_spin(&icu_lock); } intpin->io_cpu = apic_id; intpin->io_vector = new_vector; if (isrc->is_handlers > 0) apic_enable_vector(intpin->io_cpu, intpin->io_vector); if (bootverbose) { printf("ioapic%u: routing intpin %u (", io->io_id, intpin->io_intpin); ioapic_print_irq(intpin); printf(") to lapic %u vector %u\n", intpin->io_cpu, intpin->io_vector); } ioapic_program_intpin(intpin); mtx_unlock_spin(&icu_lock); /* * Free the old vector after the new one is established. This is done * to prevent races where we could miss an interrupt. */ if (old_vector) { if (isrc->is_handlers > 0) apic_disable_vector(old_id, old_vector); apic_free_vector(old_id, old_vector, intpin->io_irq); } return (0); } static void ioapic_enable_intr(struct intsrc *isrc) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; if (intpin->io_vector == 0) - if (ioapic_assign_cpu(isrc, intr_next_cpu()) != 0) + if (ioapic_assign_cpu(isrc, intr_next_cpu(isrc->is_domain)) != 0) panic("Couldn't find an APIC vector for IRQ %d", intpin->io_irq); apic_enable_vector(intpin->io_cpu, intpin->io_vector); } static void ioapic_disable_intr(struct intsrc *isrc) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; u_int vector; if (intpin->io_vector != 0) { /* Mask this interrupt pin and free its APIC vector. */ vector = intpin->io_vector; apic_disable_vector(intpin->io_cpu, vector); mtx_lock_spin(&icu_lock); intpin->io_masked = 1; intpin->io_vector = 0; ioapic_program_intpin(intpin); mtx_unlock_spin(&icu_lock); apic_free_vector(intpin->io_cpu, vector, intpin->io_irq); } } static int ioapic_vector(struct intsrc *isrc) { struct ioapic_intsrc *pin; pin = (struct ioapic_intsrc *)isrc; return (pin->io_irq); } static int ioapic_source_pending(struct intsrc *isrc) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; if (intpin->io_vector == 0) return 0; return (lapic_intr_pending(intpin->io_vector)); } static int ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; struct ioapic *io = (struct ioapic *)isrc->is_pic; int changed; KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM), ("%s: Conforming trigger or polarity\n", __func__)); /* * EISA interrupts always use active high polarity, so don't allow * them to be set to active low. * * XXX: Should we write to the ELCR if the trigger mode changes for * an EISA IRQ or an ISA IRQ with the ELCR present? */ mtx_lock_spin(&icu_lock); if (intpin->io_bus == APIC_BUS_EISA) pol = INTR_POLARITY_HIGH; changed = 0; if (intpin->io_edgetrigger != (trig == INTR_TRIGGER_EDGE)) { if (bootverbose) printf("ioapic%u: Changing trigger for pin %u to %s\n", io->io_id, intpin->io_intpin, trig == INTR_TRIGGER_EDGE ? "edge" : "level"); intpin->io_edgetrigger = (trig == INTR_TRIGGER_EDGE); changed++; } if (intpin->io_activehi != (pol == INTR_POLARITY_HIGH)) { if (bootverbose) printf("ioapic%u: Changing polarity for pin %u to %s\n", io->io_id, intpin->io_intpin, pol == INTR_POLARITY_HIGH ? "high" : "low"); intpin->io_activehi = (pol == INTR_POLARITY_HIGH); changed++; } if (changed) ioapic_program_intpin(intpin); mtx_unlock_spin(&icu_lock); return (0); } static void ioapic_resume(struct pic *pic, bool suspend_cancelled) { struct ioapic *io = (struct ioapic *)pic; int i; mtx_lock_spin(&icu_lock); for (i = 0; i < io->io_numintr; i++) ioapic_program_intpin(&io->io_pins[i]); mtx_unlock_spin(&icu_lock); } /* * Create a plain I/O APIC object. */ void * ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase) { struct ioapic *io; struct ioapic_intsrc *intpin; volatile ioapic_t *apic; u_int numintr, i; uint32_t value; /* Map the register window so we can access the device. */ apic = pmap_mapdev(addr, IOAPIC_MEM_REGION); mtx_lock_spin(&icu_lock); value = ioapic_read(apic, IOAPIC_VER); mtx_unlock_spin(&icu_lock); /* If it's version register doesn't seem to work, punt. */ if (value == 0xffffffff) { pmap_unmapdev((vm_offset_t)apic, IOAPIC_MEM_REGION); return (NULL); } /* Determine the number of vectors and set the APIC ID. */ numintr = ((value & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) + 1; io = malloc(sizeof(struct ioapic) + numintr * sizeof(struct ioapic_intsrc), M_IOAPIC, M_WAITOK); io->io_pic = ioapic_template; io->pci_dev = NULL; io->pci_wnd = NULL; mtx_lock_spin(&icu_lock); io->io_id = next_id++; io->io_apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT; if (apic_id != -1 && io->io_apic_id != apic_id) { ioapic_write(apic, IOAPIC_ID, apic_id << APIC_ID_SHIFT); mtx_unlock_spin(&icu_lock); io->io_apic_id = apic_id; printf("ioapic%u: Changing APIC ID to %d\n", io->io_id, apic_id); } else mtx_unlock_spin(&icu_lock); if (intbase == -1) { intbase = next_ioapic_base; printf("ioapic%u: Assuming intbase of %d\n", io->io_id, intbase); } else if (intbase != next_ioapic_base && bootverbose) printf("ioapic%u: WARNING: intbase %d != expected base %d\n", io->io_id, intbase, next_ioapic_base); io->io_intbase = intbase; next_ioapic_base = intbase + numintr; io->io_numintr = numintr; io->io_addr = apic; io->io_paddr = addr; if (bootverbose) { printf("ioapic%u: ver 0x%02x maxredir 0x%02x\n", io->io_id, (value & IOART_VER_VERSION), (value & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT); } /* * The summary information about IO-APIC versions is taken from * the Linux kernel source: * 0Xh 82489DX * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant * 2Xh I/O(x)APIC which is PCI 2.2 Compliant * 30h-FFh Reserved * IO-APICs with version >= 0x20 have working EOIR register. */ io->io_haseoi = (value & IOART_VER_VERSION) >= 0x20; /* * Initialize pins. Start off with interrupts disabled. Default * to active-hi and edge-triggered for ISA interrupts and active-lo * and level-triggered for all others. */ bzero(io->io_pins, sizeof(struct ioapic_intsrc) * numintr); mtx_lock_spin(&icu_lock); for (i = 0, intpin = io->io_pins; i < numintr; i++, intpin++) { intpin->io_intsrc.is_pic = (struct pic *)io; intpin->io_intpin = i; intpin->io_irq = intbase + i; /* * Assume that pin 0 on the first I/O APIC is an ExtINT pin. * Assume that pins 1-15 are ISA interrupts and that all * other pins are PCI interrupts. */ if (intpin->io_irq == 0) ioapic_set_extint(io, i); else if (intpin->io_irq < IOAPIC_ISA_INTS) { intpin->io_bus = APIC_BUS_ISA; intpin->io_activehi = 1; intpin->io_edgetrigger = 1; intpin->io_masked = 1; } else { intpin->io_bus = APIC_BUS_PCI; intpin->io_activehi = 0; intpin->io_edgetrigger = 0; intpin->io_masked = 1; } /* * Route interrupts to the BSP by default. Interrupts may * be routed to other CPUs later after they are enabled. */ intpin->io_cpu = PCPU_GET(apic_id); value = ioapic_read(apic, IOAPIC_REDTBL_LO(i)); ioapic_write(apic, IOAPIC_REDTBL_LO(i), value | IOART_INTMSET); #ifdef ACPI_DMAR /* dummy, but sets cookie */ mtx_unlock_spin(&icu_lock); iommu_map_ioapic_intr(io->io_apic_id, intpin->io_cpu, intpin->io_vector, intpin->io_edgetrigger, intpin->io_activehi, intpin->io_irq, &intpin->io_remap_cookie, NULL, NULL); mtx_lock_spin(&icu_lock); #endif } mtx_unlock_spin(&icu_lock); return (io); } int ioapic_get_vector(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (-1); return (io->io_pins[pin].io_irq); } int ioapic_disable_pin(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_DISABLED) return (EINVAL); io->io_pins[pin].io_irq = IRQ_DISABLED; if (bootverbose) printf("ioapic%u: intpin %d disabled\n", io->io_id, pin); return (0); } int ioapic_remap_vector(void *cookie, u_int pin, int vector) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr || vector < 0) return (EINVAL); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); io->io_pins[pin].io_irq = vector; if (bootverbose) printf("ioapic%u: Routing IRQ %d -> intpin %d\n", io->io_id, vector, pin); return (0); } int ioapic_set_bus(void *cookie, u_int pin, int bus_type) { struct ioapic *io; if (bus_type < 0 || bus_type > APIC_BUS_MAX) return (EINVAL); io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); if (io->io_pins[pin].io_bus == bus_type) return (0); io->io_pins[pin].io_bus = bus_type; if (bootverbose) printf("ioapic%u: intpin %d bus %s\n", io->io_id, pin, ioapic_bus_string(bus_type)); return (0); } int ioapic_set_nmi(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_NMI) return (0); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; io->io_pins[pin].io_irq = IRQ_NMI; io->io_pins[pin].io_masked = 0; io->io_pins[pin].io_edgetrigger = 1; io->io_pins[pin].io_activehi = 1; if (bootverbose) printf("ioapic%u: Routing NMI -> intpin %d\n", io->io_id, pin); return (0); } int ioapic_set_smi(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_SMI) return (0); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; io->io_pins[pin].io_irq = IRQ_SMI; io->io_pins[pin].io_masked = 0; io->io_pins[pin].io_edgetrigger = 1; io->io_pins[pin].io_activehi = 1; if (bootverbose) printf("ioapic%u: Routing SMI -> intpin %d\n", io->io_id, pin); return (0); } int ioapic_set_extint(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_EXTINT) return (0); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; io->io_pins[pin].io_irq = IRQ_EXTINT; if (enable_extint) io->io_pins[pin].io_masked = 0; else io->io_pins[pin].io_masked = 1; io->io_pins[pin].io_edgetrigger = 1; io->io_pins[pin].io_activehi = 1; if (bootverbose) printf("ioapic%u: Routing external 8259A's -> intpin %d\n", io->io_id, pin); return (0); } int ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol) { struct ioapic *io; int activehi; io = (struct ioapic *)cookie; if (pin >= io->io_numintr || pol == INTR_POLARITY_CONFORM) return (EINVAL); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); activehi = (pol == INTR_POLARITY_HIGH); if (io->io_pins[pin].io_activehi == activehi) return (0); io->io_pins[pin].io_activehi = activehi; if (bootverbose) printf("ioapic%u: intpin %d polarity: %s\n", io->io_id, pin, pol == INTR_POLARITY_HIGH ? "high" : "low"); return (0); } int ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger) { struct ioapic *io; int edgetrigger; io = (struct ioapic *)cookie; if (pin >= io->io_numintr || trigger == INTR_TRIGGER_CONFORM) return (EINVAL); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); edgetrigger = (trigger == INTR_TRIGGER_EDGE); if (io->io_pins[pin].io_edgetrigger == edgetrigger) return (0); io->io_pins[pin].io_edgetrigger = edgetrigger; if (bootverbose) printf("ioapic%u: intpin %d trigger: %s\n", io->io_id, pin, trigger == INTR_TRIGGER_EDGE ? "edge" : "level"); return (0); } /* * Register a complete I/O APIC object with the interrupt subsystem. */ void ioapic_register(void *cookie) { struct ioapic_intsrc *pin; struct ioapic *io; volatile ioapic_t *apic; uint32_t flags; int i; io = (struct ioapic *)cookie; apic = io->io_addr; mtx_lock_spin(&icu_lock); flags = ioapic_read(apic, IOAPIC_VER) & IOART_VER_VERSION; STAILQ_INSERT_TAIL(&ioapic_list, io, io_next); mtx_unlock_spin(&icu_lock); printf("ioapic%u irqs %u-%u on motherboard\n", io->io_id, flags >> 4, flags & 0xf, io->io_intbase, io->io_intbase + io->io_numintr - 1); /* * Reprogram pins to handle special case pins (such as NMI and * SMI) and register valid pins as interrupt sources. */ intr_register_pic(&io->io_pic); for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++) { ioapic_reprogram_intpin(&pin->io_intsrc); if (pin->io_irq < NUM_IO_INTS) intr_register_source(&pin->io_intsrc); } } /* A simple new-bus driver to consume PCI I/O APIC devices. */ static int ioapic_pci_probe(device_t dev) { if (pci_get_class(dev) == PCIC_BASEPERIPH && pci_get_subclass(dev) == PCIS_BASEPERIPH_PIC) { switch (pci_get_progif(dev)) { case PCIP_BASEPERIPH_PIC_IO_APIC: device_set_desc(dev, "IO APIC"); break; case PCIP_BASEPERIPH_PIC_IOX_APIC: device_set_desc(dev, "IO(x) APIC"); break; default: return (ENXIO); } device_quiet(dev); return (-10000); } return (ENXIO); } static int ioapic_pci_attach(device_t dev) { struct resource *res; volatile ioapic_t *apic; struct ioapic *io; int rid; u_int apic_id; /* * Try to match the enumerated ioapic. Match BAR start * against io_paddr. Due to a fear that PCI window is not the * same as the MADT reported io window, but an alias, read the * APIC ID from the mapped BAR and match against it. */ rid = PCIR_BAR(0); res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE | RF_SHAREABLE); if (res == NULL) { if (bootverbose) device_printf(dev, "cannot activate BAR0\n"); return (ENXIO); } apic = (volatile ioapic_t *)rman_get_virtual(res); if (rman_get_size(res) < IOAPIC_WND_SIZE) { if (bootverbose) device_printf(dev, "BAR0 too small (%jd) for IOAPIC window\n", (uintmax_t)rman_get_size(res)); goto fail; } mtx_lock_spin(&icu_lock); apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT; /* First match by io window address */ STAILQ_FOREACH(io, &ioapic_list, io_next) { if (io->io_paddr == (vm_paddr_t)rman_get_start(res)) goto found; } /* Then by apic id */ STAILQ_FOREACH(io, &ioapic_list, io_next) { if (io->io_apic_id == apic_id) goto found; } mtx_unlock_spin(&icu_lock); if (bootverbose) device_printf(dev, "cannot match pci bar apic id %d against MADT\n", apic_id); fail: bus_release_resource(dev, SYS_RES_MEMORY, rid, res); return (ENXIO); found: KASSERT(io->pci_dev == NULL, ("ioapic %d pci_dev not NULL", io->io_id)); KASSERT(io->pci_wnd == NULL, ("ioapic %d pci_wnd not NULL", io->io_id)); io->pci_dev = dev; io->pci_wnd = res; if (bootverbose && (io->io_paddr != (vm_paddr_t)rman_get_start(res) || io->io_apic_id != apic_id)) { device_printf(dev, "pci%d:%d:%d:%d pci BAR0@%jx id %d " "MADT id %d paddr@%jx\n", pci_get_domain(dev), pci_get_bus(dev), pci_get_slot(dev), pci_get_function(dev), (uintmax_t)rman_get_start(res), apic_id, io->io_apic_id, (uintmax_t)io->io_paddr); } mtx_unlock_spin(&icu_lock); return (0); } static device_method_t ioapic_pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ioapic_pci_probe), DEVMETHOD(device_attach, ioapic_pci_attach), { 0, 0 } }; DEFINE_CLASS_0(ioapic, ioapic_pci_driver, ioapic_pci_methods, 0); static devclass_t ioapic_devclass; DRIVER_MODULE(ioapic, pci, ioapic_pci_driver, ioapic_devclass, 0, 0); int ioapic_get_rid(u_int apic_id, uint16_t *ridp) { struct ioapic *io; uintptr_t rid; int error; mtx_lock_spin(&icu_lock); STAILQ_FOREACH(io, &ioapic_list, io_next) { if (io->io_apic_id == apic_id) break; } mtx_unlock_spin(&icu_lock); if (io == NULL || io->pci_dev == NULL) return (EINVAL); error = pci_get_id(io->pci_dev, PCI_ID_RID, &rid); if (error != 0) return (error); *ridp = rid; return (0); } /* * A new-bus driver to consume the memory resources associated with * the APICs in the system. On some systems ACPI or PnPBIOS system * resource devices may already claim these resources. To keep from * breaking those devices, we attach ourself to the nexus device after * legacy0 and acpi0 and ignore any allocation failures. */ static void apic_identify(driver_t *driver, device_t parent) { /* * Add at order 12. acpi0 is probed at order 10 and legacy0 * is probed at order 11. */ if (lapic_paddr != 0) BUS_ADD_CHILD(parent, 12, "apic", 0); } static int apic_probe(device_t dev) { device_set_desc(dev, "APIC resources"); device_quiet(dev); return (0); } static void apic_add_resource(device_t dev, int rid, vm_paddr_t base, size_t length) { int error; error = bus_set_resource(dev, SYS_RES_MEMORY, rid, base, length); if (error) panic("apic_add_resource: resource %d failed set with %d", rid, error); bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_SHAREABLE); } static int apic_attach(device_t dev) { struct ioapic *io; int i; /* Reserve the local APIC. */ apic_add_resource(dev, 0, lapic_paddr, LAPIC_MEM_REGION); i = 1; STAILQ_FOREACH(io, &ioapic_list, io_next) { apic_add_resource(dev, i, io->io_paddr, IOAPIC_MEM_REGION); i++; } return (0); } static device_method_t apic_methods[] = { /* Device interface */ DEVMETHOD(device_identify, apic_identify), DEVMETHOD(device_probe, apic_probe), DEVMETHOD(device_attach, apic_attach), { 0, 0 } }; DEFINE_CLASS_0(apic, apic_driver, apic_methods, 0); static devclass_t apic_devclass; DRIVER_MODULE(apic, nexus, apic_driver, apic_devclass, 0, 0); #include "opt_ddb.h" #ifdef DDB #include static const char * ioapic_delivery_mode(uint32_t mode) { switch (mode) { case IOART_DELFIXED: return ("fixed"); case IOART_DELLOPRI: return ("lowestpri"); case IOART_DELSMI: return ("SMI"); case IOART_DELRSV1: return ("rsrvd1"); case IOART_DELNMI: return ("NMI"); case IOART_DELINIT: return ("INIT"); case IOART_DELRSV2: return ("rsrvd2"); case IOART_DELEXINT: return ("ExtINT"); default: return (""); } } static u_int db_ioapic_read(volatile ioapic_t *apic, int reg) { apic->ioregsel = reg; return (apic->iowin); } static void db_show_ioapic_one(volatile ioapic_t *io_addr) { uint32_t r, lo, hi; int mre, i; r = db_ioapic_read(io_addr, IOAPIC_VER); mre = (r & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT; db_printf("Id 0x%08x Ver 0x%02x MRE %d\n", db_ioapic_read(io_addr, IOAPIC_ID), r & IOART_VER_VERSION, mre); for (i = 0; i < mre; i++) { lo = db_ioapic_read(io_addr, IOAPIC_REDTBL_LO(i)); hi = db_ioapic_read(io_addr, IOAPIC_REDTBL_HI(i)); db_printf(" pin %d Dest %s/%x %smasked Trig %s RemoteIRR %d " "Polarity %s Status %s DeliveryMode %s Vec %d\n", i, (lo & IOART_DESTMOD) == IOART_DESTLOG ? "log" : "phy", (hi & IOART_DEST) >> 24, (lo & IOART_INTMASK) == IOART_INTMSET ? "" : "not", (lo & IOART_TRGRMOD) == IOART_TRGRLVL ? "lvl" : "edge", (lo & IOART_REM_IRR) == IOART_REM_IRR ? 1 : 0, (lo & IOART_INTPOL) == IOART_INTALO ? "low" : "high", (lo & IOART_DELIVS) == IOART_DELIVS ? "pend" : "idle", ioapic_delivery_mode(lo & IOART_DELMOD), (lo & IOART_INTVEC)); } } DB_SHOW_COMMAND(ioapic, db_show_ioapic) { struct ioapic *ioapic; int idx, i; if (!have_addr) { db_printf("usage: show ioapic index\n"); return; } idx = (int)addr; i = 0; STAILQ_FOREACH(ioapic, &ioapic_list, io_next) { if (idx == i) { db_show_ioapic_one(ioapic->io_addr); break; } i++; } } DB_SHOW_ALL_COMMAND(ioapics, db_show_all_ioapics) { struct ioapic *ioapic; STAILQ_FOREACH(ioapic, &ioapic_list, io_next) db_show_ioapic_one(ioapic->io_addr); } #endif Index: head/sys/x86/x86/msi.c =================================================================== --- head/sys/x86/x86/msi.c (revision 331697) +++ head/sys/x86/x86/msi.c (revision 331698) @@ -1,732 +1,738 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2006 Yahoo!, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Support for PCI Message Signalled Interrupts (MSI). MSI interrupts on * x86 are basically APIC messages that the northbridge delivers directly * to the local APICs as if they had come from an I/O APIC. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Fields in address for Intel MSI messages. */ #define MSI_INTEL_ADDR_DEST 0x000ff000 #define MSI_INTEL_ADDR_RH 0x00000008 # define MSI_INTEL_ADDR_RH_ON 0x00000008 # define MSI_INTEL_ADDR_RH_OFF 0x00000000 #define MSI_INTEL_ADDR_DM 0x00000004 # define MSI_INTEL_ADDR_DM_PHYSICAL 0x00000000 # define MSI_INTEL_ADDR_DM_LOGICAL 0x00000004 /* Fields in data for Intel MSI messages. */ #define MSI_INTEL_DATA_TRGRMOD IOART_TRGRMOD /* Trigger mode. */ # define MSI_INTEL_DATA_TRGREDG IOART_TRGREDG # define MSI_INTEL_DATA_TRGRLVL IOART_TRGRLVL #define MSI_INTEL_DATA_LEVEL 0x00004000 /* Polarity. */ # define MSI_INTEL_DATA_DEASSERT 0x00000000 # define MSI_INTEL_DATA_ASSERT 0x00004000 #define MSI_INTEL_DATA_DELMOD IOART_DELMOD /* Delivery mode. */ # define MSI_INTEL_DATA_DELFIXED IOART_DELFIXED # define MSI_INTEL_DATA_DELLOPRI IOART_DELLOPRI # define MSI_INTEL_DATA_DELSMI IOART_DELSMI # define MSI_INTEL_DATA_DELNMI IOART_DELNMI # define MSI_INTEL_DATA_DELINIT IOART_DELINIT # define MSI_INTEL_DATA_DELEXINT IOART_DELEXINT #define MSI_INTEL_DATA_INTVEC IOART_INTVEC /* Interrupt vector. */ /* * Build Intel MSI message and data values from a source. AMD64 systems * seem to be compatible, so we use the same function for both. */ #define INTEL_ADDR(msi) \ (MSI_INTEL_ADDR_BASE | (msi)->msi_cpu << 12 | \ MSI_INTEL_ADDR_RH_OFF | MSI_INTEL_ADDR_DM_PHYSICAL) #define INTEL_DATA(msi) \ (MSI_INTEL_DATA_TRGREDG | MSI_INTEL_DATA_DELFIXED | (msi)->msi_vector) static MALLOC_DEFINE(M_MSI, "msi", "PCI MSI"); /* * MSI sources are bunched into groups. This is because MSI forces * all of the messages to share the address and data registers and * thus certain properties (such as the local APIC ID target on x86). * Each group has a 'first' source that contains information global to * the group. These fields are marked with (g) below. * * Note that local APIC ID is kind of special. Each message will be * assigned an ID by the system; however, a group will use the ID from * the first message. * * For MSI-X, each message is isolated. */ struct msi_intsrc { struct intsrc msi_intsrc; device_t msi_dev; /* Owning device. (g) */ struct msi_intsrc *msi_first; /* First source in group. */ u_int msi_irq; /* IRQ cookie. */ u_int msi_msix; /* MSI-X message. */ u_int msi_vector:8; /* IDT vector. */ u_int msi_cpu; /* Local APIC ID. (g) */ u_int msi_count:8; /* Messages in this group. (g) */ u_int msi_maxcount:8; /* Alignment for this group. (g) */ int *msi_irqs; /* Group's IRQ list. (g) */ u_int msi_remap_cookie; }; static void msi_create_source(void); static void msi_enable_source(struct intsrc *isrc); static void msi_disable_source(struct intsrc *isrc, int eoi); static void msi_eoi_source(struct intsrc *isrc); static void msi_enable_intr(struct intsrc *isrc); static void msi_disable_intr(struct intsrc *isrc); static int msi_vector(struct intsrc *isrc); static int msi_source_pending(struct intsrc *isrc); static int msi_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol); static int msi_assign_cpu(struct intsrc *isrc, u_int apic_id); struct pic msi_pic = { .pic_enable_source = msi_enable_source, .pic_disable_source = msi_disable_source, .pic_eoi_source = msi_eoi_source, .pic_enable_intr = msi_enable_intr, .pic_disable_intr = msi_disable_intr, .pic_vector = msi_vector, .pic_source_pending = msi_source_pending, .pic_suspend = NULL, .pic_resume = NULL, .pic_config_intr = msi_config_intr, .pic_assign_cpu = msi_assign_cpu, .pic_reprogram_pin = NULL, }; #ifdef SMP /** * Xen hypervisors prior to 4.6.0 do not properly handle updates to * enabled MSI-X table entries. Allow migration of MSI-X interrupts * to be disabled via a tunable. Values have the following meaning: * * -1: automatic detection by FreeBSD * 0: enable migration * 1: disable migration */ int msix_disable_migration = -1; SYSCTL_INT(_machdep, OID_AUTO, disable_msix_migration, CTLFLAG_RDTUN, &msix_disable_migration, 0, "Disable migration of MSI-X interrupts between CPUs"); #endif static int msi_enabled; static int msi_last_irq; static struct mtx msi_lock; static void msi_enable_source(struct intsrc *isrc) { } static void msi_disable_source(struct intsrc *isrc, int eoi) { if (eoi == PIC_EOI) lapic_eoi(); } static void msi_eoi_source(struct intsrc *isrc) { lapic_eoi(); } static void msi_enable_intr(struct intsrc *isrc) { struct msi_intsrc *msi = (struct msi_intsrc *)isrc; apic_enable_vector(msi->msi_cpu, msi->msi_vector); } static void msi_disable_intr(struct intsrc *isrc) { struct msi_intsrc *msi = (struct msi_intsrc *)isrc; apic_disable_vector(msi->msi_cpu, msi->msi_vector); } static int msi_vector(struct intsrc *isrc) { struct msi_intsrc *msi = (struct msi_intsrc *)isrc; return (msi->msi_irq); } static int msi_source_pending(struct intsrc *isrc) { return (0); } static int msi_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol) { return (ENODEV); } static int msi_assign_cpu(struct intsrc *isrc, u_int apic_id) { struct msi_intsrc *sib, *msi = (struct msi_intsrc *)isrc; int old_vector; u_int old_id; int i, vector; /* * Only allow CPUs to be assigned to the first message for an * MSI group. */ if (msi->msi_first != msi) return (EINVAL); #ifdef SMP if (msix_disable_migration && msi->msi_msix) return (EINVAL); #endif /* Store information to free existing irq. */ old_vector = msi->msi_vector; old_id = msi->msi_cpu; if (old_id == apic_id) return (0); /* Allocate IDT vectors on this cpu. */ if (msi->msi_count > 1) { KASSERT(msi->msi_msix == 0, ("MSI-X message group")); vector = apic_alloc_vectors(apic_id, msi->msi_irqs, msi->msi_count, msi->msi_maxcount); } else vector = apic_alloc_vector(apic_id, msi->msi_irq); if (vector == 0) return (ENOSPC); msi->msi_cpu = apic_id; msi->msi_vector = vector; if (msi->msi_intsrc.is_handlers > 0) apic_enable_vector(msi->msi_cpu, msi->msi_vector); if (bootverbose) printf("msi: Assigning %s IRQ %d to local APIC %u vector %u\n", msi->msi_msix ? "MSI-X" : "MSI", msi->msi_irq, msi->msi_cpu, msi->msi_vector); for (i = 1; i < msi->msi_count; i++) { sib = (struct msi_intsrc *)intr_lookup_source(msi->msi_irqs[i]); sib->msi_cpu = apic_id; sib->msi_vector = vector + i; if (sib->msi_intsrc.is_handlers > 0) apic_enable_vector(sib->msi_cpu, sib->msi_vector); if (bootverbose) printf( "msi: Assigning MSI IRQ %d to local APIC %u vector %u\n", sib->msi_irq, sib->msi_cpu, sib->msi_vector); } BUS_REMAP_INTR(device_get_parent(msi->msi_dev), msi->msi_dev, msi->msi_irq); /* * Free the old vector after the new one is established. This is done * to prevent races where we could miss an interrupt. */ if (msi->msi_intsrc.is_handlers > 0) apic_disable_vector(old_id, old_vector); apic_free_vector(old_id, old_vector, msi->msi_irq); for (i = 1; i < msi->msi_count; i++) { sib = (struct msi_intsrc *)intr_lookup_source(msi->msi_irqs[i]); if (sib->msi_intsrc.is_handlers > 0) apic_disable_vector(old_id, old_vector + i); apic_free_vector(old_id, old_vector + i, msi->msi_irqs[i]); } return (0); } void msi_init(void) { /* Check if we have a supported CPU. */ switch (cpu_vendor_id) { case CPU_VENDOR_INTEL: case CPU_VENDOR_AMD: break; case CPU_VENDOR_CENTAUR: if (CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xf) break; /* FALLTHROUGH */ default: return; } #ifdef SMP if (msix_disable_migration == -1) { /* The default is to allow migration of MSI-X interrupts. */ msix_disable_migration = 0; } #endif msi_enabled = 1; intr_register_pic(&msi_pic); mtx_init(&msi_lock, "msi", NULL, MTX_DEF); } static void msi_create_source(void) { struct msi_intsrc *msi; u_int irq; mtx_lock(&msi_lock); if (msi_last_irq >= NUM_MSI_INTS) { mtx_unlock(&msi_lock); return; } irq = msi_last_irq + FIRST_MSI_INT; msi_last_irq++; mtx_unlock(&msi_lock); msi = malloc(sizeof(struct msi_intsrc), M_MSI, M_WAITOK | M_ZERO); msi->msi_intsrc.is_pic = &msi_pic; msi->msi_irq = irq; intr_register_source(&msi->msi_intsrc); nexus_add_irq(irq); } /* * Try to allocate 'count' interrupt sources with contiguous IDT values. */ int msi_alloc(device_t dev, int count, int maxcount, int *irqs) { struct msi_intsrc *msi, *fsrc; - u_int cpu; + u_int cpu, domain; int cnt, i, *mirqs, vector; #ifdef ACPI_DMAR u_int cookies[count]; int error; #endif if (!msi_enabled) return (ENXIO); + if (bus_get_domain(dev, &domain) != 0) + domain = 0; + if (count > 1) mirqs = malloc(count * sizeof(*mirqs), M_MSI, M_WAITOK); else mirqs = NULL; again: mtx_lock(&msi_lock); /* Try to find 'count' free IRQs. */ cnt = 0; for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) { msi = (struct msi_intsrc *)intr_lookup_source(i); /* End of allocated sources, so break. */ if (msi == NULL) break; /* If this is a free one, save its IRQ in the array. */ if (msi->msi_dev == NULL) { irqs[cnt] = i; cnt++; if (cnt == count) break; } } /* Do we need to create some new sources? */ if (cnt < count) { /* If we would exceed the max, give up. */ if (i + (count - cnt) > FIRST_MSI_INT + NUM_MSI_INTS) { mtx_unlock(&msi_lock); free(mirqs, M_MSI); return (ENXIO); } mtx_unlock(&msi_lock); /* We need count - cnt more sources. */ while (cnt < count) { msi_create_source(); cnt++; } goto again; } /* Ok, we now have the IRQs allocated. */ KASSERT(cnt == count, ("count mismatch")); /* Allocate 'count' IDT vectors. */ - cpu = intr_next_cpu(); + cpu = intr_next_cpu(domain); vector = apic_alloc_vectors(cpu, irqs, count, maxcount); if (vector == 0) { mtx_unlock(&msi_lock); free(mirqs, M_MSI); return (ENOSPC); } #ifdef ACPI_DMAR mtx_unlock(&msi_lock); error = iommu_alloc_msi_intr(dev, cookies, count); mtx_lock(&msi_lock); if (error == EOPNOTSUPP) error = 0; if (error != 0) { for (i = 0; i < count; i++) apic_free_vector(cpu, vector + i, irqs[i]); free(mirqs, M_MSI); return (error); } for (i = 0; i < count; i++) { msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]); msi->msi_remap_cookie = cookies[i]; } #endif /* Assign IDT vectors and make these messages owned by 'dev'. */ fsrc = (struct msi_intsrc *)intr_lookup_source(irqs[0]); for (i = 0; i < count; i++) { msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]); msi->msi_cpu = cpu; msi->msi_dev = dev; msi->msi_vector = vector + i; if (bootverbose) printf( "msi: routing MSI IRQ %d to local APIC %u vector %u\n", msi->msi_irq, msi->msi_cpu, msi->msi_vector); msi->msi_first = fsrc; KASSERT(msi->msi_intsrc.is_handlers == 0, ("dead MSI has handlers")); } fsrc->msi_count = count; fsrc->msi_maxcount = maxcount; if (count > 1) bcopy(irqs, mirqs, count * sizeof(*mirqs)); fsrc->msi_irqs = mirqs; mtx_unlock(&msi_lock); return (0); } int msi_release(int *irqs, int count) { struct msi_intsrc *msi, *first; int i; mtx_lock(&msi_lock); first = (struct msi_intsrc *)intr_lookup_source(irqs[0]); if (first == NULL) { mtx_unlock(&msi_lock); return (ENOENT); } /* Make sure this isn't an MSI-X message. */ if (first->msi_msix) { mtx_unlock(&msi_lock); return (EINVAL); } /* Make sure this message is allocated to a group. */ if (first->msi_first == NULL) { mtx_unlock(&msi_lock); return (ENXIO); } /* * Make sure this is the start of a group and that we are releasing * the entire group. */ if (first->msi_first != first || first->msi_count != count) { mtx_unlock(&msi_lock); return (EINVAL); } KASSERT(first->msi_dev != NULL, ("unowned group")); /* Clear all the extra messages in the group. */ for (i = 1; i < count; i++) { msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]); KASSERT(msi->msi_first == first, ("message not in group")); KASSERT(msi->msi_dev == first->msi_dev, ("owner mismatch")); #ifdef ACPI_DMAR iommu_unmap_msi_intr(first->msi_dev, msi->msi_remap_cookie); #endif msi->msi_first = NULL; msi->msi_dev = NULL; apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq); msi->msi_vector = 0; } /* Clear out the first message. */ #ifdef ACPI_DMAR mtx_unlock(&msi_lock); iommu_unmap_msi_intr(first->msi_dev, first->msi_remap_cookie); mtx_lock(&msi_lock); #endif first->msi_first = NULL; first->msi_dev = NULL; apic_free_vector(first->msi_cpu, first->msi_vector, first->msi_irq); first->msi_vector = 0; first->msi_count = 0; first->msi_maxcount = 0; free(first->msi_irqs, M_MSI); first->msi_irqs = NULL; mtx_unlock(&msi_lock); return (0); } int msi_map(int irq, uint64_t *addr, uint32_t *data) { struct msi_intsrc *msi; int error; #ifdef ACPI_DMAR struct msi_intsrc *msi1; int i, k; #endif mtx_lock(&msi_lock); msi = (struct msi_intsrc *)intr_lookup_source(irq); if (msi == NULL) { mtx_unlock(&msi_lock); return (ENOENT); } /* Make sure this message is allocated to a device. */ if (msi->msi_dev == NULL) { mtx_unlock(&msi_lock); return (ENXIO); } /* * If this message isn't an MSI-X message, make sure it's part * of a group, and switch to the first message in the * group. */ if (!msi->msi_msix) { if (msi->msi_first == NULL) { mtx_unlock(&msi_lock); return (ENXIO); } msi = msi->msi_first; } #ifdef ACPI_DMAR if (!msi->msi_msix) { for (k = msi->msi_count - 1, i = FIRST_MSI_INT; k > 0 && i < FIRST_MSI_INT + NUM_MSI_INTS; i++) { if (i == msi->msi_irq) continue; msi1 = (struct msi_intsrc *)intr_lookup_source(i); if (!msi1->msi_msix && msi1->msi_first == msi) { mtx_unlock(&msi_lock); iommu_map_msi_intr(msi1->msi_dev, msi1->msi_cpu, msi1->msi_vector, msi1->msi_remap_cookie, NULL, NULL); k--; mtx_lock(&msi_lock); } } } mtx_unlock(&msi_lock); error = iommu_map_msi_intr(msi->msi_dev, msi->msi_cpu, msi->msi_vector, msi->msi_remap_cookie, addr, data); #else mtx_unlock(&msi_lock); error = EOPNOTSUPP; #endif if (error == EOPNOTSUPP) { *addr = INTEL_ADDR(msi); *data = INTEL_DATA(msi); error = 0; } return (error); } int msix_alloc(device_t dev, int *irq) { struct msi_intsrc *msi; - u_int cpu; + u_int cpu, domain; int i, vector; #ifdef ACPI_DMAR u_int cookie; int error; #endif if (!msi_enabled) return (ENXIO); + if (bus_get_domain(dev, &domain) != 0) + domain = 0; + again: mtx_lock(&msi_lock); /* Find a free IRQ. */ for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) { msi = (struct msi_intsrc *)intr_lookup_source(i); /* End of allocated sources, so break. */ if (msi == NULL) break; /* Stop at the first free source. */ if (msi->msi_dev == NULL) break; } /* Do we need to create a new source? */ if (msi == NULL) { /* If we would exceed the max, give up. */ if (i + 1 > FIRST_MSI_INT + NUM_MSI_INTS) { mtx_unlock(&msi_lock); return (ENXIO); } mtx_unlock(&msi_lock); /* Create a new source. */ msi_create_source(); goto again; } /* Allocate an IDT vector. */ - cpu = intr_next_cpu(); + cpu = intr_next_cpu(domain); vector = apic_alloc_vector(cpu, i); if (vector == 0) { mtx_unlock(&msi_lock); return (ENOSPC); } msi->msi_dev = dev; #ifdef ACPI_DMAR mtx_unlock(&msi_lock); error = iommu_alloc_msi_intr(dev, &cookie, 1); mtx_lock(&msi_lock); if (error == EOPNOTSUPP) error = 0; if (error != 0) { msi->msi_dev = NULL; apic_free_vector(cpu, vector, i); return (error); } msi->msi_remap_cookie = cookie; #endif if (bootverbose) printf("msi: routing MSI-X IRQ %d to local APIC %u vector %u\n", msi->msi_irq, cpu, vector); /* Setup source. */ msi->msi_cpu = cpu; msi->msi_first = msi; msi->msi_vector = vector; msi->msi_msix = 1; msi->msi_count = 1; msi->msi_maxcount = 1; msi->msi_irqs = NULL; KASSERT(msi->msi_intsrc.is_handlers == 0, ("dead MSI-X has handlers")); mtx_unlock(&msi_lock); *irq = i; return (0); } int msix_release(int irq) { struct msi_intsrc *msi; mtx_lock(&msi_lock); msi = (struct msi_intsrc *)intr_lookup_source(irq); if (msi == NULL) { mtx_unlock(&msi_lock); return (ENOENT); } /* Make sure this is an MSI-X message. */ if (!msi->msi_msix) { mtx_unlock(&msi_lock); return (EINVAL); } KASSERT(msi->msi_dev != NULL, ("unowned message")); /* Clear out the message. */ #ifdef ACPI_DMAR mtx_unlock(&msi_lock); iommu_unmap_msi_intr(msi->msi_dev, msi->msi_remap_cookie); mtx_lock(&msi_lock); #endif msi->msi_first = NULL; msi->msi_dev = NULL; apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq); msi->msi_vector = 0; msi->msi_msix = 0; msi->msi_count = 0; msi->msi_maxcount = 0; mtx_unlock(&msi_lock); return (0); } Index: head/sys/x86/x86/nexus.c =================================================================== --- head/sys/x86/x86/nexus.c (revision 331697) +++ head/sys/x86/x86/nexus.c (revision 331698) @@ -1,905 +1,907 @@ /*- * Copyright 1998 Massachusetts Institute of Technology * * Permission to use, copy, modify, and distribute this software and * its documentation for any purpose and without fee is hereby * granted, provided that both the above copyright notice and this * permission notice appear in all copies, that both the above * copyright notice and this permission notice appear in all * supporting documentation, and that the name of M.I.T. not be used * in advertising or publicity pertaining to distribution of the * software without specific, written prior permission. M.I.T. makes * no representations about the suitability of this software for any * purpose. It is provided "as is" without express or implied * warranty. * * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * This code implements a `root nexus' for Intel Architecture * machines. The function of the root nexus is to serve as an * attachment point for both processors and buses, and to manage * resources which are common to all of them. In particular, * this code implements the core resource managers for interrupt * requests, DMA requests (which rightfully should be a part of the * ISA code but it's easier to do it here for now), I/O port addresses, * and I/O memory address space. */ #ifdef __amd64__ #define DEV_APIC #else #include "opt_apic.h" #endif #include "opt_isa.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC #include "pcib_if.h" #endif #ifdef DEV_ISA #include #include #endif #include #define ELF_KERN_STR ("elf"__XSTRING(__ELF_WORD_SIZE)" kernel") static MALLOC_DEFINE(M_NEXUSDEV, "nexusdev", "Nexus device"); #define DEVTONX(dev) ((struct nexus_device *)device_get_ivars(dev)) struct rman irq_rman, drq_rman, port_rman, mem_rman; static int nexus_probe(device_t); static int nexus_attach(device_t); static int nexus_print_all_resources(device_t dev); static int nexus_print_child(device_t, device_t); static device_t nexus_add_child(device_t bus, u_int order, const char *name, int unit); static struct resource *nexus_alloc_resource(device_t, device_t, int, int *, rman_res_t, rman_res_t, rman_res_t, u_int); static int nexus_adjust_resource(device_t, device_t, int, struct resource *, rman_res_t, rman_res_t); #ifdef SMP static int nexus_bind_intr(device_t, device_t, struct resource *, int); #endif static int nexus_config_intr(device_t, int, enum intr_trigger, enum intr_polarity); static int nexus_describe_intr(device_t dev, device_t child, struct resource *irq, void *cookie, const char *descr); static int nexus_activate_resource(device_t, device_t, int, int, struct resource *); static int nexus_deactivate_resource(device_t, device_t, int, int, struct resource *); static int nexus_map_resource(device_t bus, device_t child, int type, struct resource *r, struct resource_map_request *argsp, struct resource_map *map); static int nexus_unmap_resource(device_t bus, device_t child, int type, struct resource *r, struct resource_map *map); static int nexus_release_resource(device_t, device_t, int, int, struct resource *); static int nexus_setup_intr(device_t, device_t, struct resource *, int flags, driver_filter_t filter, void (*)(void *), void *, void **); static int nexus_teardown_intr(device_t, device_t, struct resource *, void *); static struct resource_list *nexus_get_reslist(device_t dev, device_t child); static int nexus_set_resource(device_t, device_t, int, int, rman_res_t, rman_res_t); static int nexus_get_resource(device_t, device_t, int, int, rman_res_t *, rman_res_t *); static void nexus_delete_resource(device_t, device_t, int, int); static int nexus_get_cpus(device_t, device_t, enum cpu_sets, size_t, cpuset_t *); #ifdef DEV_APIC static int nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs); static int nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs); static int nexus_alloc_msix(device_t pcib, device_t dev, int *irq); static int nexus_release_msix(device_t pcib, device_t dev, int irq); static int nexus_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data); #endif static device_method_t nexus_methods[] = { /* Device interface */ DEVMETHOD(device_probe, nexus_probe), DEVMETHOD(device_attach, nexus_attach), DEVMETHOD(device_detach, bus_generic_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, bus_generic_resume), /* Bus interface */ DEVMETHOD(bus_print_child, nexus_print_child), DEVMETHOD(bus_add_child, nexus_add_child), DEVMETHOD(bus_alloc_resource, nexus_alloc_resource), DEVMETHOD(bus_adjust_resource, nexus_adjust_resource), DEVMETHOD(bus_release_resource, nexus_release_resource), DEVMETHOD(bus_activate_resource, nexus_activate_resource), DEVMETHOD(bus_deactivate_resource, nexus_deactivate_resource), DEVMETHOD(bus_map_resource, nexus_map_resource), DEVMETHOD(bus_unmap_resource, nexus_unmap_resource), DEVMETHOD(bus_setup_intr, nexus_setup_intr), DEVMETHOD(bus_teardown_intr, nexus_teardown_intr), #ifdef SMP DEVMETHOD(bus_bind_intr, nexus_bind_intr), #endif DEVMETHOD(bus_config_intr, nexus_config_intr), DEVMETHOD(bus_describe_intr, nexus_describe_intr), DEVMETHOD(bus_get_resource_list, nexus_get_reslist), DEVMETHOD(bus_set_resource, nexus_set_resource), DEVMETHOD(bus_get_resource, nexus_get_resource), DEVMETHOD(bus_delete_resource, nexus_delete_resource), DEVMETHOD(bus_get_cpus, nexus_get_cpus), /* pcib interface */ #ifdef DEV_APIC DEVMETHOD(pcib_alloc_msi, nexus_alloc_msi), DEVMETHOD(pcib_release_msi, nexus_release_msi), DEVMETHOD(pcib_alloc_msix, nexus_alloc_msix), DEVMETHOD(pcib_release_msix, nexus_release_msix), DEVMETHOD(pcib_map_msi, nexus_map_msi), #endif { 0, 0 } }; DEFINE_CLASS_0(nexus, nexus_driver, nexus_methods, 1); static devclass_t nexus_devclass; DRIVER_MODULE(nexus, root, nexus_driver, nexus_devclass, 0, 0); static int nexus_probe(device_t dev) { device_quiet(dev); /* suppress attach message for neatness */ return (BUS_PROBE_GENERIC); } void nexus_init_resources(void) { int irq; /* * XXX working notes: * * - IRQ resource creation should be moved to the PIC/APIC driver. * - DRQ resource creation should be moved to the DMAC driver. * - The above should be sorted to probe earlier than any child buses. * * - Leave I/O and memory creation here, as child probes may need them. * (especially eg. ACPI) */ /* * IRQ's are on the mainboard on old systems, but on the ISA part * of PCI->ISA bridges. There would be multiple sets of IRQs on * multi-ISA-bus systems. PCI interrupts are routed to the ISA * component, so in a way, PCI can be a partial child of an ISA bus(!). * APIC interrupts are global though. */ irq_rman.rm_start = 0; irq_rman.rm_type = RMAN_ARRAY; irq_rman.rm_descr = "Interrupt request lines"; irq_rman.rm_end = NUM_IO_INTS - 1; if (rman_init(&irq_rman)) panic("nexus_init_resources irq_rman"); /* * We search for regions of existing IRQs and add those to the IRQ * resource manager. */ for (irq = 0; irq < NUM_IO_INTS; irq++) if (intr_lookup_source(irq) != NULL) if (rman_manage_region(&irq_rman, irq, irq) != 0) panic("nexus_init_resources irq_rman add"); /* * ISA DMA on PCI systems is implemented in the ISA part of each * PCI->ISA bridge and the channels can be duplicated if there are * multiple bridges. (eg: laptops with docking stations) */ drq_rman.rm_start = 0; drq_rman.rm_end = 7; drq_rman.rm_type = RMAN_ARRAY; drq_rman.rm_descr = "DMA request lines"; /* XXX drq 0 not available on some machines */ if (rman_init(&drq_rman) || rman_manage_region(&drq_rman, drq_rman.rm_start, drq_rman.rm_end)) panic("nexus_init_resources drq_rman"); /* * However, IO ports and Memory truely are global at this level, * as are APIC interrupts (however many IO APICS there turn out * to be on large systems..) */ port_rman.rm_start = 0; port_rman.rm_end = 0xffff; port_rman.rm_type = RMAN_ARRAY; port_rman.rm_descr = "I/O ports"; if (rman_init(&port_rman) || rman_manage_region(&port_rman, 0, 0xffff)) panic("nexus_init_resources port_rman"); mem_rman.rm_start = 0; #ifndef PAE mem_rman.rm_end = BUS_SPACE_MAXADDR; #else mem_rman.rm_end = ((1ULL << cpu_maxphyaddr) - 1); #endif mem_rman.rm_type = RMAN_ARRAY; mem_rman.rm_descr = "I/O memory addresses"; if (rman_init(&mem_rman) || rman_manage_region(&mem_rman, 0, mem_rman.rm_end)) panic("nexus_init_resources mem_rman"); } static int nexus_attach(device_t dev) { nexus_init_resources(); bus_generic_probe(dev); /* * Explicitly add the legacy0 device here. Other platform * types (such as ACPI), use their own nexus(4) subclass * driver to override this routine and add their own root bus. */ if (BUS_ADD_CHILD(dev, 10, "legacy", 0) == NULL) panic("legacy: could not attach"); bus_generic_attach(dev); return 0; } static int nexus_print_all_resources(device_t dev) { struct nexus_device *ndev = DEVTONX(dev); struct resource_list *rl = &ndev->nx_resources; int retval = 0; if (STAILQ_FIRST(rl)) retval += printf(" at"); retval += resource_list_print_type(rl, "port", SYS_RES_IOPORT, "%#jx"); retval += resource_list_print_type(rl, "iomem", SYS_RES_MEMORY, "%#jx"); retval += resource_list_print_type(rl, "irq", SYS_RES_IRQ, "%jd"); return retval; } static int nexus_print_child(device_t bus, device_t child) { int retval = 0; retval += bus_print_child_header(bus, child); retval += nexus_print_all_resources(child); if (device_get_flags(child)) retval += printf(" flags %#x", device_get_flags(child)); retval += printf(" on motherboard\n"); /* XXX "motherboard", ick */ return (retval); } static device_t nexus_add_child(device_t bus, u_int order, const char *name, int unit) { device_t child; struct nexus_device *ndev; ndev = malloc(sizeof(struct nexus_device), M_NEXUSDEV, M_NOWAIT|M_ZERO); if (!ndev) return(0); resource_list_init(&ndev->nx_resources); child = device_add_child_ordered(bus, order, name, unit); /* should we free this in nexus_child_detached? */ device_set_ivars(child, ndev); return(child); } static struct rman * nexus_rman(int type) { switch (type) { case SYS_RES_IRQ: return (&irq_rman); case SYS_RES_DRQ: return (&drq_rman); case SYS_RES_IOPORT: return (&port_rman); case SYS_RES_MEMORY: return (&mem_rman); default: return (NULL); } } /* * Allocate a resource on behalf of child. NB: child is usually going to be a * child of one of our descendants, not a direct child of nexus0. * (Exceptions include npx.) */ static struct resource * nexus_alloc_resource(device_t bus, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { struct nexus_device *ndev = DEVTONX(child); struct resource *rv; struct resource_list_entry *rle; struct rman *rm; int needactivate = flags & RF_ACTIVE; /* * If this is an allocation of the "default" range for a given * RID, and we know what the resources for this device are * (ie. they aren't maintained by a child bus), then work out * the start/end values. */ if (RMAN_IS_DEFAULT_RANGE(start, end) && (count == 1)) { if (device_get_parent(child) != bus || ndev == NULL) return(NULL); rle = resource_list_find(&ndev->nx_resources, type, *rid); if (rle == NULL) return(NULL); start = rle->start; end = rle->end; count = rle->count; } flags &= ~RF_ACTIVE; rm = nexus_rman(type); if (rm == NULL) return (NULL); rv = rman_reserve_resource(rm, start, end, count, flags, child); if (rv == NULL) return 0; rman_set_rid(rv, *rid); if (needactivate) { if (bus_activate_resource(child, type, *rid, rv)) { rman_release_resource(rv); return 0; } } return rv; } static int nexus_adjust_resource(device_t bus, device_t child, int type, struct resource *r, rman_res_t start, rman_res_t end) { struct rman *rm; rm = nexus_rman(type); if (rm == NULL) return (ENXIO); if (!rman_is_region_manager(r, rm)) return (EINVAL); return (rman_adjust_resource(r, start, end)); } static int nexus_activate_resource(device_t bus, device_t child, int type, int rid, struct resource *r) { struct resource_map map; int error; error = rman_activate_resource(r); if (error != 0) return (error); if (!(rman_get_flags(r) & RF_UNMAPPED) && (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) { error = nexus_map_resource(bus, child, type, r, NULL, &map); if (error) { rman_deactivate_resource(r); return (error); } rman_set_mapping(r,&map); } return (0); } static int nexus_deactivate_resource(device_t bus, device_t child, int type, int rid, struct resource *r) { struct resource_map map; int error; error = rman_deactivate_resource(r); if (error) return (error); if (!(rman_get_flags(r) & RF_UNMAPPED) && (type == SYS_RES_MEMORY || type == SYS_RES_IOPORT)) { rman_get_mapping(r, &map); nexus_unmap_resource(bus, child, type, r, &map); } return (0); } static int nexus_map_resource(device_t bus, device_t child, int type, struct resource *r, struct resource_map_request *argsp, struct resource_map *map) { struct resource_map_request args; rman_res_t end, length, start; /* Resources must be active to be mapped. */ if (!(rman_get_flags(r) & RF_ACTIVE)) return (ENXIO); /* Mappings are only supported on I/O and memory resources. */ switch (type) { case SYS_RES_IOPORT: case SYS_RES_MEMORY: break; default: return (EINVAL); } resource_init_map_request(&args); if (argsp != NULL) bcopy(argsp, &args, imin(argsp->size, args.size)); start = rman_get_start(r) + args.offset; if (args.length == 0) length = rman_get_size(r); else length = args.length; end = start + length - 1; if (start > rman_get_end(r) || start < rman_get_start(r)) return (EINVAL); if (end > rman_get_end(r) || end < start) return (EINVAL); /* * If this is a memory resource, map it into the kernel. */ switch (type) { case SYS_RES_IOPORT: map->r_bushandle = start; map->r_bustag = X86_BUS_SPACE_IO; map->r_size = length; map->r_vaddr = NULL; break; case SYS_RES_MEMORY: map->r_vaddr = pmap_mapdev_attr(start, length, args.memattr); map->r_bustag = X86_BUS_SPACE_MEM; map->r_size = length; /* * The handle is the virtual address. */ map->r_bushandle = (bus_space_handle_t)map->r_vaddr; break; } return (0); } static int nexus_unmap_resource(device_t bus, device_t child, int type, struct resource *r, struct resource_map *map) { /* * If this is a memory resource, unmap it. */ switch (type) { case SYS_RES_MEMORY: pmap_unmapdev((vm_offset_t)map->r_vaddr, map->r_size); /* FALLTHROUGH */ case SYS_RES_IOPORT: break; default: return (EINVAL); } return (0); } static int nexus_release_resource(device_t bus, device_t child, int type, int rid, struct resource *r) { if (rman_get_flags(r) & RF_ACTIVE) { int error = bus_deactivate_resource(child, type, rid, r); if (error) return error; } return (rman_release_resource(r)); } /* * Currently this uses the really grody interface from kern/kern_intr.c * (which really doesn't belong in kern/anything.c). Eventually, all of * the code in kern_intr.c and machdep_intr.c should get moved here, since * this is going to be the official interface. */ static int nexus_setup_intr(device_t bus, device_t child, struct resource *irq, int flags, driver_filter_t filter, void (*ihand)(void *), void *arg, void **cookiep) { - int error; + int error, domain; /* somebody tried to setup an irq that failed to allocate! */ if (irq == NULL) panic("nexus_setup_intr: NULL irq resource!"); *cookiep = NULL; if ((rman_get_flags(irq) & RF_SHAREABLE) == 0) flags |= INTR_EXCL; /* * We depend here on rman_activate_resource() being idempotent. */ error = rman_activate_resource(irq); if (error) return (error); + if (bus_get_domain(child, &domain) != 0) + domain = 0; error = intr_add_handler(device_get_nameunit(child), - rman_get_start(irq), filter, ihand, arg, flags, cookiep); + rman_get_start(irq), filter, ihand, arg, flags, cookiep, domain); return (error); } static int nexus_teardown_intr(device_t dev, device_t child, struct resource *r, void *ih) { return (intr_remove_handler(ih)); } #ifdef SMP static int nexus_bind_intr(device_t dev, device_t child, struct resource *irq, int cpu) { return (intr_bind(rman_get_start(irq), cpu)); } #endif static int nexus_config_intr(device_t dev, int irq, enum intr_trigger trig, enum intr_polarity pol) { return (intr_config_intr(irq, trig, pol)); } static int nexus_describe_intr(device_t dev, device_t child, struct resource *irq, void *cookie, const char *descr) { return (intr_describe(rman_get_start(irq), cookie, descr)); } static struct resource_list * nexus_get_reslist(device_t dev, device_t child) { struct nexus_device *ndev = DEVTONX(child); return (&ndev->nx_resources); } static int nexus_set_resource(device_t dev, device_t child, int type, int rid, rman_res_t start, rman_res_t count) { struct nexus_device *ndev = DEVTONX(child); struct resource_list *rl = &ndev->nx_resources; /* XXX this should return a success/failure indicator */ resource_list_add(rl, type, rid, start, start + count - 1, count); return(0); } static int nexus_get_resource(device_t dev, device_t child, int type, int rid, rman_res_t *startp, rman_res_t *countp) { struct nexus_device *ndev = DEVTONX(child); struct resource_list *rl = &ndev->nx_resources; struct resource_list_entry *rle; rle = resource_list_find(rl, type, rid); if (!rle) return(ENOENT); if (startp) *startp = rle->start; if (countp) *countp = rle->count; return(0); } static void nexus_delete_resource(device_t dev, device_t child, int type, int rid) { struct nexus_device *ndev = DEVTONX(child); struct resource_list *rl = &ndev->nx_resources; resource_list_delete(rl, type, rid); } static int nexus_get_cpus(device_t dev, device_t child, enum cpu_sets op, size_t setsize, cpuset_t *cpuset) { switch (op) { #ifdef SMP case INTR_CPUS: if (setsize != sizeof(cpuset_t)) return (EINVAL); *cpuset = intr_cpus; return (0); #endif default: return (bus_generic_get_cpus(dev, child, op, setsize, cpuset)); } } /* Called from the MSI code to add new IRQs to the IRQ rman. */ void nexus_add_irq(u_long irq) { if (rman_manage_region(&irq_rman, irq, irq) != 0) panic("%s: failed", __func__); } #ifdef DEV_APIC static int nexus_alloc_msix(device_t pcib, device_t dev, int *irq) { return (msix_alloc(dev, irq)); } static int nexus_release_msix(device_t pcib, device_t dev, int irq) { return (msix_release(irq)); } static int nexus_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs) { return (msi_alloc(dev, count, maxcount, irqs)); } static int nexus_release_msi(device_t pcib, device_t dev, int count, int *irqs) { return (msi_release(irqs, count)); } static int nexus_map_msi(device_t pcib, device_t dev, int irq, uint64_t *addr, uint32_t *data) { return (msi_map(irq, addr, data)); } #endif /* Placeholder for system RAM. */ static void ram_identify(driver_t *driver, device_t parent) { if (resource_disabled("ram", 0)) return; if (BUS_ADD_CHILD(parent, 0, "ram", 0) == NULL) panic("ram_identify"); } static int ram_probe(device_t dev) { device_quiet(dev); device_set_desc(dev, "System RAM"); return (0); } static int ram_attach(device_t dev) { struct bios_smap *smapbase, *smap, *smapend; struct resource *res; vm_paddr_t *p; caddr_t kmdp; uint32_t smapsize; int error, rid; /* Retrieve the system memory map from the loader. */ kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type(ELF_KERN_STR); smapbase = (struct bios_smap *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_SMAP); if (smapbase != NULL) { smapsize = *((u_int32_t *)smapbase - 1); smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize); rid = 0; for (smap = smapbase; smap < smapend; smap++) { if (smap->type != SMAP_TYPE_MEMORY || smap->length == 0) continue; #ifdef __i386__ /* * Resources use long's to track resources, so * we can't include memory regions above 4GB. */ if (smap->base > ~0ul) continue; #endif error = bus_set_resource(dev, SYS_RES_MEMORY, rid, smap->base, smap->length); if (error) panic( "ram_attach: resource %d failed set with %d", rid, error); res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0); if (res == NULL) panic("ram_attach: resource %d failed to attach", rid); rid++; } return (0); } /* * If the system map is not available, fall back to using * dump_avail[]. We use the dump_avail[] array rather than * phys_avail[] for the memory map as phys_avail[] contains * holes for kernel memory, page 0, the message buffer, and * the dcons buffer. We test the end address in the loop * instead of the start since the start address for the first * segment is 0. */ for (rid = 0, p = dump_avail; p[1] != 0; rid++, p += 2) { #ifdef PAE /* * Resources use long's to track resources, so we can't * include memory regions above 4GB. */ if (p[0] > ~0ul) break; #endif error = bus_set_resource(dev, SYS_RES_MEMORY, rid, p[0], p[1] - p[0]); if (error) panic("ram_attach: resource %d failed set with %d", rid, error); res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0); if (res == NULL) panic("ram_attach: resource %d failed to attach", rid); } return (0); } static device_method_t ram_methods[] = { /* Device interface */ DEVMETHOD(device_identify, ram_identify), DEVMETHOD(device_probe, ram_probe), DEVMETHOD(device_attach, ram_attach), { 0, 0 } }; static driver_t ram_driver = { "ram", ram_methods, 1, /* no softc */ }; static devclass_t ram_devclass; DRIVER_MODULE(ram, nexus, ram_driver, ram_devclass, 0, 0); #ifdef DEV_ISA /* * Placeholder which claims PnP 'devices' which describe system * resources. */ static struct isa_pnp_id sysresource_ids[] = { { 0x010cd041 /* PNP0c01 */, "System Memory" }, { 0x020cd041 /* PNP0c02 */, "System Resource" }, { 0 } }; static int sysresource_probe(device_t dev) { int result; if ((result = ISA_PNP_PROBE(device_get_parent(dev), dev, sysresource_ids)) <= 0) { device_quiet(dev); } return(result); } static int sysresource_attach(device_t dev) { return(0); } static device_method_t sysresource_methods[] = { /* Device interface */ DEVMETHOD(device_probe, sysresource_probe), DEVMETHOD(device_attach, sysresource_attach), DEVMETHOD(device_detach, bus_generic_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, bus_generic_resume), { 0, 0 } }; static driver_t sysresource_driver = { "sysresource", sysresource_methods, 1, /* no softc */ }; static devclass_t sysresource_devclass; DRIVER_MODULE(sysresource, isa, sysresource_driver, sysresource_devclass, 0, 0); ISA_PNP_INFO(sysresource_ids); #endif /* DEV_ISA */ Index: head/sys/x86/xen/xen_intr.c =================================================================== --- head/sys/x86/xen/xen_intr.c (revision 331697) +++ head/sys/x86/xen/xen_intr.c (revision 331698) @@ -1,1668 +1,1668 @@ /****************************************************************************** * xen_intr.c * * Xen event and interrupt services for x86 HVM guests. * * Copyright (c) 2002-2005, K A Fraser * Copyright (c) 2005, Intel Corporation * Copyright (c) 2012, Spectra Logic Corporation * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif static MALLOC_DEFINE(M_XENINTR, "xen_intr", "Xen Interrupt Services"); /** * Per-cpu event channel processing state. */ struct xen_intr_pcpu_data { /** * The last event channel bitmap section (level one bit) processed. * This is used to ensure we scan all ports before * servicing an already servied port again. */ u_int last_processed_l1i; /** * The last event channel processed within the event channel * bitmap being scanned. */ u_int last_processed_l2i; /** Pointer to this CPU's interrupt statistic counter. */ u_long *evtchn_intrcnt; /** * A bitmap of ports that can be serviced from this CPU. * A set bit means interrupt handling is enabled. */ u_long evtchn_enabled[sizeof(u_long) * 8]; }; /* * Start the scan at port 0 by initializing the last scanned * location as the highest numbered event channel port. */ static DPCPU_DEFINE(struct xen_intr_pcpu_data, xen_intr_pcpu) = { .last_processed_l1i = LONG_BIT - 1, .last_processed_l2i = LONG_BIT - 1 }; DPCPU_DECLARE(struct vcpu_info *, vcpu_info); #define XEN_EEXIST 17 /* Xen "already exists" error */ #define XEN_ALLOCATE_VECTOR 0 /* Allocate a vector for this event channel */ #define XEN_INVALID_EVTCHN 0 /* Invalid event channel */ #define is_valid_evtchn(x) ((x) != XEN_INVALID_EVTCHN) struct xenisrc { struct intsrc xi_intsrc; enum evtchn_type xi_type; int xi_cpu; /* VCPU for delivery. */ int xi_vector; /* Global isrc vector number. */ evtchn_port_t xi_port; int xi_pirq; int xi_virq; void *xi_cookie; u_int xi_close:1; /* close on unbind? */ u_int xi_activehi:1; u_int xi_edgetrigger:1; u_int xi_masked:1; volatile u_int xi_refcount; }; static void xen_intr_suspend(struct pic *); static void xen_intr_resume(struct pic *, bool suspend_cancelled); static void xen_intr_enable_source(struct intsrc *isrc); static void xen_intr_disable_source(struct intsrc *isrc, int eoi); static void xen_intr_eoi_source(struct intsrc *isrc); static void xen_intr_enable_intr(struct intsrc *isrc); static void xen_intr_disable_intr(struct intsrc *isrc); static int xen_intr_vector(struct intsrc *isrc); static int xen_intr_source_pending(struct intsrc *isrc); static int xen_intr_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol); static int xen_intr_assign_cpu(struct intsrc *isrc, u_int apic_id); static void xen_intr_pirq_enable_source(struct intsrc *isrc); static void xen_intr_pirq_disable_source(struct intsrc *isrc, int eoi); static void xen_intr_pirq_eoi_source(struct intsrc *isrc); static void xen_intr_pirq_enable_intr(struct intsrc *isrc); static void xen_intr_pirq_disable_intr(struct intsrc *isrc); static int xen_intr_pirq_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol); /** * PIC interface for all event channel port types except physical IRQs. */ struct pic xen_intr_pic = { .pic_enable_source = xen_intr_enable_source, .pic_disable_source = xen_intr_disable_source, .pic_eoi_source = xen_intr_eoi_source, .pic_enable_intr = xen_intr_enable_intr, .pic_disable_intr = xen_intr_disable_intr, .pic_vector = xen_intr_vector, .pic_source_pending = xen_intr_source_pending, .pic_suspend = xen_intr_suspend, .pic_resume = xen_intr_resume, .pic_config_intr = xen_intr_config_intr, .pic_assign_cpu = xen_intr_assign_cpu }; /** * PIC interface for all event channel representing * physical interrupt sources. */ struct pic xen_intr_pirq_pic = { .pic_enable_source = xen_intr_pirq_enable_source, .pic_disable_source = xen_intr_pirq_disable_source, .pic_eoi_source = xen_intr_pirq_eoi_source, .pic_enable_intr = xen_intr_pirq_enable_intr, .pic_disable_intr = xen_intr_pirq_disable_intr, .pic_vector = xen_intr_vector, .pic_source_pending = xen_intr_source_pending, .pic_config_intr = xen_intr_pirq_config_intr, .pic_assign_cpu = xen_intr_assign_cpu }; static struct mtx xen_intr_isrc_lock; static int xen_intr_auto_vector_count; static struct xenisrc *xen_intr_port_to_isrc[NR_EVENT_CHANNELS]; static u_long *xen_intr_pirq_eoi_map; static boolean_t xen_intr_pirq_eoi_map_enabled; /*------------------------- Private Functions --------------------------------*/ /** * Disable signal delivery for an event channel port on the * specified CPU. * * \param port The event channel port to mask. * * This API is used to manage the port<=>CPU binding of event * channel handlers. * * \note This operation does not preclude reception of an event * for this event channel on another CPU. To mask the * event channel globally, use evtchn_mask(). */ static inline void evtchn_cpu_mask_port(u_int cpu, evtchn_port_t port) { struct xen_intr_pcpu_data *pcpu; pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu); xen_clear_bit(port, pcpu->evtchn_enabled); } /** * Enable signal delivery for an event channel port on the * specified CPU. * * \param port The event channel port to unmask. * * This API is used to manage the port<=>CPU binding of event * channel handlers. * * \note This operation does not guarantee that event delivery * is enabled for this event channel port. The port must * also be globally enabled. See evtchn_unmask(). */ static inline void evtchn_cpu_unmask_port(u_int cpu, evtchn_port_t port) { struct xen_intr_pcpu_data *pcpu; pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu); xen_set_bit(port, pcpu->evtchn_enabled); } /** * Allocate and register a per-cpu Xen upcall interrupt counter. * * \param cpu The cpu for which to register this interrupt count. */ static void xen_intr_intrcnt_add(u_int cpu) { char buf[MAXCOMLEN + 1]; struct xen_intr_pcpu_data *pcpu; pcpu = DPCPU_ID_PTR(cpu, xen_intr_pcpu); if (pcpu->evtchn_intrcnt != NULL) return; snprintf(buf, sizeof(buf), "cpu%d:xen", cpu); intrcnt_add(buf, &pcpu->evtchn_intrcnt); } /** * Search for an already allocated but currently unused Xen interrupt * source object. * * \param type Restrict the search to interrupt sources of the given * type. * * \return A pointer to a free Xen interrupt source object or NULL. */ static struct xenisrc * xen_intr_find_unused_isrc(enum evtchn_type type) { int isrc_idx; KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn isrc lock not held")); for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx ++) { struct xenisrc *isrc; u_int vector; vector = FIRST_EVTCHN_INT + isrc_idx; isrc = (struct xenisrc *)intr_lookup_source(vector); if (isrc != NULL && isrc->xi_type == EVTCHN_TYPE_UNBOUND) { KASSERT(isrc->xi_intsrc.is_handlers == 0, ("Free evtchn still has handlers")); isrc->xi_type = type; return (isrc); } } return (NULL); } /** * Allocate a Xen interrupt source object. * * \param type The type of interrupt source to create. * * \return A pointer to a newly allocated Xen interrupt source * object or NULL. */ static struct xenisrc * xen_intr_alloc_isrc(enum evtchn_type type, int vector) { static int warned; struct xenisrc *isrc; KASSERT(mtx_owned(&xen_intr_isrc_lock), ("Evtchn alloc lock not held")); if (xen_intr_auto_vector_count > NR_EVENT_CHANNELS) { if (!warned) { warned = 1; printf("xen_intr_alloc: Event channels exhausted.\n"); } return (NULL); } if (type != EVTCHN_TYPE_PIRQ) { vector = FIRST_EVTCHN_INT + xen_intr_auto_vector_count; xen_intr_auto_vector_count++; } KASSERT((intr_lookup_source(vector) == NULL), ("Trying to use an already allocated vector")); mtx_unlock(&xen_intr_isrc_lock); isrc = malloc(sizeof(*isrc), M_XENINTR, M_WAITOK | M_ZERO); isrc->xi_intsrc.is_pic = (type == EVTCHN_TYPE_PIRQ) ? &xen_intr_pirq_pic : &xen_intr_pic; isrc->xi_vector = vector; isrc->xi_type = type; intr_register_source(&isrc->xi_intsrc); mtx_lock(&xen_intr_isrc_lock); return (isrc); } /** * Attempt to free an active Xen interrupt source object. * * \param isrc The interrupt source object to release. * * \returns EBUSY if the source is still in use, otherwise 0. */ static int xen_intr_release_isrc(struct xenisrc *isrc) { mtx_lock(&xen_intr_isrc_lock); KASSERT(isrc->xi_intsrc.is_handlers == 0, ("Release called, but xenisrc still in use")); evtchn_mask_port(isrc->xi_port); evtchn_clear_port(isrc->xi_port); /* Rebind port to CPU 0. */ evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port); evtchn_cpu_unmask_port(0, isrc->xi_port); if (isrc->xi_close != 0 && is_valid_evtchn(isrc->xi_port)) { struct evtchn_close close = { .port = isrc->xi_port }; if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) panic("EVTCHNOP_close failed"); } xen_intr_port_to_isrc[isrc->xi_port] = NULL; isrc->xi_cpu = 0; isrc->xi_type = EVTCHN_TYPE_UNBOUND; isrc->xi_port = 0; isrc->xi_cookie = NULL; mtx_unlock(&xen_intr_isrc_lock); return (0); } /** * Associate an interrupt handler with an already allocated local Xen * event channel port. * * \param isrcp The returned Xen interrupt object associated with * the specified local port. * \param local_port The event channel to bind. * \param type The event channel type of local_port. * \param intr_owner The device making this bind request. * \param filter An interrupt filter handler. Specify NULL * to always dispatch to the ithread handler. * \param handler An interrupt ithread handler. Optional (can * specify NULL) if all necessary event actions * are performed by filter. * \param arg Argument to present to both filter and handler. * \param irqflags Interrupt handler flags. See sys/bus.h. * \param handlep Pointer to an opaque handle used to manage this * registration. * * \returns 0 on success, otherwise an errno. */ static int xen_intr_bind_isrc(struct xenisrc **isrcp, evtchn_port_t local_port, enum evtchn_type type, const char *intr_owner, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, xen_intr_handle_t *port_handlep) { struct xenisrc *isrc; int error; *isrcp = NULL; if (port_handlep == NULL) { printf("%s: xen_intr_bind_isrc: Bad event handle\n", intr_owner); return (EINVAL); } mtx_lock(&xen_intr_isrc_lock); isrc = xen_intr_find_unused_isrc(type); if (isrc == NULL) { isrc = xen_intr_alloc_isrc(type, XEN_ALLOCATE_VECTOR); if (isrc == NULL) { mtx_unlock(&xen_intr_isrc_lock); return (ENOSPC); } } isrc->xi_port = local_port; xen_intr_port_to_isrc[local_port] = isrc; refcount_init(&isrc->xi_refcount, 1); mtx_unlock(&xen_intr_isrc_lock); /* Assign the opaque handler (the event channel port) */ *port_handlep = &isrc->xi_vector; #ifdef SMP if (type == EVTCHN_TYPE_PORT) { /* * By default all interrupts are assigned to vCPU#0 * unless specified otherwise, so shuffle them to balance * the interrupt load. */ - xen_intr_assign_cpu(&isrc->xi_intsrc, intr_next_cpu()); + xen_intr_assign_cpu(&isrc->xi_intsrc, intr_next_cpu(0)); } #endif if (filter == NULL && handler == NULL) { /* * No filter/handler provided, leave the event channel * masked and without a valid handler, the caller is * in charge of setting that up. */ *isrcp = isrc; return (0); } error = xen_intr_add_handler(intr_owner, filter, handler, arg, flags, *port_handlep); if (error != 0) { xen_intr_release_isrc(isrc); return (error); } *isrcp = isrc; return (0); } /** * Lookup a Xen interrupt source object given an interrupt binding handle. * * \param handle A handle initialized by a previous call to * xen_intr_bind_isrc(). * * \returns A pointer to the Xen interrupt source object associated * with the given interrupt handle. NULL if no association * currently exists. */ static struct xenisrc * xen_intr_isrc(xen_intr_handle_t handle) { int vector; if (handle == NULL) return (NULL); vector = *(int *)handle; KASSERT(vector >= FIRST_EVTCHN_INT && vector < (FIRST_EVTCHN_INT + xen_intr_auto_vector_count), ("Xen interrupt vector is out of range")); return ((struct xenisrc *)intr_lookup_source(vector)); } /** * Determine the event channel ports at the given section of the * event port bitmap which have pending events for the given cpu. * * \param pcpu The Xen interrupt pcpu data for the cpu being querried. * \param sh The Xen shared info area. * \param idx The index of the section of the event channel bitmap to * inspect. * * \returns A u_long with bits set for every event channel with pending * events. */ static inline u_long xen_intr_active_ports(struct xen_intr_pcpu_data *pcpu, shared_info_t *sh, u_int idx) { CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(sh->evtchn_pending[0])); CTASSERT(sizeof(sh->evtchn_mask[0]) == sizeof(pcpu->evtchn_enabled[0])); CTASSERT(sizeof(sh->evtchn_mask) == sizeof(sh->evtchn_pending)); CTASSERT(sizeof(sh->evtchn_mask) == sizeof(pcpu->evtchn_enabled)); return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx] & pcpu->evtchn_enabled[idx]); } /** * Interrupt handler for processing all Xen event channel events. * * \param trap_frame The trap frame context for the current interrupt. */ void xen_intr_handle_upcall(struct trapframe *trap_frame) { u_int l1i, l2i, port, cpu; u_long masked_l1, masked_l2; struct xenisrc *isrc; shared_info_t *s; vcpu_info_t *v; struct xen_intr_pcpu_data *pc; u_long l1, l2; /* * Disable preemption in order to always check and fire events * on the right vCPU */ critical_enter(); cpu = PCPU_GET(cpuid); pc = DPCPU_PTR(xen_intr_pcpu); s = HYPERVISOR_shared_info; v = DPCPU_GET(vcpu_info); if (xen_hvm_domain() && !xen_vector_callback_enabled) { KASSERT((cpu == 0), ("Fired PCI event callback on wrong CPU")); } v->evtchn_upcall_pending = 0; #if 0 #ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ /* Clear master flag /before/ clearing selector flag. */ wmb(); #endif #endif l1 = atomic_readandclear_long(&v->evtchn_pending_sel); l1i = pc->last_processed_l1i; l2i = pc->last_processed_l2i; (*pc->evtchn_intrcnt)++; while (l1 != 0) { l1i = (l1i + 1) % LONG_BIT; masked_l1 = l1 & ((~0UL) << l1i); if (masked_l1 == 0) { /* * if we masked out all events, wrap around * to the beginning. */ l1i = LONG_BIT - 1; l2i = LONG_BIT - 1; continue; } l1i = ffsl(masked_l1) - 1; do { l2 = xen_intr_active_ports(pc, s, l1i); l2i = (l2i + 1) % LONG_BIT; masked_l2 = l2 & ((~0UL) << l2i); if (masked_l2 == 0) { /* if we masked out all events, move on */ l2i = LONG_BIT - 1; break; } l2i = ffsl(masked_l2) - 1; /* process port */ port = (l1i * LONG_BIT) + l2i; synch_clear_bit(port, &s->evtchn_pending[0]); isrc = xen_intr_port_to_isrc[port]; if (__predict_false(isrc == NULL)) continue; /* Make sure we are firing on the right vCPU */ KASSERT((isrc->xi_cpu == PCPU_GET(cpuid)), ("Received unexpected event on vCPU#%d, event bound to vCPU#%d", PCPU_GET(cpuid), isrc->xi_cpu)); intr_execute_handlers(&isrc->xi_intsrc, trap_frame); /* * If this is the final port processed, * we'll pick up here+1 next time. */ pc->last_processed_l1i = l1i; pc->last_processed_l2i = l2i; } while (l2i != LONG_BIT - 1); l2 = xen_intr_active_ports(pc, s, l1i); if (l2 == 0) { /* * We handled all ports, so we can clear the * selector bit. */ l1 &= ~(1UL << l1i); } } critical_exit(); } static int xen_intr_init(void *dummy __unused) { shared_info_t *s = HYPERVISOR_shared_info; struct xen_intr_pcpu_data *pcpu; struct physdev_pirq_eoi_gmfn eoi_gmfn; int i, rc; if (!xen_domain()) return (0); mtx_init(&xen_intr_isrc_lock, "xen-irq-lock", NULL, MTX_DEF); /* * Register interrupt count manually as we aren't * guaranteed to see a call to xen_intr_assign_cpu() * before our first interrupt. Also set the per-cpu * mask of CPU#0 to enable all, since by default * all event channels are bound to CPU#0. */ CPU_FOREACH(i) { pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu); memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0, sizeof(pcpu->evtchn_enabled)); xen_intr_intrcnt_add(i); } for (i = 0; i < nitems(s->evtchn_mask); i++) atomic_store_rel_long(&s->evtchn_mask[i], ~0); /* Try to register PIRQ EOI map */ xen_intr_pirq_eoi_map = malloc(PAGE_SIZE, M_XENINTR, M_WAITOK | M_ZERO); eoi_gmfn.gmfn = atop(vtophys(xen_intr_pirq_eoi_map)); rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); if (rc != 0 && bootverbose) printf("Xen interrupts: unable to register PIRQ EOI map\n"); else xen_intr_pirq_eoi_map_enabled = true; intr_register_pic(&xen_intr_pic); intr_register_pic(&xen_intr_pirq_pic); if (bootverbose) printf("Xen interrupt system initialized\n"); return (0); } SYSINIT(xen_intr_init, SI_SUB_INTR, SI_ORDER_SECOND, xen_intr_init, NULL); /*--------------------------- Common PIC Functions ---------------------------*/ /** * Prepare this PIC for system suspension. */ static void xen_intr_suspend(struct pic *unused) { } static void xen_rebind_ipi(struct xenisrc *isrc) { #ifdef SMP int cpu = isrc->xi_cpu; int vcpu_id = pcpu_find(cpu)->pc_vcpu_id; int error; struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id }; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi); if (error != 0) panic("unable to rebind xen IPI: %d", error); isrc->xi_port = bind_ipi.port; isrc->xi_cpu = 0; xen_intr_port_to_isrc[bind_ipi.port] = isrc; error = xen_intr_assign_cpu(&isrc->xi_intsrc, cpu_apic_ids[cpu]); if (error) panic("unable to bind xen IPI to CPU#%d: %d", cpu, error); evtchn_unmask_port(bind_ipi.port); #else panic("Resume IPI event channel on UP"); #endif } static void xen_rebind_virq(struct xenisrc *isrc) { int cpu = isrc->xi_cpu; int vcpu_id = pcpu_find(cpu)->pc_vcpu_id; int error; struct evtchn_bind_virq bind_virq = { .virq = isrc->xi_virq, .vcpu = vcpu_id }; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq); if (error != 0) panic("unable to rebind xen VIRQ#%d: %d", isrc->xi_virq, error); isrc->xi_port = bind_virq.port; isrc->xi_cpu = 0; xen_intr_port_to_isrc[bind_virq.port] = isrc; #ifdef SMP error = xen_intr_assign_cpu(&isrc->xi_intsrc, cpu_apic_ids[cpu]); if (error) panic("unable to bind xen VIRQ#%d to CPU#%d: %d", isrc->xi_virq, cpu, error); #endif evtchn_unmask_port(bind_virq.port); } /** * Return this PIC to service after being suspended. */ static void xen_intr_resume(struct pic *unused, bool suspend_cancelled) { shared_info_t *s = HYPERVISOR_shared_info; struct xenisrc *isrc; u_int isrc_idx; int i; if (suspend_cancelled) return; /* Reset the per-CPU masks */ CPU_FOREACH(i) { struct xen_intr_pcpu_data *pcpu; pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu); memset(pcpu->evtchn_enabled, i == 0 ? ~0 : 0, sizeof(pcpu->evtchn_enabled)); } /* Mask all event channels. */ for (i = 0; i < nitems(s->evtchn_mask); i++) atomic_store_rel_long(&s->evtchn_mask[i], ~0); /* Remove port -> isrc mappings */ memset(xen_intr_port_to_isrc, 0, sizeof(xen_intr_port_to_isrc)); /* Free unused isrcs and rebind VIRQs and IPIs */ for (isrc_idx = 0; isrc_idx < xen_intr_auto_vector_count; isrc_idx++) { u_int vector; vector = FIRST_EVTCHN_INT + isrc_idx; isrc = (struct xenisrc *)intr_lookup_source(vector); if (isrc != NULL) { isrc->xi_port = 0; switch (isrc->xi_type) { case EVTCHN_TYPE_IPI: xen_rebind_ipi(isrc); break; case EVTCHN_TYPE_VIRQ: xen_rebind_virq(isrc); break; default: break; } } } } /** * Disable a Xen interrupt source. * * \param isrc The interrupt source to disable. */ static void xen_intr_disable_intr(struct intsrc *base_isrc) { struct xenisrc *isrc = (struct xenisrc *)base_isrc; evtchn_mask_port(isrc->xi_port); } /** * Determine the global interrupt vector number for * a Xen interrupt source. * * \param isrc The interrupt source to query. * * \return The vector number corresponding to the given interrupt source. */ static int xen_intr_vector(struct intsrc *base_isrc) { struct xenisrc *isrc = (struct xenisrc *)base_isrc; return (isrc->xi_vector); } /** * Determine whether or not interrupt events are pending on the * the given interrupt source. * * \param isrc The interrupt source to query. * * \returns 0 if no events are pending, otherwise non-zero. */ static int xen_intr_source_pending(struct intsrc *isrc) { /* * EventChannels are edge triggered and never masked. * There can be no pending events. */ return (0); } /** * Perform configuration of an interrupt source. * * \param isrc The interrupt source to configure. * \param trig Edge or level. * \param pol Active high or low. * * \returns 0 if no events are pending, otherwise non-zero. */ static int xen_intr_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol) { /* Configuration is only possible via the evtchn apis. */ return (ENODEV); } /** * Configure CPU affinity for interrupt source event delivery. * * \param isrc The interrupt source to configure. * \param apic_id The apic id of the CPU for handling future events. * * \returns 0 if successful, otherwise an errno. */ static int xen_intr_assign_cpu(struct intsrc *base_isrc, u_int apic_id) { #ifdef SMP struct evtchn_bind_vcpu bind_vcpu; struct xenisrc *isrc; u_int to_cpu, vcpu_id; int error, masked; if (xen_vector_callback_enabled == 0) return (EOPNOTSUPP); to_cpu = apic_cpuid(apic_id); vcpu_id = pcpu_find(to_cpu)->pc_vcpu_id; xen_intr_intrcnt_add(to_cpu); mtx_lock(&xen_intr_isrc_lock); isrc = (struct xenisrc *)base_isrc; if (!is_valid_evtchn(isrc->xi_port)) { mtx_unlock(&xen_intr_isrc_lock); return (EINVAL); } /* * Mask the event channel while binding it to prevent interrupt * delivery with an inconsistent state in isrc->xi_cpu. */ masked = evtchn_test_and_set_mask(isrc->xi_port); if ((isrc->xi_type == EVTCHN_TYPE_VIRQ) || (isrc->xi_type == EVTCHN_TYPE_IPI)) { /* * Virtual IRQs are associated with a cpu by * the Hypervisor at evtchn_bind_virq time, so * all we need to do is update the per-CPU masks. */ evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port); isrc->xi_cpu = to_cpu; evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port); goto out; } bind_vcpu.port = isrc->xi_port; bind_vcpu.vcpu = vcpu_id; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu); if (isrc->xi_cpu != to_cpu) { if (error == 0) { /* Commit to new binding by removing the old one. */ evtchn_cpu_mask_port(isrc->xi_cpu, isrc->xi_port); isrc->xi_cpu = to_cpu; evtchn_cpu_unmask_port(isrc->xi_cpu, isrc->xi_port); } } out: if (masked == 0) evtchn_unmask_port(isrc->xi_port); mtx_unlock(&xen_intr_isrc_lock); return (0); #else return (EOPNOTSUPP); #endif } /*------------------- Virtual Interrupt Source PIC Functions -----------------*/ /* * Mask a level triggered interrupt source. * * \param isrc The interrupt source to mask (if necessary). * \param eoi If non-zero, perform any necessary end-of-interrupt * acknowledgements. */ static void xen_intr_disable_source(struct intsrc *base_isrc, int eoi) { struct xenisrc *isrc; isrc = (struct xenisrc *)base_isrc; /* * NB: checking if the event channel is already masked is * needed because the event channel user-space device * masks event channels on its filter as part of its * normal operation, and those shouldn't be automatically * unmasked by the generic interrupt code. The event channel * device will unmask them when needed. */ isrc->xi_masked = !!evtchn_test_and_set_mask(isrc->xi_port); } /* * Unmask a level triggered interrupt source. * * \param isrc The interrupt source to unmask (if necessary). */ static void xen_intr_enable_source(struct intsrc *base_isrc) { struct xenisrc *isrc; isrc = (struct xenisrc *)base_isrc; if (isrc->xi_masked == 0) evtchn_unmask_port(isrc->xi_port); } /* * Perform any necessary end-of-interrupt acknowledgements. * * \param isrc The interrupt source to EOI. */ static void xen_intr_eoi_source(struct intsrc *base_isrc) { } /* * Enable and unmask the interrupt source. * * \param isrc The interrupt source to enable. */ static void xen_intr_enable_intr(struct intsrc *base_isrc) { struct xenisrc *isrc = (struct xenisrc *)base_isrc; evtchn_unmask_port(isrc->xi_port); } /*------------------ Physical Interrupt Source PIC Functions -----------------*/ /* * Mask a level triggered interrupt source. * * \param isrc The interrupt source to mask (if necessary). * \param eoi If non-zero, perform any necessary end-of-interrupt * acknowledgements. */ static void xen_intr_pirq_disable_source(struct intsrc *base_isrc, int eoi) { struct xenisrc *isrc; isrc = (struct xenisrc *)base_isrc; if (isrc->xi_edgetrigger == 0) evtchn_mask_port(isrc->xi_port); if (eoi == PIC_EOI) xen_intr_pirq_eoi_source(base_isrc); } /* * Unmask a level triggered interrupt source. * * \param isrc The interrupt source to unmask (if necessary). */ static void xen_intr_pirq_enable_source(struct intsrc *base_isrc) { struct xenisrc *isrc; isrc = (struct xenisrc *)base_isrc; if (isrc->xi_edgetrigger == 0) evtchn_unmask_port(isrc->xi_port); } /* * Perform any necessary end-of-interrupt acknowledgements. * * \param isrc The interrupt source to EOI. */ static void xen_intr_pirq_eoi_source(struct intsrc *base_isrc) { struct xenisrc *isrc; int error; isrc = (struct xenisrc *)base_isrc; if (xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map)) { struct physdev_eoi eoi = { .irq = isrc->xi_pirq }; error = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); if (error != 0) panic("Unable to EOI PIRQ#%d: %d\n", isrc->xi_pirq, error); } } /* * Enable and unmask the interrupt source. * * \param isrc The interrupt source to enable. */ static void xen_intr_pirq_enable_intr(struct intsrc *base_isrc) { struct xenisrc *isrc; struct evtchn_bind_pirq bind_pirq; struct physdev_irq_status_query irq_status; int error; isrc = (struct xenisrc *)base_isrc; if (!xen_intr_pirq_eoi_map_enabled) { irq_status.irq = isrc->xi_pirq; error = HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status); if (error) panic("unable to get status of IRQ#%d", isrc->xi_pirq); if (irq_status.flags & XENIRQSTAT_needs_eoi) { /* * Since the dynamic PIRQ EOI map is not available * mark the PIRQ as needing EOI unconditionally. */ xen_set_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map); } } bind_pirq.pirq = isrc->xi_pirq; bind_pirq.flags = isrc->xi_edgetrigger ? 0 : BIND_PIRQ__WILL_SHARE; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); if (error) panic("unable to bind IRQ#%d", isrc->xi_pirq); isrc->xi_port = bind_pirq.port; mtx_lock(&xen_intr_isrc_lock); KASSERT((xen_intr_port_to_isrc[bind_pirq.port] == NULL), ("trying to override an already setup event channel port")); xen_intr_port_to_isrc[bind_pirq.port] = isrc; mtx_unlock(&xen_intr_isrc_lock); evtchn_unmask_port(isrc->xi_port); } /* * Disable an interrupt source. * * \param isrc The interrupt source to disable. */ static void xen_intr_pirq_disable_intr(struct intsrc *base_isrc) { struct xenisrc *isrc; struct evtchn_close close; int error; isrc = (struct xenisrc *)base_isrc; evtchn_mask_port(isrc->xi_port); close.port = isrc->xi_port; error = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); if (error) panic("unable to close event channel %d IRQ#%d", isrc->xi_port, isrc->xi_pirq); mtx_lock(&xen_intr_isrc_lock); xen_intr_port_to_isrc[isrc->xi_port] = NULL; mtx_unlock(&xen_intr_isrc_lock); isrc->xi_port = 0; } /** * Perform configuration of an interrupt source. * * \param isrc The interrupt source to configure. * \param trig Edge or level. * \param pol Active high or low. * * \returns 0 if no events are pending, otherwise non-zero. */ static int xen_intr_pirq_config_intr(struct intsrc *base_isrc, enum intr_trigger trig, enum intr_polarity pol) { struct xenisrc *isrc = (struct xenisrc *)base_isrc; struct physdev_setup_gsi setup_gsi; int error; KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM), ("%s: Conforming trigger or polarity\n", __func__)); setup_gsi.gsi = isrc->xi_pirq; setup_gsi.triggering = trig == INTR_TRIGGER_EDGE ? 0 : 1; setup_gsi.polarity = pol == INTR_POLARITY_HIGH ? 0 : 1; error = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi); if (error == -XEN_EEXIST) { if ((isrc->xi_edgetrigger && (trig != INTR_TRIGGER_EDGE)) || (isrc->xi_activehi && (pol != INTR_POLARITY_HIGH))) panic("unable to reconfigure interrupt IRQ#%d", isrc->xi_pirq); error = 0; } if (error) panic("unable to configure IRQ#%d\n", isrc->xi_pirq); isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0; isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0; return (0); } /*--------------------------- Public Functions -------------------------------*/ /*------- API comments for these methods can be found in xen/xenintr.h -------*/ int xen_intr_bind_local_port(device_t dev, evtchn_port_t local_port, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, xen_intr_handle_t *port_handlep) { struct xenisrc *isrc; int error; error = xen_intr_bind_isrc(&isrc, local_port, EVTCHN_TYPE_PORT, device_get_nameunit(dev), filter, handler, arg, flags, port_handlep); if (error != 0) return (error); /* * The Event Channel API didn't open this port, so it is not * responsible for closing it automatically on unbind. */ isrc->xi_close = 0; return (0); } int xen_intr_alloc_and_bind_local_port(device_t dev, u_int remote_domain, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, xen_intr_handle_t *port_handlep) { struct xenisrc *isrc; struct evtchn_alloc_unbound alloc_unbound; int error; alloc_unbound.dom = DOMID_SELF; alloc_unbound.remote_dom = remote_domain; error = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &alloc_unbound); if (error != 0) { /* * XXX Trap Hypercall error code Linuxisms in * the HYPERCALL layer. */ return (-error); } error = xen_intr_bind_isrc(&isrc, alloc_unbound.port, EVTCHN_TYPE_PORT, device_get_nameunit(dev), filter, handler, arg, flags, port_handlep); if (error != 0) { evtchn_close_t close = { .port = alloc_unbound.port }; if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) panic("EVTCHNOP_close failed"); return (error); } isrc->xi_close = 1; return (0); } int xen_intr_bind_remote_port(device_t dev, u_int remote_domain, u_int remote_port, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, xen_intr_handle_t *port_handlep) { struct xenisrc *isrc; struct evtchn_bind_interdomain bind_interdomain; int error; bind_interdomain.remote_dom = remote_domain; bind_interdomain.remote_port = remote_port; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, &bind_interdomain); if (error != 0) { /* * XXX Trap Hypercall error code Linuxisms in * the HYPERCALL layer. */ return (-error); } error = xen_intr_bind_isrc(&isrc, bind_interdomain.local_port, EVTCHN_TYPE_PORT, device_get_nameunit(dev), filter, handler, arg, flags, port_handlep); if (error) { evtchn_close_t close = { .port = bind_interdomain.local_port }; if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) panic("EVTCHNOP_close failed"); return (error); } /* * The Event Channel API opened this port, so it is * responsible for closing it automatically on unbind. */ isrc->xi_close = 1; return (0); } int xen_intr_bind_virq(device_t dev, u_int virq, u_int cpu, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, xen_intr_handle_t *port_handlep) { int vcpu_id = pcpu_find(cpu)->pc_vcpu_id; struct xenisrc *isrc; struct evtchn_bind_virq bind_virq = { .virq = virq, .vcpu = vcpu_id }; int error; /* Ensure the target CPU is ready to handle evtchn interrupts. */ xen_intr_intrcnt_add(cpu); isrc = NULL; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, &bind_virq); if (error != 0) { /* * XXX Trap Hypercall error code Linuxisms in * the HYPERCALL layer. */ return (-error); } error = xen_intr_bind_isrc(&isrc, bind_virq.port, EVTCHN_TYPE_VIRQ, device_get_nameunit(dev), filter, handler, arg, flags, port_handlep); #ifdef SMP if (error == 0) error = intr_event_bind(isrc->xi_intsrc.is_event, cpu); #endif if (error != 0) { evtchn_close_t close = { .port = bind_virq.port }; xen_intr_unbind(*port_handlep); if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) panic("EVTCHNOP_close failed"); return (error); } #ifdef SMP if (isrc->xi_cpu != cpu) { /* * Too early in the boot process for the generic interrupt * code to perform the binding. Update our event channel * masks manually so events can't fire on the wrong cpu * during AP startup. */ xen_intr_assign_cpu(&isrc->xi_intsrc, cpu_apic_ids[cpu]); } #endif /* * The Event Channel API opened this port, so it is * responsible for closing it automatically on unbind. */ isrc->xi_close = 1; isrc->xi_virq = virq; return (0); } int xen_intr_alloc_and_bind_ipi(u_int cpu, driver_filter_t filter, enum intr_type flags, xen_intr_handle_t *port_handlep) { #ifdef SMP int vcpu_id = pcpu_find(cpu)->pc_vcpu_id; struct xenisrc *isrc; struct evtchn_bind_ipi bind_ipi = { .vcpu = vcpu_id }; /* Same size as the one used by intr_handler->ih_name. */ char name[MAXCOMLEN + 1]; int error; /* Ensure the target CPU is ready to handle evtchn interrupts. */ xen_intr_intrcnt_add(cpu); isrc = NULL; error = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, &bind_ipi); if (error != 0) { /* * XXX Trap Hypercall error code Linuxisms in * the HYPERCALL layer. */ return (-error); } snprintf(name, sizeof(name), "cpu%u", cpu); error = xen_intr_bind_isrc(&isrc, bind_ipi.port, EVTCHN_TYPE_IPI, name, filter, NULL, NULL, flags, port_handlep); if (error != 0) { evtchn_close_t close = { .port = bind_ipi.port }; xen_intr_unbind(*port_handlep); if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) panic("EVTCHNOP_close failed"); return (error); } if (isrc->xi_cpu != cpu) { /* * Too early in the boot process for the generic interrupt * code to perform the binding. Update our event channel * masks manually so events can't fire on the wrong cpu * during AP startup. */ xen_intr_assign_cpu(&isrc->xi_intsrc, cpu_apic_ids[cpu]); } /* * The Event Channel API opened this port, so it is * responsible for closing it automatically on unbind. */ isrc->xi_close = 1; return (0); #else return (EOPNOTSUPP); #endif } int xen_register_pirq(int vector, enum intr_trigger trig, enum intr_polarity pol) { struct physdev_map_pirq map_pirq; struct xenisrc *isrc; int error; if (vector == 0) return (EINVAL); if (bootverbose) printf("xen: register IRQ#%d\n", vector); map_pirq.domid = DOMID_SELF; map_pirq.type = MAP_PIRQ_TYPE_GSI; map_pirq.index = vector; map_pirq.pirq = vector; error = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_pirq); if (error) { printf("xen: unable to map IRQ#%d\n", vector); return (error); } mtx_lock(&xen_intr_isrc_lock); isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector); mtx_unlock(&xen_intr_isrc_lock); KASSERT((isrc != NULL), ("xen: unable to allocate isrc for interrupt")); isrc->xi_pirq = vector; isrc->xi_activehi = pol == INTR_POLARITY_HIGH ? 1 : 0; isrc->xi_edgetrigger = trig == INTR_TRIGGER_EDGE ? 1 : 0; return (0); } int xen_register_msi(device_t dev, int vector, int count) { struct physdev_map_pirq msi_irq; struct xenisrc *isrc; int ret; memset(&msi_irq, 0, sizeof(msi_irq)); msi_irq.domid = DOMID_SELF; msi_irq.type = count == 1 ? MAP_PIRQ_TYPE_MSI_SEG : MAP_PIRQ_TYPE_MULTI_MSI; msi_irq.index = -1; msi_irq.pirq = -1; msi_irq.bus = pci_get_bus(dev) | (pci_get_domain(dev) << 16); msi_irq.devfn = (pci_get_slot(dev) << 3) | pci_get_function(dev); msi_irq.entry_nr = count; ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &msi_irq); if (ret != 0) return (ret); if (count != msi_irq.entry_nr) { panic("unable to setup all requested MSI vectors " "(expected %d got %d)", count, msi_irq.entry_nr); } mtx_lock(&xen_intr_isrc_lock); for (int i = 0; i < count; i++) { isrc = xen_intr_alloc_isrc(EVTCHN_TYPE_PIRQ, vector + i); KASSERT(isrc != NULL, ("xen: unable to allocate isrc for interrupt")); isrc->xi_pirq = msi_irq.pirq + i; /* MSI interrupts are always edge triggered */ isrc->xi_edgetrigger = 1; } mtx_unlock(&xen_intr_isrc_lock); return (0); } int xen_release_msi(int vector) { struct physdev_unmap_pirq unmap; struct xenisrc *isrc; int ret; isrc = (struct xenisrc *)intr_lookup_source(vector); if (isrc == NULL) return (ENXIO); unmap.pirq = isrc->xi_pirq; ret = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap); if (ret != 0) return (ret); xen_intr_release_isrc(isrc); return (0); } int xen_intr_describe(xen_intr_handle_t port_handle, const char *fmt, ...) { char descr[MAXCOMLEN + 1]; struct xenisrc *isrc; va_list ap; isrc = xen_intr_isrc(port_handle); if (isrc == NULL) return (EINVAL); va_start(ap, fmt); vsnprintf(descr, sizeof(descr), fmt, ap); va_end(ap); return (intr_describe(isrc->xi_vector, isrc->xi_cookie, descr)); } void xen_intr_unbind(xen_intr_handle_t *port_handlep) { struct xenisrc *isrc; KASSERT(port_handlep != NULL, ("NULL xen_intr_handle_t passed to xen_intr_unbind")); isrc = xen_intr_isrc(*port_handlep); *port_handlep = NULL; if (isrc == NULL) return; mtx_lock(&xen_intr_isrc_lock); if (refcount_release(&isrc->xi_refcount) == 0) { mtx_unlock(&xen_intr_isrc_lock); return; } mtx_unlock(&xen_intr_isrc_lock); if (isrc->xi_cookie != NULL) intr_remove_handler(isrc->xi_cookie); xen_intr_release_isrc(isrc); } void xen_intr_signal(xen_intr_handle_t handle) { struct xenisrc *isrc; isrc = xen_intr_isrc(handle); if (isrc != NULL) { KASSERT(isrc->xi_type == EVTCHN_TYPE_PORT || isrc->xi_type == EVTCHN_TYPE_IPI, ("evtchn_signal on something other than a local port")); struct evtchn_send send = { .port = isrc->xi_port }; (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send); } } evtchn_port_t xen_intr_port(xen_intr_handle_t handle) { struct xenisrc *isrc; isrc = xen_intr_isrc(handle); if (isrc == NULL) return (0); return (isrc->xi_port); } int xen_intr_add_handler(const char *name, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, xen_intr_handle_t handle) { struct xenisrc *isrc; int error; isrc = xen_intr_isrc(handle); if (isrc == NULL || isrc->xi_cookie != NULL) return (EINVAL); error = intr_add_handler(name, isrc->xi_vector,filter, handler, arg, - flags|INTR_EXCL, &isrc->xi_cookie); + flags|INTR_EXCL, &isrc->xi_cookie, 0); if (error != 0) { printf( "%s: xen_intr_add_handler: intr_add_handler failed: %d\n", name, error); } return (error); } int xen_intr_get_evtchn_from_port(evtchn_port_t port, xen_intr_handle_t *handlep) { if (!is_valid_evtchn(port) || port >= NR_EVENT_CHANNELS) return (EINVAL); if (handlep == NULL) { return (EINVAL); } mtx_lock(&xen_intr_isrc_lock); if (xen_intr_port_to_isrc[port] == NULL) { mtx_unlock(&xen_intr_isrc_lock); return (EINVAL); } refcount_acquire(&xen_intr_port_to_isrc[port]->xi_refcount); mtx_unlock(&xen_intr_isrc_lock); /* Assign the opaque handler (the event channel port) */ *handlep = &xen_intr_port_to_isrc[port]->xi_vector; return (0); } #ifdef DDB static const char * xen_intr_print_type(enum evtchn_type type) { static const char *evtchn_type_to_string[EVTCHN_TYPE_COUNT] = { [EVTCHN_TYPE_UNBOUND] = "UNBOUND", [EVTCHN_TYPE_PIRQ] = "PIRQ", [EVTCHN_TYPE_VIRQ] = "VIRQ", [EVTCHN_TYPE_IPI] = "IPI", [EVTCHN_TYPE_PORT] = "PORT", }; if (type >= EVTCHN_TYPE_COUNT) return ("UNKNOWN"); return (evtchn_type_to_string[type]); } static void xen_intr_dump_port(struct xenisrc *isrc) { struct xen_intr_pcpu_data *pcpu; shared_info_t *s = HYPERVISOR_shared_info; int i; db_printf("Port %d Type: %s\n", isrc->xi_port, xen_intr_print_type(isrc->xi_type)); if (isrc->xi_type == EVTCHN_TYPE_PIRQ) { db_printf("\tPirq: %d ActiveHi: %d EdgeTrigger: %d " "NeedsEOI: %d\n", isrc->xi_pirq, isrc->xi_activehi, isrc->xi_edgetrigger, !!xen_test_bit(isrc->xi_pirq, xen_intr_pirq_eoi_map)); } if (isrc->xi_type == EVTCHN_TYPE_VIRQ) db_printf("\tVirq: %d\n", isrc->xi_virq); db_printf("\tMasked: %d Pending: %d\n", !!xen_test_bit(isrc->xi_port, &s->evtchn_mask[0]), !!xen_test_bit(isrc->xi_port, &s->evtchn_pending[0])); db_printf("\tPer-CPU Masks: "); CPU_FOREACH(i) { pcpu = DPCPU_ID_PTR(i, xen_intr_pcpu); db_printf("cpu#%d: %d ", i, !!xen_test_bit(isrc->xi_port, pcpu->evtchn_enabled)); } db_printf("\n"); } DB_SHOW_COMMAND(xen_evtchn, db_show_xen_evtchn) { int i; if (!xen_domain()) { db_printf("Only available on Xen guests\n"); return; } for (i = 0; i < NR_EVENT_CHANNELS; i++) { struct xenisrc *isrc; isrc = xen_intr_port_to_isrc[i]; if (isrc == NULL) continue; xen_intr_dump_port(isrc); } } #endif /* DDB */