Index: head/sys/amd64/include/intr_machdep.h =================================================================== --- head/sys/amd64/include/intr_machdep.h (revision 304636) +++ head/sys/amd64/include/intr_machdep.h (revision 304637) @@ -1,194 +1,195 @@ /*- * Copyright (c) 2003 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_INTR_MACHDEP_H__ #define __MACHINE_INTR_MACHDEP_H__ #ifdef _KERNEL /* * The maximum number of I/O interrupts we allow. This number is rather * arbitrary as it is just the maximum IRQ resource value. The interrupt * source for a given IRQ maps that I/O interrupt to device interrupt * source whether it be a pin on an interrupt controller or an MSI interrupt. * The 16 ISA IRQs are assigned fixed IDT vectors, but all other device * interrupts allocate IDT vectors on demand. Currently we have 191 IDT * vectors available for device interrupts. On many systems with I/O APICs, * a lot of the IRQs are not used, so this number can be much larger than * 191 and still be safe since only interrupt sources in actual use will * allocate IDT vectors. * * The first 255 IRQs (0 - 254) are reserved for ISA IRQs and PCI intline IRQs. * IRQ values from 256 to 767 are used by MSI. When running under the Xen * Hypervisor, IRQ values from 768 to 4863 are available for binding to * event channel events. We leave 255 unused to avoid confusion since 255 is * used in PCI to indicate an invalid IRQ. */ #define NUM_MSI_INTS 512 #define FIRST_MSI_INT 256 #ifdef XENHVM #include #include #define NUM_EVTCHN_INTS NR_EVENT_CHANNELS #define FIRST_EVTCHN_INT \ (FIRST_MSI_INT + NUM_MSI_INTS) #define LAST_EVTCHN_INT \ (FIRST_EVTCHN_INT + NUM_EVTCHN_INTS - 1) #else #define NUM_EVTCHN_INTS 0 #endif #define NUM_IO_INTS (FIRST_MSI_INT + NUM_MSI_INTS + NUM_EVTCHN_INTS) /* * Default base address for MSI messages on x86 platforms. */ #define MSI_INTEL_ADDR_BASE 0xfee00000 /* * - 1 ??? dummy counter. * - 2 counters for each I/O interrupt. * - 1 counter for each CPU for lapic timer. * - 8 counters for each CPU for IPI counters for SMP. */ #ifdef SMP #define INTRCNT_COUNT (1 + NUM_IO_INTS * 2 + (1 + 8) * MAXCPU) #else #define INTRCNT_COUNT (1 + NUM_IO_INTS * 2 + 1) #endif #ifndef LOCORE typedef void inthand_t(void); #define IDTVEC(name) __CONCAT(X,name) struct intsrc; /* * Methods that a PIC provides to mask/unmask a given interrupt source, * "turn on" the interrupt on the CPU side by setting up an IDT entry, and * return the vector associated with this source. */ struct pic { void (*pic_enable_source)(struct intsrc *); void (*pic_disable_source)(struct intsrc *, int); void (*pic_eoi_source)(struct intsrc *); void (*pic_enable_intr)(struct intsrc *); void (*pic_disable_intr)(struct intsrc *); int (*pic_vector)(struct intsrc *); int (*pic_source_pending)(struct intsrc *); void (*pic_suspend)(struct pic *); void (*pic_resume)(struct pic *, bool suspend_cancelled); int (*pic_config_intr)(struct intsrc *, enum intr_trigger, enum intr_polarity); int (*pic_assign_cpu)(struct intsrc *, u_int apic_id); void (*pic_reprogram_pin)(struct intsrc *); TAILQ_ENTRY(pic) pics; }; /* Flags for pic_disable_source() */ enum { PIC_EOI, PIC_NO_EOI, }; /* * An interrupt source. The upper-layer code uses the PIC methods to * control a given source. The lower-layer PIC drivers can store additional * private data in a given interrupt source such as an interrupt pin number * or an I/O APIC pointer. */ struct intsrc { struct pic *is_pic; struct intr_event *is_event; u_long *is_count; u_long *is_straycount; u_int is_index; u_int is_handlers; }; struct trapframe; /* * The following data structure holds per-cpu data, and is placed just * above the top of the space used for the NMI stack. */ struct nmi_pcpu { register_t np_pcpu; register_t __padding; /* pad to 16 bytes */ }; #ifdef SMP extern cpuset_t intr_cpus; #endif extern struct mtx icu_lock; extern int elcr_found; - +#ifdef SMP extern int msix_disable_migration; +#endif #ifndef DEV_ATPIC void atpic_reset(void); #endif /* XXX: The elcr_* prototypes probably belong somewhere else. */ int elcr_probe(void); enum intr_trigger elcr_read_trigger(u_int irq); void elcr_resume(void); void elcr_write_trigger(u_int irq, enum intr_trigger trigger); #ifdef SMP void intr_add_cpu(u_int cpu); #endif int intr_add_handler(const char *name, int vector, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep); #ifdef SMP int intr_bind(u_int vector, u_char cpu); #endif int intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol); int intr_describe(u_int vector, void *ih, const char *descr); void intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame); u_int intr_next_cpu(void); struct intsrc *intr_lookup_source(int vector); int intr_register_pic(struct pic *pic); int intr_register_source(struct intsrc *isrc); int intr_remove_handler(void *cookie); void intr_resume(bool suspend_cancelled); void intr_suspend(void); void intr_reprogram(void); void intrcnt_add(const char *name, u_long **countp); void nexus_add_irq(u_long irq); int msi_alloc(device_t dev, int count, int maxcount, int *irqs); void msi_init(void); int msi_map(int irq, uint64_t *addr, uint32_t *data); int msi_release(int *irqs, int count); int msix_alloc(device_t dev, int *irq); int msix_release(int irq); #endif /* !LOCORE */ #endif /* _KERNEL */ #endif /* !__MACHINE_INTR_MACHDEP_H__ */ Index: head/sys/i386/include/intr_machdep.h =================================================================== --- head/sys/i386/include/intr_machdep.h (revision 304636) +++ head/sys/i386/include/intr_machdep.h (revision 304637) @@ -1,184 +1,185 @@ /*- * Copyright (c) 2003 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __MACHINE_INTR_MACHDEP_H__ #define __MACHINE_INTR_MACHDEP_H__ #ifdef _KERNEL /* * The maximum number of I/O interrupts we allow. This number is rather * arbitrary as it is just the maximum IRQ resource value. The interrupt * source for a given IRQ maps that I/O interrupt to device interrupt * source whether it be a pin on an interrupt controller or an MSI interrupt. * The 16 ISA IRQs are assigned fixed IDT vectors, but all other device * interrupts allocate IDT vectors on demand. Currently we have 191 IDT * vectors available for device interrupts. On many systems with I/O APICs, * a lot of the IRQs are not used, so this number can be much larger than * 191 and still be safe since only interrupt sources in actual use will * allocate IDT vectors. * * The first 255 IRQs (0 - 254) are reserved for ISA IRQs and PCI intline IRQs. * IRQ values from 256 to 767 are used by MSI. When running under the Xen * Hypervisor, IRQ values from 768 to 4863 are available for binding to * event channel events. We leave 255 unused to avoid confusion since 255 is * used in PCI to indicate an invalid IRQ. */ #define NUM_MSI_INTS 512 #define FIRST_MSI_INT 256 #ifdef XENHVM #include #include #define NUM_EVTCHN_INTS NR_EVENT_CHANNELS #define FIRST_EVTCHN_INT \ (FIRST_MSI_INT + NUM_MSI_INTS) #define LAST_EVTCHN_INT \ (FIRST_EVTCHN_INT + NUM_EVTCHN_INTS - 1) #else /* !XENHVM */ #define NUM_EVTCHN_INTS 0 #endif #define NUM_IO_INTS (FIRST_MSI_INT + NUM_MSI_INTS + NUM_EVTCHN_INTS) /* * Default base address for MSI messages on x86 platforms. */ #define MSI_INTEL_ADDR_BASE 0xfee00000 /* * - 1 ??? dummy counter. * - 2 counters for each I/O interrupt. * - 1 counter for each CPU for lapic timer. * - 9 counters for each CPU for IPI counters for SMP. */ #ifdef SMP #define INTRCNT_COUNT (1 + NUM_IO_INTS * 2 + (1 + 9) * MAXCPU) #else #define INTRCNT_COUNT (1 + NUM_IO_INTS * 2 + 1) #endif #ifndef LOCORE typedef void inthand_t(void); #define IDTVEC(name) __CONCAT(X,name) struct intsrc; /* * Methods that a PIC provides to mask/unmask a given interrupt source, * "turn on" the interrupt on the CPU side by setting up an IDT entry, and * return the vector associated with this source. */ struct pic { void (*pic_enable_source)(struct intsrc *); void (*pic_disable_source)(struct intsrc *, int); void (*pic_eoi_source)(struct intsrc *); void (*pic_enable_intr)(struct intsrc *); void (*pic_disable_intr)(struct intsrc *); int (*pic_vector)(struct intsrc *); int (*pic_source_pending)(struct intsrc *); void (*pic_suspend)(struct pic *); void (*pic_resume)(struct pic *, bool suspend_cancelled); int (*pic_config_intr)(struct intsrc *, enum intr_trigger, enum intr_polarity); int (*pic_assign_cpu)(struct intsrc *, u_int apic_id); void (*pic_reprogram_pin)(struct intsrc *); TAILQ_ENTRY(pic) pics; }; /* Flags for pic_disable_source() */ enum { PIC_EOI, PIC_NO_EOI, }; /* * An interrupt source. The upper-layer code uses the PIC methods to * control a given source. The lower-layer PIC drivers can store additional * private data in a given interrupt source such as an interrupt pin number * or an I/O APIC pointer. */ struct intsrc { struct pic *is_pic; struct intr_event *is_event; u_long *is_count; u_long *is_straycount; u_int is_index; u_int is_handlers; }; struct trapframe; #ifdef SMP extern cpuset_t intr_cpus; #endif extern struct mtx icu_lock; extern int elcr_found; - +#ifdef SMP extern int msix_disable_migration; +#endif #ifndef DEV_ATPIC void atpic_reset(void); #endif /* XXX: The elcr_* prototypes probably belong somewhere else. */ int elcr_probe(void); enum intr_trigger elcr_read_trigger(u_int irq); void elcr_resume(void); void elcr_write_trigger(u_int irq, enum intr_trigger trigger); #ifdef SMP void intr_add_cpu(u_int cpu); #endif int intr_add_handler(const char *name, int vector, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, void **cookiep); #ifdef SMP int intr_bind(u_int vector, u_char cpu); #endif int intr_config_intr(int vector, enum intr_trigger trig, enum intr_polarity pol); int intr_describe(u_int vector, void *ih, const char *descr); void intr_execute_handlers(struct intsrc *isrc, struct trapframe *frame); u_int intr_next_cpu(void); struct intsrc *intr_lookup_source(int vector); int intr_register_pic(struct pic *pic); int intr_register_source(struct intsrc *isrc); int intr_remove_handler(void *cookie); void intr_resume(bool suspend_cancelled); void intr_suspend(void); void intr_reprogram(void); void intrcnt_add(const char *name, u_long **countp); void nexus_add_irq(u_long irq); int msi_alloc(device_t dev, int count, int maxcount, int *irqs); void msi_init(void); int msi_map(int irq, uint64_t *addr, uint32_t *data); int msi_release(int* irqs, int count); int msix_alloc(device_t dev, int *irq); int msix_release(int irq); #endif /* !LOCORE */ #endif /* _KERNEL */ #endif /* !__MACHINE_INTR_MACHDEP_H__ */ Index: head/sys/x86/x86/msi.c =================================================================== --- head/sys/x86/x86/msi.c (revision 304636) +++ head/sys/x86/x86/msi.c (revision 304637) @@ -1,724 +1,730 @@ /*- * Copyright (c) 2006 Yahoo!, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Support for PCI Message Signalled Interrupts (MSI). MSI interrupts on * x86 are basically APIC messages that the northbridge delivers directly * to the local APICs as if they had come from an I/O APIC. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Fields in address for Intel MSI messages. */ #define MSI_INTEL_ADDR_DEST 0x000ff000 #define MSI_INTEL_ADDR_RH 0x00000008 # define MSI_INTEL_ADDR_RH_ON 0x00000008 # define MSI_INTEL_ADDR_RH_OFF 0x00000000 #define MSI_INTEL_ADDR_DM 0x00000004 # define MSI_INTEL_ADDR_DM_PHYSICAL 0x00000000 # define MSI_INTEL_ADDR_DM_LOGICAL 0x00000004 /* Fields in data for Intel MSI messages. */ #define MSI_INTEL_DATA_TRGRMOD IOART_TRGRMOD /* Trigger mode. */ # define MSI_INTEL_DATA_TRGREDG IOART_TRGREDG # define MSI_INTEL_DATA_TRGRLVL IOART_TRGRLVL #define MSI_INTEL_DATA_LEVEL 0x00004000 /* Polarity. */ # define MSI_INTEL_DATA_DEASSERT 0x00000000 # define MSI_INTEL_DATA_ASSERT 0x00004000 #define MSI_INTEL_DATA_DELMOD IOART_DELMOD /* Delivery mode. */ # define MSI_INTEL_DATA_DELFIXED IOART_DELFIXED # define MSI_INTEL_DATA_DELLOPRI IOART_DELLOPRI # define MSI_INTEL_DATA_DELSMI IOART_DELSMI # define MSI_INTEL_DATA_DELNMI IOART_DELNMI # define MSI_INTEL_DATA_DELINIT IOART_DELINIT # define MSI_INTEL_DATA_DELEXINT IOART_DELEXINT #define MSI_INTEL_DATA_INTVEC IOART_INTVEC /* Interrupt vector. */ /* * Build Intel MSI message and data values from a source. AMD64 systems * seem to be compatible, so we use the same function for both. */ #define INTEL_ADDR(msi) \ (MSI_INTEL_ADDR_BASE | (msi)->msi_cpu << 12 | \ MSI_INTEL_ADDR_RH_OFF | MSI_INTEL_ADDR_DM_PHYSICAL) #define INTEL_DATA(msi) \ (MSI_INTEL_DATA_TRGREDG | MSI_INTEL_DATA_DELFIXED | (msi)->msi_vector) static MALLOC_DEFINE(M_MSI, "msi", "PCI MSI"); /* * MSI sources are bunched into groups. This is because MSI forces * all of the messages to share the address and data registers and * thus certain properties (such as the local APIC ID target on x86). * Each group has a 'first' source that contains information global to * the group. These fields are marked with (g) below. * * Note that local APIC ID is kind of special. Each message will be * assigned an ID by the system; however, a group will use the ID from * the first message. * * For MSI-X, each message is isolated. */ struct msi_intsrc { struct intsrc msi_intsrc; device_t msi_dev; /* Owning device. (g) */ struct msi_intsrc *msi_first; /* First source in group. */ u_int msi_irq; /* IRQ cookie. */ u_int msi_msix; /* MSI-X message. */ u_int msi_vector:8; /* IDT vector. */ u_int msi_cpu; /* Local APIC ID. (g) */ u_int msi_count:8; /* Messages in this group. (g) */ u_int msi_maxcount:8; /* Alignment for this group. (g) */ int *msi_irqs; /* Group's IRQ list. (g) */ u_int msi_remap_cookie; }; static void msi_create_source(void); static void msi_enable_source(struct intsrc *isrc); static void msi_disable_source(struct intsrc *isrc, int eoi); static void msi_eoi_source(struct intsrc *isrc); static void msi_enable_intr(struct intsrc *isrc); static void msi_disable_intr(struct intsrc *isrc); static int msi_vector(struct intsrc *isrc); static int msi_source_pending(struct intsrc *isrc); static int msi_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol); static int msi_assign_cpu(struct intsrc *isrc, u_int apic_id); struct pic msi_pic = { .pic_enable_source = msi_enable_source, .pic_disable_source = msi_disable_source, .pic_eoi_source = msi_eoi_source, .pic_enable_intr = msi_enable_intr, .pic_disable_intr = msi_disable_intr, .pic_vector = msi_vector, .pic_source_pending = msi_source_pending, .pic_suspend = NULL, .pic_resume = NULL, .pic_config_intr = msi_config_intr, .pic_assign_cpu = msi_assign_cpu, .pic_reprogram_pin = NULL, }; +#ifdef SMP /** * Xen hypervisors prior to 4.6.0 do not properly handle updates to * enabled MSI-X table entries. Allow migration of MSI-X interrupts * to be disabled via a tunable. Values have the following meaning: * * -1: automatic detection by FreeBSD * 0: enable migration * 1: disable migration */ int msix_disable_migration = -1; SYSCTL_INT(_machdep, OID_AUTO, disable_msix_migration, CTLFLAG_RDTUN, &msix_disable_migration, 0, "Disable migration of MSI-X interrupts between CPUs"); +#endif static int msi_enabled; static int msi_last_irq; static struct mtx msi_lock; static void msi_enable_source(struct intsrc *isrc) { } static void msi_disable_source(struct intsrc *isrc, int eoi) { if (eoi == PIC_EOI) lapic_eoi(); } static void msi_eoi_source(struct intsrc *isrc) { lapic_eoi(); } static void msi_enable_intr(struct intsrc *isrc) { struct msi_intsrc *msi = (struct msi_intsrc *)isrc; apic_enable_vector(msi->msi_cpu, msi->msi_vector); } static void msi_disable_intr(struct intsrc *isrc) { struct msi_intsrc *msi = (struct msi_intsrc *)isrc; apic_disable_vector(msi->msi_cpu, msi->msi_vector); } static int msi_vector(struct intsrc *isrc) { struct msi_intsrc *msi = (struct msi_intsrc *)isrc; return (msi->msi_irq); } static int msi_source_pending(struct intsrc *isrc) { return (0); } static int msi_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol) { return (ENODEV); } static int msi_assign_cpu(struct intsrc *isrc, u_int apic_id) { struct msi_intsrc *sib, *msi = (struct msi_intsrc *)isrc; int old_vector; u_int old_id; int i, vector; /* * Only allow CPUs to be assigned to the first message for an * MSI group. */ if (msi->msi_first != msi) return (EINVAL); +#ifdef SMP if (msix_disable_migration && msi->msi_msix) return (EINVAL); +#endif /* Store information to free existing irq. */ old_vector = msi->msi_vector; old_id = msi->msi_cpu; if (old_id == apic_id) return (0); /* Allocate IDT vectors on this cpu. */ if (msi->msi_count > 1) { KASSERT(msi->msi_msix == 0, ("MSI-X message group")); vector = apic_alloc_vectors(apic_id, msi->msi_irqs, msi->msi_count, msi->msi_maxcount); } else vector = apic_alloc_vector(apic_id, msi->msi_irq); if (vector == 0) return (ENOSPC); msi->msi_cpu = apic_id; msi->msi_vector = vector; if (msi->msi_intsrc.is_handlers > 0) apic_enable_vector(msi->msi_cpu, msi->msi_vector); if (bootverbose) printf("msi: Assigning %s IRQ %d to local APIC %u vector %u\n", msi->msi_msix ? "MSI-X" : "MSI", msi->msi_irq, msi->msi_cpu, msi->msi_vector); for (i = 1; i < msi->msi_count; i++) { sib = (struct msi_intsrc *)intr_lookup_source(msi->msi_irqs[i]); sib->msi_cpu = apic_id; sib->msi_vector = vector + i; if (sib->msi_intsrc.is_handlers > 0) apic_enable_vector(sib->msi_cpu, sib->msi_vector); if (bootverbose) printf( "msi: Assigning MSI IRQ %d to local APIC %u vector %u\n", sib->msi_irq, sib->msi_cpu, sib->msi_vector); } BUS_REMAP_INTR(device_get_parent(msi->msi_dev), msi->msi_dev, msi->msi_irq); /* * Free the old vector after the new one is established. This is done * to prevent races where we could miss an interrupt. */ if (msi->msi_intsrc.is_handlers > 0) apic_disable_vector(old_id, old_vector); apic_free_vector(old_id, old_vector, msi->msi_irq); for (i = 1; i < msi->msi_count; i++) { sib = (struct msi_intsrc *)intr_lookup_source(msi->msi_irqs[i]); if (sib->msi_intsrc.is_handlers > 0) apic_disable_vector(old_id, old_vector + i); apic_free_vector(old_id, old_vector + i, msi->msi_irqs[i]); } return (0); } void msi_init(void) { /* Check if we have a supported CPU. */ switch (cpu_vendor_id) { case CPU_VENDOR_INTEL: case CPU_VENDOR_AMD: break; case CPU_VENDOR_CENTAUR: if (CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xf) break; /* FALLTHROUGH */ default: return; } +#ifdef SMP if (msix_disable_migration == -1) { /* The default is to allow migration of MSI-X interrupts. */ msix_disable_migration = 0; } +#endif msi_enabled = 1; intr_register_pic(&msi_pic); mtx_init(&msi_lock, "msi", NULL, MTX_DEF); } static void msi_create_source(void) { struct msi_intsrc *msi; u_int irq; mtx_lock(&msi_lock); if (msi_last_irq >= NUM_MSI_INTS) { mtx_unlock(&msi_lock); return; } irq = msi_last_irq + FIRST_MSI_INT; msi_last_irq++; mtx_unlock(&msi_lock); msi = malloc(sizeof(struct msi_intsrc), M_MSI, M_WAITOK | M_ZERO); msi->msi_intsrc.is_pic = &msi_pic; msi->msi_irq = irq; intr_register_source(&msi->msi_intsrc); nexus_add_irq(irq); } /* * Try to allocate 'count' interrupt sources with contiguous IDT values. */ int msi_alloc(device_t dev, int count, int maxcount, int *irqs) { struct msi_intsrc *msi, *fsrc; u_int cpu; int cnt, i, *mirqs, vector; #ifdef ACPI_DMAR u_int cookies[count]; int error; #endif if (!msi_enabled) return (ENXIO); if (count > 1) mirqs = malloc(count * sizeof(*mirqs), M_MSI, M_WAITOK); else mirqs = NULL; again: mtx_lock(&msi_lock); /* Try to find 'count' free IRQs. */ cnt = 0; for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) { msi = (struct msi_intsrc *)intr_lookup_source(i); /* End of allocated sources, so break. */ if (msi == NULL) break; /* If this is a free one, save its IRQ in the array. */ if (msi->msi_dev == NULL) { irqs[cnt] = i; cnt++; if (cnt == count) break; } } /* Do we need to create some new sources? */ if (cnt < count) { /* If we would exceed the max, give up. */ if (i + (count - cnt) > FIRST_MSI_INT + NUM_MSI_INTS) { mtx_unlock(&msi_lock); free(mirqs, M_MSI); return (ENXIO); } mtx_unlock(&msi_lock); /* We need count - cnt more sources. */ while (cnt < count) { msi_create_source(); cnt++; } goto again; } /* Ok, we now have the IRQs allocated. */ KASSERT(cnt == count, ("count mismatch")); /* Allocate 'count' IDT vectors. */ cpu = intr_next_cpu(); vector = apic_alloc_vectors(cpu, irqs, count, maxcount); if (vector == 0) { mtx_unlock(&msi_lock); free(mirqs, M_MSI); return (ENOSPC); } #ifdef ACPI_DMAR mtx_unlock(&msi_lock); error = iommu_alloc_msi_intr(dev, cookies, count); mtx_lock(&msi_lock); if (error == EOPNOTSUPP) error = 0; if (error != 0) { for (i = 0; i < count; i++) apic_free_vector(cpu, vector + i, irqs[i]); free(mirqs, M_MSI); return (error); } for (i = 0; i < count; i++) { msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]); msi->msi_remap_cookie = cookies[i]; } #endif /* Assign IDT vectors and make these messages owned by 'dev'. */ fsrc = (struct msi_intsrc *)intr_lookup_source(irqs[0]); for (i = 0; i < count; i++) { msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]); msi->msi_cpu = cpu; msi->msi_dev = dev; msi->msi_vector = vector + i; if (bootverbose) printf( "msi: routing MSI IRQ %d to local APIC %u vector %u\n", msi->msi_irq, msi->msi_cpu, msi->msi_vector); msi->msi_first = fsrc; KASSERT(msi->msi_intsrc.is_handlers == 0, ("dead MSI has handlers")); } fsrc->msi_count = count; fsrc->msi_maxcount = maxcount; if (count > 1) bcopy(irqs, mirqs, count * sizeof(*mirqs)); fsrc->msi_irqs = mirqs; mtx_unlock(&msi_lock); return (0); } int msi_release(int *irqs, int count) { struct msi_intsrc *msi, *first; int i; mtx_lock(&msi_lock); first = (struct msi_intsrc *)intr_lookup_source(irqs[0]); if (first == NULL) { mtx_unlock(&msi_lock); return (ENOENT); } /* Make sure this isn't an MSI-X message. */ if (first->msi_msix) { mtx_unlock(&msi_lock); return (EINVAL); } /* Make sure this message is allocated to a group. */ if (first->msi_first == NULL) { mtx_unlock(&msi_lock); return (ENXIO); } /* * Make sure this is the start of a group and that we are releasing * the entire group. */ if (first->msi_first != first || first->msi_count != count) { mtx_unlock(&msi_lock); return (EINVAL); } KASSERT(first->msi_dev != NULL, ("unowned group")); /* Clear all the extra messages in the group. */ for (i = 1; i < count; i++) { msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]); KASSERT(msi->msi_first == first, ("message not in group")); KASSERT(msi->msi_dev == first->msi_dev, ("owner mismatch")); #ifdef ACPI_DMAR iommu_unmap_msi_intr(first->msi_dev, msi->msi_remap_cookie); #endif msi->msi_first = NULL; msi->msi_dev = NULL; apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq); msi->msi_vector = 0; } /* Clear out the first message. */ #ifdef ACPI_DMAR mtx_unlock(&msi_lock); iommu_unmap_msi_intr(first->msi_dev, first->msi_remap_cookie); mtx_lock(&msi_lock); #endif first->msi_first = NULL; first->msi_dev = NULL; apic_free_vector(first->msi_cpu, first->msi_vector, first->msi_irq); first->msi_vector = 0; first->msi_count = 0; first->msi_maxcount = 0; free(first->msi_irqs, M_MSI); first->msi_irqs = NULL; mtx_unlock(&msi_lock); return (0); } int msi_map(int irq, uint64_t *addr, uint32_t *data) { struct msi_intsrc *msi; int error; #ifdef ACPI_DMAR struct msi_intsrc *msi1; int i, k; #endif mtx_lock(&msi_lock); msi = (struct msi_intsrc *)intr_lookup_source(irq); if (msi == NULL) { mtx_unlock(&msi_lock); return (ENOENT); } /* Make sure this message is allocated to a device. */ if (msi->msi_dev == NULL) { mtx_unlock(&msi_lock); return (ENXIO); } /* * If this message isn't an MSI-X message, make sure it's part * of a group, and switch to the first message in the * group. */ if (!msi->msi_msix) { if (msi->msi_first == NULL) { mtx_unlock(&msi_lock); return (ENXIO); } msi = msi->msi_first; } #ifdef ACPI_DMAR if (!msi->msi_msix) { for (k = msi->msi_count - 1, i = FIRST_MSI_INT; k > 0 && i < FIRST_MSI_INT + NUM_MSI_INTS; i++) { if (i == msi->msi_irq) continue; msi1 = (struct msi_intsrc *)intr_lookup_source(i); if (!msi1->msi_msix && msi1->msi_first == msi) { mtx_unlock(&msi_lock); iommu_map_msi_intr(msi1->msi_dev, msi1->msi_cpu, msi1->msi_vector, msi1->msi_remap_cookie, NULL, NULL); k--; mtx_lock(&msi_lock); } } } mtx_unlock(&msi_lock); error = iommu_map_msi_intr(msi->msi_dev, msi->msi_cpu, msi->msi_vector, msi->msi_remap_cookie, addr, data); #else mtx_unlock(&msi_lock); error = EOPNOTSUPP; #endif if (error == EOPNOTSUPP) { *addr = INTEL_ADDR(msi); *data = INTEL_DATA(msi); error = 0; } return (error); } int msix_alloc(device_t dev, int *irq) { struct msi_intsrc *msi; u_int cpu; int i, vector; #ifdef ACPI_DMAR u_int cookie; int error; #endif if (!msi_enabled) return (ENXIO); again: mtx_lock(&msi_lock); /* Find a free IRQ. */ for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) { msi = (struct msi_intsrc *)intr_lookup_source(i); /* End of allocated sources, so break. */ if (msi == NULL) break; /* Stop at the first free source. */ if (msi->msi_dev == NULL) break; } /* Do we need to create a new source? */ if (msi == NULL) { /* If we would exceed the max, give up. */ if (i + 1 > FIRST_MSI_INT + NUM_MSI_INTS) { mtx_unlock(&msi_lock); return (ENXIO); } mtx_unlock(&msi_lock); /* Create a new source. */ msi_create_source(); goto again; } /* Allocate an IDT vector. */ cpu = intr_next_cpu(); vector = apic_alloc_vector(cpu, i); if (vector == 0) { mtx_unlock(&msi_lock); return (ENOSPC); } msi->msi_dev = dev; #ifdef ACPI_DMAR mtx_unlock(&msi_lock); error = iommu_alloc_msi_intr(dev, &cookie, 1); mtx_lock(&msi_lock); if (error == EOPNOTSUPP) error = 0; if (error != 0) { msi->msi_dev = NULL; apic_free_vector(cpu, vector, i); return (error); } msi->msi_remap_cookie = cookie; #endif if (bootverbose) printf("msi: routing MSI-X IRQ %d to local APIC %u vector %u\n", msi->msi_irq, cpu, vector); /* Setup source. */ msi->msi_cpu = cpu; msi->msi_first = msi; msi->msi_vector = vector; msi->msi_msix = 1; msi->msi_count = 1; msi->msi_maxcount = 1; msi->msi_irqs = NULL; KASSERT(msi->msi_intsrc.is_handlers == 0, ("dead MSI-X has handlers")); mtx_unlock(&msi_lock); *irq = i; return (0); } int msix_release(int irq) { struct msi_intsrc *msi; mtx_lock(&msi_lock); msi = (struct msi_intsrc *)intr_lookup_source(irq); if (msi == NULL) { mtx_unlock(&msi_lock); return (ENOENT); } /* Make sure this is an MSI-X message. */ if (!msi->msi_msix) { mtx_unlock(&msi_lock); return (EINVAL); } KASSERT(msi->msi_dev != NULL, ("unowned message")); /* Clear out the message. */ #ifdef ACPI_DMAR mtx_unlock(&msi_lock); iommu_unmap_msi_intr(msi->msi_dev, msi->msi_remap_cookie); mtx_lock(&msi_lock); #endif msi->msi_first = NULL; msi->msi_dev = NULL; apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq); msi->msi_vector = 0; msi->msi_msix = 0; msi->msi_count = 0; msi->msi_maxcount = 0; mtx_unlock(&msi_lock); return (0); } Index: head/sys/x86/xen/hvm.c =================================================================== --- head/sys/x86/xen/hvm.c (revision 304636) +++ head/sys/x86/xen/hvm.c (revision 304637) @@ -1,423 +1,425 @@ /* * Copyright (c) 2008, 2013 Citrix Systems, Inc. * Copyright (c) 2012 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /*--------------------------- Forward Declarations ---------------------------*/ static void xen_hvm_cpu_init(void); /*-------------------------------- Local Types -------------------------------*/ enum xen_hvm_init_type { XEN_HVM_INIT_COLD, XEN_HVM_INIT_CANCELLED_SUSPEND, XEN_HVM_INIT_RESUME }; /*-------------------------------- Global Data -------------------------------*/ enum xen_domain_type xen_domain_type = XEN_NATIVE; #ifdef SMP struct cpu_ops xen_hvm_cpu_ops = { .cpu_init = xen_hvm_cpu_init, .cpu_resume = xen_hvm_cpu_init }; #endif static MALLOC_DEFINE(M_XENHVM, "xen_hvm", "Xen HVM PV Support"); /** * If non-zero, the hypervisor has been configured to use a direct * IDT event callback for interrupt injection. */ int xen_vector_callback_enabled; /*------------------------------- Per-CPU Data -------------------------------*/ DPCPU_DEFINE(struct vcpu_info, vcpu_local_info); DPCPU_DEFINE(struct vcpu_info *, vcpu_info); /*------------------ Hypervisor Access Shared Memory Regions -----------------*/ shared_info_t *HYPERVISOR_shared_info; start_info_t *HYPERVISOR_start_info; /*------------------------------ Sysctl tunables -----------------------------*/ int xen_disable_pv_disks = 0; int xen_disable_pv_nics = 0; TUNABLE_INT("hw.xen.disable_pv_disks", &xen_disable_pv_disks); TUNABLE_INT("hw.xen.disable_pv_nics", &xen_disable_pv_nics); /*---------------------- XEN Hypervisor Probe and Setup ----------------------*/ static uint32_t xen_hvm_cpuid_base(void) { uint32_t base, regs[4]; for (base = 0x40000000; base < 0x40010000; base += 0x100) { do_cpuid(base, regs); if (!memcmp("XenVMMXenVMM", ®s[1], 12) && (regs[0] - base) >= 2) return (base); } return (0); } /* * Allocate and fill in the hypcall page. */ static int xen_hvm_init_hypercall_stubs(enum xen_hvm_init_type init_type) { uint32_t base, regs[4]; int i; if (xen_pv_domain()) { /* hypercall page is already set in the PV case */ return (0); } base = xen_hvm_cpuid_base(); if (base == 0) return (ENXIO); if (init_type == XEN_HVM_INIT_COLD) { int major, minor; do_cpuid(base + 1, regs); major = regs[0] >> 16; minor = regs[0] & 0xffff; printf("XEN: Hypervisor version %d.%d detected.\n", major, minor); +#ifdef SMP if (((major < 4) || (major == 4 && minor <= 5)) && msix_disable_migration == -1) { /* * Xen hypervisors prior to 4.6.0 do not properly * handle updates to enabled MSI-X table entries, * so disable MSI-X interrupt migration in that * case. */ if (bootverbose) printf( "Disabling MSI-X interrupt migration due to Xen hypervisor bug.\n" "Set machdep.msix_disable_migration=0 to forcefully enable it.\n"); msix_disable_migration = 1; } +#endif } /* * Find the hypercall pages. */ do_cpuid(base + 2, regs); for (i = 0; i < regs[0]; i++) wrmsr(regs[1], vtophys(&hypercall_page + i * PAGE_SIZE) + i); return (0); } static void xen_hvm_init_shared_info_page(void) { struct xen_add_to_physmap xatp; if (xen_pv_domain()) { /* * Already setup in the PV case, shared_info is passed inside * of the start_info struct at start of day. */ return; } if (HYPERVISOR_shared_info == NULL) { HYPERVISOR_shared_info = malloc(PAGE_SIZE, M_XENHVM, M_NOWAIT); if (HYPERVISOR_shared_info == NULL) panic("Unable to allocate Xen shared info page"); } xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; xatp.gpfn = vtophys(HYPERVISOR_shared_info) >> PAGE_SHIFT; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) panic("HYPERVISOR_memory_op failed"); } /* * Tell the hypervisor how to contact us for event channel callbacks. */ void xen_hvm_set_callback(device_t dev) { struct xen_hvm_param xhp; int irq; if (xen_vector_callback_enabled) return; xhp.domid = DOMID_SELF; xhp.index = HVM_PARAM_CALLBACK_IRQ; if (xen_feature(XENFEAT_hvm_callback_vector) != 0) { int error; xhp.value = HVM_CALLBACK_VECTOR(IDT_EVTCHN); error = HYPERVISOR_hvm_op(HVMOP_set_param, &xhp); if (error == 0) { xen_vector_callback_enabled = 1; return; } printf("Xen HVM callback vector registration failed (%d). " "Falling back to emulated device interrupt\n", error); } xen_vector_callback_enabled = 0; if (dev == NULL) { /* * Called from early boot or resume. * xenpci will invoke us again later. */ return; } irq = pci_get_irq(dev); if (irq < 16) { xhp.value = HVM_CALLBACK_GSI(irq); } else { u_int slot; u_int pin; slot = pci_get_slot(dev); pin = pci_get_intpin(dev) - 1; xhp.value = HVM_CALLBACK_PCI_INTX(slot, pin); } if (HYPERVISOR_hvm_op(HVMOP_set_param, &xhp) != 0) panic("Can't set evtchn callback"); } #define XEN_MAGIC_IOPORT 0x10 enum { XMI_MAGIC = 0x49d2, XMI_UNPLUG_IDE_DISKS = 0x01, XMI_UNPLUG_NICS = 0x02, XMI_UNPLUG_IDE_EXCEPT_PRI_MASTER = 0x04 }; static void xen_hvm_disable_emulated_devices(void) { u_short disable_devs = 0; if (xen_pv_domain()) { /* * No emulated devices in the PV case, so no need to unplug * anything. */ if (xen_disable_pv_disks != 0 || xen_disable_pv_nics != 0) printf("PV devices cannot be disabled in PV guests\n"); return; } if (inw(XEN_MAGIC_IOPORT) != XMI_MAGIC) return; if (xen_disable_pv_disks == 0) { if (bootverbose) printf("XEN: disabling emulated disks\n"); disable_devs |= XMI_UNPLUG_IDE_DISKS; } if (xen_disable_pv_nics == 0) { if (bootverbose) printf("XEN: disabling emulated nics\n"); disable_devs |= XMI_UNPLUG_NICS; } if (disable_devs != 0) outw(XEN_MAGIC_IOPORT, disable_devs); } static void xen_hvm_init(enum xen_hvm_init_type init_type) { int error; int i; if (init_type == XEN_HVM_INIT_CANCELLED_SUSPEND) return; error = xen_hvm_init_hypercall_stubs(init_type); switch (init_type) { case XEN_HVM_INIT_COLD: if (error != 0) return; /* * If xen_domain_type is not set at this point * it means we are inside a (PV)HVM guest, because * for PVH the guest type is set much earlier * (see hammer_time_xen). */ if (!xen_domain()) { xen_domain_type = XEN_HVM_DOMAIN; vm_guest = VM_GUEST_XEN; } setup_xen_features(); #ifdef SMP cpu_ops = xen_hvm_cpu_ops; #endif break; case XEN_HVM_INIT_RESUME: if (error != 0) panic("Unable to init Xen hypercall stubs on resume"); /* Clear stale vcpu_info. */ CPU_FOREACH(i) DPCPU_ID_SET(i, vcpu_info, NULL); break; default: panic("Unsupported HVM initialization type"); } xen_vector_callback_enabled = 0; xen_hvm_set_callback(NULL); /* * On (PV)HVM domains we need to request the hypervisor to * fill the shared info page, for PVH guest the shared_info page * is passed inside the start_info struct and is already set, so this * functions are no-ops. */ xen_hvm_init_shared_info_page(); xen_hvm_disable_emulated_devices(); } void xen_hvm_suspend(void) { } void xen_hvm_resume(bool suspend_cancelled) { xen_hvm_init(suspend_cancelled ? XEN_HVM_INIT_CANCELLED_SUSPEND : XEN_HVM_INIT_RESUME); /* Register vcpu_info area for CPU#0. */ xen_hvm_cpu_init(); } static void xen_hvm_sysinit(void *arg __unused) { xen_hvm_init(XEN_HVM_INIT_COLD); } static void xen_set_vcpu_id(void) { struct pcpu *pc; int i; if (!xen_hvm_domain()) return; /* Set vcpu_id to acpi_id */ CPU_FOREACH(i) { pc = pcpu_find(i); pc->pc_vcpu_id = pc->pc_acpi_id; if (bootverbose) printf("XEN: CPU %u has VCPU ID %u\n", i, pc->pc_vcpu_id); } } static void xen_hvm_cpu_init(void) { struct vcpu_register_vcpu_info info; struct vcpu_info *vcpu_info; int cpu, rc; if (!xen_domain()) return; if (DPCPU_GET(vcpu_info) != NULL) { /* * vcpu_info is already set. We're resuming * from a failed migration and our pre-suspend * configuration is still valid. */ return; } vcpu_info = DPCPU_PTR(vcpu_local_info); cpu = PCPU_GET(vcpu_id); info.mfn = vtophys(vcpu_info) >> PAGE_SHIFT; info.offset = vtophys(vcpu_info) - trunc_page(vtophys(vcpu_info)); rc = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); if (rc != 0) DPCPU_SET(vcpu_info, &HYPERVISOR_shared_info->vcpu_info[cpu]); else DPCPU_SET(vcpu_info, vcpu_info); } SYSINIT(xen_hvm_init, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, xen_hvm_sysinit, NULL); SYSINIT(xen_hvm_cpu_init, SI_SUB_INTR, SI_ORDER_FIRST, xen_hvm_cpu_init, NULL); SYSINIT(xen_set_vcpu_id, SI_SUB_CPU, SI_ORDER_ANY, xen_set_vcpu_id, NULL);