diff --git a/sys/amd64/acpica/acpi_wakeup.c b/sys/amd64/acpica/acpi_wakeup.c index 534a7746dfff..c25d802dada0 100644 --- a/sys/amd64/acpica/acpi_wakeup.c +++ b/sys/amd64/acpica/acpi_wakeup.c @@ -1,410 +1,410 @@ /*- * Copyright (c) 2001 Takanori Watanabe * Copyright (c) 2001 Mitsuru IWASAKI * Copyright (c) 2003 Peter Wemm * Copyright (c) 2008-2010 Jung-uk Kim * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP -#include +#include #include #include #endif #include #include #include "acpi_wakecode.h" #include "acpi_wakedata.h" /* Make sure the code is less than a page and leave room for the stack. */ CTASSERT(sizeof(wakecode) < PAGE_SIZE - 1024); extern int acpi_resume_beep; extern int acpi_reset_video; #ifdef SMP extern struct pcb **susppcbs; #else static struct pcb **susppcbs; #endif int acpi_restorecpu(vm_offset_t, struct pcb *); static void *acpi_alloc_wakeup_handler(void); static void acpi_stop_beep(void *); #ifdef SMP static int acpi_wakeup_ap(struct acpi_softc *, int); static void acpi_wakeup_cpus(struct acpi_softc *, cpumask_t); #endif #define WAKECODE_VADDR(sc) ((sc)->acpi_wakeaddr + (3 * PAGE_SIZE)) #define WAKECODE_PADDR(sc) ((sc)->acpi_wakephys + (3 * PAGE_SIZE)) #define WAKECODE_FIXUP(offset, type, val) do { \ type *addr; \ addr = (type *)(WAKECODE_VADDR(sc) + offset); \ *addr = val; \ } while (0) /* Turn off bits 1&2 of the PIT, stopping the beep. */ static void acpi_stop_beep(void *arg) { outb(0x61, inb(0x61) & ~0x3); } #ifdef SMP static int acpi_wakeup_ap(struct acpi_softc *sc, int cpu) { int vector = (WAKECODE_PADDR(sc) >> 12) & 0xff; int apic_id = cpu_apic_ids[cpu]; int ms; WAKECODE_FIXUP(wakeup_pcb, struct pcb *, susppcbs[cpu]); WAKECODE_FIXUP(wakeup_gdt, uint16_t, susppcbs[cpu]->pcb_gdt.rd_limit); WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t, susppcbs[cpu]->pcb_gdt.rd_base); WAKECODE_FIXUP(wakeup_cpu, int, cpu); /* do an INIT IPI: assert RESET */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); /* wait for pending status end */ lapic_ipi_wait(-1); /* do an INIT IPI: deassert RESET */ lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0); /* wait for pending status end */ DELAY(10000); /* wait ~10mS */ lapic_ipi_wait(-1); /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ /* do a STARTUP IPI */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); lapic_ipi_wait(-1); DELAY(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); lapic_ipi_wait(-1); DELAY(200); /* wait ~200uS */ /* Wait up to 5 seconds for it to start. */ for (ms = 0; ms < 5000; ms++) { if (*(int *)(WAKECODE_VADDR(sc) + wakeup_cpu) == 0) return (1); /* return SUCCESS */ DELAY(1000); } return (0); /* return FAILURE */ } #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) static void acpi_wakeup_cpus(struct acpi_softc *sc, cpumask_t wakeup_cpus) { uint32_t mpbioswarmvec; int cpu; u_char mpbiosreason; /* save the current value of the warm-start vector */ mpbioswarmvec = *((uint32_t *)WARMBOOT_OFF); outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); /* setup a vector to our boot code */ *((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *)WARMBOOT_SEG) = WAKECODE_PADDR(sc) >> 4; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ /* Wake up each AP. */ for (cpu = 1; cpu < mp_ncpus; cpu++) { if ((wakeup_cpus & (1 << cpu)) == 0) continue; if (acpi_wakeup_ap(sc, cpu) == 0) { /* restore the warmstart vector */ *(uint32_t *)WARMBOOT_OFF = mpbioswarmvec; panic("acpi_wakeup: failed to resume AP #%d (PHY #%d)", cpu, cpu_apic_ids[cpu]); } } /* restore the warmstart vector */ *(uint32_t *)WARMBOOT_OFF = mpbioswarmvec; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); } #endif int acpi_sleep_machdep(struct acpi_softc *sc, int state) { #ifdef SMP cpumask_t wakeup_cpus; #endif register_t cr3, rf; ACPI_STATUS status; int ret; ret = -1; if (sc->acpi_wakeaddr == 0ul) return (ret); #ifdef SMP wakeup_cpus = PCPU_GET(other_cpus); #endif AcpiSetFirmwareWakingVector(WAKECODE_PADDR(sc)); rf = intr_disable(); intr_suspend(); /* * Temporarily switch to the kernel pmap because it provides * an identity mapping (setup at boot) for the low physical * memory region containing the wakeup code. */ cr3 = rcr3(); load_cr3(KPML4phys); if (savectx(susppcbs[0])) { #ifdef SMP if (wakeup_cpus != 0 && suspend_cpus(wakeup_cpus) == 0) { device_printf(sc->acpi_dev, "Failed to suspend APs: CPU mask = 0x%jx\n", (uintmax_t)(wakeup_cpus & ~stopped_cpus)); goto out; } #endif WAKECODE_FIXUP(resume_beep, uint8_t, (acpi_resume_beep != 0)); WAKECODE_FIXUP(reset_video, uint8_t, (acpi_reset_video != 0)); WAKECODE_FIXUP(wakeup_pcb, struct pcb *, susppcbs[0]); WAKECODE_FIXUP(wakeup_gdt, uint16_t, susppcbs[0]->pcb_gdt.rd_limit); WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t, susppcbs[0]->pcb_gdt.rd_base); WAKECODE_FIXUP(wakeup_cpu, int, 0); /* Call ACPICA to enter the desired sleep state */ if (state == ACPI_STATE_S4 && sc->acpi_s4bios) status = AcpiEnterSleepStateS4bios(); else status = AcpiEnterSleepState(state); if (status != AE_OK) { device_printf(sc->acpi_dev, "AcpiEnterSleepState failed - %s\n", AcpiFormatException(status)); goto out; } for (;;) ia32_pause(); } else { PCPU_SET(switchtime, 0); PCPU_SET(switchticks, ticks); #ifdef SMP if (wakeup_cpus != 0) acpi_wakeup_cpus(sc, wakeup_cpus); #endif acpi_resync_clock(sc); ret = 0; } out: #ifdef SMP if (wakeup_cpus != 0) restart_cpus(wakeup_cpus); #endif load_cr3(cr3); mca_resume(); intr_resume(); intr_restore(rf); AcpiSetFirmwareWakingVector(0); if (ret == 0 && mem_range_softc.mr_op != NULL && mem_range_softc.mr_op->reinit != NULL) mem_range_softc.mr_op->reinit(&mem_range_softc); /* If we beeped, turn it off after a delay. */ if (acpi_resume_beep) timeout(acpi_stop_beep, NULL, 3 * hz); return (ret); } static void * acpi_alloc_wakeup_handler(void) { void *wakeaddr; int i; /* * Specify the region for our wakeup code. We want it in the low 1 MB * region, excluding real mode IVT (0-0x3ff), BDA (0x400-0x4ff), EBDA * (less than 128KB, below 0xa0000, must be excluded by SMAP and DSDT), * and ROM area (0xa0000 and above). The temporary page tables must be * page-aligned. */ wakeaddr = contigmalloc(4 * PAGE_SIZE, M_DEVBUF, M_NOWAIT, 0x500, 0xa0000, PAGE_SIZE, 0ul); if (wakeaddr == NULL) { printf("%s: can't alloc wake memory\n", __func__); return (NULL); } susppcbs = malloc(mp_ncpus * sizeof(*susppcbs), M_DEVBUF, M_WAITOK); for (i = 0; i < mp_ncpus; i++) susppcbs[i] = malloc(sizeof(**susppcbs), M_DEVBUF, M_WAITOK); return (wakeaddr); } void acpi_install_wakeup_handler(struct acpi_softc *sc) { static void *wakeaddr = NULL; uint64_t *pt4, *pt3, *pt2; int i; if (wakeaddr != NULL) return; wakeaddr = acpi_alloc_wakeup_handler(); if (wakeaddr == NULL) return; sc->acpi_wakeaddr = (vm_offset_t)wakeaddr; sc->acpi_wakephys = vtophys(wakeaddr); bcopy(wakecode, (void *)WAKECODE_VADDR(sc), sizeof(wakecode)); /* Patch GDT base address, ljmp targets and page table base address. */ WAKECODE_FIXUP((bootgdtdesc + 2), uint32_t, WAKECODE_PADDR(sc) + bootgdt); WAKECODE_FIXUP((wakeup_sw32 + 2), uint32_t, WAKECODE_PADDR(sc) + wakeup_32); WAKECODE_FIXUP((wakeup_sw64 + 1), uint32_t, WAKECODE_PADDR(sc) + wakeup_64); WAKECODE_FIXUP(wakeup_pagetables, uint32_t, sc->acpi_wakephys); /* Save pointers to some global data. */ WAKECODE_FIXUP(wakeup_retaddr, void *, acpi_restorecpu); WAKECODE_FIXUP(wakeup_kpml4, uint64_t, KPML4phys); WAKECODE_FIXUP(wakeup_ctx, vm_offset_t, WAKECODE_VADDR(sc) + wakeup_ctx); WAKECODE_FIXUP(wakeup_efer, uint64_t, rdmsr(MSR_EFER)); WAKECODE_FIXUP(wakeup_pat, uint64_t, rdmsr(MSR_PAT)); WAKECODE_FIXUP(wakeup_star, uint64_t, rdmsr(MSR_STAR)); WAKECODE_FIXUP(wakeup_lstar, uint64_t, rdmsr(MSR_LSTAR)); WAKECODE_FIXUP(wakeup_cstar, uint64_t, rdmsr(MSR_CSTAR)); WAKECODE_FIXUP(wakeup_sfmask, uint64_t, rdmsr(MSR_SF_MASK)); /* Build temporary page tables below realmode code. */ pt4 = wakeaddr; pt3 = pt4 + (PAGE_SIZE) / sizeof(uint64_t); pt2 = pt3 + (PAGE_SIZE) / sizeof(uint64_t); /* Create the initial 1GB replicated page tables */ for (i = 0; i < 512; i++) { /* * Each slot of the level 4 pages points * to the same level 3 page */ pt4[i] = (uint64_t)(sc->acpi_wakephys + PAGE_SIZE); pt4[i] |= PG_V | PG_RW | PG_U; /* * Each slot of the level 3 pages points * to the same level 2 page */ pt3[i] = (uint64_t)(sc->acpi_wakephys + (2 * PAGE_SIZE)); pt3[i] |= PG_V | PG_RW | PG_U; /* The level 2 page slots are mapped with 2MB pages for 1GB. */ pt2[i] = i * (2 * 1024 * 1024); pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; } if (bootverbose) device_printf(sc->acpi_dev, "wakeup code va %p pa %p\n", (void *)sc->acpi_wakeaddr, (void *)sc->acpi_wakephys); } diff --git a/sys/amd64/acpica/madt.c b/sys/amd64/acpica/madt.c index 90ffd640dc19..84bc831163d0 100644 --- a/sys/amd64/acpica/madt.c +++ b/sys/amd64/acpica/madt.c @@ -1,573 +1,573 @@ /*- * Copyright (c) 2003 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include #include /* These two arrays are indexed by APIC IDs. */ struct ioapic_info { void *io_apic; UINT32 io_vector; } ioapics[MAX_APIC_ID + 1]; struct lapic_info { u_int la_enabled:1; u_int la_acpi_id:8; } lapics[MAX_APIC_ID + 1]; static int madt_found_sci_override; static ACPI_TABLE_MADT *madt; static vm_paddr_t madt_physaddr; static vm_offset_t madt_length; MALLOC_DEFINE(M_MADT, "madt_table", "ACPI MADT Table Items"); static enum intr_polarity interrupt_polarity(UINT16 IntiFlags, UINT8 Source); static enum intr_trigger interrupt_trigger(UINT16 IntiFlags, UINT8 Source); static int madt_find_cpu(u_int acpi_id, u_int *apic_id); static int madt_find_interrupt(int intr, void **apic, u_int *pin); static void madt_parse_apics(ACPI_SUBTABLE_HEADER *entry, void *arg); static void madt_parse_interrupt_override( ACPI_MADT_INTERRUPT_OVERRIDE *intr); static void madt_parse_ints(ACPI_SUBTABLE_HEADER *entry, void *arg __unused); static void madt_parse_local_nmi(ACPI_MADT_LOCAL_APIC_NMI *nmi); static void madt_parse_nmi(ACPI_MADT_NMI_SOURCE *nmi); static int madt_probe(void); static int madt_probe_cpus(void); static void madt_probe_cpus_handler(ACPI_SUBTABLE_HEADER *entry, void *arg __unused); static void madt_register(void *dummy); static int madt_setup_local(void); static int madt_setup_io(void); static void madt_walk_table(acpi_subtable_handler *handler, void *arg); static struct apic_enumerator madt_enumerator = { "MADT", madt_probe, madt_probe_cpus, madt_setup_local, madt_setup_io }; /* * Look for an ACPI Multiple APIC Description Table ("APIC") */ static int madt_probe(void) { madt_physaddr = acpi_find_table(ACPI_SIG_MADT); if (madt_physaddr == 0) return (ENXIO); return (0); } /* * Run through the MP table enumerating CPUs. */ static int madt_probe_cpus(void) { madt = acpi_map_table(madt_physaddr, ACPI_SIG_MADT); madt_length = madt->Header.Length; KASSERT(madt != NULL, ("Unable to re-map MADT")); madt_walk_table(madt_probe_cpus_handler, NULL); acpi_unmap_table(madt); madt = NULL; return (0); } /* * Initialize the local APIC on the BSP. */ static int madt_setup_local(void) { madt = pmap_mapbios(madt_physaddr, madt_length); lapic_init(madt->Address); printf("ACPI APIC Table: <%.*s %.*s>\n", (int)sizeof(madt->Header.OemId), madt->Header.OemId, (int)sizeof(madt->Header.OemTableId), madt->Header.OemTableId); /* * We ignore 64-bit local APIC override entries. Should we * perhaps emit a warning here if we find one? */ return (0); } /* * Enumerate I/O APICs and setup interrupt sources. */ static int madt_setup_io(void) { void *ioapic; u_int pin; int i; /* Try to initialize ACPI so that we can access the FADT. */ i = acpi_Startup(); if (ACPI_FAILURE(i)) { printf("MADT: ACPI Startup failed with %s\n", AcpiFormatException(i)); printf("Try disabling either ACPI or apic support.\n"); panic("Using MADT but ACPI doesn't work"); } /* First, we run through adding I/O APIC's. */ madt_walk_table(madt_parse_apics, NULL); /* Second, we run through the table tweaking interrupt sources. */ madt_walk_table(madt_parse_ints, NULL); /* * If there was not an explicit override entry for the SCI, * force it to use level trigger and active-low polarity. */ if (!madt_found_sci_override) { if (madt_find_interrupt(AcpiGbl_FADT.SciInterrupt, &ioapic, &pin) != 0) printf("MADT: Could not find APIC for SCI IRQ %u\n", AcpiGbl_FADT.SciInterrupt); else { printf( "MADT: Forcing active-low polarity and level trigger for SCI\n"); ioapic_set_polarity(ioapic, pin, INTR_POLARITY_LOW); ioapic_set_triggermode(ioapic, pin, INTR_TRIGGER_LEVEL); } } /* Third, we register all the I/O APIC's. */ for (i = 0; i <= MAX_APIC_ID; i++) if (ioapics[i].io_apic != NULL) ioapic_register(ioapics[i].io_apic); /* Finally, we throw the switch to enable the I/O APIC's. */ acpi_SetDefaultIntrModel(ACPI_INTR_APIC); return (0); } static void madt_register(void *dummy __unused) { apic_register_enumerator(&madt_enumerator); } SYSINIT(madt_register, SI_SUB_TUNABLES - 1, SI_ORDER_FIRST, madt_register, NULL); /* * Call the handler routine for each entry in the MADT table. */ static void madt_walk_table(acpi_subtable_handler *handler, void *arg) { acpi_walk_subtables(madt + 1, (char *)madt + madt->Header.Length, handler, arg); } static void madt_probe_cpus_handler(ACPI_SUBTABLE_HEADER *entry, void *arg) { ACPI_MADT_LOCAL_APIC *proc; struct lapic_info *la; switch (entry->Type) { case ACPI_MADT_TYPE_LOCAL_APIC: /* * The MADT does not include a BSP flag, so we have to * let the MP code figure out which CPU is the BSP on * its own. */ proc = (ACPI_MADT_LOCAL_APIC *)entry; if (bootverbose) printf("MADT: Found CPU APIC ID %u ACPI ID %u: %s\n", proc->Id, proc->ProcessorId, (proc->LapicFlags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); if (!(proc->LapicFlags & ACPI_MADT_ENABLED)) break; if (proc->Id > MAX_APIC_ID) panic("%s: CPU ID %u too high", __func__, proc->Id); la = &lapics[proc->Id]; KASSERT(la->la_enabled == 0, ("Duplicate local APIC ID %u", proc->Id)); la->la_enabled = 1; la->la_acpi_id = proc->ProcessorId; lapic_create(proc->Id, 0); break; } } /* * Add an I/O APIC from an entry in the table. */ static void madt_parse_apics(ACPI_SUBTABLE_HEADER *entry, void *arg __unused) { ACPI_MADT_IO_APIC *apic; switch (entry->Type) { case ACPI_MADT_TYPE_IO_APIC: apic = (ACPI_MADT_IO_APIC *)entry; if (bootverbose) printf( "MADT: Found IO APIC ID %u, Interrupt %u at %p\n", apic->Id, apic->GlobalIrqBase, (void *)(uintptr_t)apic->Address); if (apic->Id > MAX_APIC_ID) panic("%s: I/O APIC ID %u too high", __func__, apic->Id); if (ioapics[apic->Id].io_apic != NULL) panic("%s: Double APIC ID %u", __func__, apic->Id); if (apic->GlobalIrqBase >= FIRST_MSI_INT) { printf("MADT: Ignoring bogus I/O APIC ID %u", apic->Id); break; } ioapics[apic->Id].io_apic = ioapic_create(apic->Address, apic->Id, apic->GlobalIrqBase); ioapics[apic->Id].io_vector = apic->GlobalIrqBase; break; default: break; } } /* * Determine properties of an interrupt source. Note that for ACPI these * functions are only used for ISA interrupts, so we assume ISA bus values * (Active Hi, Edge Triggered) for conforming values except for the ACPI * SCI for which we use Active Lo, Level Triggered. */ static enum intr_polarity interrupt_polarity(UINT16 IntiFlags, UINT8 Source) { switch (IntiFlags & ACPI_MADT_POLARITY_MASK) { case ACPI_MADT_POLARITY_CONFORMS: if (Source == AcpiGbl_FADT.SciInterrupt) return (INTR_POLARITY_LOW); else return (INTR_POLARITY_HIGH); case ACPI_MADT_POLARITY_ACTIVE_HIGH: return (INTR_POLARITY_HIGH); case ACPI_MADT_POLARITY_ACTIVE_LOW: return (INTR_POLARITY_LOW); default: panic("Bogus Interrupt Polarity"); } } static enum intr_trigger interrupt_trigger(UINT16 IntiFlags, UINT8 Source) { switch (IntiFlags & ACPI_MADT_TRIGGER_MASK) { case ACPI_MADT_TRIGGER_CONFORMS: if (Source == AcpiGbl_FADT.SciInterrupt) return (INTR_TRIGGER_LEVEL); else return (INTR_TRIGGER_EDGE); case ACPI_MADT_TRIGGER_EDGE: return (INTR_TRIGGER_EDGE); case ACPI_MADT_TRIGGER_LEVEL: return (INTR_TRIGGER_LEVEL); default: panic("Bogus Interrupt Trigger Mode"); } } /* * Find the local APIC ID associated with a given ACPI Processor ID. */ static int madt_find_cpu(u_int acpi_id, u_int *apic_id) { int i; for (i = 0; i <= MAX_APIC_ID; i++) { if (!lapics[i].la_enabled) continue; if (lapics[i].la_acpi_id != acpi_id) continue; *apic_id = i; return (0); } return (ENOENT); } /* * Find the IO APIC and pin on that APIC associated with a given global * interrupt. */ static int madt_find_interrupt(int intr, void **apic, u_int *pin) { int i, best; best = -1; for (i = 0; i <= MAX_APIC_ID; i++) { if (ioapics[i].io_apic == NULL || ioapics[i].io_vector > intr) continue; if (best == -1 || ioapics[best].io_vector < ioapics[i].io_vector) best = i; } if (best == -1) return (ENOENT); *apic = ioapics[best].io_apic; *pin = intr - ioapics[best].io_vector; if (*pin > 32) printf("WARNING: Found intpin of %u for vector %d\n", *pin, intr); return (0); } /* * Parse an interrupt source override for an ISA interrupt. */ static void madt_parse_interrupt_override(ACPI_MADT_INTERRUPT_OVERRIDE *intr) { void *new_ioapic, *old_ioapic; u_int new_pin, old_pin; enum intr_trigger trig; enum intr_polarity pol; char buf[64]; if (acpi_quirks & ACPI_Q_MADT_IRQ0 && intr->SourceIrq == 0 && intr->GlobalIrq == 2) { if (bootverbose) printf("MADT: Skipping timer override\n"); return; } if (bootverbose) printf("MADT: Interrupt override: source %u, irq %u\n", intr->SourceIrq, intr->GlobalIrq); KASSERT(intr->Bus == 0, ("bus for interrupt overrides must be zero")); if (madt_find_interrupt(intr->GlobalIrq, &new_ioapic, &new_pin) != 0) { printf("MADT: Could not find APIC for vector %u (IRQ %u)\n", intr->GlobalIrq, intr->SourceIrq); return; } /* * Lookup the appropriate trigger and polarity modes for this * entry. */ trig = interrupt_trigger(intr->IntiFlags, intr->SourceIrq); pol = interrupt_polarity(intr->IntiFlags, intr->SourceIrq); /* * If the SCI is identity mapped but has edge trigger and * active-hi polarity or the force_sci_lo tunable is set, * force it to use level/lo. */ if (intr->SourceIrq == AcpiGbl_FADT.SciInterrupt) { madt_found_sci_override = 1; if (getenv_string("hw.acpi.sci.trigger", buf, sizeof(buf))) { if (tolower(buf[0]) == 'e') trig = INTR_TRIGGER_EDGE; else if (tolower(buf[0]) == 'l') trig = INTR_TRIGGER_LEVEL; else panic( "Invalid trigger %s: must be 'edge' or 'level'", buf); printf("MADT: Forcing SCI to %s trigger\n", trig == INTR_TRIGGER_EDGE ? "edge" : "level"); } if (getenv_string("hw.acpi.sci.polarity", buf, sizeof(buf))) { if (tolower(buf[0]) == 'h') pol = INTR_POLARITY_HIGH; else if (tolower(buf[0]) == 'l') pol = INTR_POLARITY_LOW; else panic( "Invalid polarity %s: must be 'high' or 'low'", buf); printf("MADT: Forcing SCI to active %s polarity\n", pol == INTR_POLARITY_HIGH ? "high" : "low"); } } /* Remap the IRQ if it is mapped to a different interrupt vector. */ if (intr->SourceIrq != intr->GlobalIrq) { /* * If the SCI is remapped to a non-ISA global interrupt, * then override the vector we use to setup and allocate * the interrupt. */ if (intr->GlobalIrq > 15 && intr->SourceIrq == AcpiGbl_FADT.SciInterrupt) acpi_OverrideInterruptLevel(intr->GlobalIrq); else ioapic_remap_vector(new_ioapic, new_pin, intr->SourceIrq); if (madt_find_interrupt(intr->SourceIrq, &old_ioapic, &old_pin) != 0) printf("MADT: Could not find APIC for source IRQ %u\n", intr->SourceIrq); else if (ioapic_get_vector(old_ioapic, old_pin) == intr->SourceIrq) ioapic_disable_pin(old_ioapic, old_pin); } /* Program the polarity and trigger mode. */ ioapic_set_triggermode(new_ioapic, new_pin, trig); ioapic_set_polarity(new_ioapic, new_pin, pol); } /* * Parse an entry for an NMI routed to an IO APIC. */ static void madt_parse_nmi(ACPI_MADT_NMI_SOURCE *nmi) { void *ioapic; u_int pin; if (madt_find_interrupt(nmi->GlobalIrq, &ioapic, &pin) != 0) { printf("MADT: Could not find APIC for vector %u\n", nmi->GlobalIrq); return; } ioapic_set_nmi(ioapic, pin); if (!(nmi->IntiFlags & ACPI_MADT_TRIGGER_CONFORMS)) ioapic_set_triggermode(ioapic, pin, interrupt_trigger(nmi->IntiFlags, 0)); if (!(nmi->IntiFlags & ACPI_MADT_TRIGGER_CONFORMS)) ioapic_set_polarity(ioapic, pin, interrupt_polarity(nmi->IntiFlags, 0)); } /* * Parse an entry for an NMI routed to a local APIC LVT pin. */ static void madt_parse_local_nmi(ACPI_MADT_LOCAL_APIC_NMI *nmi) { u_int apic_id, pin; if (nmi->ProcessorId == 0xff) apic_id = APIC_ID_ALL; else if (madt_find_cpu(nmi->ProcessorId, &apic_id) != 0) { if (bootverbose) printf("MADT: Ignoring local NMI routed to " "ACPI CPU %u\n", nmi->ProcessorId); return; } if (nmi->Lint == 0) pin = LVT_LINT0; else pin = LVT_LINT1; lapic_set_lvt_mode(apic_id, pin, APIC_LVT_DM_NMI); if (!(nmi->IntiFlags & ACPI_MADT_TRIGGER_CONFORMS)) lapic_set_lvt_triggermode(apic_id, pin, interrupt_trigger(nmi->IntiFlags, 0)); if (!(nmi->IntiFlags & ACPI_MADT_POLARITY_CONFORMS)) lapic_set_lvt_polarity(apic_id, pin, interrupt_polarity(nmi->IntiFlags, 0)); } /* * Parse interrupt entries. */ static void madt_parse_ints(ACPI_SUBTABLE_HEADER *entry, void *arg __unused) { switch (entry->Type) { case ACPI_MADT_TYPE_INTERRUPT_OVERRIDE: madt_parse_interrupt_override( (ACPI_MADT_INTERRUPT_OVERRIDE *)entry); break; case ACPI_MADT_TYPE_NMI_SOURCE: madt_parse_nmi((ACPI_MADT_NMI_SOURCE *)entry); break; case ACPI_MADT_TYPE_LOCAL_APIC_NMI: madt_parse_local_nmi((ACPI_MADT_LOCAL_APIC_NMI *)entry); break; } } /* * Setup per-CPU ACPI IDs. */ static void madt_set_ids(void *dummy) { struct lapic_info *la; struct pcpu *pc; u_int i; if (madt == NULL) return; CPU_FOREACH(i) { pc = pcpu_find(i); KASSERT(pc != NULL, ("no pcpu data for CPU %u", i)); la = &lapics[pc->pc_apic_id]; if (!la->la_enabled) panic("APIC: CPU with APIC ID %u is not enabled", pc->pc_apic_id); pc->pc_acpi_id = la->la_acpi_id; if (bootverbose) printf("APIC: CPU %u has ACPI ID %u\n", i, la->la_acpi_id); } } SYSINIT(madt_set_ids, SI_SUB_CPU, SI_ORDER_ANY, madt_set_ids, NULL); diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index 73a0eabffe78..e4c4ddf0dcea 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -1,331 +1,331 @@ /*- * Copyright (c) 1989, 1990 William F. Jolitz. * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: vector.s, 386BSD 0.1 unknown origin * $FreeBSD$ */ /* * Interrupt entry points for external interrupts triggered by I/O APICs * as well as IPI handlers. */ #include "opt_smp.h" #include -#include +#include #include "assym.s" /* * I/O Interrupt Entry Point. Rather than having one entry point for * each interrupt source, we use one entry point for each 32-bit word * in the ISR. The handler determines the highest bit set in the ISR, * translates that into a vector, and passes the vector to the * lapic_handle_intr() function. */ #define ISR_VEC(index, vec_name) \ .text ; \ SUPERALIGN_TEXT ; \ IDTVEC(vec_name) ; \ PUSH_FRAME ; \ FAKE_MCOUNT(TF_RIP(%rsp)) ; \ movq lapic, %rdx ; /* pointer to local APIC */ \ movl LA_ISR + 16 * (index)(%rdx), %eax ; /* load ISR */ \ bsrl %eax, %eax ; /* index of highset set bit in ISR */ \ jz 2f ; \ addl $(32 * index),%eax ; \ 1: ; \ movq %rsp, %rsi ; \ movl %eax, %edi ; /* pass the IRQ */ \ call lapic_handle_intr ; \ MEXITCOUNT ; \ jmp doreti ; \ 2: movl $-1, %eax ; /* send a vector of -1 */ \ jmp 1b /* * Handle "spurious INTerrupts". * Notes: * This is different than the "spurious INTerrupt" generated by an * 8259 PIC for missing INTs. See the APIC documentation for details. * This routine should NOT do an 'EOI' cycle. */ .text SUPERALIGN_TEXT IDTVEC(spuriousint) /* No EOI cycle used here */ jmp doreti_iret ISR_VEC(1, apic_isr1) ISR_VEC(2, apic_isr2) ISR_VEC(3, apic_isr3) ISR_VEC(4, apic_isr4) ISR_VEC(5, apic_isr5) ISR_VEC(6, apic_isr6) ISR_VEC(7, apic_isr7) /* * Local APIC periodic timer handler. */ .text SUPERALIGN_TEXT IDTVEC(timerint) PUSH_FRAME FAKE_MCOUNT(TF_RIP(%rsp)) movq %rsp, %rdi call lapic_handle_timer MEXITCOUNT jmp doreti /* * Local APIC CMCI handler. */ .text SUPERALIGN_TEXT IDTVEC(cmcint) PUSH_FRAME FAKE_MCOUNT(TF_RIP(%rsp)) call lapic_handle_cmc MEXITCOUNT jmp doreti /* * Local APIC error interrupt handler. */ .text SUPERALIGN_TEXT IDTVEC(errorint) PUSH_FRAME FAKE_MCOUNT(TF_RIP(%rsp)) call lapic_handle_error MEXITCOUNT jmp doreti #ifdef SMP /* * Global address space TLB shootdown. */ .text SUPERALIGN_TEXT IDTVEC(invltlb) #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS) PUSH_FRAME movl PCPU(CPUID), %eax #ifdef COUNT_XINVLTLB_HITS incl xhits_gbl(,%rax,4) #endif #ifdef COUNT_IPIS movq ipi_invltlb_counts(,%rax,8),%rax incq (%rax) #endif POP_FRAME #endif pushq %rax movq %cr3, %rax /* invalidate the TLB */ movq %rax, %cr3 movq lapic, %rax movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait popq %rax jmp doreti_iret /* * Single page TLB shootdown */ .text SUPERALIGN_TEXT IDTVEC(invlpg) #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS) PUSH_FRAME movl PCPU(CPUID), %eax #ifdef COUNT_XINVLTLB_HITS incl xhits_pg(,%rax,4) #endif #ifdef COUNT_IPIS movq ipi_invlpg_counts(,%rax,8),%rax incq (%rax) #endif POP_FRAME #endif pushq %rax movq smp_tlb_addr1, %rax invlpg (%rax) /* invalidate single page */ movq lapic, %rax movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait popq %rax jmp doreti_iret /* * Page range TLB shootdown. */ .text SUPERALIGN_TEXT IDTVEC(invlrng) #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS) PUSH_FRAME movl PCPU(CPUID), %eax #ifdef COUNT_XINVLTLB_HITS incl xhits_rng(,%rax,4) #endif #ifdef COUNT_IPIS movq ipi_invlrng_counts(,%rax,8),%rax incq (%rax) #endif POP_FRAME #endif pushq %rax pushq %rdx movq smp_tlb_addr1, %rdx movq smp_tlb_addr2, %rax 1: invlpg (%rdx) /* invalidate single page */ addq $PAGE_SIZE, %rdx cmpq %rax, %rdx jb 1b movq lapic, %rax movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait popq %rdx popq %rax jmp doreti_iret /* * Invalidate cache. */ .text SUPERALIGN_TEXT IDTVEC(invlcache) #ifdef COUNT_IPIS PUSH_FRAME movl PCPU(CPUID), %eax movq ipi_invlcache_counts(,%rax,8),%rax incq (%rax) POP_FRAME #endif pushq %rax wbinvd movq lapic, %rax movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait popq %rax jmp doreti_iret /* * Handler for IPIs sent via the per-cpu IPI bitmap. */ .text SUPERALIGN_TEXT IDTVEC(ipi_intr_bitmap_handler) PUSH_FRAME movq lapic, %rdx movl $0, LA_EOI(%rdx) /* End Of Interrupt to APIC */ FAKE_MCOUNT(TF_RIP(%rsp)) call ipi_bitmap_handler MEXITCOUNT jmp doreti /* * Executed by a CPU when it receives an IPI_STOP from another CPU. */ .text SUPERALIGN_TEXT IDTVEC(cpustop) PUSH_FRAME movq lapic, %rax movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ call cpustop_handler jmp doreti /* * Executed by a CPU when it receives an IPI_SUSPEND from another CPU. */ .text SUPERALIGN_TEXT IDTVEC(cpususpend) PUSH_FRAME movq lapic, %rax movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ call cpususpend_handler POP_FRAME jmp doreti_iret /* * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU. * * - Calls the generic rendezvous action function. */ .text SUPERALIGN_TEXT IDTVEC(rendezvous) PUSH_FRAME #ifdef COUNT_IPIS movl PCPU(CPUID), %eax movq ipi_rendezvous_counts(,%rax,8), %rax incq (%rax) #endif call smp_rendezvous_action movq lapic, %rax movl $0, LA_EOI(%rax) /* End Of Interrupt to APIC */ jmp doreti #endif /* SMP */ diff --git a/sys/amd64/amd64/genassym.c b/sys/amd64/amd64/genassym.c index d6ebec52b4e1..613bce587a73 100644 --- a/sys/amd64/amd64/genassym.c +++ b/sys/amd64/amd64/genassym.c @@ -1,246 +1,246 @@ /*- * Copyright (c) 1982, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_hwpmc_hooks.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); ASSYM(P_MD, offsetof(struct proc, p_md)); ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt)); ASSYM(MD_LDT_SD, offsetof(struct mdproc, md_ldt_sd)); ASSYM(TD_LOCK, offsetof(struct thread, td_lock)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_PFLAGS, offsetof(struct thread, td_pflags)); ASSYM(TD_PROC, offsetof(struct thread, td_proc)); ASSYM(TD_TID, offsetof(struct thread, td_tid)); ASSYM(TDF_ASTPENDING, TDF_ASTPENDING); ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED); ASSYM(TDP_CALLCHAIN, TDP_CALLCHAIN); ASSYM(TDP_KTHREAD, TDP_KTHREAD); ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap)); ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall)); ASSYM(V_INTR, offsetof(struct vmmeter, v_intr)); ASSYM(KSTACK_PAGES, KSTACK_PAGES); ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(NPTEPG, NPTEPG); ASSYM(NPDEPG, NPDEPG); ASSYM(addr_PTmap, addr_PTmap); ASSYM(addr_PDmap, addr_PDmap); ASSYM(addr_PDPmap, addr_PDPmap); ASSYM(addr_PML4map, addr_PML4map); ASSYM(addr_PML4pml4e, addr_PML4pml4e); ASSYM(PDESIZE, sizeof(pd_entry_t)); ASSYM(PTESIZE, sizeof(pt_entry_t)); ASSYM(PTESHIFT, PTESHIFT); ASSYM(PAGE_SHIFT, PAGE_SHIFT); ASSYM(PAGE_MASK, PAGE_MASK); ASSYM(PDRSHIFT, PDRSHIFT); ASSYM(PDPSHIFT, PDPSHIFT); ASSYM(PML4SHIFT, PML4SHIFT); ASSYM(val_KPDPI, KPDPI); ASSYM(val_KPML4I, KPML4I); ASSYM(val_PML4PML4I, PML4PML4I); ASSYM(USRSTACK, USRSTACK); ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS); ASSYM(KERNBASE, KERNBASE); ASSYM(DMAP_MIN_ADDRESS, DMAP_MIN_ADDRESS); ASSYM(DMAP_MAX_ADDRESS, DMAP_MAX_ADDRESS); ASSYM(MCLBYTES, MCLBYTES); ASSYM(PCB_R15, offsetof(struct pcb, pcb_r15)); ASSYM(PCB_R14, offsetof(struct pcb, pcb_r14)); ASSYM(PCB_R13, offsetof(struct pcb, pcb_r13)); ASSYM(PCB_R12, offsetof(struct pcb, pcb_r12)); ASSYM(PCB_RBP, offsetof(struct pcb, pcb_rbp)); ASSYM(PCB_RSP, offsetof(struct pcb, pcb_rsp)); ASSYM(PCB_RBX, offsetof(struct pcb, pcb_rbx)); ASSYM(PCB_RIP, offsetof(struct pcb, pcb_rip)); ASSYM(PCB_FSBASE, offsetof(struct pcb, pcb_fsbase)); ASSYM(PCB_GSBASE, offsetof(struct pcb, pcb_gsbase)); ASSYM(PCB_KGSBASE, offsetof(struct pcb, pcb_kgsbase)); ASSYM(PCB_CR0, offsetof(struct pcb, pcb_cr0)); ASSYM(PCB_CR2, offsetof(struct pcb, pcb_cr2)); ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3)); ASSYM(PCB_CR4, offsetof(struct pcb, pcb_cr4)); ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0)); ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1)); ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2)); ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3)); ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6)); ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); ASSYM(PCB_GS32SD, offsetof(struct pcb, pcb_gs32sd)); ASSYM(PCB_TSSP, offsetof(struct pcb, pcb_tssp)); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct savefpu)); ASSYM(PCB_FULL_IRET, offsetof(struct pcb, pcb_full_iret)); ASSYM(PCB_GDT, offsetof(struct pcb, pcb_gdt)); ASSYM(PCB_IDT, offsetof(struct pcb, pcb_idt)); ASSYM(PCB_LDT, offsetof(struct pcb, pcb_ldt)); ASSYM(PCB_TR, offsetof(struct pcb, pcb_tr)); ASSYM(PCB_USERFPU, offsetof(struct pcb, pcb_user_save)); ASSYM(PCB_SIZE, sizeof(struct pcb)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_32BIT, PCB_32BIT); ASSYM(PCB_GS32BIT, PCB_GS32BIT); ASSYM(PCB_FULLCTX, PCB_FULLCTX); ASSYM(COMMON_TSS_RSP0, offsetof(struct amd64tss, tss_rsp0)); ASSYM(TF_R15, offsetof(struct trapframe, tf_r15)); ASSYM(TF_R14, offsetof(struct trapframe, tf_r14)); ASSYM(TF_R13, offsetof(struct trapframe, tf_r13)); ASSYM(TF_R12, offsetof(struct trapframe, tf_r12)); ASSYM(TF_R11, offsetof(struct trapframe, tf_r11)); ASSYM(TF_R10, offsetof(struct trapframe, tf_r10)); ASSYM(TF_R9, offsetof(struct trapframe, tf_r9)); ASSYM(TF_R8, offsetof(struct trapframe, tf_r8)); ASSYM(TF_RDI, offsetof(struct trapframe, tf_rdi)); ASSYM(TF_RSI, offsetof(struct trapframe, tf_rsi)); ASSYM(TF_RBP, offsetof(struct trapframe, tf_rbp)); ASSYM(TF_RBX, offsetof(struct trapframe, tf_rbx)); ASSYM(TF_RDX, offsetof(struct trapframe, tf_rdx)); ASSYM(TF_RCX, offsetof(struct trapframe, tf_rcx)); ASSYM(TF_RAX, offsetof(struct trapframe, tf_rax)); ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno)); ASSYM(TF_ADDR, offsetof(struct trapframe, tf_addr)); ASSYM(TF_ERR, offsetof(struct trapframe, tf_err)); ASSYM(TF_RIP, offsetof(struct trapframe, tf_rip)); ASSYM(TF_CS, offsetof(struct trapframe, tf_cs)); ASSYM(TF_RFLAGS, offsetof(struct trapframe, tf_rflags)); ASSYM(TF_RSP, offsetof(struct trapframe, tf_rsp)); ASSYM(TF_SS, offsetof(struct trapframe, tf_ss)); ASSYM(TF_DS, offsetof(struct trapframe, tf_ds)); ASSYM(TF_ES, offsetof(struct trapframe, tf_es)); ASSYM(TF_FS, offsetof(struct trapframe, tf_fs)); ASSYM(TF_GS, offsetof(struct trapframe, tf_gs)); ASSYM(TF_FLAGS, offsetof(struct trapframe, tf_flags)); ASSYM(TF_SIZE, sizeof(struct trapframe)); ASSYM(TF_HASSEGS, TF_HASSEGS); ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc)); ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_rflags)); ASSYM(ENOENT, ENOENT); ASSYM(EFAULT, EFAULT); ASSYM(ENAMETOOLONG, ENAMETOOLONG); ASSYM(MAXCPU, MAXCPU); ASSYM(MAXCOMLEN, MAXCOMLEN); ASSYM(MAXPATHLEN, MAXPATHLEN); ASSYM(PC_SIZEOF, sizeof(struct pcpu)); ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace)); ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread)); ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread)); ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread)); ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb)); ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); ASSYM(PC_SCRATCH_RSP, offsetof(struct pcpu, pc_scratch_rsp)); ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); ASSYM(PC_TSSP, offsetof(struct pcpu, pc_tssp)); ASSYM(PC_RSP0, offsetof(struct pcpu, pc_rsp0)); ASSYM(PC_FS32P, offsetof(struct pcpu, pc_fs32p)); ASSYM(PC_GS32P, offsetof(struct pcpu, pc_gs32p)); ASSYM(PC_LDT, offsetof(struct pcpu, pc_ldt)); ASSYM(PC_COMMONTSSP, offsetof(struct pcpu, pc_commontssp)); ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss)); ASSYM(LA_VER, offsetof(struct LAPIC, version)); ASSYM(LA_TPR, offsetof(struct LAPIC, tpr)); ASSYM(LA_EOI, offsetof(struct LAPIC, eoi)); ASSYM(LA_SVR, offsetof(struct LAPIC, svr)); ASSYM(LA_ICR_LO, offsetof(struct LAPIC, icr_lo)); ASSYM(LA_ICR_HI, offsetof(struct LAPIC, icr_hi)); ASSYM(LA_ISR, offsetof(struct LAPIC, isr0)); ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL)); ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL)); ASSYM(KUCSEL, GSEL(GUCODE_SEL, SEL_UPL)); ASSYM(KUDSEL, GSEL(GUDATA_SEL, SEL_UPL)); ASSYM(KUC32SEL, GSEL(GUCODE32_SEL, SEL_UPL)); ASSYM(KUF32SEL, GSEL(GUFS32_SEL, SEL_UPL)); ASSYM(KUG32SEL, GSEL(GUGS32_SEL, SEL_UPL)); ASSYM(TSSSEL, GSEL(GPROC0_SEL, SEL_KPL)); ASSYM(LDTSEL, GSEL(GUSERLDT_SEL, SEL_KPL)); ASSYM(SEL_RPL_MASK, SEL_RPL_MASK); #ifdef HWPMC_HOOKS ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN); #endif diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c index 26ad9fd3ccfe..3868428f97e1 100644 --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -1,1627 +1,1627 @@ /*- * Copyright (c) 1996, by Steve Passe * Copyright (c) 2003, by Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_cpu.h" #include "opt_kstack_pages.h" #include "opt_mp_watchdog.h" #include "opt_sched.h" #include "opt_smp.h" #include #include #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include #include #include #include #include #include #include #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) /* lock region used by kernel profiling */ int mcount_lock; int mp_naps; /* # of Applications processors */ int boot_cpu_id = -1; /* designated BSP */ extern struct pcpu __pcpu[]; /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; static int bootAP; /* Free these after use */ void *bootstacks[MAXCPU]; /* Temporary variables for init_secondary() */ char *doublefault_stack; char *nmi_stack; void *dpcpu; struct pcb stoppcbs[MAXCPU]; struct pcb **susppcbs = NULL; /* Variables needed for SMP tlb shootdown. */ vm_offset_t smp_tlb_addr1; vm_offset_t smp_tlb_addr2; volatile int smp_tlb_wait; #ifdef COUNT_IPIS /* Interrupt counts. */ static u_long *ipi_preempt_counts[MAXCPU]; static u_long *ipi_ast_counts[MAXCPU]; u_long *ipi_invltlb_counts[MAXCPU]; u_long *ipi_invlrng_counts[MAXCPU]; u_long *ipi_invlpg_counts[MAXCPU]; u_long *ipi_invlcache_counts[MAXCPU]; u_long *ipi_rendezvous_counts[MAXCPU]; u_long *ipi_lazypmap_counts[MAXCPU]; static u_long *ipi_hardclock_counts[MAXCPU]; #endif extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); /* * Local data and functions. */ static volatile cpumask_t ipi_nmi_pending; /* used to hold the AP's until we are ready to release them */ static struct mtx ap_boot_mtx; /* Set to 1 once we're ready to let the APs out of the pen. */ static volatile int aps_ready = 0; /* * Store data from cpu_add() until later in the boot when we actually setup * the APs. */ struct cpu_info { int cpu_present:1; int cpu_bsp:1; int cpu_disabled:1; int cpu_hyperthread:1; } static cpu_info[MAX_APIC_ID + 1]; int cpu_apic_ids[MAXCPU]; int apic_cpuids[MAX_APIC_ID + 1]; /* Holds pending bitmap based IPIs per CPU */ static volatile u_int cpu_ipi_pending[MAXCPU]; static u_int boot_address; static int cpu_logical; /* logical cpus per core */ static int cpu_cores; /* cores per package */ static void assign_cpu_ids(void); static void set_interrupt_apic_ids(void); static int start_all_aps(void); static int start_ap(int apic_id); static void release_aps(void *dummy); static int hlt_logical_cpus; static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ static cpumask_t hyperthreading_cpus_mask; static int hyperthreading_allowed = 1; static struct sysctl_ctx_list logical_cpu_clist; static u_int bootMP_size; static void mem_range_AP_init(void) { if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) mem_range_softc.mr_op->initAP(&mem_range_softc); } static void topo_probe_amd(void) { /* AMD processors do not support HTT. */ cpu_cores = (amd_feature2 & AMDID2_CMP) != 0 ? (cpu_procinfo2 & AMDID_CMP_CORES) + 1 : 1; cpu_logical = 1; } /* * Round up to the next power of two, if necessary, and then * take log2. * Returns -1 if argument is zero. */ static __inline int mask_width(u_int x) { return (fls(x << (1 - powerof2(x))) - 1); } static void topo_probe_0x4(void) { u_int p[4]; int pkg_id_bits; int core_id_bits; int max_cores; int max_logical; int id; /* Both zero and one here mean one logical processor per package. */ max_logical = (cpu_feature & CPUID_HTT) != 0 ? (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; if (max_logical <= 1) return; /* * Because of uniformity assumption we examine only * those logical processors that belong to the same * package as BSP. Further, we count number of * logical processors that belong to the same core * as BSP thus deducing number of threads per core. */ cpuid_count(0x04, 0, p); max_cores = ((p[0] >> 26) & 0x3f) + 1; core_id_bits = mask_width(max_logical/max_cores); if (core_id_bits < 0) return; pkg_id_bits = core_id_bits + mask_width(max_cores); for (id = 0; id <= MAX_APIC_ID; id++) { /* Check logical CPU availability. */ if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) continue; /* Check if logical CPU has the same package ID. */ if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) continue; cpu_cores++; /* Check if logical CPU has the same package and core IDs. */ if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) cpu_logical++; } cpu_cores /= cpu_logical; hyperthreading_cpus = cpu_logical; } static void topo_probe_0xb(void) { u_int p[4]; int bits; int cnt; int i; int logical; int type; int x; /* We only support three levels for now. */ for (i = 0; i < 3; i++) { cpuid_count(0x0b, i, p); /* Fall back if CPU leaf 11 doesn't really exist. */ if (i == 0 && p[1] == 0) { topo_probe_0x4(); return; } bits = p[0] & 0x1f; logical = p[1] &= 0xffff; type = (p[2] >> 8) & 0xff; if (type == 0 || logical == 0) break; /* * Because of uniformity assumption we examine only * those logical processors that belong to the same * package as BSP. */ for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { if (!cpu_info[x].cpu_present || cpu_info[x].cpu_disabled) continue; if (x >> bits == boot_cpu_id >> bits) cnt++; } if (type == CPUID_TYPE_SMT) cpu_logical = cnt; else if (type == CPUID_TYPE_CORE) cpu_cores = cnt; } if (cpu_logical == 0) cpu_logical = 1; cpu_cores /= cpu_logical; } /* * Both topology discovery code and code that consumes topology * information assume top-down uniformity of the topology. * That is, all physical packages must be identical and each * core in a package must have the same number of threads. * Topology information is queried only on BSP, on which this * code runs and for which it can query CPUID information. * Then topology is extrapolated on all packages using the * uniformity assumption. */ static void topo_probe(void) { static int cpu_topo_probed = 0; if (cpu_topo_probed) return; logical_cpus_mask = 0; if (cpu_vendor_id == CPU_VENDOR_AMD) topo_probe_amd(); else if (cpu_vendor_id == CPU_VENDOR_INTEL) { /* * See Intel(R) 64 Architecture Processor * Topology Enumeration article for details. * * Note that 0x1 <= cpu_high < 4 case should be * compatible with topo_probe_0x4() logic when * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) * or it should trigger the fallback otherwise. */ if (cpu_high >= 0xb) topo_probe_0xb(); else if (cpu_high >= 0x1) topo_probe_0x4(); } /* * Fallback: assume each logical CPU is in separate * physical package. That is, no multi-core, no SMT. */ if (cpu_cores == 0) cpu_cores = 1; if (cpu_logical == 0) cpu_logical = 1; cpu_topo_probed = 1; } struct cpu_group * cpu_topo(void) { int cg_flags; /* * Determine whether any threading flags are * necessry. */ topo_probe(); if (cpu_logical > 1 && hyperthreading_cpus) cg_flags = CG_FLAG_HTT; else if (cpu_logical > 1) cg_flags = CG_FLAG_SMT; else cg_flags = 0; if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { printf("WARNING: Non-uniform processors.\n"); printf("WARNING: Using suboptimal topology.\n"); return (smp_topo_none()); } /* * No multi-core or hyper-threaded. */ if (cpu_logical * cpu_cores == 1) return (smp_topo_none()); /* * Only HTT no multi-core. */ if (cpu_logical > 1 && cpu_cores == 1) return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); /* * Only multi-core no HTT. */ if (cpu_cores > 1 && cpu_logical == 1) return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); /* * Both HTT and multi-core. */ return (smp_topo_2level(CG_SHARE_L2, cpu_cores, CG_SHARE_L1, cpu_logical, cg_flags)); } /* * Calculate usable address in base memory for AP trampoline code. */ u_int mp_bootaddress(u_int basemem) { bootMP_size = mptramp_end - mptramp_start; boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */ if (((basemem * 1024) - boot_address) < bootMP_size) boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ /* 3 levels of page table pages */ mptramp_pagetables = boot_address - (PAGE_SIZE * 3); return mptramp_pagetables; } void cpu_add(u_int apic_id, char boot_cpu) { if (apic_id > MAX_APIC_ID) { panic("SMP: APIC ID %d too high", apic_id); return; } KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", apic_id)); cpu_info[apic_id].cpu_present = 1; if (boot_cpu) { KASSERT(boot_cpu_id == -1, ("CPU %d claims to be BSP, but CPU %d already is", apic_id, boot_cpu_id)); boot_cpu_id = apic_id; cpu_info[apic_id].cpu_bsp = 1; } if (mp_ncpus < MAXCPU) { mp_ncpus++; mp_maxid = mp_ncpus -1; } if (bootverbose) printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : "AP"); } void cpu_mp_setmaxid(void) { /* * mp_maxid should be already set by calls to cpu_add(). * Just sanity check its value here. */ if (mp_ncpus == 0) KASSERT(mp_maxid == 0, ("%s: mp_ncpus is zero, but mp_maxid is not", __func__)); else if (mp_ncpus == 1) mp_maxid = 0; else KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); } int cpu_mp_probe(void) { /* * Always record BSP in CPU map so that the mbuf init code works * correctly. */ all_cpus = 1; if (mp_ncpus == 0) { /* * No CPUs were found, so this must be a UP system. Setup * the variables to represent a system with a single CPU * with an id of 0. */ mp_ncpus = 1; return (0); } /* At least one CPU was found. */ if (mp_ncpus == 1) { /* * One CPU was found, so this must be a UP system with * an I/O APIC. */ mp_maxid = 0; return (0); } /* At least two CPUs were found. */ return (1); } /* * Initialize the IPI handlers and start up the AP's. */ void cpu_mp_start(void) { int i; /* Initialize the logical ID to APIC ID table. */ for (i = 0; i < MAXCPU; i++) { cpu_apic_ids[i] = -1; cpu_ipi_pending[i] = 0; } /* Install an inter-CPU IPI for TLB invalidation */ setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0); setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0); setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for cache invalidation. */ setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for all-CPU rendezvous */ setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); /* Install generic inter-CPU IPI handler */ setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for CPU stop/restart */ setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for CPU suspend/resume */ setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); /* Set boot_cpu_id if needed. */ if (boot_cpu_id == -1) { boot_cpu_id = PCPU_GET(apic_id); cpu_info[boot_cpu_id].cpu_bsp = 1; } else KASSERT(boot_cpu_id == PCPU_GET(apic_id), ("BSP's APIC ID doesn't match boot_cpu_id")); /* Probe logical/physical core configuration. */ topo_probe(); assign_cpu_ids(); /* Start each Application Processor */ start_all_aps(); set_interrupt_apic_ids(); } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { const char *hyperthread; int i; printf("FreeBSD/SMP: %d package(s) x %d core(s)", mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); if (hyperthreading_cpus > 1) printf(" x %d HTT threads", cpu_logical); else if (cpu_logical > 1) printf(" x %d SMT threads", cpu_logical); printf("\n"); /* List active CPUs first. */ printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); for (i = 1; i < mp_ncpus; i++) { if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) hyperthread = "/HT"; else hyperthread = ""; printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, cpu_apic_ids[i]); } /* List disabled CPUs last. */ for (i = 0; i <= MAX_APIC_ID; i++) { if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) continue; if (cpu_info[i].cpu_hyperthread) hyperthread = "/HT"; else hyperthread = ""; printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, i); } } /* * AP CPU's call this to initialize themselves. */ void init_secondary(void) { struct pcpu *pc; struct nmi_pcpu *np; u_int64_t msr, cr0; int cpu, gsel_tss, x; struct region_descriptor ap_gdt; /* Set by the startup code for us to use */ cpu = bootAP; /* Init tss */ common_tss[cpu] = common_tss[0]; common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */ common_tss[cpu].tss_iobase = sizeof(struct amd64tss) + IOPAGES * PAGE_SIZE; common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE]; /* The NMI stack runs on IST2. */ np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1; common_tss[cpu].tss_ist2 = (long) np; /* Prepare private GDT */ gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu]; for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1)) ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]); } ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]); ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; ap_gdt.rd_base = (long) &gdt[NGDT * cpu]; lgdt(&ap_gdt); /* does magic intra-segment return */ /* Get per-cpu data */ pc = &__pcpu[cpu]; /* prime data page for it to use */ pcpu_init(pc, cpu, sizeof(struct pcpu)); dpcpu_init(dpcpu, cpu); pc->pc_apic_id = cpu_apic_ids[cpu]; pc->pc_prvspace = pc; pc->pc_curthread = 0; pc->pc_tssp = &common_tss[cpu]; pc->pc_commontssp = &common_tss[cpu]; pc->pc_rsp0 = 0; pc->pc_tss = (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]; pc->pc_fs32p = &gdt[NGDT * cpu + GUFS32_SEL]; pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL]; pc->pc_ldt = (struct system_segment_descriptor *)&gdt[NGDT * cpu + GUSERLDT_SEL]; /* Save the per-cpu pointer for use by the NMI handler. */ np->np_pcpu = (register_t) pc; wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */ lidt(&r_idt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); /* * Set to a known state: * Set by mpboot.s: CR0_PG, CR0_PE * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM */ cr0 = rcr0(); cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); load_cr0(cr0); /* Set up the fast syscall stuff */ msr = rdmsr(MSR_EFER) | EFER_SCE; wrmsr(MSR_EFER, msr); wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); wrmsr(MSR_STAR, msr); wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); /* Disable local APIC just to be sure. */ lapic_disable(); /* signal our startup to the BSP. */ mp_naps++; /* Spin until the BSP releases the AP's. */ while (!aps_ready) ia32_pause(); /* Initialize the PAT MSR. */ pmap_init_pat(); /* set up CPU registers and state */ cpu_setregs(); /* set up SSE/NX registers */ initializecpu(); /* set up FPU state on the AP */ fpuinit(); /* A quick check from sanity claus */ if (PCPU_GET(apic_id) != lapic_id()) { printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); printf("SMP: actual apic_id = %d\n", lapic_id()); printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); panic("cpuid mismatch! boom!!"); } /* Initialize curthread. */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); PCPU_SET(curthread, PCPU_GET(idlethread)); mca_init(); mtx_lock_spin(&ap_boot_mtx); /* Init local apic for irq's */ lapic_setup(1); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); smp_cpus++; CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); /* Determine if we are a logical CPU. */ /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) logical_cpus_mask |= PCPU_GET(cpumask); /* Determine if we are a hyperthread. */ if (hyperthreading_cpus > 1 && PCPU_GET(apic_id) % hyperthreading_cpus != 0) hyperthreading_cpus_mask |= PCPU_GET(cpumask); /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); if (bootverbose) lapic_dump("AP"); if (smp_cpus == mp_ncpus) { /* enable IPI's, tlb shootdown, freezes etc */ atomic_store_rel_int(&smp_started, 1); smp_active = 1; /* historic */ } /* * Enable global pages TLB extension * This also implicitly flushes the TLB */ load_cr4(rcr4() | CR4_PGE); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); mtx_unlock_spin(&ap_boot_mtx); /* Wait until all the AP's are up. */ while (smp_started == 0) ia32_pause(); /* Start per-CPU event timers. */ cpu_initclocks_ap(); sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ } /******************************************************************* * local functions and data */ /* * We tell the I/O APIC code about all the CPUs we want to receive * interrupts. If we don't want certain CPUs to receive IRQs we * can simply not tell the I/O APIC code about them in this function. * We also do not tell it about the BSP since it tells itself about * the BSP internally to work with UP kernels and on UP machines. */ static void set_interrupt_apic_ids(void) { u_int i, apic_id; for (i = 0; i < MAXCPU; i++) { apic_id = cpu_apic_ids[i]; if (apic_id == -1) continue; if (cpu_info[apic_id].cpu_bsp) continue; if (cpu_info[apic_id].cpu_disabled) continue; /* Don't let hyperthreads service interrupts. */ if (hyperthreading_cpus > 1 && apic_id % hyperthreading_cpus != 0) continue; intr_add_cpu(i); } } /* * Assign logical CPU IDs to local APICs. */ static void assign_cpu_ids(void) { u_int i; TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", &hyperthreading_allowed); /* Check for explicitly disabled CPUs. */ for (i = 0; i <= MAX_APIC_ID; i++) { if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) continue; if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { cpu_info[i].cpu_hyperthread = 1; #if defined(SCHED_ULE) /* * Don't use HT CPU if it has been disabled by a * tunable. */ if (hyperthreading_allowed == 0) { cpu_info[i].cpu_disabled = 1; continue; } #endif } /* Don't use this CPU if it has been disabled by a tunable. */ if (resource_disabled("lapic", i)) { cpu_info[i].cpu_disabled = 1; continue; } } /* * Assign CPU IDs to local APIC IDs and disable any CPUs * beyond MAXCPU. CPU 0 is always assigned to the BSP. * * To minimize confusion for userland, we attempt to number * CPUs such that all threads and cores in a package are * grouped together. For now we assume that the BSP is always * the first thread in a package and just start adding APs * starting with the BSP's APIC ID. */ mp_ncpus = 1; cpu_apic_ids[0] = boot_cpu_id; apic_cpuids[boot_cpu_id] = 0; for (i = boot_cpu_id + 1; i != boot_cpu_id; i == MAX_APIC_ID ? i = 0 : i++) { if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || cpu_info[i].cpu_disabled) continue; if (mp_ncpus < MAXCPU) { cpu_apic_ids[mp_ncpus] = i; apic_cpuids[i] = mp_ncpus; mp_ncpus++; } else cpu_info[i].cpu_disabled = 1; } KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); } /* * start each AP in our list */ static int start_all_aps(void) { vm_offset_t va = boot_address + KERNBASE; u_int64_t *pt4, *pt3, *pt2; u_int32_t mpbioswarmvec; int apic_id, cpu, i; u_char mpbiosreason; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); /* install the AP 1st level boot code */ pmap_kenter(va, boot_address); pmap_invalidate_page(kernel_pmap, va); bcopy(mptramp_start, (void *)va, bootMP_size); /* Locate the page tables, they'll be below the trampoline */ pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE); pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); /* Create the initial 1GB replicated page tables */ for (i = 0; i < 512; i++) { /* Each slot of the level 4 pages points to the same level 3 page */ pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); pt4[i] |= PG_V | PG_RW | PG_U; /* Each slot of the level 3 pages points to the same level 2 page */ pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); pt3[i] |= PG_V | PG_RW | PG_U; /* The level 2 page slots are mapped with 2MB pages for 1GB. */ pt2[i] = i * (2 * 1024 * 1024); pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; } /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); /* setup a vector to our boot code */ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ /* start each AP */ for (cpu = 1; cpu < mp_ncpus; cpu++) { apic_id = cpu_apic_ids[cpu]; /* allocate and set up an idle stack data page */ bootstacks[cpu] = (void *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); doublefault_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE); nmi_stack = (char *)kmem_alloc(kernel_map, PAGE_SIZE); dpcpu = (void *)kmem_alloc(kernel_map, DPCPU_SIZE); bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8; bootAP = cpu; /* attempt to start the Application Processor */ if (!start_ap(apic_id)) { /* restore the warmstart vector */ *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; panic("AP #%d (PHY# %d) failed!", cpu, apic_id); } all_cpus |= (1 << cpu); /* record AP in CPU map */ } /* build our map of 'other' CPUs */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); /* restore the warmstart vector */ *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); /* number of APs actually started */ return mp_naps; } /* * This function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It isn't pretty, * but it seems to work. */ static int start_ap(int apic_id) { int vector, ms; int cpus; /* calculate the vector */ vector = (boot_address >> 12) & 0xff; /* used as a watchpoint to signal AP startup */ cpus = mp_naps; /* * first we do an INIT/RESET IPI this INIT IPI might be run, reseting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ /* do an INIT IPI: assert RESET */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); /* wait for pending status end */ lapic_ipi_wait(-1); /* do an INIT IPI: deassert RESET */ lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0); /* wait for pending status end */ DELAY(10000); /* wait ~10mS */ lapic_ipi_wait(-1); /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ /* do a STARTUP IPI */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); lapic_ipi_wait(-1); DELAY(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); lapic_ipi_wait(-1); DELAY(200); /* wait ~200uS */ /* Wait up to 5 seconds for it to start. */ for (ms = 0; ms < 5000; ms++) { if (mp_naps > cpus) return 1; /* return SUCCESS */ DELAY(1000); } return 0; /* return FAILURE */ } #ifdef COUNT_XINVLTLB_HITS u_int xhits_gbl[MAXCPU]; u_int xhits_pg[MAXCPU]; u_int xhits_rng[MAXCPU]; SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, sizeof(xhits_gbl), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, sizeof(xhits_pg), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, sizeof(xhits_rng), "IU", ""); u_int ipi_global; u_int ipi_page; u_int ipi_range; u_int ipi_range_size; SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 0, ""); u_int ipi_masked_global; u_int ipi_masked_page; u_int ipi_masked_range; u_int ipi_masked_range_size; SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, &ipi_masked_global, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, &ipi_masked_page, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, &ipi_masked_range, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, &ipi_masked_range_size, 0, ""); #endif /* COUNT_XINVLTLB_HITS */ /* * Flush the TLB on all other CPU's */ static void smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) { u_int ncpu; ncpu = mp_ncpus - 1; /* does not shootdown self */ if (ncpu < 1) return; /* no other cpus */ if (!(read_rflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); smp_tlb_addr1 = addr1; smp_tlb_addr2 = addr2; atomic_store_rel_int(&smp_tlb_wait, 0); ipi_all_but_self(vector); while (smp_tlb_wait < ncpu) ia32_pause(); mtx_unlock_spin(&smp_ipi_mtx); } static void smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) { int ncpu, othercpus; othercpus = mp_ncpus - 1; if (mask == (cpumask_t)-1) { ncpu = othercpus; if (ncpu < 1) return; } else { mask &= ~PCPU_GET(cpumask); if (mask == 0) return; ncpu = bitcount32(mask); if (ncpu > othercpus) { /* XXX this should be a panic offence */ printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", ncpu, othercpus); ncpu = othercpus; } /* XXX should be a panic, implied by mask == 0 above */ if (ncpu < 1) return; } if (!(read_rflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); smp_tlb_addr1 = addr1; smp_tlb_addr2 = addr2; atomic_store_rel_int(&smp_tlb_wait, 0); if (mask == (cpumask_t)-1) ipi_all_but_self(vector); else ipi_selected(mask, vector); while (smp_tlb_wait < ncpu) ia32_pause(); mtx_unlock_spin(&smp_ipi_mtx); } /* * Send an IPI to specified CPU handling the bitmap logic. */ static void ipi_send_cpu(int cpu, u_int ipi) { u_int bitmap, old_pending, new_pending; KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); if (IPI_IS_BITMAPED(ipi)) { bitmap = 1 << ipi; ipi = IPI_BITMAP_VECTOR; do { old_pending = cpu_ipi_pending[cpu]; new_pending = old_pending | bitmap; } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], old_pending, new_pending)); if (old_pending) return; } lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); } void smp_cache_flush(void) { if (smp_started) smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); } void smp_invltlb(void) { if (smp_started) { smp_tlb_shootdown(IPI_INVLTLB, 0, 0); #ifdef COUNT_XINVLTLB_HITS ipi_global++; #endif } } void smp_invlpg(vm_offset_t addr) { if (smp_started) { smp_tlb_shootdown(IPI_INVLPG, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_page++; #endif } } void smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); #ifdef COUNT_XINVLTLB_HITS ipi_range++; ipi_range_size += (addr2 - addr1) / PAGE_SIZE; #endif } } void smp_masked_invltlb(cpumask_t mask) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); #ifdef COUNT_XINVLTLB_HITS ipi_masked_global++; #endif } } void smp_masked_invlpg(cpumask_t mask, vm_offset_t addr) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_masked_page++; #endif } } void smp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); #ifdef COUNT_XINVLTLB_HITS ipi_masked_range++; ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; #endif } } void ipi_bitmap_handler(struct trapframe frame) { struct trapframe *oldframe; struct thread *td; int cpu = PCPU_GET(cpuid); u_int ipi_bitmap; critical_enter(); td = curthread; td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = &frame; ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); if (ipi_bitmap & (1 << IPI_PREEMPT)) { #ifdef COUNT_IPIS (*ipi_preempt_counts[cpu])++; #endif sched_preempt(td); } if (ipi_bitmap & (1 << IPI_AST)) { #ifdef COUNT_IPIS (*ipi_ast_counts[cpu])++; #endif /* Nothing to do for AST */ } if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { #ifdef COUNT_IPIS (*ipi_hardclock_counts[cpu])++; #endif hardclockintr(); } td->td_intr_frame = oldframe; td->td_intr_nesting_level--; critical_exit(); } /* * send an IPI to a set of cpus. */ void ipi_selected(cpumask_t cpus, u_int ipi) { int cpu; /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) atomic_set_int(&ipi_nmi_pending, cpus); CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); while ((cpu = ffs(cpus)) != 0) { cpu--; cpus &= ~(1 << cpu); ipi_send_cpu(cpu, ipi); } } /* * send an IPI to a specific CPU. */ void ipi_cpu(int cpu, u_int ipi) { /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) atomic_set_int(&ipi_nmi_pending, 1 << cpu); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { if (IPI_IS_BITMAPED(ipi)) { ipi_selected(PCPU_GET(other_cpus), ipi); return; } /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus)); CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); } int ipi_nmi_handler() { cpumask_t cpumask; /* * As long as there is not a simple way to know about a NMI's * source, if the bitmask for the current CPU is present in * the global pending bitword an IPI_STOP_HARD has been issued * and should be handled. */ cpumask = PCPU_GET(cpumask); if ((ipi_nmi_pending & cpumask) == 0) return (1); atomic_clear_int(&ipi_nmi_pending, cpumask); cpustop_handler(); return (0); } /* * Handle an IPI_STOP by saving our current context and spinning until we * are resumed. */ void cpustop_handler(void) { cpumask_t cpumask; u_int cpu; cpu = PCPU_GET(cpuid); cpumask = PCPU_GET(cpumask); savectx(&stoppcbs[cpu]); /* Indicate that we are stopped */ atomic_set_int(&stopped_cpus, cpumask); /* Wait for restart */ while (!(started_cpus & cpumask)) ia32_pause(); atomic_clear_int(&started_cpus, cpumask); atomic_clear_int(&stopped_cpus, cpumask); if (cpu == 0 && cpustop_restartfunc != NULL) { cpustop_restartfunc(); cpustop_restartfunc = NULL; } } /* * Handle an IPI_SUSPEND by saving our current context and spinning until we * are resumed. */ void cpususpend_handler(void) { cpumask_t cpumask; register_t cr3, rf; u_int cpu; cpu = PCPU_GET(cpuid); cpumask = PCPU_GET(cpumask); rf = intr_disable(); cr3 = rcr3(); if (savectx(susppcbs[cpu])) { wbinvd(); atomic_set_int(&stopped_cpus, cpumask); } else { PCPU_SET(switchtime, 0); PCPU_SET(switchticks, ticks); } /* Wait for resume */ while (!(started_cpus & cpumask)) ia32_pause(); atomic_clear_int(&started_cpus, cpumask); atomic_clear_int(&stopped_cpus, cpumask); /* Restore CR3 and enable interrupts */ load_cr3(cr3); mca_resume(); lapic_setup(0); intr_restore(rf); } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ static void release_aps(void *dummy __unused) { if (mp_ncpus == 1) return; atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ia32_pause(); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); static int sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS) { cpumask_t mask; int error; mask = hlt_cpus_mask; error = sysctl_handle_int(oidp, &mask, 0, req); if (error || !req->newptr) return (error); if (logical_cpus_mask != 0 && (mask & logical_cpus_mask) == logical_cpus_mask) hlt_logical_cpus = 1; else hlt_logical_cpus = 0; if (! hyperthreading_allowed) mask |= hyperthreading_cpus_mask; if ((mask & all_cpus) == all_cpus) mask &= ~(1<<0); hlt_cpus_mask = mask; return (error); } SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_hlt_cpus, "IU", "Bitmap of CPUs to halt. 101 (binary) will halt CPUs 0 and 2."); static int sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS) { int disable, error; disable = hlt_logical_cpus; error = sysctl_handle_int(oidp, &disable, 0, req); if (error || !req->newptr) return (error); if (disable) hlt_cpus_mask |= logical_cpus_mask; else hlt_cpus_mask &= ~logical_cpus_mask; if (! hyperthreading_allowed) hlt_cpus_mask |= hyperthreading_cpus_mask; if ((hlt_cpus_mask & all_cpus) == all_cpus) hlt_cpus_mask &= ~(1<<0); hlt_logical_cpus = disable; return (error); } static int sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS) { int allowed, error; allowed = hyperthreading_allowed; error = sysctl_handle_int(oidp, &allowed, 0, req); if (error || !req->newptr) return (error); #ifdef SCHED_ULE /* * SCHED_ULE doesn't allow enabling/disabling HT cores at * run-time. */ if (allowed != hyperthreading_allowed) return (ENOTSUP); return (error); #endif if (allowed) hlt_cpus_mask &= ~hyperthreading_cpus_mask; else hlt_cpus_mask |= hyperthreading_cpus_mask; if (logical_cpus_mask != 0 && (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask) hlt_logical_cpus = 1; else hlt_logical_cpus = 0; if ((hlt_cpus_mask & all_cpus) == all_cpus) hlt_cpus_mask &= ~(1<<0); hyperthreading_allowed = allowed; return (error); } static void cpu_hlt_setup(void *dummy __unused) { if (logical_cpus_mask != 0) { TUNABLE_INT_FETCH("machdep.hlt_logical_cpus", &hlt_logical_cpus); sysctl_ctx_init(&logical_cpu_clist); SYSCTL_ADD_PROC(&logical_cpu_clist, SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_hlt_logical_cpus, "IU", ""); SYSCTL_ADD_UINT(&logical_cpu_clist, SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD, &logical_cpus_mask, 0, ""); if (hlt_logical_cpus) hlt_cpus_mask |= logical_cpus_mask; /* * If necessary for security purposes, force * hyperthreading off, regardless of the value * of hlt_logical_cpus. */ if (hyperthreading_cpus_mask) { SYSCTL_ADD_PROC(&logical_cpu_clist, SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_hyperthreading_allowed, "IU", ""); if (! hyperthreading_allowed) hlt_cpus_mask |= hyperthreading_cpus_mask; } } } SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL); int mp_grab_cpu_hlt(void) { cpumask_t mask; #ifdef MP_WATCHDOG u_int cpuid; #endif int retval; mask = PCPU_GET(cpumask); #ifdef MP_WATCHDOG cpuid = PCPU_GET(cpuid); ap_watchdog(cpuid); #endif retval = 0; while (mask & hlt_cpus_mask) { retval = 1; __asm __volatile("sti; hlt" : : : "memory"); } return (retval); } #ifdef COUNT_IPIS /* * Setup interrupt counters for IPI handlers. */ static void mp_ipi_intrcnt(void *dummy) { char buf[64]; int i; CPU_FOREACH(i) { snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); intrcnt_add(buf, &ipi_invltlb_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); intrcnt_add(buf, &ipi_invlrng_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); intrcnt_add(buf, &ipi_invlpg_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:preempt", i); intrcnt_add(buf, &ipi_preempt_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:ast", i); intrcnt_add(buf, &ipi_ast_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); intrcnt_add(buf, &ipi_rendezvous_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:lazypmap", i); intrcnt_add(buf, &ipi_lazypmap_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); intrcnt_add(buf, &ipi_hardclock_counts[i]); } } SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); #endif diff --git a/sys/amd64/amd64/mp_watchdog.c b/sys/amd64/amd64/mp_watchdog.c index 1803270b573f..5cbd649f7e60 100644 --- a/sys/amd64/amd64/mp_watchdog.c +++ b/sys/amd64/amd64/mp_watchdog.c @@ -1,211 +1,211 @@ /*- * Copyright (c) 2004 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_mp_watchdog.h" #include "opt_sched.h" #ifdef SCHED_ULE #error MP_WATCHDOG cannot currently be used with SCHED_ULE #endif #include #include #include #include #include #include #include #include #include #include -#include +#include #include #include /* * mp_watchdog hijacks the idle thread on a specified CPU, prevents new work * from being scheduled there, and uses it as a "watchdog" to detect kernel * failure on other CPUs. This is made reasonable by inclusion of logical * processors in Xeon hardware. The watchdog is configured by setting the * debug.watchdog sysctl/tunable to the CPU of interest. A callout will then * begin executing reseting a timer that is gradually lowered by the watching * thread. If the timer reaches 0, the watchdog fires by ether dropping * directly to the debugger, or by sending an NMI IPI to the boot processor. * This is a somewhat less efficient substitute for dedicated watchdog * hardware, but can be quite an effective tool for debugging hangs. * * XXXRW: This should really use the watchdog(9)/watchdog(4) framework, but * doesn't yet. */ static int watchdog_cpu = -1; static int watchdog_dontfire = 1; static int watchdog_timer = -1; static int watchdog_nmi = 1; TUNABLE_INT("debug.watchdog", &watchdog_cpu); SYSCTL_INT(_debug, OID_AUTO, watchdog_nmi, CTLFLAG_RW, &watchdog_nmi, 0, "IPI the boot processor with an NMI to enter the debugger"); static struct callout watchdog_callout; static void watchdog_change(int wdcpu); /* * Number of seconds before the watchdog will fire if the callout fails to * reset the timer. */ #define WATCHDOG_THRESHOLD 10 static void watchdog_init(void *arg) { callout_init(&watchdog_callout, CALLOUT_MPSAFE); if (watchdog_cpu != -1) watchdog_change(watchdog_cpu); } /* * This callout resets a timer until the watchdog kicks in. It acquires some * critical locks to make sure things haven't gotten wedged with hose locks * held. */ static void watchdog_function(void *arg) { /* * Since the timer ran, we must not be wedged. Acquire some critical * locks to make sure. Then reset the timer. */ mtx_lock(&Giant); watchdog_timer = WATCHDOG_THRESHOLD; mtx_unlock(&Giant); callout_reset(&watchdog_callout, 1 * hz, watchdog_function, NULL); } SYSINIT(watchdog_init, SI_SUB_DRIVERS, SI_ORDER_ANY, watchdog_init, NULL); static void watchdog_change(int wdcpu) { if (wdcpu == -1 || wdcpu == 0xffffffff) { /* * Disable the watchdog. */ watchdog_cpu = -1; watchdog_dontfire = 1; callout_stop(&watchdog_callout); printf("watchdog stopped\n"); } else { watchdog_timer = WATCHDOG_THRESHOLD; watchdog_dontfire = 0; watchdog_cpu = wdcpu; callout_reset(&watchdog_callout, 1 * hz, watchdog_function, NULL); } } /* * This sysctl sets which CPU is the watchdog CPU. Set to -1 or 0xffffffff * to disable the watchdog. */ static int sysctl_watchdog(SYSCTL_HANDLER_ARGS) { int error, temp; temp = watchdog_cpu; error = sysctl_handle_int(oidp, &temp, 0, req); if (error) return (error); if (req->newptr != NULL) watchdog_change(temp); return (0); } SYSCTL_PROC(_debug, OID_AUTO, watchdog, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_watchdog, "I", ""); /* * Drop into the debugger by sending an IPI NMI to the boot processor. */ static void watchdog_ipi_nmi(void) { /* * Deliver NMI to the boot processor. Why not? */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_NMI, boot_cpu_id); lapic_ipi_wait(-1); } /* * ap_watchdog() is called by the SMP idle loop code. It works on the same * premise that the disabling of logical processors does: that if the cpu is * idle, then it can ignore the world from then on, as nothing will be * scheduled on it. Leaving aside multi-runqueue schedulers (SCHED_ULE) and * explicit process migration (sched_bind()), this is not an unreasonable * assumption. */ void ap_watchdog(u_int cpuid) { char old_pcomm[MAXCOMLEN + 1]; struct proc *p; if (watchdog_cpu != cpuid) return; printf("watchdog started on cpu %d\n", cpuid); p = curproc; bcopy(p->p_comm, old_pcomm, MAXCOMLEN + 1); snprintf(p->p_comm, MAXCOMLEN + 1, "mp_watchdog cpu %d", cpuid); while (1) { DELAY(1000000); /* One second. */ if (watchdog_cpu != cpuid) break; atomic_subtract_int(&watchdog_timer, 1); if (watchdog_timer < 4) printf("Watchdog timer: %d\n", watchdog_timer); if (watchdog_timer == 0 && watchdog_dontfire == 0) { printf("Watchdog firing!\n"); watchdog_dontfire = 1; if (watchdog_nmi) watchdog_ipi_nmi(); else kdb_enter(KDB_WHY_WATCHDOG, "mp_watchdog"); } } bcopy(old_pcomm, p->p_comm, MAXCOMLEN + 1); printf("watchdog stopped on cpu %d\n", cpuid); } diff --git a/sys/i386/acpica/madt.c b/sys/i386/acpica/madt.c index 5013c21287a8..f153696690e2 100644 --- a/sys/i386/acpica/madt.c +++ b/sys/i386/acpica/madt.c @@ -1,573 +1,573 @@ /*- * Copyright (c) 2003 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include #include /* These two arrays are indexed by APIC IDs. */ struct ioapic_info { void *io_apic; UINT32 io_vector; } ioapics[MAX_APIC_ID + 1]; struct lapic_info { u_int la_enabled:1; u_int la_acpi_id:8; } lapics[MAX_APIC_ID + 1]; static int madt_found_sci_override; static ACPI_TABLE_MADT *madt; static vm_paddr_t madt_physaddr; static vm_offset_t madt_length; MALLOC_DEFINE(M_MADT, "madt_table", "ACPI MADT Table Items"); static enum intr_polarity interrupt_polarity(UINT16 IntiFlags, UINT8 Source); static enum intr_trigger interrupt_trigger(UINT16 IntiFlags, UINT8 Source); static int madt_find_cpu(u_int acpi_id, u_int *apic_id); static int madt_find_interrupt(int intr, void **apic, u_int *pin); static void madt_parse_apics(ACPI_SUBTABLE_HEADER *entry, void *arg); static void madt_parse_interrupt_override( ACPI_MADT_INTERRUPT_OVERRIDE *intr); static void madt_parse_ints(ACPI_SUBTABLE_HEADER *entry, void *arg __unused); static void madt_parse_local_nmi(ACPI_MADT_LOCAL_APIC_NMI *nmi); static void madt_parse_nmi(ACPI_MADT_NMI_SOURCE *nmi); static int madt_probe(void); static int madt_probe_cpus(void); static void madt_probe_cpus_handler(ACPI_SUBTABLE_HEADER *entry, void *arg __unused); static void madt_register(void *dummy); static int madt_setup_local(void); static int madt_setup_io(void); static void madt_walk_table(acpi_subtable_handler *handler, void *arg); static struct apic_enumerator madt_enumerator = { "MADT", madt_probe, madt_probe_cpus, madt_setup_local, madt_setup_io }; /* * Look for an ACPI Multiple APIC Description Table ("APIC") */ static int madt_probe(void) { madt_physaddr = acpi_find_table(ACPI_SIG_MADT); if (madt_physaddr == 0) return (ENXIO); return (0); } /* * Run through the MP table enumerating CPUs. */ static int madt_probe_cpus(void) { madt = acpi_map_table(madt_physaddr, ACPI_SIG_MADT); madt_length = madt->Header.Length; KASSERT(madt != NULL, ("Unable to re-map MADT")); madt_walk_table(madt_probe_cpus_handler, NULL); acpi_unmap_table(madt); madt = NULL; return (0); } /* * Initialize the local APIC on the BSP. */ static int madt_setup_local(void) { madt = pmap_mapbios(madt_physaddr, madt_length); lapic_init(madt->Address); printf("ACPI APIC Table: <%.*s %.*s>\n", (int)sizeof(madt->Header.OemId), madt->Header.OemId, (int)sizeof(madt->Header.OemTableId), madt->Header.OemTableId); /* * We ignore 64-bit local APIC override entries. Should we * perhaps emit a warning here if we find one? */ return (0); } /* * Enumerate I/O APICs and setup interrupt sources. */ static int madt_setup_io(void) { void *ioapic; u_int pin; int i; /* Try to initialize ACPI so that we can access the FADT. */ i = acpi_Startup(); if (ACPI_FAILURE(i)) { printf("MADT: ACPI Startup failed with %s\n", AcpiFormatException(i)); printf("Try disabling either ACPI or apic support.\n"); panic("Using MADT but ACPI doesn't work"); } /* First, we run through adding I/O APIC's. */ madt_walk_table(madt_parse_apics, NULL); /* Second, we run through the table tweaking interrupt sources. */ madt_walk_table(madt_parse_ints, NULL); /* * If there was not an explicit override entry for the SCI, * force it to use level trigger and active-low polarity. */ if (!madt_found_sci_override) { if (madt_find_interrupt(AcpiGbl_FADT.SciInterrupt, &ioapic, &pin) != 0) printf("MADT: Could not find APIC for SCI IRQ %u\n", AcpiGbl_FADT.SciInterrupt); else { printf( "MADT: Forcing active-low polarity and level trigger for SCI\n"); ioapic_set_polarity(ioapic, pin, INTR_POLARITY_LOW); ioapic_set_triggermode(ioapic, pin, INTR_TRIGGER_LEVEL); } } /* Third, we register all the I/O APIC's. */ for (i = 0; i <= MAX_APIC_ID; i++) if (ioapics[i].io_apic != NULL) ioapic_register(ioapics[i].io_apic); /* Finally, we throw the switch to enable the I/O APIC's. */ acpi_SetDefaultIntrModel(ACPI_INTR_APIC); return (0); } static void madt_register(void *dummy __unused) { apic_register_enumerator(&madt_enumerator); } SYSINIT(madt_register, SI_SUB_CPU - 1, SI_ORDER_SECOND, madt_register, NULL); /* * Call the handler routine for each entry in the MADT table. */ static void madt_walk_table(acpi_subtable_handler *handler, void *arg) { acpi_walk_subtables(madt + 1, (char *)madt + madt->Header.Length, handler, arg); } static void madt_probe_cpus_handler(ACPI_SUBTABLE_HEADER *entry, void *arg) { ACPI_MADT_LOCAL_APIC *proc; struct lapic_info *la; switch (entry->Type) { case ACPI_MADT_TYPE_LOCAL_APIC: /* * The MADT does not include a BSP flag, so we have to * let the MP code figure out which CPU is the BSP on * its own. */ proc = (ACPI_MADT_LOCAL_APIC *)entry; if (bootverbose) printf("MADT: Found CPU APIC ID %u ACPI ID %u: %s\n", proc->Id, proc->ProcessorId, (proc->LapicFlags & ACPI_MADT_ENABLED) ? "enabled" : "disabled"); if (!(proc->LapicFlags & ACPI_MADT_ENABLED)) break; if (proc->Id > MAX_APIC_ID) panic("%s: CPU ID %u too high", __func__, proc->Id); la = &lapics[proc->Id]; KASSERT(la->la_enabled == 0, ("Duplicate local APIC ID %u", proc->Id)); la->la_enabled = 1; la->la_acpi_id = proc->ProcessorId; lapic_create(proc->Id, 0); break; } } /* * Add an I/O APIC from an entry in the table. */ static void madt_parse_apics(ACPI_SUBTABLE_HEADER *entry, void *arg __unused) { ACPI_MADT_IO_APIC *apic; switch (entry->Type) { case ACPI_MADT_TYPE_IO_APIC: apic = (ACPI_MADT_IO_APIC *)entry; if (bootverbose) printf( "MADT: Found IO APIC ID %u, Interrupt %u at %p\n", apic->Id, apic->GlobalIrqBase, (void *)(uintptr_t)apic->Address); if (apic->Id > MAX_APIC_ID) panic("%s: I/O APIC ID %u too high", __func__, apic->Id); if (ioapics[apic->Id].io_apic != NULL) panic("%s: Double APIC ID %u", __func__, apic->Id); if (apic->GlobalIrqBase >= FIRST_MSI_INT) { printf("MADT: Ignoring bogus I/O APIC ID %u", apic->Id); break; } ioapics[apic->Id].io_apic = ioapic_create(apic->Address, apic->Id, apic->GlobalIrqBase); ioapics[apic->Id].io_vector = apic->GlobalIrqBase; break; default: break; } } /* * Determine properties of an interrupt source. Note that for ACPI these * functions are only used for ISA interrupts, so we assume ISA bus values * (Active Hi, Edge Triggered) for conforming values except for the ACPI * SCI for which we use Active Lo, Level Triggered. */ static enum intr_polarity interrupt_polarity(UINT16 IntiFlags, UINT8 Source) { switch (IntiFlags & ACPI_MADT_POLARITY_MASK) { case ACPI_MADT_POLARITY_CONFORMS: if (Source == AcpiGbl_FADT.SciInterrupt) return (INTR_POLARITY_LOW); else return (INTR_POLARITY_HIGH); case ACPI_MADT_POLARITY_ACTIVE_HIGH: return (INTR_POLARITY_HIGH); case ACPI_MADT_POLARITY_ACTIVE_LOW: return (INTR_POLARITY_LOW); default: panic("Bogus Interrupt Polarity"); } } static enum intr_trigger interrupt_trigger(UINT16 IntiFlags, UINT8 Source) { switch (IntiFlags & ACPI_MADT_TRIGGER_MASK) { case ACPI_MADT_TRIGGER_CONFORMS: if (Source == AcpiGbl_FADT.SciInterrupt) return (INTR_TRIGGER_LEVEL); else return (INTR_TRIGGER_EDGE); case ACPI_MADT_TRIGGER_EDGE: return (INTR_TRIGGER_EDGE); case ACPI_MADT_TRIGGER_LEVEL: return (INTR_TRIGGER_LEVEL); default: panic("Bogus Interrupt Trigger Mode"); } } /* * Find the local APIC ID associated with a given ACPI Processor ID. */ static int madt_find_cpu(u_int acpi_id, u_int *apic_id) { int i; for (i = 0; i <= MAX_APIC_ID; i++) { if (!lapics[i].la_enabled) continue; if (lapics[i].la_acpi_id != acpi_id) continue; *apic_id = i; return (0); } return (ENOENT); } /* * Find the IO APIC and pin on that APIC associated with a given global * interrupt. */ static int madt_find_interrupt(int intr, void **apic, u_int *pin) { int i, best; best = -1; for (i = 0; i <= MAX_APIC_ID; i++) { if (ioapics[i].io_apic == NULL || ioapics[i].io_vector > intr) continue; if (best == -1 || ioapics[best].io_vector < ioapics[i].io_vector) best = i; } if (best == -1) return (ENOENT); *apic = ioapics[best].io_apic; *pin = intr - ioapics[best].io_vector; if (*pin > 32) printf("WARNING: Found intpin of %u for vector %d\n", *pin, intr); return (0); } /* * Parse an interrupt source override for an ISA interrupt. */ static void madt_parse_interrupt_override(ACPI_MADT_INTERRUPT_OVERRIDE *intr) { void *new_ioapic, *old_ioapic; u_int new_pin, old_pin; enum intr_trigger trig; enum intr_polarity pol; char buf[64]; if (acpi_quirks & ACPI_Q_MADT_IRQ0 && intr->SourceIrq == 0 && intr->GlobalIrq == 2) { if (bootverbose) printf("MADT: Skipping timer override\n"); return; } if (bootverbose) printf("MADT: Interrupt override: source %u, irq %u\n", intr->SourceIrq, intr->GlobalIrq); KASSERT(intr->Bus == 0, ("bus for interrupt overrides must be zero")); if (madt_find_interrupt(intr->GlobalIrq, &new_ioapic, &new_pin) != 0) { printf("MADT: Could not find APIC for vector %u (IRQ %u)\n", intr->GlobalIrq, intr->SourceIrq); return; } /* * Lookup the appropriate trigger and polarity modes for this * entry. */ trig = interrupt_trigger(intr->IntiFlags, intr->SourceIrq); pol = interrupt_polarity(intr->IntiFlags, intr->SourceIrq); /* * If the SCI is identity mapped but has edge trigger and * active-hi polarity or the force_sci_lo tunable is set, * force it to use level/lo. */ if (intr->SourceIrq == AcpiGbl_FADT.SciInterrupt) { madt_found_sci_override = 1; if (getenv_string("hw.acpi.sci.trigger", buf, sizeof(buf))) { if (tolower(buf[0]) == 'e') trig = INTR_TRIGGER_EDGE; else if (tolower(buf[0]) == 'l') trig = INTR_TRIGGER_LEVEL; else panic( "Invalid trigger %s: must be 'edge' or 'level'", buf); printf("MADT: Forcing SCI to %s trigger\n", trig == INTR_TRIGGER_EDGE ? "edge" : "level"); } if (getenv_string("hw.acpi.sci.polarity", buf, sizeof(buf))) { if (tolower(buf[0]) == 'h') pol = INTR_POLARITY_HIGH; else if (tolower(buf[0]) == 'l') pol = INTR_POLARITY_LOW; else panic( "Invalid polarity %s: must be 'high' or 'low'", buf); printf("MADT: Forcing SCI to active %s polarity\n", pol == INTR_POLARITY_HIGH ? "high" : "low"); } } /* Remap the IRQ if it is mapped to a different interrupt vector. */ if (intr->SourceIrq != intr->GlobalIrq) { /* * If the SCI is remapped to a non-ISA global interrupt, * then override the vector we use to setup and allocate * the interrupt. */ if (intr->GlobalIrq > 15 && intr->SourceIrq == AcpiGbl_FADT.SciInterrupt) acpi_OverrideInterruptLevel(intr->GlobalIrq); else ioapic_remap_vector(new_ioapic, new_pin, intr->SourceIrq); if (madt_find_interrupt(intr->SourceIrq, &old_ioapic, &old_pin) != 0) printf("MADT: Could not find APIC for source IRQ %u\n", intr->SourceIrq); else if (ioapic_get_vector(old_ioapic, old_pin) == intr->SourceIrq) ioapic_disable_pin(old_ioapic, old_pin); } /* Program the polarity and trigger mode. */ ioapic_set_triggermode(new_ioapic, new_pin, trig); ioapic_set_polarity(new_ioapic, new_pin, pol); } /* * Parse an entry for an NMI routed to an IO APIC. */ static void madt_parse_nmi(ACPI_MADT_NMI_SOURCE *nmi) { void *ioapic; u_int pin; if (madt_find_interrupt(nmi->GlobalIrq, &ioapic, &pin) != 0) { printf("MADT: Could not find APIC for vector %u\n", nmi->GlobalIrq); return; } ioapic_set_nmi(ioapic, pin); if (!(nmi->IntiFlags & ACPI_MADT_TRIGGER_CONFORMS)) ioapic_set_triggermode(ioapic, pin, interrupt_trigger(nmi->IntiFlags, 0)); if (!(nmi->IntiFlags & ACPI_MADT_TRIGGER_CONFORMS)) ioapic_set_polarity(ioapic, pin, interrupt_polarity(nmi->IntiFlags, 0)); } /* * Parse an entry for an NMI routed to a local APIC LVT pin. */ static void madt_parse_local_nmi(ACPI_MADT_LOCAL_APIC_NMI *nmi) { u_int apic_id, pin; if (nmi->ProcessorId == 0xff) apic_id = APIC_ID_ALL; else if (madt_find_cpu(nmi->ProcessorId, &apic_id) != 0) { if (bootverbose) printf("MADT: Ignoring local NMI routed to " "ACPI CPU %u\n", nmi->ProcessorId); return; } if (nmi->Lint == 0) pin = LVT_LINT0; else pin = LVT_LINT1; lapic_set_lvt_mode(apic_id, pin, APIC_LVT_DM_NMI); if (!(nmi->IntiFlags & ACPI_MADT_TRIGGER_CONFORMS)) lapic_set_lvt_triggermode(apic_id, pin, interrupt_trigger(nmi->IntiFlags, 0)); if (!(nmi->IntiFlags & ACPI_MADT_POLARITY_CONFORMS)) lapic_set_lvt_polarity(apic_id, pin, interrupt_polarity(nmi->IntiFlags, 0)); } /* * Parse interrupt entries. */ static void madt_parse_ints(ACPI_SUBTABLE_HEADER *entry, void *arg __unused) { switch (entry->Type) { case ACPI_MADT_TYPE_INTERRUPT_OVERRIDE: madt_parse_interrupt_override( (ACPI_MADT_INTERRUPT_OVERRIDE *)entry); break; case ACPI_MADT_TYPE_NMI_SOURCE: madt_parse_nmi((ACPI_MADT_NMI_SOURCE *)entry); break; case ACPI_MADT_TYPE_LOCAL_APIC_NMI: madt_parse_local_nmi((ACPI_MADT_LOCAL_APIC_NMI *)entry); break; } } /* * Setup per-CPU ACPI IDs. */ static void madt_set_ids(void *dummy) { struct lapic_info *la; struct pcpu *pc; u_int i; if (madt == NULL) return; CPU_FOREACH(i) { pc = pcpu_find(i); KASSERT(pc != NULL, ("no pcpu data for CPU %u", i)); la = &lapics[pc->pc_apic_id]; if (!la->la_enabled) panic("APIC: CPU with APIC ID %u is not enabled", pc->pc_apic_id); pc->pc_acpi_id = la->la_acpi_id; if (bootverbose) printf("APIC: CPU %u has ACPI ID %u\n", i, la->la_acpi_id); } } SYSINIT(madt_set_ids, SI_SUB_CPU, SI_ORDER_ANY, madt_set_ids, NULL); diff --git a/sys/i386/i386/apic_vector.s b/sys/i386/i386/apic_vector.s index cbe3871b9ad0..100ce90331e1 100644 --- a/sys/i386/i386/apic_vector.s +++ b/sys/i386/i386/apic_vector.s @@ -1,378 +1,378 @@ /*- * Copyright (c) 1989, 1990 William F. Jolitz. * Copyright (c) 1990 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: vector.s, 386BSD 0.1 unknown origin * $FreeBSD$ */ /* * Interrupt entry points for external interrupts triggered by I/O APICs * as well as IPI handlers. */ #include "opt_smp.h" #include -#include +#include #include "assym.s" /* * I/O Interrupt Entry Point. Rather than having one entry point for * each interrupt source, we use one entry point for each 32-bit word * in the ISR. The handler determines the highest bit set in the ISR, * translates that into a vector, and passes the vector to the * lapic_handle_intr() function. */ #define ISR_VEC(index, vec_name) \ .text ; \ SUPERALIGN_TEXT ; \ IDTVEC(vec_name) ; \ PUSH_FRAME ; \ SET_KERNEL_SREGS ; \ cld ; \ FAKE_MCOUNT(TF_EIP(%esp)) ; \ movl lapic, %edx ; /* pointer to local APIC */ \ movl LA_ISR + 16 * (index)(%edx), %eax ; /* load ISR */ \ bsrl %eax, %eax ; /* index of highset set bit in ISR */ \ jz 2f ; \ addl $(32 * index),%eax ; \ 1: ; \ pushl %esp ; \ pushl %eax ; /* pass the IRQ */ \ call lapic_handle_intr ; \ addl $8, %esp ; /* discard parameter */ \ MEXITCOUNT ; \ jmp doreti ; \ 2: movl $-1, %eax ; /* send a vector of -1 */ \ jmp 1b /* * Handle "spurious INTerrupts". * Notes: * This is different than the "spurious INTerrupt" generated by an * 8259 PIC for missing INTs. See the APIC documentation for details. * This routine should NOT do an 'EOI' cycle. */ .text SUPERALIGN_TEXT IDTVEC(spuriousint) /* No EOI cycle used here */ iret ISR_VEC(1, apic_isr1) ISR_VEC(2, apic_isr2) ISR_VEC(3, apic_isr3) ISR_VEC(4, apic_isr4) ISR_VEC(5, apic_isr5) ISR_VEC(6, apic_isr6) ISR_VEC(7, apic_isr7) /* * Local APIC periodic timer handler. */ .text SUPERALIGN_TEXT IDTVEC(timerint) PUSH_FRAME SET_KERNEL_SREGS cld FAKE_MCOUNT(TF_EIP(%esp)) pushl %esp call lapic_handle_timer add $4, %esp MEXITCOUNT jmp doreti /* * Local APIC CMCI handler. */ .text SUPERALIGN_TEXT IDTVEC(cmcint) PUSH_FRAME SET_KERNEL_SREGS cld FAKE_MCOUNT(TF_EIP(%esp)) call lapic_handle_cmc MEXITCOUNT jmp doreti /* * Local APIC error interrupt handler. */ .text SUPERALIGN_TEXT IDTVEC(errorint) PUSH_FRAME SET_KERNEL_SREGS cld FAKE_MCOUNT(TF_EIP(%esp)) call lapic_handle_error MEXITCOUNT jmp doreti #ifdef SMP /* * Global address space TLB shootdown. */ .text SUPERALIGN_TEXT IDTVEC(invltlb) pushl %eax pushl %ds movl $KDSEL, %eax /* Kernel data selector */ movl %eax, %ds #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS) pushl %fs movl $KPSEL, %eax /* Private space selector */ movl %eax, %fs movl PCPU(CPUID), %eax popl %fs #ifdef COUNT_XINVLTLB_HITS incl xhits_gbl(,%eax,4) #endif #ifdef COUNT_IPIS movl ipi_invltlb_counts(,%eax,4),%eax incl (%eax) #endif #endif movl %cr3, %eax /* invalidate the TLB */ movl %eax, %cr3 movl lapic, %eax movl $0, LA_EOI(%eax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait popl %ds popl %eax iret /* * Single page TLB shootdown */ .text SUPERALIGN_TEXT IDTVEC(invlpg) pushl %eax pushl %ds movl $KDSEL, %eax /* Kernel data selector */ movl %eax, %ds #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS) pushl %fs movl $KPSEL, %eax /* Private space selector */ movl %eax, %fs movl PCPU(CPUID), %eax popl %fs #ifdef COUNT_XINVLTLB_HITS incl xhits_pg(,%eax,4) #endif #ifdef COUNT_IPIS movl ipi_invlpg_counts(,%eax,4),%eax incl (%eax) #endif #endif movl smp_tlb_addr1, %eax invlpg (%eax) /* invalidate single page */ movl lapic, %eax movl $0, LA_EOI(%eax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait popl %ds popl %eax iret /* * Page range TLB shootdown. */ .text SUPERALIGN_TEXT IDTVEC(invlrng) pushl %eax pushl %edx pushl %ds movl $KDSEL, %eax /* Kernel data selector */ movl %eax, %ds #if defined(COUNT_XINVLTLB_HITS) || defined(COUNT_IPIS) pushl %fs movl $KPSEL, %eax /* Private space selector */ movl %eax, %fs movl PCPU(CPUID), %eax popl %fs #ifdef COUNT_XINVLTLB_HITS incl xhits_rng(,%eax,4) #endif #ifdef COUNT_IPIS movl ipi_invlrng_counts(,%eax,4),%eax incl (%eax) #endif #endif movl smp_tlb_addr1, %edx movl smp_tlb_addr2, %eax 1: invlpg (%edx) /* invalidate single page */ addl $PAGE_SIZE, %edx cmpl %eax, %edx jb 1b movl lapic, %eax movl $0, LA_EOI(%eax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait popl %ds popl %edx popl %eax iret /* * Invalidate cache. */ .text SUPERALIGN_TEXT IDTVEC(invlcache) pushl %eax pushl %ds movl $KDSEL, %eax /* Kernel data selector */ movl %eax, %ds #ifdef COUNT_IPIS pushl %fs movl $KPSEL, %eax /* Private space selector */ movl %eax, %fs movl PCPU(CPUID), %eax popl %fs movl ipi_invlcache_counts(,%eax,4),%eax incl (%eax) #endif wbinvd movl lapic, %eax movl $0, LA_EOI(%eax) /* End Of Interrupt to APIC */ lock incl smp_tlb_wait popl %ds popl %eax iret /* * Handler for IPIs sent via the per-cpu IPI bitmap. */ #ifndef XEN .text SUPERALIGN_TEXT IDTVEC(ipi_intr_bitmap_handler) PUSH_FRAME SET_KERNEL_SREGS cld movl lapic, %edx movl $0, LA_EOI(%edx) /* End Of Interrupt to APIC */ FAKE_MCOUNT(TF_EIP(%esp)) call ipi_bitmap_handler MEXITCOUNT jmp doreti #endif /* * Executed by a CPU when it receives an IPI_STOP from another CPU. */ .text SUPERALIGN_TEXT IDTVEC(cpustop) PUSH_FRAME SET_KERNEL_SREGS cld movl lapic, %eax movl $0, LA_EOI(%eax) /* End Of Interrupt to APIC */ call cpustop_handler POP_FRAME iret /* * Executed by a CPU when it receives a RENDEZVOUS IPI from another CPU. * * - Calls the generic rendezvous action function. */ .text SUPERALIGN_TEXT IDTVEC(rendezvous) PUSH_FRAME SET_KERNEL_SREGS cld #ifdef COUNT_IPIS movl PCPU(CPUID), %eax movl ipi_rendezvous_counts(,%eax,4), %eax incl (%eax) #endif call smp_rendezvous_action movl lapic, %eax movl $0, LA_EOI(%eax) /* End Of Interrupt to APIC */ POP_FRAME iret /* * Clean up when we lose out on the lazy context switch optimization. * ie: when we are about to release a PTD but a cpu is still borrowing it. */ SUPERALIGN_TEXT IDTVEC(lazypmap) PUSH_FRAME SET_KERNEL_SREGS cld call pmap_lazyfix_action movl lapic, %eax movl $0, LA_EOI(%eax) /* End Of Interrupt to APIC */ POP_FRAME iret #endif /* SMP */ diff --git a/sys/i386/i386/genassym.c b/sys/i386/i386/genassym.c index fae78339809b..523b194fabc4 100644 --- a/sys/i386/i386/genassym.c +++ b/sys/i386/i386/genassym.c @@ -1,244 +1,244 @@ /*- * Copyright (c) 1982, 1990 The Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * William Jolitz. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)genassym.c 5.11 (Berkeley) 5/10/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_apic.h" #include "opt_compat.h" #include "opt_hwpmc_hooks.h" #include "opt_kstack_pages.h" #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_APIC -#include +#include #endif #include #include #include #include #include ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace)); ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap)); ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_LOCK, offsetof(struct thread, td_lock)); ASSYM(TD_PCB, offsetof(struct thread, td_pcb)); ASSYM(TD_PFLAGS, offsetof(struct thread, td_pflags)); ASSYM(TD_PROC, offsetof(struct thread, td_proc)); ASSYM(TD_MD, offsetof(struct thread, td_md)); ASSYM(TD_TID, offsetof(struct thread, td_tid)); ASSYM(TDP_CALLCHAIN, TDP_CALLCHAIN); ASSYM(P_MD, offsetof(struct proc, p_md)); ASSYM(MD_LDT, offsetof(struct mdproc, md_ldt)); ASSYM(TDF_ASTPENDING, TDF_ASTPENDING); ASSYM(TDF_NEEDRESCHED, TDF_NEEDRESCHED); ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap)); ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall)); ASSYM(V_INTR, offsetof(struct vmmeter, v_intr)); /* ASSYM(UPAGES, UPAGES);*/ ASSYM(KSTACK_PAGES, KSTACK_PAGES); ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(NPTEPG, NPTEPG); ASSYM(NPDEPG, NPDEPG); ASSYM(NPDEPTD, NPDEPTD); ASSYM(NPGPTD, NPGPTD); ASSYM(PDESIZE, sizeof(pd_entry_t)); ASSYM(PTESIZE, sizeof(pt_entry_t)); ASSYM(PDESHIFT, PDESHIFT); ASSYM(PTESHIFT, PTESHIFT); ASSYM(PAGE_SHIFT, PAGE_SHIFT); ASSYM(PAGE_MASK, PAGE_MASK); ASSYM(PDRSHIFT, PDRSHIFT); ASSYM(PDRMASK, PDRMASK); ASSYM(USRSTACK, USRSTACK); ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS); ASSYM(KERNBASE, KERNBASE); ASSYM(KERNLOAD, KERNLOAD); ASSYM(MCLBYTES, MCLBYTES); ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3)); ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi)); ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi)); ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp)); ASSYM(PCB_ESP, offsetof(struct pcb, pcb_esp)); ASSYM(PCB_EBX, offsetof(struct pcb, pcb_ebx)); ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip)); ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0)); ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs)); ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0)); ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1)); ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2)); ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3)); ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6)); ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7)); ASSYM(PCB_USERFPU, offsetof(struct pcb, pcb_user_save)); ASSYM(PCB_PSL, offsetof(struct pcb, pcb_psl)); ASSYM(PCB_DBREGS, PCB_DBREGS); ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext)); ASSYM(PCB_FSD, offsetof(struct pcb, pcb_fsd)); ASSYM(PCB_VM86, offsetof(struct pcb, pcb_vm86)); ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags)); ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_save)); ASSYM(PCB_SAVEFPU_SIZE, sizeof(union savefpu)); ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault)); ASSYM(PCB_SIZE, sizeof(struct pcb)); ASSYM(PCB_VM86CALL, PCB_VM86CALL); ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno)); ASSYM(TF_ERR, offsetof(struct trapframe, tf_err)); ASSYM(TF_EIP, offsetof(struct trapframe, tf_eip)); ASSYM(TF_CS, offsetof(struct trapframe, tf_cs)); ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags)); ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler)); #ifdef COMPAT_43 ASSYM(SIGF_SC, offsetof(struct osigframe, sf_siginfo.si_sc)); #endif ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc)); #ifdef COMPAT_FREEBSD4 ASSYM(SIGF_UC4, offsetof(struct sigframe4, sf_uc)); #endif #ifdef COMPAT_43 ASSYM(SC_PS, offsetof(struct osigcontext, sc_ps)); ASSYM(SC_FS, offsetof(struct osigcontext, sc_fs)); ASSYM(SC_GS, offsetof(struct osigcontext, sc_gs)); ASSYM(SC_TRAPNO, offsetof(struct osigcontext, sc_trapno)); #endif #ifdef COMPAT_FREEBSD4 ASSYM(UC4_EFLAGS, offsetof(struct ucontext4, uc_mcontext.mc_eflags)); ASSYM(UC4_GS, offsetof(struct ucontext4, uc_mcontext.mc_gs)); #endif ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_eflags)); ASSYM(UC_GS, offsetof(ucontext_t, uc_mcontext.mc_gs)); ASSYM(ENOENT, ENOENT); ASSYM(EFAULT, EFAULT); ASSYM(ENAMETOOLONG, ENAMETOOLONG); ASSYM(MAXCPU, MAXCPU); ASSYM(MAXCOMLEN, MAXCOMLEN); ASSYM(MAXPATHLEN, MAXPATHLEN); ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo)); ASSYM(BI_VERSION, offsetof(struct bootinfo, bi_version)); ASSYM(BI_KERNELNAME, offsetof(struct bootinfo, bi_kernelname)); ASSYM(BI_NFS_DISKLESS, offsetof(struct bootinfo, bi_nfs_diskless)); ASSYM(BI_ENDCOMMON, offsetof(struct bootinfo, bi_endcommon)); ASSYM(NFSDISKLESS_SIZE, sizeof(struct nfs_diskless)); ASSYM(BI_SIZE, offsetof(struct bootinfo, bi_size)); ASSYM(BI_SYMTAB, offsetof(struct bootinfo, bi_symtab)); ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab)); ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend)); ASSYM(PC_SIZEOF, sizeof(struct pcpu)); ASSYM(PC_PRVSPACE, offsetof(struct pcpu, pc_prvspace)); ASSYM(PC_CURTHREAD, offsetof(struct pcpu, pc_curthread)); ASSYM(PC_FPCURTHREAD, offsetof(struct pcpu, pc_fpcurthread)); ASSYM(PC_IDLETHREAD, offsetof(struct pcpu, pc_idlethread)); ASSYM(PC_CURPCB, offsetof(struct pcpu, pc_curpcb)); ASSYM(PC_COMMON_TSS, offsetof(struct pcpu, pc_common_tss)); ASSYM(PC_COMMON_TSSD, offsetof(struct pcpu, pc_common_tssd)); ASSYM(PC_TSS_GDT, offsetof(struct pcpu, pc_tss_gdt)); ASSYM(PC_FSGS_GDT, offsetof(struct pcpu, pc_fsgs_gdt)); ASSYM(PC_CURRENTLDT, offsetof(struct pcpu, pc_currentldt)); ASSYM(PC_CPUID, offsetof(struct pcpu, pc_cpuid)); ASSYM(PC_CURPMAP, offsetof(struct pcpu, pc_curpmap)); ASSYM(PC_PRIVATE_TSS, offsetof(struct pcpu, pc_private_tss)); #ifdef DEV_APIC ASSYM(LA_VER, offsetof(struct LAPIC, version)); ASSYM(LA_TPR, offsetof(struct LAPIC, tpr)); ASSYM(LA_EOI, offsetof(struct LAPIC, eoi)); ASSYM(LA_SVR, offsetof(struct LAPIC, svr)); ASSYM(LA_ICR_LO, offsetof(struct LAPIC, icr_lo)); ASSYM(LA_ICR_HI, offsetof(struct LAPIC, icr_hi)); ASSYM(LA_ISR, offsetof(struct LAPIC, isr0)); #endif ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL)); ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL)); ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL)); ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL)); ASSYM(GPROC0_SEL, GPROC0_SEL); ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame)); #ifdef PC98 #include ASSYM(BUS_SPACE_HANDLE_BASE, offsetof(struct bus_space_handle, bsh_base)); ASSYM(BUS_SPACE_HANDLE_IAT, offsetof(struct bus_space_handle, bsh_iat)); #endif #ifdef XEN #include ASSYM(PC_CR3, offsetof(struct pcpu, pc_cr3)); ASSYM(HYPERVISOR_VIRT_START, __HYPERVISOR_VIRT_START); #endif #ifdef HWPMC_HOOKS ASSYM(PMC_FN_USER_CALLCHAIN, PMC_FN_USER_CALLCHAIN); #endif diff --git a/sys/i386/i386/mp_machdep.c b/sys/i386/i386/mp_machdep.c index d98660975f93..b99c19caede4 100644 --- a/sys/i386/i386/mp_machdep.c +++ b/sys/i386/i386/mp_machdep.c @@ -1,1674 +1,1674 @@ /*- * Copyright (c) 1996, by Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_apic.h" #include "opt_cpu.h" #include "opt_kstack_pages.h" #include "opt_mp_watchdog.h" #include "opt_pmap.h" #include "opt_sched.h" #include "opt_smp.h" #if !defined(lint) #if !defined(SMP) #error How did you get here? #endif #ifndef DEV_APIC #error The apic device is required for SMP, add "device apic" to your config file. #endif #if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) #error SMP not supported with CPU_DISABLE_CMPXCHG #endif #endif /* not lint */ #include #include #include #include /* cngetc() */ #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include #include #include #include #include #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) /* * this code MUST be enabled here and in mpboot.s. * it follows the very early stages of AP boot by placing values in CMOS ram. * it NORMALLY will never be needed and thus the primitive method for enabling. * #define CHECK_POINTS */ #if defined(CHECK_POINTS) && !defined(PC98) #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) #define CHECK_INIT(D); \ CHECK_WRITE(0x34, (D)); \ CHECK_WRITE(0x35, (D)); \ CHECK_WRITE(0x36, (D)); \ CHECK_WRITE(0x37, (D)); \ CHECK_WRITE(0x38, (D)); \ CHECK_WRITE(0x39, (D)); #define CHECK_PRINT(S); \ printf("%s: %d, %d, %d, %d, %d, %d\n", \ (S), \ CHECK_READ(0x34), \ CHECK_READ(0x35), \ CHECK_READ(0x36), \ CHECK_READ(0x37), \ CHECK_READ(0x38), \ CHECK_READ(0x39)); #else /* CHECK_POINTS */ #define CHECK_INIT(D) #define CHECK_PRINT(S) #define CHECK_WRITE(A, D) #endif /* CHECK_POINTS */ /* lock region used by kernel profiling */ int mcount_lock; int mp_naps; /* # of Applications processors */ int boot_cpu_id = -1; /* designated BSP */ extern struct pcpu __pcpu[]; /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; static int bootAP; /* Free these after use */ void *bootstacks[MAXCPU]; static void *dpcpu; /* Hotwire a 0->4MB V==P mapping */ extern pt_entry_t *KPTphys; struct pcb stoppcbs[MAXCPU]; /* Variables needed for SMP tlb shootdown. */ vm_offset_t smp_tlb_addr1; vm_offset_t smp_tlb_addr2; volatile int smp_tlb_wait; #ifdef COUNT_IPIS /* Interrupt counts. */ static u_long *ipi_preempt_counts[MAXCPU]; static u_long *ipi_ast_counts[MAXCPU]; u_long *ipi_invltlb_counts[MAXCPU]; u_long *ipi_invlrng_counts[MAXCPU]; u_long *ipi_invlpg_counts[MAXCPU]; u_long *ipi_invlcache_counts[MAXCPU]; u_long *ipi_rendezvous_counts[MAXCPU]; u_long *ipi_lazypmap_counts[MAXCPU]; static u_long *ipi_hardclock_counts[MAXCPU]; #endif /* * Local data and functions. */ static volatile cpumask_t ipi_nmi_pending; /* used to hold the AP's until we are ready to release them */ static struct mtx ap_boot_mtx; /* Set to 1 once we're ready to let the APs out of the pen. */ static volatile int aps_ready = 0; /* * Store data from cpu_add() until later in the boot when we actually setup * the APs. */ struct cpu_info { int cpu_present:1; int cpu_bsp:1; int cpu_disabled:1; int cpu_hyperthread:1; } static cpu_info[MAX_APIC_ID + 1]; int cpu_apic_ids[MAXCPU]; int apic_cpuids[MAX_APIC_ID + 1]; /* Holds pending bitmap based IPIs per CPU */ static volatile u_int cpu_ipi_pending[MAXCPU]; static u_int boot_address; static int cpu_logical; /* logical cpus per core */ static int cpu_cores; /* cores per package */ static void assign_cpu_ids(void); static void install_ap_tramp(void); static void set_interrupt_apic_ids(void); static int start_all_aps(void); static int start_ap(int apic_id); static void release_aps(void *dummy); static int hlt_logical_cpus; static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ static cpumask_t hyperthreading_cpus_mask; static int hyperthreading_allowed = 1; static struct sysctl_ctx_list logical_cpu_clist; static void mem_range_AP_init(void) { if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) mem_range_softc.mr_op->initAP(&mem_range_softc); } static void topo_probe_amd(void) { /* AMD processors do not support HTT. */ cpu_cores = (amd_feature2 & AMDID2_CMP) != 0 ? (cpu_procinfo2 & AMDID_CMP_CORES) + 1 : 1; cpu_logical = 1; } /* * Round up to the next power of two, if necessary, and then * take log2. * Returns -1 if argument is zero. */ static __inline int mask_width(u_int x) { return (fls(x << (1 - powerof2(x))) - 1); } static void topo_probe_0x4(void) { u_int p[4]; int pkg_id_bits; int core_id_bits; int max_cores; int max_logical; int id; /* Both zero and one here mean one logical processor per package. */ max_logical = (cpu_feature & CPUID_HTT) != 0 ? (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; if (max_logical <= 1) return; /* * Because of uniformity assumption we examine only * those logical processors that belong to the same * package as BSP. Further, we count number of * logical processors that belong to the same core * as BSP thus deducing number of threads per core. */ cpuid_count(0x04, 0, p); max_cores = ((p[0] >> 26) & 0x3f) + 1; core_id_bits = mask_width(max_logical/max_cores); if (core_id_bits < 0) return; pkg_id_bits = core_id_bits + mask_width(max_cores); for (id = 0; id <= MAX_APIC_ID; id++) { /* Check logical CPU availability. */ if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) continue; /* Check if logical CPU has the same package ID. */ if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) continue; cpu_cores++; /* Check if logical CPU has the same package and core IDs. */ if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) cpu_logical++; } cpu_cores /= cpu_logical; hyperthreading_cpus = cpu_logical; } static void topo_probe_0xb(void) { u_int p[4]; int bits; int cnt; int i; int logical; int type; int x; /* We only support three levels for now. */ for (i = 0; i < 3; i++) { cpuid_count(0x0b, i, p); /* Fall back if CPU leaf 11 doesn't really exist. */ if (i == 0 && p[1] == 0) { topo_probe_0x4(); return; } bits = p[0] & 0x1f; logical = p[1] &= 0xffff; type = (p[2] >> 8) & 0xff; if (type == 0 || logical == 0) break; /* * Because of uniformity assumption we examine only * those logical processors that belong to the same * package as BSP. */ for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { if (!cpu_info[x].cpu_present || cpu_info[x].cpu_disabled) continue; if (x >> bits == boot_cpu_id >> bits) cnt++; } if (type == CPUID_TYPE_SMT) cpu_logical = cnt; else if (type == CPUID_TYPE_CORE) cpu_cores = cnt; } if (cpu_logical == 0) cpu_logical = 1; cpu_cores /= cpu_logical; } /* * Both topology discovery code and code that consumes topology * information assume top-down uniformity of the topology. * That is, all physical packages must be identical and each * core in a package must have the same number of threads. * Topology information is queried only on BSP, on which this * code runs and for which it can query CPUID information. * Then topology is extrapolated on all packages using the * uniformity assumption. */ static void topo_probe(void) { static int cpu_topo_probed = 0; if (cpu_topo_probed) return; logical_cpus_mask = 0; if (cpu_vendor_id == CPU_VENDOR_AMD) topo_probe_amd(); else if (cpu_vendor_id == CPU_VENDOR_INTEL) { /* * See Intel(R) 64 Architecture Processor * Topology Enumeration article for details. * * Note that 0x1 <= cpu_high < 4 case should be * compatible with topo_probe_0x4() logic when * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) * or it should trigger the fallback otherwise. */ if (cpu_high >= 0xb) topo_probe_0xb(); else if (cpu_high >= 0x1) topo_probe_0x4(); } /* * Fallback: assume each logical CPU is in separate * physical package. That is, no multi-core, no SMT. */ if (cpu_cores == 0) cpu_cores = 1; if (cpu_logical == 0) cpu_logical = 1; cpu_topo_probed = 1; } struct cpu_group * cpu_topo(void) { int cg_flags; /* * Determine whether any threading flags are * necessry. */ topo_probe(); if (cpu_logical > 1 && hyperthreading_cpus) cg_flags = CG_FLAG_HTT; else if (cpu_logical > 1) cg_flags = CG_FLAG_SMT; else cg_flags = 0; if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { printf("WARNING: Non-uniform processors.\n"); printf("WARNING: Using suboptimal topology.\n"); return (smp_topo_none()); } /* * No multi-core or hyper-threaded. */ if (cpu_logical * cpu_cores == 1) return (smp_topo_none()); /* * Only HTT no multi-core. */ if (cpu_logical > 1 && cpu_cores == 1) return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); /* * Only multi-core no HTT. */ if (cpu_cores > 1 && cpu_logical == 1) return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); /* * Both HTT and multi-core. */ return (smp_topo_2level(CG_SHARE_L2, cpu_cores, CG_SHARE_L1, cpu_logical, cg_flags)); } /* * Calculate usable address in base memory for AP trampoline code. */ u_int mp_bootaddress(u_int basemem) { boot_address = trunc_page(basemem); /* round down to 4k boundary */ if ((basemem - boot_address) < bootMP_size) boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ return boot_address; } void cpu_add(u_int apic_id, char boot_cpu) { if (apic_id > MAX_APIC_ID) { panic("SMP: APIC ID %d too high", apic_id); return; } KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", apic_id)); cpu_info[apic_id].cpu_present = 1; if (boot_cpu) { KASSERT(boot_cpu_id == -1, ("CPU %d claims to be BSP, but CPU %d already is", apic_id, boot_cpu_id)); boot_cpu_id = apic_id; cpu_info[apic_id].cpu_bsp = 1; } if (mp_ncpus < MAXCPU) mp_ncpus++; if (bootverbose) printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : "AP"); } void cpu_mp_setmaxid(void) { mp_maxid = MAXCPU - 1; } int cpu_mp_probe(void) { /* * Always record BSP in CPU map so that the mbuf init code works * correctly. */ all_cpus = 1; if (mp_ncpus == 0) { /* * No CPUs were found, so this must be a UP system. Setup * the variables to represent a system with a single CPU * with an id of 0. */ mp_ncpus = 1; return (0); } /* At least one CPU was found. */ if (mp_ncpus == 1) { /* * One CPU was found, so this must be a UP system with * an I/O APIC. */ return (0); } /* At least two CPUs were found. */ return (1); } /* * Initialize the IPI handlers and start up the AP's. */ void cpu_mp_start(void) { int i; /* Initialize the logical ID to APIC ID table. */ for (i = 0; i < MAXCPU; i++) { cpu_apic_ids[i] = -1; cpu_ipi_pending[i] = 0; } /* Install an inter-CPU IPI for TLB invalidation */ setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for cache invalidation. */ setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for lazy pmap release */ setidt(IPI_LAZYPMAP, IDTVEC(lazypmap), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for all-CPU rendezvous */ setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install generic inter-CPU IPI handler */ setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for CPU stop/restart */ setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Set boot_cpu_id if needed. */ if (boot_cpu_id == -1) { boot_cpu_id = PCPU_GET(apic_id); cpu_info[boot_cpu_id].cpu_bsp = 1; } else KASSERT(boot_cpu_id == PCPU_GET(apic_id), ("BSP's APIC ID doesn't match boot_cpu_id")); /* Probe logical/physical core configuration. */ topo_probe(); assign_cpu_ids(); /* Start each Application Processor */ start_all_aps(); set_interrupt_apic_ids(); } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { const char *hyperthread; int i; printf("FreeBSD/SMP: %d package(s) x %d core(s)", mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); if (hyperthreading_cpus > 1) printf(" x %d HTT threads", cpu_logical); else if (cpu_logical > 1) printf(" x %d SMT threads", cpu_logical); printf("\n"); /* List active CPUs first. */ printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); for (i = 1; i < mp_ncpus; i++) { if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) hyperthread = "/HT"; else hyperthread = ""; printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, cpu_apic_ids[i]); } /* List disabled CPUs last. */ for (i = 0; i <= MAX_APIC_ID; i++) { if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) continue; if (cpu_info[i].cpu_hyperthread) hyperthread = "/HT"; else hyperthread = ""; printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, i); } } /* * AP CPU's call this to initialize themselves. */ void init_secondary(void) { struct pcpu *pc; vm_offset_t addr; int gsel_tss; int x, myid; u_int cr0; /* bootAP is set in start_ap() to our ID. */ myid = bootAP; /* Get per-cpu data */ pc = &__pcpu[myid]; /* prime data page for it to use */ pcpu_init(pc, myid, sizeof(struct pcpu)); dpcpu_init(dpcpu, myid); pc->pc_apic_id = cpu_apic_ids[myid]; pc->pc_prvspace = pc; pc->pc_curthread = 0; gdt_segs[GPRIV_SEL].ssd_base = (int) pc; gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; for (x = 0; x < NGDT; x++) { ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) &gdt[myid * NGDT]; lgdt(&r_gdt); /* does magic intra-segment return */ lidt(&r_idt); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); ltr(gsel_tss); PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd); /* * Set to a known state: * Set by mpboot.s: CR0_PG, CR0_PE * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM */ cr0 = rcr0(); cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); load_cr0(cr0); CHECK_WRITE(0x38, 5); /* Disable local APIC just to be sure. */ lapic_disable(); /* signal our startup to the BSP. */ mp_naps++; CHECK_WRITE(0x39, 6); /* Spin until the BSP releases the AP's. */ while (!aps_ready) ia32_pause(); /* BSP may have changed PTD while we were waiting */ invltlb(); for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) invlpg(addr); #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); #endif /* Initialize the PAT MSR if present. */ pmap_init_pat(); /* set up CPU registers and state */ cpu_setregs(); /* set up FPU state on the AP */ npxinit(); /* set up SSE registers */ enable_sse(); #ifdef PAE /* Enable the PTE no-execute bit. */ if ((amd_feature & AMDID_NX) != 0) { uint64_t msr; msr = rdmsr(MSR_EFER) | EFER_NXE; wrmsr(MSR_EFER, msr); } #endif /* A quick check from sanity claus */ if (PCPU_GET(apic_id) != lapic_id()) { printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); printf("SMP: actual apic_id = %d\n", lapic_id()); printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); panic("cpuid mismatch! boom!!"); } /* Initialize curthread. */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); PCPU_SET(curthread, PCPU_GET(idlethread)); mca_init(); mtx_lock_spin(&ap_boot_mtx); /* Init local apic for irq's */ lapic_setup(1); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); smp_cpus++; CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); /* Determine if we are a logical CPU. */ /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) logical_cpus_mask |= PCPU_GET(cpumask); /* Determine if we are a hyperthread. */ if (hyperthreading_cpus > 1 && PCPU_GET(apic_id) % hyperthreading_cpus != 0) hyperthreading_cpus_mask |= PCPU_GET(cpumask); /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); if (bootverbose) lapic_dump("AP"); if (smp_cpus == mp_ncpus) { /* enable IPI's, tlb shootdown, freezes etc */ atomic_store_rel_int(&smp_started, 1); smp_active = 1; /* historic */ } mtx_unlock_spin(&ap_boot_mtx); /* Wait until all the AP's are up. */ while (smp_started == 0) ia32_pause(); /* Start per-CPU event timers. */ cpu_initclocks_ap(); /* Enter the scheduler. */ sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ } /******************************************************************* * local functions and data */ /* * We tell the I/O APIC code about all the CPUs we want to receive * interrupts. If we don't want certain CPUs to receive IRQs we * can simply not tell the I/O APIC code about them in this function. * We also do not tell it about the BSP since it tells itself about * the BSP internally to work with UP kernels and on UP machines. */ static void set_interrupt_apic_ids(void) { u_int i, apic_id; for (i = 0; i < MAXCPU; i++) { apic_id = cpu_apic_ids[i]; if (apic_id == -1) continue; if (cpu_info[apic_id].cpu_bsp) continue; if (cpu_info[apic_id].cpu_disabled) continue; /* Don't let hyperthreads service interrupts. */ if (hyperthreading_cpus > 1 && apic_id % hyperthreading_cpus != 0) continue; intr_add_cpu(i); } } /* * Assign logical CPU IDs to local APICs. */ static void assign_cpu_ids(void) { u_int i; TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", &hyperthreading_allowed); /* Check for explicitly disabled CPUs. */ for (i = 0; i <= MAX_APIC_ID; i++) { if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) continue; if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { cpu_info[i].cpu_hyperthread = 1; #if defined(SCHED_ULE) /* * Don't use HT CPU if it has been disabled by a * tunable. */ if (hyperthreading_allowed == 0) { cpu_info[i].cpu_disabled = 1; continue; } #endif } /* Don't use this CPU if it has been disabled by a tunable. */ if (resource_disabled("lapic", i)) { cpu_info[i].cpu_disabled = 1; continue; } } /* * Assign CPU IDs to local APIC IDs and disable any CPUs * beyond MAXCPU. CPU 0 is always assigned to the BSP. * * To minimize confusion for userland, we attempt to number * CPUs such that all threads and cores in a package are * grouped together. For now we assume that the BSP is always * the first thread in a package and just start adding APs * starting with the BSP's APIC ID. */ mp_ncpus = 1; cpu_apic_ids[0] = boot_cpu_id; apic_cpuids[boot_cpu_id] = 0; for (i = boot_cpu_id + 1; i != boot_cpu_id; i == MAX_APIC_ID ? i = 0 : i++) { if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || cpu_info[i].cpu_disabled) continue; if (mp_ncpus < MAXCPU) { cpu_apic_ids[mp_ncpus] = i; apic_cpuids[i] = mp_ncpus; mp_ncpus++; } else cpu_info[i].cpu_disabled = 1; } KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); } /* * start each AP in our list */ /* Lowest 1MB is already mapped: don't touch*/ #define TMPMAP_START 1 static int start_all_aps(void) { #ifndef PC98 u_char mpbiosreason; #endif uintptr_t kptbase; u_int32_t mpbioswarmvec; int apic_id, cpu, i; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); /* install the AP 1st level boot code */ install_ap_tramp(); /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); #endif /* set up temporary P==V mapping for AP boot */ /* XXX this is a hack, we should boot the AP on its own stack/PTD */ kptbase = (uintptr_t)(void *)KPTphys; for (i = TMPMAP_START; i < NKPT; i++) PTD[i] = (pd_entry_t)(PG_V | PG_RW | ((kptbase + i * PAGE_SIZE) & PG_FRAME)); invltlb(); /* start each AP */ for (cpu = 1; cpu < mp_ncpus; cpu++) { apic_id = cpu_apic_ids[cpu]; /* allocate and set up a boot stack data page */ bootstacks[cpu] = (char *)kmem_alloc(kernel_map, KSTACK_PAGES * PAGE_SIZE); dpcpu = (void *)kmem_alloc(kernel_map, DPCPU_SIZE); /* setup a vector to our boot code */ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ #endif bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 4; bootAP = cpu; /* attempt to start the Application Processor */ CHECK_INIT(99); /* setup checkpoints */ if (!start_ap(apic_id)) { printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); CHECK_PRINT("trace"); /* show checkpoints */ /* better panic as the AP may be running loose */ printf("panic y/n? [y] "); if (cngetc() != 'n') panic("bye-bye"); } CHECK_PRINT("trace"); /* show checkpoints */ all_cpus |= (1 << cpu); /* record AP in CPU map */ } /* build our map of 'other' CPUs */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); /* restore the warmstart vector */ *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); #endif /* Undo V==P hack from above */ for (i = TMPMAP_START; i < NKPT; i++) PTD[i] = 0; pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); /* number of APs actually started */ return mp_naps; } /* * load the 1st level AP boot code into base memory. */ /* targets for relocation */ extern void bigJump(void); extern void bootCodeSeg(void); extern void bootDataSeg(void); extern void MPentry(void); extern u_int MP_GDT; extern u_int mp_gdtbase; static void install_ap_tramp(void) { int x; int size = *(int *) ((u_long) & bootMP_size); vm_offset_t va = boot_address + KERNBASE; u_char *src = (u_char *) ((u_long) bootMP); u_char *dst = (u_char *) va; u_int boot_base = (u_int) bootMP; u_int8_t *dst8; u_int16_t *dst16; u_int32_t *dst32; KASSERT (size <= PAGE_SIZE, ("'size' do not fit into PAGE_SIZE, as expected.")); pmap_kenter(va, boot_address); pmap_invalidate_page (kernel_pmap, va); for (x = 0; x < size; ++x) *dst++ = *src++; /* * modify addresses in code we just moved to basemem. unfortunately we * need fairly detailed info about mpboot.s for this to work. changes * to mpboot.s might require changes here. */ /* boot code is located in KERNEL space */ dst = (u_char *) va; /* modify the lgdt arg */ dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); *dst32 = boot_address + ((u_int) & MP_GDT - boot_base); /* modify the ljmp target for MPentry() */ dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); *dst32 = ((u_int) MPentry - KERNBASE); /* modify the target for boot code segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_address & 0xffff; *dst8 = ((u_int) boot_address >> 16) & 0xff; /* modify the target for boot data segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_address & 0xffff; *dst8 = ((u_int) boot_address >> 16) & 0xff; } /* * This function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It isn't pretty, * but it seems to work. */ static int start_ap(int apic_id) { int vector, ms; int cpus; /* calculate the vector */ vector = (boot_address >> 12) & 0xff; /* used as a watchpoint to signal AP startup */ cpus = mp_naps; /* * first we do an INIT/RESET IPI this INIT IPI might be run, reseting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ /* do an INIT IPI: assert RESET */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); /* wait for pending status end */ lapic_ipi_wait(-1); /* do an INIT IPI: deassert RESET */ lapic_ipi_raw(APIC_DEST_ALLESELF | APIC_TRIGMOD_LEVEL | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, 0); /* wait for pending status end */ DELAY(10000); /* wait ~10mS */ lapic_ipi_wait(-1); /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ /* do a STARTUP IPI */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); lapic_ipi_wait(-1); DELAY(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); lapic_ipi_wait(-1); DELAY(200); /* wait ~200uS */ /* Wait up to 5 seconds for it to start. */ for (ms = 0; ms < 5000; ms++) { if (mp_naps > cpus) return 1; /* return SUCCESS */ DELAY(1000); } return 0; /* return FAILURE */ } #ifdef COUNT_XINVLTLB_HITS u_int xhits_gbl[MAXCPU]; u_int xhits_pg[MAXCPU]; u_int xhits_rng[MAXCPU]; SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, sizeof(xhits_gbl), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, sizeof(xhits_pg), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, sizeof(xhits_rng), "IU", ""); u_int ipi_global; u_int ipi_page; u_int ipi_range; u_int ipi_range_size; SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 0, ""); u_int ipi_masked_global; u_int ipi_masked_page; u_int ipi_masked_range; u_int ipi_masked_range_size; SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, &ipi_masked_global, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, &ipi_masked_page, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, &ipi_masked_range, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, &ipi_masked_range_size, 0, ""); #endif /* COUNT_XINVLTLB_HITS */ /* * Flush the TLB on all other CPU's */ static void smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) { u_int ncpu; ncpu = mp_ncpus - 1; /* does not shootdown self */ if (ncpu < 1) return; /* no other cpus */ if (!(read_eflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); smp_tlb_addr1 = addr1; smp_tlb_addr2 = addr2; atomic_store_rel_int(&smp_tlb_wait, 0); ipi_all_but_self(vector); while (smp_tlb_wait < ncpu) ia32_pause(); mtx_unlock_spin(&smp_ipi_mtx); } static void smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) { int ncpu, othercpus; othercpus = mp_ncpus - 1; if (mask == (u_int)-1) { ncpu = othercpus; if (ncpu < 1) return; } else { mask &= ~PCPU_GET(cpumask); if (mask == 0) return; ncpu = bitcount32(mask); if (ncpu > othercpus) { /* XXX this should be a panic offence */ printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", ncpu, othercpus); ncpu = othercpus; } /* XXX should be a panic, implied by mask == 0 above */ if (ncpu < 1) return; } if (!(read_eflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); smp_tlb_addr1 = addr1; smp_tlb_addr2 = addr2; atomic_store_rel_int(&smp_tlb_wait, 0); if (mask == (u_int)-1) ipi_all_but_self(vector); else ipi_selected(mask, vector); while (smp_tlb_wait < ncpu) ia32_pause(); mtx_unlock_spin(&smp_ipi_mtx); } /* * Send an IPI to specified CPU handling the bitmap logic. */ static void ipi_send_cpu(int cpu, u_int ipi) { u_int bitmap, old_pending, new_pending; KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); if (IPI_IS_BITMAPED(ipi)) { bitmap = 1 << ipi; ipi = IPI_BITMAP_VECTOR; do { old_pending = cpu_ipi_pending[cpu]; new_pending = old_pending | bitmap; } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], old_pending, new_pending)); if (old_pending) return; } lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); } void smp_cache_flush(void) { if (smp_started) smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); } void smp_invltlb(void) { if (smp_started) { smp_tlb_shootdown(IPI_INVLTLB, 0, 0); #ifdef COUNT_XINVLTLB_HITS ipi_global++; #endif } } void smp_invlpg(vm_offset_t addr) { if (smp_started) { smp_tlb_shootdown(IPI_INVLPG, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_page++; #endif } } void smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); #ifdef COUNT_XINVLTLB_HITS ipi_range++; ipi_range_size += (addr2 - addr1) / PAGE_SIZE; #endif } } void smp_masked_invltlb(cpumask_t mask) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); #ifdef COUNT_XINVLTLB_HITS ipi_masked_global++; #endif } } void smp_masked_invlpg(cpumask_t mask, vm_offset_t addr) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_masked_page++; #endif } } void smp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); #ifdef COUNT_XINVLTLB_HITS ipi_masked_range++; ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; #endif } } void ipi_bitmap_handler(struct trapframe frame) { struct trapframe *oldframe; struct thread *td; int cpu = PCPU_GET(cpuid); u_int ipi_bitmap; critical_enter(); td = curthread; td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = &frame; ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); if (ipi_bitmap & (1 << IPI_PREEMPT)) { #ifdef COUNT_IPIS (*ipi_preempt_counts[cpu])++; #endif sched_preempt(td); } if (ipi_bitmap & (1 << IPI_AST)) { #ifdef COUNT_IPIS (*ipi_ast_counts[cpu])++; #endif /* Nothing to do for AST */ } if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { #ifdef COUNT_IPIS (*ipi_hardclock_counts[cpu])++; #endif hardclockintr(); } td->td_intr_frame = oldframe; td->td_intr_nesting_level--; critical_exit(); } /* * send an IPI to a set of cpus. */ void ipi_selected(cpumask_t cpus, u_int ipi) { int cpu; /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) atomic_set_int(&ipi_nmi_pending, cpus); CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); while ((cpu = ffs(cpus)) != 0) { cpu--; cpus &= ~(1 << cpu); ipi_send_cpu(cpu, ipi); } } /* * send an IPI to a specific CPU. */ void ipi_cpu(int cpu, u_int ipi) { /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) atomic_set_int(&ipi_nmi_pending, 1 << cpu); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { if (IPI_IS_BITMAPED(ipi)) { ipi_selected(PCPU_GET(other_cpus), ipi); return; } /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus)); CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); } int ipi_nmi_handler() { cpumask_t cpumask; /* * As long as there is not a simple way to know about a NMI's * source, if the bitmask for the current CPU is present in * the global pending bitword an IPI_STOP_HARD has been issued * and should be handled. */ cpumask = PCPU_GET(cpumask); if ((ipi_nmi_pending & cpumask) == 0) return (1); atomic_clear_int(&ipi_nmi_pending, cpumask); cpustop_handler(); return (0); } /* * Handle an IPI_STOP by saving our current context and spinning until we * are resumed. */ void cpustop_handler(void) { cpumask_t cpumask; u_int cpu; cpu = PCPU_GET(cpuid); cpumask = PCPU_GET(cpumask); savectx(&stoppcbs[cpu]); /* Indicate that we are stopped */ atomic_set_int(&stopped_cpus, cpumask); /* Wait for restart */ while (!(started_cpus & cpumask)) ia32_pause(); atomic_clear_int(&started_cpus, cpumask); atomic_clear_int(&stopped_cpus, cpumask); if (cpu == 0 && cpustop_restartfunc != NULL) { cpustop_restartfunc(); cpustop_restartfunc = NULL; } } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ static void release_aps(void *dummy __unused) { if (mp_ncpus == 1) return; atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ia32_pause(); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); static int sysctl_hlt_cpus(SYSCTL_HANDLER_ARGS) { cpumask_t mask; int error; mask = hlt_cpus_mask; error = sysctl_handle_int(oidp, &mask, 0, req); if (error || !req->newptr) return (error); if (logical_cpus_mask != 0 && (mask & logical_cpus_mask) == logical_cpus_mask) hlt_logical_cpus = 1; else hlt_logical_cpus = 0; if (! hyperthreading_allowed) mask |= hyperthreading_cpus_mask; if ((mask & all_cpus) == all_cpus) mask &= ~(1<<0); hlt_cpus_mask = mask; return (error); } SYSCTL_PROC(_machdep, OID_AUTO, hlt_cpus, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_hlt_cpus, "IU", "Bitmap of CPUs to halt. 101 (binary) will halt CPUs 0 and 2."); static int sysctl_hlt_logical_cpus(SYSCTL_HANDLER_ARGS) { int disable, error; disable = hlt_logical_cpus; error = sysctl_handle_int(oidp, &disable, 0, req); if (error || !req->newptr) return (error); if (disable) hlt_cpus_mask |= logical_cpus_mask; else hlt_cpus_mask &= ~logical_cpus_mask; if (! hyperthreading_allowed) hlt_cpus_mask |= hyperthreading_cpus_mask; if ((hlt_cpus_mask & all_cpus) == all_cpus) hlt_cpus_mask &= ~(1<<0); hlt_logical_cpus = disable; return (error); } static int sysctl_hyperthreading_allowed(SYSCTL_HANDLER_ARGS) { int allowed, error; allowed = hyperthreading_allowed; error = sysctl_handle_int(oidp, &allowed, 0, req); if (error || !req->newptr) return (error); #ifdef SCHED_ULE /* * SCHED_ULE doesn't allow enabling/disabling HT cores at * run-time. */ if (allowed != hyperthreading_allowed) return (ENOTSUP); return (error); #endif if (allowed) hlt_cpus_mask &= ~hyperthreading_cpus_mask; else hlt_cpus_mask |= hyperthreading_cpus_mask; if (logical_cpus_mask != 0 && (hlt_cpus_mask & logical_cpus_mask) == logical_cpus_mask) hlt_logical_cpus = 1; else hlt_logical_cpus = 0; if ((hlt_cpus_mask & all_cpus) == all_cpus) hlt_cpus_mask &= ~(1<<0); hyperthreading_allowed = allowed; return (error); } static void cpu_hlt_setup(void *dummy __unused) { if (logical_cpus_mask != 0) { TUNABLE_INT_FETCH("machdep.hlt_logical_cpus", &hlt_logical_cpus); sysctl_ctx_init(&logical_cpu_clist); SYSCTL_ADD_PROC(&logical_cpu_clist, SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, "hlt_logical_cpus", CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_hlt_logical_cpus, "IU", ""); SYSCTL_ADD_UINT(&logical_cpu_clist, SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, "logical_cpus_mask", CTLTYPE_INT|CTLFLAG_RD, &logical_cpus_mask, 0, ""); if (hlt_logical_cpus) hlt_cpus_mask |= logical_cpus_mask; /* * If necessary for security purposes, force * hyperthreading off, regardless of the value * of hlt_logical_cpus. */ if (hyperthreading_cpus_mask) { SYSCTL_ADD_PROC(&logical_cpu_clist, SYSCTL_STATIC_CHILDREN(_machdep), OID_AUTO, "hyperthreading_allowed", CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_hyperthreading_allowed, "IU", ""); if (! hyperthreading_allowed) hlt_cpus_mask |= hyperthreading_cpus_mask; } } } SYSINIT(cpu_hlt, SI_SUB_SMP, SI_ORDER_ANY, cpu_hlt_setup, NULL); int mp_grab_cpu_hlt(void) { cpumask_t mask; #ifdef MP_WATCHDOG u_int cpuid; #endif int retval; mask = PCPU_GET(cpumask); #ifdef MP_WATCHDOG cpuid = PCPU_GET(cpuid); ap_watchdog(cpuid); #endif retval = 0; while (mask & hlt_cpus_mask) { retval = 1; __asm __volatile("sti; hlt" : : : "memory"); } return (retval); } #ifdef COUNT_IPIS /* * Setup interrupt counters for IPI handlers. */ static void mp_ipi_intrcnt(void *dummy) { char buf[64]; int i; CPU_FOREACH(i) { snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); intrcnt_add(buf, &ipi_invltlb_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); intrcnt_add(buf, &ipi_invlrng_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); intrcnt_add(buf, &ipi_invlpg_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:preempt", i); intrcnt_add(buf, &ipi_preempt_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:ast", i); intrcnt_add(buf, &ipi_ast_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); intrcnt_add(buf, &ipi_rendezvous_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:lazypmap", i); intrcnt_add(buf, &ipi_lazypmap_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); intrcnt_add(buf, &ipi_hardclock_counts[i]); } } SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); #endif diff --git a/sys/i386/i386/mp_watchdog.c b/sys/i386/i386/mp_watchdog.c index 1803270b573f..5cbd649f7e60 100644 --- a/sys/i386/i386/mp_watchdog.c +++ b/sys/i386/i386/mp_watchdog.c @@ -1,211 +1,211 @@ /*- * Copyright (c) 2004 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include "opt_mp_watchdog.h" #include "opt_sched.h" #ifdef SCHED_ULE #error MP_WATCHDOG cannot currently be used with SCHED_ULE #endif #include #include #include #include #include #include #include #include #include #include -#include +#include #include #include /* * mp_watchdog hijacks the idle thread on a specified CPU, prevents new work * from being scheduled there, and uses it as a "watchdog" to detect kernel * failure on other CPUs. This is made reasonable by inclusion of logical * processors in Xeon hardware. The watchdog is configured by setting the * debug.watchdog sysctl/tunable to the CPU of interest. A callout will then * begin executing reseting a timer that is gradually lowered by the watching * thread. If the timer reaches 0, the watchdog fires by ether dropping * directly to the debugger, or by sending an NMI IPI to the boot processor. * This is a somewhat less efficient substitute for dedicated watchdog * hardware, but can be quite an effective tool for debugging hangs. * * XXXRW: This should really use the watchdog(9)/watchdog(4) framework, but * doesn't yet. */ static int watchdog_cpu = -1; static int watchdog_dontfire = 1; static int watchdog_timer = -1; static int watchdog_nmi = 1; TUNABLE_INT("debug.watchdog", &watchdog_cpu); SYSCTL_INT(_debug, OID_AUTO, watchdog_nmi, CTLFLAG_RW, &watchdog_nmi, 0, "IPI the boot processor with an NMI to enter the debugger"); static struct callout watchdog_callout; static void watchdog_change(int wdcpu); /* * Number of seconds before the watchdog will fire if the callout fails to * reset the timer. */ #define WATCHDOG_THRESHOLD 10 static void watchdog_init(void *arg) { callout_init(&watchdog_callout, CALLOUT_MPSAFE); if (watchdog_cpu != -1) watchdog_change(watchdog_cpu); } /* * This callout resets a timer until the watchdog kicks in. It acquires some * critical locks to make sure things haven't gotten wedged with hose locks * held. */ static void watchdog_function(void *arg) { /* * Since the timer ran, we must not be wedged. Acquire some critical * locks to make sure. Then reset the timer. */ mtx_lock(&Giant); watchdog_timer = WATCHDOG_THRESHOLD; mtx_unlock(&Giant); callout_reset(&watchdog_callout, 1 * hz, watchdog_function, NULL); } SYSINIT(watchdog_init, SI_SUB_DRIVERS, SI_ORDER_ANY, watchdog_init, NULL); static void watchdog_change(int wdcpu) { if (wdcpu == -1 || wdcpu == 0xffffffff) { /* * Disable the watchdog. */ watchdog_cpu = -1; watchdog_dontfire = 1; callout_stop(&watchdog_callout); printf("watchdog stopped\n"); } else { watchdog_timer = WATCHDOG_THRESHOLD; watchdog_dontfire = 0; watchdog_cpu = wdcpu; callout_reset(&watchdog_callout, 1 * hz, watchdog_function, NULL); } } /* * This sysctl sets which CPU is the watchdog CPU. Set to -1 or 0xffffffff * to disable the watchdog. */ static int sysctl_watchdog(SYSCTL_HANDLER_ARGS) { int error, temp; temp = watchdog_cpu; error = sysctl_handle_int(oidp, &temp, 0, req); if (error) return (error); if (req->newptr != NULL) watchdog_change(temp); return (0); } SYSCTL_PROC(_debug, OID_AUTO, watchdog, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_watchdog, "I", ""); /* * Drop into the debugger by sending an IPI NMI to the boot processor. */ static void watchdog_ipi_nmi(void) { /* * Deliver NMI to the boot processor. Why not? */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_NMI, boot_cpu_id); lapic_ipi_wait(-1); } /* * ap_watchdog() is called by the SMP idle loop code. It works on the same * premise that the disabling of logical processors does: that if the cpu is * idle, then it can ignore the world from then on, as nothing will be * scheduled on it. Leaving aside multi-runqueue schedulers (SCHED_ULE) and * explicit process migration (sched_bind()), this is not an unreasonable * assumption. */ void ap_watchdog(u_int cpuid) { char old_pcomm[MAXCOMLEN + 1]; struct proc *p; if (watchdog_cpu != cpuid) return; printf("watchdog started on cpu %d\n", cpuid); p = curproc; bcopy(p->p_comm, old_pcomm, MAXCOMLEN + 1); snprintf(p->p_comm, MAXCOMLEN + 1, "mp_watchdog cpu %d", cpuid); while (1) { DELAY(1000000); /* One second. */ if (watchdog_cpu != cpuid) break; atomic_subtract_int(&watchdog_timer, 1); if (watchdog_timer < 4) printf("Watchdog timer: %d\n", watchdog_timer); if (watchdog_timer == 0 && watchdog_dontfire == 0) { printf("Watchdog firing!\n"); watchdog_dontfire = 1; if (watchdog_nmi) watchdog_ipi_nmi(); else kdb_enter(KDB_WHY_WATCHDOG, "mp_watchdog"); } } bcopy(old_pcomm, p->p_comm, MAXCOMLEN + 1); printf("watchdog stopped on cpu %d\n", cpuid); } diff --git a/sys/i386/i386/mpboot.s b/sys/i386/i386/mpboot.s index 88708589048f..a3ef283023d6 100644 --- a/sys/i386/i386/mpboot.s +++ b/sys/i386/i386/mpboot.s @@ -1,279 +1,279 @@ /*- * Copyright (c) 1995 Jack F. Vogel * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * mpboot.s: FreeBSD machine support for the Intel MP Spec * multiprocessor systems. * * $FreeBSD$ */ #include "opt_pmap.h" #include /* miscellaneous asm macros */ -#include +#include #include #include "assym.s" #define R(x) ((x)-KERNBASE) /* * this code MUST be enabled here and in mp_machdep.c * it follows the very early stages of AP boot by placing values in CMOS ram. * it NORMALLY will never be needed and thus the primitive method for enabling. * #define CHECK_POINTS */ #if defined(CHECK_POINTS) && !defined(PC98) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define CHECKPOINT(A,D) \ movb $(A),%al ; \ outb %al,$CMOS_REG ; \ movb $(D),%al ; \ outb %al,$CMOS_DATA #else #define CHECKPOINT(A,D) #endif /* CHECK_POINTS */ /* * the APs enter here from their trampoline code (bootMP, below) */ .p2align 4 NON_GPROF_ENTRY(MPentry) CHECKPOINT(0x36, 3) /* * Enable features on this processor. We don't support SMP on * CPUs older than a Pentium, so we know that we can use the cpuid * instruction. */ movl $1,%eax cpuid /* Retrieve features */ movl %cr4,%eax #ifndef DISABLE_PSE testl $CPUID_PSE,%edx jz 1f orl $CR4_PSE,%eax /* Enable PSE */ 1: #endif #ifndef DISABLE_PG_G testl $CPUID_PGE,%edx jz 1f orl $CR4_PGE,%eax /* Enable PGE */ 1: #endif testl $CPUID_VME,%edx jz 1f orl $CR4_VME,%eax /* Enable VME */ 1: movl %eax,%cr4 /* Now enable paging mode */ #ifdef PAE movl R(IdlePDPT), %eax movl %eax, %cr3 movl %cr4, %eax orl $CR4_PAE, %eax movl %eax, %cr4 #else movl R(IdlePTD), %eax movl %eax,%cr3 #endif movl %cr0,%eax orl $CR0_PE|CR0_PG,%eax /* enable paging */ movl %eax,%cr0 /* let the games begin! */ movl bootSTK,%esp /* boot stack end loc. */ pushl $mp_begin /* jump to high mem */ ret /* * Wait for the booting CPU to signal startup */ mp_begin: /* now running relocated at KERNBASE */ CHECKPOINT(0x37, 4) call init_secondary /* load i386 tables */ /* * This is the embedded trampoline or bootstrap that is * copied into 'real-mode' low memory, it is where the * secondary processor "wakes up". When it is executed * the processor will eventually jump into the routine * MPentry, which resides in normal kernel text above * 1Meg. -jackv */ .data ALIGN_DATA /* just to be sure */ BOOTMP1: NON_GPROF_ENTRY(bootMP) .code16 cli CHECKPOINT(0x34, 1) /* First guarantee a 'clean slate' */ xorl %eax, %eax movl %eax, %ebx movl %eax, %ecx movl %eax, %edx movl %eax, %esi movl %eax, %edi /* set up data segments */ mov %cs, %ax mov %ax, %ds mov %ax, %es mov %ax, %fs mov %ax, %gs mov %ax, %ss mov $(boot_stk-bootMP), %esp /* Now load the global descriptor table */ lgdt MP_GDTptr-bootMP /* Enable protected mode */ movl %cr0, %eax orl $CR0_PE, %eax movl %eax, %cr0 /* * make intrasegment jump to flush the processor pipeline and * reload CS register */ pushl $0x18 pushl $(protmode-bootMP) lretl .code32 protmode: CHECKPOINT(0x35, 2) /* * we are NOW running for the first time with %eip * having the full physical address, BUT we still * are using a segment descriptor with the origin * not matching the booting kernel. * * SO NOW... for the BIG Jump into kernel's segment * and physical text above 1 Meg. */ mov $0x10, %ebx movw %bx, %ds movw %bx, %es movw %bx, %fs movw %bx, %gs movw %bx, %ss .globl bigJump bigJump: /* this will be modified by mpInstallTramp() */ ljmp $0x08, $0 /* far jmp to MPentry() */ dead: hlt /* We should never get here */ jmp dead /* * MP boot strap Global Descriptor Table */ .p2align 4 .globl MP_GDT .globl bootCodeSeg .globl bootDataSeg MP_GDT: nulldesc: /* offset = 0x0 */ .word 0x0 .word 0x0 .byte 0x0 .byte 0x0 .byte 0x0 .byte 0x0 kernelcode: /* offset = 0x08 */ .word 0xffff /* segment limit 0..15 */ .word 0x0000 /* segment base 0..15 */ .byte 0x0 /* segment base 16..23; set for 0K */ .byte 0x9f /* flags; Type */ .byte 0xcf /* flags; Limit */ .byte 0x0 /* segment base 24..32 */ kerneldata: /* offset = 0x10 */ .word 0xffff /* segment limit 0..15 */ .word 0x0000 /* segment base 0..15 */ .byte 0x0 /* segment base 16..23; set for 0k */ .byte 0x93 /* flags; Type */ .byte 0xcf /* flags; Limit */ .byte 0x0 /* segment base 24..32 */ bootcode: /* offset = 0x18 */ .word 0xffff /* segment limit 0..15 */ bootCodeSeg: /* this will be modified by mpInstallTramp() */ .word 0x0000 /* segment base 0..15 */ .byte 0x00 /* segment base 16...23; set for 0x000xx000 */ .byte 0x9e /* flags; Type */ .byte 0xcf /* flags; Limit */ .byte 0x0 /*segment base 24..32 */ bootdata: /* offset = 0x20 */ .word 0xffff bootDataSeg: /* this will be modified by mpInstallTramp() */ .word 0x0000 /* segment base 0..15 */ .byte 0x00 /* segment base 16...23; set for 0x000xx000 */ .byte 0x92 .byte 0xcf .byte 0x0 /* * GDT pointer for the lgdt call */ .globl mp_gdtbase MP_GDTptr: mp_gdtlimit: .word 0x0028 mp_gdtbase: /* this will be modified by mpInstallTramp() */ .long 0 .space 0x100 /* space for boot_stk - 1st temporary stack */ boot_stk: BOOTMP2: .globl bootMP_size bootMP_size: .long BOOTMP2 - BOOTMP1 diff --git a/sys/i386/include/apicreg.h b/sys/i386/include/apicreg.h deleted file mode 100644 index fee629bb2c57..000000000000 --- a/sys/i386/include/apicreg.h +++ /dev/null @@ -1,445 +0,0 @@ -/*- - * Copyright (c) 1996, by Peter Wemm and Steve Passe - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. The name of the developer may NOT be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD$ - */ - -#ifndef _MACHINE_APICREG_H_ -#define _MACHINE_APICREG_H_ - -/* - * Local && I/O APIC definitions. - */ - -/* - * Pentium P54C+ Built-in APIC - * (Advanced programmable Interrupt Controller) - * - * Base Address of Built-in APIC in memory location - * is 0xfee00000. - * - * Map of APIC Registers: - * - * Offset (hex) Description Read/Write state - * 000 Reserved - * 010 Reserved - * 020 ID Local APIC ID R/W - * 030 VER Local APIC Version R - * 040 Reserved - * 050 Reserved - * 060 Reserved - * 070 Reserved - * 080 Task Priority Register R/W - * 090 Arbitration Priority Register R - * 0A0 Processor Priority Register R - * 0B0 EOI Register W - * 0C0 RRR Remote read R - * 0D0 Logical Destination R/W - * 0E0 Destination Format Register 0..27 R; 28..31 R/W - * 0F0 SVR Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W - * 100 ISR 000-031 R - * 110 ISR 032-063 R - * 120 ISR 064-095 R - * 130 ISR 095-128 R - * 140 ISR 128-159 R - * 150 ISR 160-191 R - * 160 ISR 192-223 R - * 170 ISR 224-255 R - * 180 TMR 000-031 R - * 190 TMR 032-063 R - * 1A0 TMR 064-095 R - * 1B0 TMR 095-128 R - * 1C0 TMR 128-159 R - * 1D0 TMR 160-191 R - * 1E0 TMR 192-223 R - * 1F0 TMR 224-255 R - * 200 IRR 000-031 R - * 210 IRR 032-063 R - * 220 IRR 064-095 R - * 230 IRR 095-128 R - * 240 IRR 128-159 R - * 250 IRR 160-191 R - * 260 IRR 192-223 R - * 270 IRR 224-255 R - * 280 Error Status Register R - * 290 Reserved - * 2A0 Reserved - * 2B0 Reserved - * 2C0 Reserved - * 2D0 Reserved - * 2E0 Reserved - * 2F0 Local Vector Table (CMCI) R/W - * 300 ICR_LOW Interrupt Command Reg. (0-31) R/W - * 310 ICR_HI Interrupt Command Reg. (32-63) R/W - * 320 Local Vector Table (Timer) R/W - * 330 Local Vector Table (Thermal) R/W (PIV+) - * 340 Local Vector Table (Performance) R/W (P6+) - * 350 LVT1 Local Vector Table (LINT0) R/W - * 360 LVT2 Local Vector Table (LINT1) R/W - * 370 LVT3 Local Vector Table (ERROR) R/W - * 380 Initial Count Reg. for Timer R/W - * 390 Current Count of Timer R - * 3A0 Reserved - * 3B0 Reserved - * 3C0 Reserved - * 3D0 Reserved - * 3E0 Timer Divide Configuration Reg. R/W - * 3F0 Reserved - */ - - -/****************************************************************************** - * global defines, etc. - */ - - -/****************************************************************************** - * LOCAL APIC structure - */ - -#ifndef LOCORE -#include - -#define PAD3 int : 32; int : 32; int : 32 -#define PAD4 int : 32; int : 32; int : 32; int : 32 - -struct LAPIC { - /* reserved */ PAD4; - /* reserved */ PAD4; - u_int32_t id; PAD3; - u_int32_t version; PAD3; - /* reserved */ PAD4; - /* reserved */ PAD4; - /* reserved */ PAD4; - /* reserved */ PAD4; - u_int32_t tpr; PAD3; - u_int32_t apr; PAD3; - u_int32_t ppr; PAD3; - u_int32_t eoi; PAD3; - /* reserved */ PAD4; - u_int32_t ldr; PAD3; - u_int32_t dfr; PAD3; - u_int32_t svr; PAD3; - u_int32_t isr0; PAD3; - u_int32_t isr1; PAD3; - u_int32_t isr2; PAD3; - u_int32_t isr3; PAD3; - u_int32_t isr4; PAD3; - u_int32_t isr5; PAD3; - u_int32_t isr6; PAD3; - u_int32_t isr7; PAD3; - u_int32_t tmr0; PAD3; - u_int32_t tmr1; PAD3; - u_int32_t tmr2; PAD3; - u_int32_t tmr3; PAD3; - u_int32_t tmr4; PAD3; - u_int32_t tmr5; PAD3; - u_int32_t tmr6; PAD3; - u_int32_t tmr7; PAD3; - u_int32_t irr0; PAD3; - u_int32_t irr1; PAD3; - u_int32_t irr2; PAD3; - u_int32_t irr3; PAD3; - u_int32_t irr4; PAD3; - u_int32_t irr5; PAD3; - u_int32_t irr6; PAD3; - u_int32_t irr7; PAD3; - u_int32_t esr; PAD3; - /* reserved */ PAD4; - /* reserved */ PAD4; - /* reserved */ PAD4; - /* reserved */ PAD4; - /* reserved */ PAD4; - /* reserved */ PAD4; - u_int32_t lvt_cmci; PAD3; - u_int32_t icr_lo; PAD3; - u_int32_t icr_hi; PAD3; - u_int32_t lvt_timer; PAD3; - u_int32_t lvt_thermal; PAD3; - u_int32_t lvt_pcint; PAD3; - u_int32_t lvt_lint0; PAD3; - u_int32_t lvt_lint1; PAD3; - u_int32_t lvt_error; PAD3; - u_int32_t icr_timer; PAD3; - u_int32_t ccr_timer; PAD3; - /* reserved */ PAD4; - /* reserved */ PAD4; - /* reserved */ PAD4; - /* reserved */ PAD4; - u_int32_t dcr_timer; PAD3; - /* reserved */ PAD4; -}; - -typedef struct LAPIC lapic_t; - -/****************************************************************************** - * I/O APIC structure - */ - -struct IOAPIC { - u_int32_t ioregsel; PAD3; - u_int32_t iowin; PAD3; -}; - -typedef struct IOAPIC ioapic_t; - -#undef PAD4 -#undef PAD3 - -#endif /* !LOCORE */ - - -/****************************************************************************** - * various code 'logical' values - */ - -/****************************************************************************** - * LOCAL APIC defines - */ - -/* default physical locations of LOCAL (CPU) APICs */ -#define DEFAULT_APIC_BASE 0xfee00000 - -/* constants relating to APIC ID registers */ -#define APIC_ID_MASK 0xff000000 -#define APIC_ID_SHIFT 24 -#define APIC_ID_CLUSTER 0xf0 -#define APIC_ID_CLUSTER_ID 0x0f -#define APIC_MAX_CLUSTER 0xe -#define APIC_MAX_INTRACLUSTER_ID 3 -#define APIC_ID_CLUSTER_SHIFT 4 - -/* fields in VER */ -#define APIC_VER_VERSION 0x000000ff -#define APIC_VER_MAXLVT 0x00ff0000 -#define MAXLVTSHIFT 16 -#define APIC_VER_EOI_SUPPRESSION 0x01000000 - -/* fields in LDR */ -#define APIC_LDR_RESERVED 0x00ffffff - -/* fields in DFR */ -#define APIC_DFR_RESERVED 0x0fffffff -#define APIC_DFR_MODEL_MASK 0xf0000000 -#define APIC_DFR_MODEL_FLAT 0xf0000000 -#define APIC_DFR_MODEL_CLUSTER 0x00000000 - -/* fields in SVR */ -#define APIC_SVR_VECTOR 0x000000ff -#define APIC_SVR_VEC_PROG 0x000000f0 -#define APIC_SVR_VEC_FIX 0x0000000f -#define APIC_SVR_ENABLE 0x00000100 -# define APIC_SVR_SWDIS 0x00000000 -# define APIC_SVR_SWEN 0x00000100 -#define APIC_SVR_FOCUS 0x00000200 -# define APIC_SVR_FEN 0x00000000 -# define APIC_SVR_FDIS 0x00000200 -#define APIC_SVR_EOI_SUPPRESSION 0x00001000 - -/* fields in TPR */ -#define APIC_TPR_PRIO 0x000000ff -# define APIC_TPR_INT 0x000000f0 -# define APIC_TPR_SUB 0x0000000f - -/* fields in ESR */ -#define APIC_ESR_SEND_CS_ERROR 0x00000001 -#define APIC_ESR_RECEIVE_CS_ERROR 0x00000002 -#define APIC_ESR_SEND_ACCEPT 0x00000004 -#define APIC_ESR_RECEIVE_ACCEPT 0x00000008 -#define APIC_ESR_SEND_ILLEGAL_VECTOR 0x00000020 -#define APIC_ESR_RECEIVE_ILLEGAL_VECTOR 0x00000040 -#define APIC_ESR_ILLEGAL_REGISTER 0x00000080 - -/* fields in ICR_LOW */ -#define APIC_VECTOR_MASK 0x000000ff - -#define APIC_DELMODE_MASK 0x00000700 -# define APIC_DELMODE_FIXED 0x00000000 -# define APIC_DELMODE_LOWPRIO 0x00000100 -# define APIC_DELMODE_SMI 0x00000200 -# define APIC_DELMODE_RR 0x00000300 -# define APIC_DELMODE_NMI 0x00000400 -# define APIC_DELMODE_INIT 0x00000500 -# define APIC_DELMODE_STARTUP 0x00000600 -# define APIC_DELMODE_RESV 0x00000700 - -#define APIC_DESTMODE_MASK 0x00000800 -# define APIC_DESTMODE_PHY 0x00000000 -# define APIC_DESTMODE_LOG 0x00000800 - -#define APIC_DELSTAT_MASK 0x00001000 -# define APIC_DELSTAT_IDLE 0x00000000 -# define APIC_DELSTAT_PEND 0x00001000 - -#define APIC_RESV1_MASK 0x00002000 - -#define APIC_LEVEL_MASK 0x00004000 -# define APIC_LEVEL_DEASSERT 0x00000000 -# define APIC_LEVEL_ASSERT 0x00004000 - -#define APIC_TRIGMOD_MASK 0x00008000 -# define APIC_TRIGMOD_EDGE 0x00000000 -# define APIC_TRIGMOD_LEVEL 0x00008000 - -#define APIC_RRSTAT_MASK 0x00030000 -# define APIC_RRSTAT_INVALID 0x00000000 -# define APIC_RRSTAT_INPROG 0x00010000 -# define APIC_RRSTAT_VALID 0x00020000 -# define APIC_RRSTAT_RESV 0x00030000 - -#define APIC_DEST_MASK 0x000c0000 -# define APIC_DEST_DESTFLD 0x00000000 -# define APIC_DEST_SELF 0x00040000 -# define APIC_DEST_ALLISELF 0x00080000 -# define APIC_DEST_ALLESELF 0x000c0000 - -#define APIC_RESV2_MASK 0xfff00000 - -#define APIC_ICRLO_RESV_MASK (APIC_RESV1_MASK | APIC_RESV2_MASK) - -/* fields in LVT1/2 */ -#define APIC_LVT_VECTOR 0x000000ff -#define APIC_LVT_DM 0x00000700 -# define APIC_LVT_DM_FIXED 0x00000000 -# define APIC_LVT_DM_SMI 0x00000200 -# define APIC_LVT_DM_NMI 0x00000400 -# define APIC_LVT_DM_INIT 0x00000500 -# define APIC_LVT_DM_EXTINT 0x00000700 -#define APIC_LVT_DS 0x00001000 -#define APIC_LVT_IIPP 0x00002000 -#define APIC_LVT_IIPP_INTALO 0x00002000 -#define APIC_LVT_IIPP_INTAHI 0x00000000 -#define APIC_LVT_RIRR 0x00004000 -#define APIC_LVT_TM 0x00008000 -#define APIC_LVT_M 0x00010000 - - -/* fields in LVT Timer */ -#define APIC_LVTT_VECTOR 0x000000ff -#define APIC_LVTT_DS 0x00001000 -#define APIC_LVTT_M 0x00010000 -#define APIC_LVTT_TM 0x00020000 -# define APIC_LVTT_TM_ONE_SHOT 0x00000000 -# define APIC_LVTT_TM_PERIODIC 0x00020000 - - -/* APIC timer current count */ -#define APIC_TIMER_MAX_COUNT 0xffffffff - -/* fields in TDCR */ -#define APIC_TDCR_2 0x00 -#define APIC_TDCR_4 0x01 -#define APIC_TDCR_8 0x02 -#define APIC_TDCR_16 0x03 -#define APIC_TDCR_32 0x08 -#define APIC_TDCR_64 0x09 -#define APIC_TDCR_128 0x0a -#define APIC_TDCR_1 0x0b - -/****************************************************************************** - * I/O APIC defines - */ - -/* default physical locations of an IO APIC */ -#define DEFAULT_IO_APIC_BASE 0xfec00000 - -/* window register offset */ -#define IOAPIC_WINDOW 0x10 -#define IOAPIC_EOIR 0x40 - -/* indexes into IO APIC */ -#define IOAPIC_ID 0x00 -#define IOAPIC_VER 0x01 -#define IOAPIC_ARB 0x02 -#define IOAPIC_REDTBL 0x10 -#define IOAPIC_REDTBL0 IOAPIC_REDTBL -#define IOAPIC_REDTBL1 (IOAPIC_REDTBL+0x02) -#define IOAPIC_REDTBL2 (IOAPIC_REDTBL+0x04) -#define IOAPIC_REDTBL3 (IOAPIC_REDTBL+0x06) -#define IOAPIC_REDTBL4 (IOAPIC_REDTBL+0x08) -#define IOAPIC_REDTBL5 (IOAPIC_REDTBL+0x0a) -#define IOAPIC_REDTBL6 (IOAPIC_REDTBL+0x0c) -#define IOAPIC_REDTBL7 (IOAPIC_REDTBL+0x0e) -#define IOAPIC_REDTBL8 (IOAPIC_REDTBL+0x10) -#define IOAPIC_REDTBL9 (IOAPIC_REDTBL+0x12) -#define IOAPIC_REDTBL10 (IOAPIC_REDTBL+0x14) -#define IOAPIC_REDTBL11 (IOAPIC_REDTBL+0x16) -#define IOAPIC_REDTBL12 (IOAPIC_REDTBL+0x18) -#define IOAPIC_REDTBL13 (IOAPIC_REDTBL+0x1a) -#define IOAPIC_REDTBL14 (IOAPIC_REDTBL+0x1c) -#define IOAPIC_REDTBL15 (IOAPIC_REDTBL+0x1e) -#define IOAPIC_REDTBL16 (IOAPIC_REDTBL+0x20) -#define IOAPIC_REDTBL17 (IOAPIC_REDTBL+0x22) -#define IOAPIC_REDTBL18 (IOAPIC_REDTBL+0x24) -#define IOAPIC_REDTBL19 (IOAPIC_REDTBL+0x26) -#define IOAPIC_REDTBL20 (IOAPIC_REDTBL+0x28) -#define IOAPIC_REDTBL21 (IOAPIC_REDTBL+0x2a) -#define IOAPIC_REDTBL22 (IOAPIC_REDTBL+0x2c) -#define IOAPIC_REDTBL23 (IOAPIC_REDTBL+0x2e) - -/* fields in VER */ -#define IOART_VER_VERSION 0x000000ff -#define IOART_VER_MAXREDIR 0x00ff0000 -#define MAXREDIRSHIFT 16 - -/* - * fields in the IO APIC's redirection table entries - */ -#define IOART_DEST APIC_ID_MASK /* broadcast addr: all APICs */ - -#define IOART_RESV 0x00fe0000 /* reserved */ - -#define IOART_INTMASK 0x00010000 /* R/W: INTerrupt mask */ -# define IOART_INTMCLR 0x00000000 /* clear, allow INTs */ -# define IOART_INTMSET 0x00010000 /* set, inhibit INTs */ - -#define IOART_TRGRMOD 0x00008000 /* R/W: trigger mode */ -# define IOART_TRGREDG 0x00000000 /* edge */ -# define IOART_TRGRLVL 0x00008000 /* level */ - -#define IOART_REM_IRR 0x00004000 /* RO: remote IRR */ - -#define IOART_INTPOL 0x00002000 /* R/W: INT input pin polarity */ -# define IOART_INTAHI 0x00000000 /* active high */ -# define IOART_INTALO 0x00002000 /* active low */ - -#define IOART_DELIVS 0x00001000 /* RO: delivery status */ - -#define IOART_DESTMOD 0x00000800 /* R/W: destination mode */ -# define IOART_DESTPHY 0x00000000 /* physical */ -# define IOART_DESTLOG 0x00000800 /* logical */ - -#define IOART_DELMOD 0x00000700 /* R/W: delivery mode */ -# define IOART_DELFIXED 0x00000000 /* fixed */ -# define IOART_DELLOPRI 0x00000100 /* lowest priority */ -# define IOART_DELSMI 0x00000200 /* System Management INT */ -# define IOART_DELRSV1 0x00000300 /* reserved */ -# define IOART_DELNMI 0x00000400 /* NMI signal */ -# define IOART_DELINIT 0x00000500 /* INIT signal */ -# define IOART_DELRSV2 0x00000600 /* reserved */ -# define IOART_DELEXINT 0x00000700 /* External INTerrupt */ - -#define IOART_INTVEC 0x000000ff /* R/W: INTerrupt vector field */ - -#endif /* _MACHINE_APICREG_H_ */ diff --git a/sys/i386/xen/mp_machdep.c b/sys/i386/xen/mp_machdep.c index 43a82ab6fadd..60165cbb3b8a 100644 --- a/sys/i386/xen/mp_machdep.c +++ b/sys/i386/xen/mp_machdep.c @@ -1,1261 +1,1261 @@ /*- * Copyright (c) 1996, by Steve Passe * Copyright (c) 2008, by Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_apic.h" #include "opt_cpu.h" #include "opt_kstack_pages.h" #include "opt_mp_watchdog.h" #include "opt_pmap.h" #include "opt_sched.h" #include "opt_smp.h" #if !defined(lint) #if !defined(SMP) #error How did you get here? #endif #ifndef DEV_APIC #error The apic device is required for SMP, add "device apic" to your config file. #endif #if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) #error SMP not supported with CPU_DISABLE_CMPXCHG #endif #endif /* not lint */ #include #include #include #include /* cngetc() */ #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include #include #include #include #include #include #include #include int mp_naps; /* # of Applications processors */ int boot_cpu_id = -1; /* designated BSP */ extern struct pcpu __pcpu[]; static int bootAP; static union descriptor *bootAPgdt; static char resched_name[NR_CPUS][15]; static char callfunc_name[NR_CPUS][15]; /* Free these after use */ void *bootstacks[MAXCPU]; struct pcb stoppcbs[MAXCPU]; /* Variables needed for SMP tlb shootdown. */ vm_offset_t smp_tlb_addr1; vm_offset_t smp_tlb_addr2; volatile int smp_tlb_wait; typedef void call_data_func_t(uintptr_t , uintptr_t); static u_int logical_cpus; static volatile cpumask_t ipi_nmi_pending; /* used to hold the AP's until we are ready to release them */ static struct mtx ap_boot_mtx; /* Set to 1 once we're ready to let the APs out of the pen. */ static volatile int aps_ready = 0; /* * Store data from cpu_add() until later in the boot when we actually setup * the APs. */ struct cpu_info { int cpu_present:1; int cpu_bsp:1; int cpu_disabled:1; } static cpu_info[MAX_APIC_ID + 1]; int cpu_apic_ids[MAXCPU]; int apic_cpuids[MAX_APIC_ID + 1]; /* Holds pending bitmap based IPIs per CPU */ static volatile u_int cpu_ipi_pending[MAXCPU]; static int cpu_logical; static int cpu_cores; static void assign_cpu_ids(void); static void set_interrupt_apic_ids(void); int start_all_aps(void); static int start_ap(int apic_id); static void release_aps(void *dummy); static u_int hyperthreading_cpus; static cpumask_t hyperthreading_cpus_mask; extern void Xhypervisor_callback(void); extern void failsafe_callback(void); extern void pmap_lazyfix_action(void); struct cpu_group * cpu_topo(void) { if (cpu_cores == 0) cpu_cores = 1; if (cpu_logical == 0) cpu_logical = 1; if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { printf("WARNING: Non-uniform processors.\n"); printf("WARNING: Using suboptimal topology.\n"); return (smp_topo_none()); } /* * No multi-core or hyper-threaded. */ if (cpu_logical * cpu_cores == 1) return (smp_topo_none()); /* * Only HTT no multi-core. */ if (cpu_logical > 1 && cpu_cores == 1) return (smp_topo_1level(CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); /* * Only multi-core no HTT. */ if (cpu_cores > 1 && cpu_logical == 1) return (smp_topo_1level(CG_SHARE_NONE, cpu_cores, 0)); /* * Both HTT and multi-core. */ return (smp_topo_2level(CG_SHARE_NONE, cpu_cores, CG_SHARE_L1, cpu_logical, CG_FLAG_HTT)); } /* * Calculate usable address in base memory for AP trampoline code. */ u_int mp_bootaddress(u_int basemem) { return (basemem); } void cpu_add(u_int apic_id, char boot_cpu) { if (apic_id > MAX_APIC_ID) { panic("SMP: APIC ID %d too high", apic_id); return; } KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", apic_id)); cpu_info[apic_id].cpu_present = 1; if (boot_cpu) { KASSERT(boot_cpu_id == -1, ("CPU %d claims to be BSP, but CPU %d already is", apic_id, boot_cpu_id)); boot_cpu_id = apic_id; cpu_info[apic_id].cpu_bsp = 1; } if (mp_ncpus < MAXCPU) mp_ncpus++; if (bootverbose) printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : "AP"); } void cpu_mp_setmaxid(void) { mp_maxid = MAXCPU - 1; } int cpu_mp_probe(void) { /* * Always record BSP in CPU map so that the mbuf init code works * correctly. */ all_cpus = 1; if (mp_ncpus == 0) { /* * No CPUs were found, so this must be a UP system. Setup * the variables to represent a system with a single CPU * with an id of 0. */ mp_ncpus = 1; return (0); } /* At least one CPU was found. */ if (mp_ncpus == 1) { /* * One CPU was found, so this must be a UP system with * an I/O APIC. */ return (0); } /* At least two CPUs were found. */ return (1); } /* * Initialize the IPI handlers and start up the AP's. */ void cpu_mp_start(void) { int i; /* Initialize the logical ID to APIC ID table. */ for (i = 0; i < MAXCPU; i++) { cpu_apic_ids[i] = -1; cpu_ipi_pending[i] = 0; } /* Set boot_cpu_id if needed. */ if (boot_cpu_id == -1) { boot_cpu_id = PCPU_GET(apic_id); cpu_info[boot_cpu_id].cpu_bsp = 1; } else KASSERT(boot_cpu_id == PCPU_GET(apic_id), ("BSP's APIC ID doesn't match boot_cpu_id")); cpu_apic_ids[0] = boot_cpu_id; apic_cpuids[boot_cpu_id] = 0; assign_cpu_ids(); /* Start each Application Processor */ start_all_aps(); /* Setup the initial logical CPUs info. */ logical_cpus = logical_cpus_mask = 0; if (cpu_feature & CPUID_HTT) logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; set_interrupt_apic_ids(); } static void iv_rendezvous(uintptr_t a, uintptr_t b) { smp_rendezvous_action(); } static void iv_invltlb(uintptr_t a, uintptr_t b) { xen_tlb_flush(); } static void iv_invlpg(uintptr_t a, uintptr_t b) { xen_invlpg(a); } static void iv_invlrng(uintptr_t a, uintptr_t b) { vm_offset_t start = (vm_offset_t)a; vm_offset_t end = (vm_offset_t)b; while (start < end) { xen_invlpg(start); start += PAGE_SIZE; } } static void iv_invlcache(uintptr_t a, uintptr_t b) { wbinvd(); atomic_add_int(&smp_tlb_wait, 1); } static void iv_lazypmap(uintptr_t a, uintptr_t b) { pmap_lazyfix_action(); atomic_add_int(&smp_tlb_wait, 1); } /* * These start from "IPI offset" APIC_IPI_INTS */ static call_data_func_t *ipi_vectors[6] = { iv_rendezvous, iv_invltlb, iv_invlpg, iv_invlrng, iv_invlcache, iv_lazypmap, }; /* * Reschedule call back. Nothing to do, * all the work is done automatically when * we return from the interrupt. */ static int smp_reschedule_interrupt(void *unused) { int cpu = PCPU_GET(cpuid); u_int ipi_bitmap; ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); if (ipi_bitmap & (1 << IPI_PREEMPT)) { #ifdef COUNT_IPIS (*ipi_preempt_counts[cpu])++; #endif sched_preempt(curthread); } if (ipi_bitmap & (1 << IPI_AST)) { #ifdef COUNT_IPIS (*ipi_ast_counts[cpu])++; #endif /* Nothing to do for AST */ } return (FILTER_HANDLED); } struct _call_data { uint16_t func_id; uint16_t wait; uintptr_t arg1; uintptr_t arg2; atomic_t started; atomic_t finished; }; static struct _call_data *call_data; static int smp_call_function_interrupt(void *unused) { call_data_func_t *func; uintptr_t arg1 = call_data->arg1; uintptr_t arg2 = call_data->arg2; int wait = call_data->wait; atomic_t *started = &call_data->started; atomic_t *finished = &call_data->finished; /* We only handle function IPIs, not bitmap IPIs */ if (call_data->func_id < APIC_IPI_INTS || call_data->func_id > IPI_BITMAP_VECTOR) panic("invalid function id %u", call_data->func_id); func = ipi_vectors[call_data->func_id - APIC_IPI_INTS]; /* * Notify initiating CPU that I've grabbed the data and am * about to execute the function */ mb(); atomic_inc(started); /* * At this point the info structure may be out of scope unless wait==1 */ (*func)(arg1, arg2); if (wait) { mb(); atomic_inc(finished); } atomic_add_int(&smp_tlb_wait, 1); return (FILTER_HANDLED); } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { int i, x; /* List CPUs */ printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); for (i = 1, x = 0; x <= MAX_APIC_ID; x++) { if (!cpu_info[x].cpu_present || cpu_info[x].cpu_bsp) continue; if (cpu_info[x].cpu_disabled) printf(" cpu (AP): APIC ID: %2d (disabled)\n", x); else { KASSERT(i < mp_ncpus, ("mp_ncpus and actual cpus are out of whack")); printf(" cpu%d (AP): APIC ID: %2d\n", i++, x); } } } static int xen_smp_intr_init(unsigned int cpu) { int rc; unsigned int irq; per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; sprintf(resched_name[cpu], "resched%u", cpu); rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, cpu, resched_name[cpu], smp_reschedule_interrupt, INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq); printf("[XEN] IPI cpu=%d irq=%d vector=RESCHEDULE_VECTOR (%d)\n", cpu, irq, RESCHEDULE_VECTOR); per_cpu(resched_irq, cpu) = irq; sprintf(callfunc_name[cpu], "callfunc%u", cpu); rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR, cpu, callfunc_name[cpu], smp_call_function_interrupt, INTR_FAST|INTR_TYPE_TTY|INTR_MPSAFE, &irq); if (rc < 0) goto fail; per_cpu(callfunc_irq, cpu) = irq; printf("[XEN] IPI cpu=%d irq=%d vector=CALL_FUNCTION_VECTOR (%d)\n", cpu, irq, CALL_FUNCTION_VECTOR); if ((cpu != 0) && ((rc = ap_cpu_initclocks(cpu)) != 0)) goto fail; return 0; fail: if (per_cpu(resched_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(resched_irq, cpu)); if (per_cpu(callfunc_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(callfunc_irq, cpu)); return rc; } static void xen_smp_intr_init_cpus(void *unused) { int i; for (i = 0; i < mp_ncpus; i++) xen_smp_intr_init(i); } #define MTOPSIZE (1<<(14 + PAGE_SHIFT)) /* * AP CPU's call this to initialize themselves. */ void init_secondary(void) { vm_offset_t addr; int gsel_tss; /* bootAP is set in start_ap() to our ID. */ PCPU_SET(currentldt, _default_ldt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); #if 0 gdt[bootAP * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; #endif PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); #if 0 PCPU_SET(tss_gdt, &gdt[bootAP * NGDT + GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); #endif PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd); /* * Set to a known state: * Set by mpboot.s: CR0_PG, CR0_PE * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM */ /* * signal our startup to the BSP. */ mp_naps++; /* Spin until the BSP releases the AP's. */ while (!aps_ready) ia32_pause(); /* BSP may have changed PTD while we were waiting */ invltlb(); for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) invlpg(addr); /* set up FPU state on the AP */ npxinit(); #if 0 /* set up SSE registers */ enable_sse(); #endif #if 0 && defined(PAE) /* Enable the PTE no-execute bit. */ if ((amd_feature & AMDID_NX) != 0) { uint64_t msr; msr = rdmsr(MSR_EFER) | EFER_NXE; wrmsr(MSR_EFER, msr); } #endif #if 0 /* A quick check from sanity claus */ if (PCPU_GET(apic_id) != lapic_id()) { printf("SMP: cpuid = %d\n", PCPU_GET(cpuid)); printf("SMP: actual apic_id = %d\n", lapic_id()); printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); panic("cpuid mismatch! boom!!"); } #endif /* Initialize curthread. */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); PCPU_SET(curthread, PCPU_GET(idlethread)); mtx_lock_spin(&ap_boot_mtx); #if 0 /* Init local apic for irq's */ lapic_setup(1); #endif smp_cpus++; CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", PCPU_GET(cpuid)); printf("SMP: AP CPU #%d Launched!\n", PCPU_GET(cpuid)); /* Determine if we are a logical CPU. */ if (logical_cpus > 1 && PCPU_GET(apic_id) % logical_cpus != 0) logical_cpus_mask |= PCPU_GET(cpumask); /* Determine if we are a hyperthread. */ if (hyperthreading_cpus > 1 && PCPU_GET(apic_id) % hyperthreading_cpus != 0) hyperthreading_cpus_mask |= PCPU_GET(cpumask); /* Build our map of 'other' CPUs. */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); #if 0 if (bootverbose) lapic_dump("AP"); #endif if (smp_cpus == mp_ncpus) { /* enable IPI's, tlb shootdown, freezes etc */ atomic_store_rel_int(&smp_started, 1); smp_active = 1; /* historic */ } mtx_unlock_spin(&ap_boot_mtx); /* wait until all the AP's are up */ while (smp_started == 0) ia32_pause(); PCPU_SET(curthread, PCPU_GET(idlethread)); /* enter the scheduler */ sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ } /******************************************************************* * local functions and data */ /* * We tell the I/O APIC code about all the CPUs we want to receive * interrupts. If we don't want certain CPUs to receive IRQs we * can simply not tell the I/O APIC code about them in this function. * We also do not tell it about the BSP since it tells itself about * the BSP internally to work with UP kernels and on UP machines. */ static void set_interrupt_apic_ids(void) { u_int i, apic_id; for (i = 0; i < MAXCPU; i++) { apic_id = cpu_apic_ids[i]; if (apic_id == -1) continue; if (cpu_info[apic_id].cpu_bsp) continue; if (cpu_info[apic_id].cpu_disabled) continue; /* Don't let hyperthreads service interrupts. */ if (hyperthreading_cpus > 1 && apic_id % hyperthreading_cpus != 0) continue; intr_add_cpu(i); } } /* * Assign logical CPU IDs to local APICs. */ static void assign_cpu_ids(void) { u_int i; /* Check for explicitly disabled CPUs. */ for (i = 0; i <= MAX_APIC_ID; i++) { if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) continue; /* Don't use this CPU if it has been disabled by a tunable. */ if (resource_disabled("lapic", i)) { cpu_info[i].cpu_disabled = 1; continue; } } /* * Assign CPU IDs to local APIC IDs and disable any CPUs * beyond MAXCPU. CPU 0 has already been assigned to the BSP, * so we only have to assign IDs for APs. */ mp_ncpus = 1; for (i = 0; i <= MAX_APIC_ID; i++) { if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || cpu_info[i].cpu_disabled) continue; if (mp_ncpus < MAXCPU) { cpu_apic_ids[mp_ncpus] = i; apic_cpuids[i] = mp_ncpus; mp_ncpus++; } else cpu_info[i].cpu_disabled = 1; } KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); } /* * start each AP in our list */ /* Lowest 1MB is already mapped: don't touch*/ #define TMPMAP_START 1 int start_all_aps(void) { int x,apic_id, cpu; struct pcpu *pc; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); /* set up temporary P==V mapping for AP boot */ /* XXX this is a hack, we should boot the AP on its own stack/PTD */ /* start each AP */ for (cpu = 1; cpu < mp_ncpus; cpu++) { apic_id = cpu_apic_ids[cpu]; bootAP = cpu; bootAPgdt = gdt + (512*cpu); /* Get per-cpu data */ pc = &__pcpu[bootAP]; pcpu_init(pc, bootAP, sizeof(struct pcpu)); dpcpu_init((void *)kmem_alloc(kernel_map, DPCPU_SIZE), bootAP); pc->pc_apic_id = cpu_apic_ids[bootAP]; pc->pc_prvspace = pc; pc->pc_curthread = 0; gdt_segs[GPRIV_SEL].ssd_base = (int) pc; gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; PT_SET_MA(bootAPgdt, xpmap_ptom(VTOP(bootAPgdt)) | PG_V | PG_RW); bzero(bootAPgdt, PAGE_SIZE); for (x = 0; x < NGDT; x++) ssdtosd(&gdt_segs[x], &bootAPgdt[x].sd); PT_SET_MA(bootAPgdt, vtomach(bootAPgdt) | PG_V); #ifdef notyet if (HYPERVISOR_vcpu_op(VCPUOP_get_physid, cpu, &cpu_id) == 0) { apicid = xen_vcpu_physid_to_x86_apicid(cpu_id.phys_id); acpiid = xen_vcpu_physid_to_x86_acpiid(cpu_id.phys_id); #ifdef CONFIG_ACPI if (acpiid != 0xff) x86_acpiid_to_apicid[acpiid] = apicid; #endif } #endif /* attempt to start the Application Processor */ if (!start_ap(cpu)) { printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); /* better panic as the AP may be running loose */ printf("panic y/n? [y] "); if (cngetc() != 'n') panic("bye-bye"); } all_cpus |= (1 << cpu); /* record AP in CPU map */ } /* build our map of 'other' CPUs */ PCPU_SET(other_cpus, all_cpus & ~PCPU_GET(cpumask)); pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); /* number of APs actually started */ return mp_naps; } extern uint8_t *pcpu_boot_stack; extern trap_info_t trap_table[]; static void smp_trap_init(trap_info_t *trap_ctxt) { const trap_info_t *t = trap_table; for (t = trap_table; t->address; t++) { trap_ctxt[t->vector].flags = t->flags; trap_ctxt[t->vector].cs = t->cs; trap_ctxt[t->vector].address = t->address; } } extern int nkpt; static void cpu_initialize_context(unsigned int cpu) { /* vcpu_guest_context_t is too large to allocate on the stack. * Hence we allocate statically and protect it with a lock */ vm_page_t m[4]; static vcpu_guest_context_t ctxt; vm_offset_t boot_stack; vm_offset_t newPTD; vm_paddr_t ma[NPGPTD]; static int color; int i; /* * Page 0,[0-3] PTD * Page 1, [4] boot stack * Page [5] PDPT * */ for (i = 0; i < NPGPTD + 2; i++) { m[i] = vm_page_alloc(NULL, color++, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO); pmap_zero_page(m[i]); } boot_stack = kmem_alloc_nofault(kernel_map, 1); newPTD = kmem_alloc_nofault(kernel_map, NPGPTD); ma[0] = xpmap_ptom(VM_PAGE_TO_PHYS(m[0]))|PG_V; #ifdef PAE pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD + 1])); for (i = 0; i < NPGPTD; i++) { ((vm_paddr_t *)boot_stack)[i] = ma[i] = xpmap_ptom(VM_PAGE_TO_PHYS(m[i]))|PG_V; } #endif /* * Copy cpu0 IdlePTD to new IdlePTD - copying only * kernel mappings */ pmap_qenter(newPTD, m, 4); memcpy((uint8_t *)newPTD + KPTDI*sizeof(vm_paddr_t), (uint8_t *)PTOV(IdlePTD) + KPTDI*sizeof(vm_paddr_t), nkpt*sizeof(vm_paddr_t)); pmap_qremove(newPTD, 4); kmem_free(kernel_map, newPTD, 4); /* * map actual idle stack to boot_stack */ pmap_kenter(boot_stack, VM_PAGE_TO_PHYS(m[NPGPTD])); xen_pgdpt_pin(xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1]))); vm_page_lock_queues(); for (i = 0; i < 4; i++) { int pdir = (PTDPTDI + i) / NPDEPG; int curoffset = (PTDPTDI + i) % NPDEPG; xen_queue_pt_update((vm_paddr_t) ((ma[pdir] & ~PG_V) + (curoffset*sizeof(vm_paddr_t))), ma[i]); } PT_UPDATES_FLUSH(); vm_page_unlock_queues(); memset(&ctxt, 0, sizeof(ctxt)); ctxt.flags = VGCF_IN_KERNEL; ctxt.user_regs.ds = GSEL(GDATA_SEL, SEL_KPL); ctxt.user_regs.es = GSEL(GDATA_SEL, SEL_KPL); ctxt.user_regs.fs = GSEL(GPRIV_SEL, SEL_KPL); ctxt.user_regs.gs = GSEL(GDATA_SEL, SEL_KPL); ctxt.user_regs.cs = GSEL(GCODE_SEL, SEL_KPL); ctxt.user_regs.ss = GSEL(GDATA_SEL, SEL_KPL); ctxt.user_regs.eip = (unsigned long)init_secondary; ctxt.user_regs.eflags = PSL_KERNEL | 0x1000; /* IOPL_RING1 */ memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); smp_trap_init(ctxt.trap_ctxt); ctxt.ldt_ents = 0; ctxt.gdt_frames[0] = (uint32_t)((uint64_t)vtomach(bootAPgdt) >> PAGE_SHIFT); ctxt.gdt_ents = 512; #ifdef __i386__ ctxt.user_regs.esp = boot_stack + PAGE_SIZE; ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); ctxt.kernel_sp = boot_stack + PAGE_SIZE; ctxt.event_callback_cs = GSEL(GCODE_SEL, SEL_KPL); ctxt.event_callback_eip = (unsigned long)Xhypervisor_callback; ctxt.failsafe_callback_cs = GSEL(GCODE_SEL, SEL_KPL); ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; ctxt.ctrlreg[3] = xpmap_ptom(VM_PAGE_TO_PHYS(m[NPGPTD + 1])); #else /* __x86_64__ */ ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); ctxt.kernel_ss = GSEL(GDATA_SEL, SEL_KPL); ctxt.kernel_sp = idle->thread.rsp0; ctxt.event_callback_eip = (unsigned long)hypervisor_callback; ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; ctxt.syscall_callback_eip = (unsigned long)system_call; ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); #endif printf("gdtpfn=%lx pdptpfn=%lx\n", ctxt.gdt_frames[0], ctxt.ctrlreg[3] >> PAGE_SHIFT); PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)); DELAY(3000); PANIC_IF(HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)); } /* * This function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It isn't pretty, * but it seems to work. */ int cpus; static int start_ap(int apic_id) { int ms; /* used as a watchpoint to signal AP startup */ cpus = mp_naps; cpu_initialize_context(apic_id); /* Wait up to 5 seconds for it to start. */ for (ms = 0; ms < 5000; ms++) { if (mp_naps > cpus) return 1; /* return SUCCESS */ DELAY(1000); } return 0; /* return FAILURE */ } /* * Flush the TLB on all other CPU's */ static void smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) { u_int ncpu; struct _call_data data; ncpu = mp_ncpus - 1; /* does not shootdown self */ if (ncpu < 1) return; /* no other cpus */ if (!(read_eflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); KASSERT(call_data == NULL, ("call_data isn't null?!")); call_data = &data; call_data->func_id = vector; call_data->arg1 = addr1; call_data->arg2 = addr2; atomic_store_rel_int(&smp_tlb_wait, 0); ipi_all_but_self(vector); while (smp_tlb_wait < ncpu) ia32_pause(); call_data = NULL; mtx_unlock_spin(&smp_ipi_mtx); } static void smp_targeted_tlb_shootdown(cpumask_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) { int ncpu, othercpus; struct _call_data data; othercpus = mp_ncpus - 1; if (mask == (u_int)-1) { ncpu = othercpus; if (ncpu < 1) return; } else { mask &= ~PCPU_GET(cpumask); if (mask == 0) return; ncpu = bitcount32(mask); if (ncpu > othercpus) { /* XXX this should be a panic offence */ printf("SMP: tlb shootdown to %d other cpus (only have %d)\n", ncpu, othercpus); ncpu = othercpus; } /* XXX should be a panic, implied by mask == 0 above */ if (ncpu < 1) return; } if (!(read_eflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); KASSERT(call_data == NULL, ("call_data isn't null?!")); call_data = &data; call_data->func_id = vector; call_data->arg1 = addr1; call_data->arg2 = addr2; atomic_store_rel_int(&smp_tlb_wait, 0); if (mask == (u_int)-1) ipi_all_but_self(vector); else ipi_selected(mask, vector); while (smp_tlb_wait < ncpu) ia32_pause(); call_data = NULL; mtx_unlock_spin(&smp_ipi_mtx); } void smp_cache_flush(void) { if (smp_started) smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); } void smp_invltlb(void) { if (smp_started) { smp_tlb_shootdown(IPI_INVLTLB, 0, 0); } } void smp_invlpg(vm_offset_t addr) { if (smp_started) { smp_tlb_shootdown(IPI_INVLPG, addr, 0); } } void smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); } } void smp_masked_invltlb(cpumask_t mask) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); } } void smp_masked_invlpg(cpumask_t mask, vm_offset_t addr) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); } } void smp_masked_invlpg_range(cpumask_t mask, vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); } } /* * send an IPI to a set of cpus. */ void ipi_selected(cpumask_t cpus, u_int ipi) { int cpu; u_int bitmap = 0; u_int old_pending; u_int new_pending; if (IPI_IS_BITMAPED(ipi)) { bitmap = 1 << ipi; ipi = IPI_BITMAP_VECTOR; } /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) atomic_set_int(&ipi_nmi_pending, cpus); CTR3(KTR_SMP, "%s: cpus: %x ipi: %x", __func__, cpus, ipi); while ((cpu = ffs(cpus)) != 0) { cpu--; cpus &= ~(1 << cpu); if (bitmap) { do { old_pending = cpu_ipi_pending[cpu]; new_pending = old_pending | bitmap; } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], old_pending, new_pending)); if (!old_pending) ipi_pcpu(cpu, RESCHEDULE_VECTOR); } else { KASSERT(call_data != NULL, ("call_data not set")); ipi_pcpu(cpu, CALL_FUNCTION_VECTOR); } } } /* * send an IPI to a specific CPU. */ void ipi_cpu(int cpu, u_int ipi) { u_int bitmap = 0; u_int old_pending; u_int new_pending; if (IPI_IS_BITMAPED(ipi)) { bitmap = 1 << ipi; ipi = IPI_BITMAP_VECTOR; } /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) atomic_set_int(&ipi_nmi_pending, 1 << cpu); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); if (bitmap) { do { old_pending = cpu_ipi_pending[cpu]; new_pending = old_pending | bitmap; } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], old_pending, new_pending)); if (!old_pending) ipi_pcpu(cpu, RESCHEDULE_VECTOR); } else { KASSERT(call_data != NULL, ("call_data not set")); ipi_pcpu(cpu, CALL_FUNCTION_VECTOR); } } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) atomic_set_int(&ipi_nmi_pending, PCPU_GET(other_cpus)); CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); ipi_selected(PCPU_GET(other_cpus), ipi); } int ipi_nmi_handler() { cpumask_t cpumask; /* * As long as there is not a simple way to know about a NMI's * source, if the bitmask for the current CPU is present in * the global pending bitword an IPI_STOP_HARD has been issued * and should be handled. */ cpumask = PCPU_GET(cpumask); if ((ipi_nmi_pending & cpumask) == 0) return (1); atomic_clear_int(&ipi_nmi_pending, cpumask); cpustop_handler(); return (0); } /* * Handle an IPI_STOP by saving our current context and spinning until we * are resumed. */ void cpustop_handler(void) { int cpu = PCPU_GET(cpuid); int cpumask = PCPU_GET(cpumask); savectx(&stoppcbs[cpu]); /* Indicate that we are stopped */ atomic_set_int(&stopped_cpus, cpumask); /* Wait for restart */ while (!(started_cpus & cpumask)) ia32_pause(); atomic_clear_int(&started_cpus, cpumask); atomic_clear_int(&stopped_cpus, cpumask); if (cpu == 0 && cpustop_restartfunc != NULL) { cpustop_restartfunc(); cpustop_restartfunc = NULL; } } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ static void release_aps(void *dummy __unused) { if (mp_ncpus == 1) return; atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ia32_pause(); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); SYSINIT(start_ipis, SI_SUB_INTR, SI_ORDER_ANY, xen_smp_intr_init_cpus, NULL); diff --git a/sys/i386/xen/mptable.c b/sys/i386/xen/mptable.c index c6c7d53ccb5e..fe01cd5b406c 100644 --- a/sys/i386/xen/mptable.c +++ b/sys/i386/xen/mptable.c @@ -1,130 +1,130 @@ /*- * Copyright (c) 2003 John Baldwin * Copyright (c) 1996, by Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include #include #include #include #include #include static int mptable_probe(void); static int mptable_probe_cpus(void); static void mptable_register(void *dummy); static int mptable_setup_local(void); static int mptable_setup_io(void); static struct apic_enumerator mptable_enumerator = { "MPTable", mptable_probe, mptable_probe_cpus, mptable_setup_local, mptable_setup_io }; static int mptable_probe(void) { return (-100); } static int mptable_probe_cpus(void) { int i, rc; for (i = 0; i < MAXCPU; i++) { rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); if (rc >= 0) cpu_add(i, (i == 0)); } return (0); } /* * Initialize the local APIC on the BSP. */ static int mptable_setup_local(void) { return (0); } static int mptable_setup_io(void) { return (0); } static void mptable_register(void *dummy __unused) { apic_register_enumerator(&mptable_enumerator); } SYSINIT(mptable_register, SI_SUB_CPU - 1, SI_ORDER_FIRST, mptable_register, NULL); int mptable_pci_probe_table(int bus) { return (0); } int mptable_pci_route_interrupt(device_t pcib, device_t dev, int pin) { return (0); } diff --git a/sys/pc98/include/apicreg.h b/sys/pc98/include/apicreg.h deleted file mode 100644 index a9766d5bfd6a..000000000000 --- a/sys/pc98/include/apicreg.h +++ /dev/null @@ -1,6 +0,0 @@ -/*- - * This file is in the public domain. - */ -/* $FreeBSD$ */ - -#include diff --git a/sys/amd64/include/apicreg.h b/sys/x86/include/apicreg.h similarity index 99% rename from sys/amd64/include/apicreg.h rename to sys/x86/include/apicreg.h index fee629bb2c57..00cb571154e7 100644 --- a/sys/amd64/include/apicreg.h +++ b/sys/x86/include/apicreg.h @@ -1,445 +1,445 @@ /*- * Copyright (c) 1996, by Peter Wemm and Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ -#ifndef _MACHINE_APICREG_H_ -#define _MACHINE_APICREG_H_ +#ifndef _X86_APICREG_H_ +#define _X86_APICREG_H_ /* * Local && I/O APIC definitions. */ /* * Pentium P54C+ Built-in APIC * (Advanced programmable Interrupt Controller) * * Base Address of Built-in APIC in memory location * is 0xfee00000. * * Map of APIC Registers: * * Offset (hex) Description Read/Write state * 000 Reserved * 010 Reserved * 020 ID Local APIC ID R/W * 030 VER Local APIC Version R * 040 Reserved * 050 Reserved * 060 Reserved * 070 Reserved * 080 Task Priority Register R/W * 090 Arbitration Priority Register R * 0A0 Processor Priority Register R * 0B0 EOI Register W * 0C0 RRR Remote read R * 0D0 Logical Destination R/W * 0E0 Destination Format Register 0..27 R; 28..31 R/W * 0F0 SVR Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W * 100 ISR 000-031 R * 110 ISR 032-063 R * 120 ISR 064-095 R * 130 ISR 095-128 R * 140 ISR 128-159 R * 150 ISR 160-191 R * 160 ISR 192-223 R * 170 ISR 224-255 R * 180 TMR 000-031 R * 190 TMR 032-063 R * 1A0 TMR 064-095 R * 1B0 TMR 095-128 R * 1C0 TMR 128-159 R * 1D0 TMR 160-191 R * 1E0 TMR 192-223 R * 1F0 TMR 224-255 R * 200 IRR 000-031 R * 210 IRR 032-063 R * 220 IRR 064-095 R * 230 IRR 095-128 R * 240 IRR 128-159 R * 250 IRR 160-191 R * 260 IRR 192-223 R * 270 IRR 224-255 R * 280 Error Status Register R * 290 Reserved * 2A0 Reserved * 2B0 Reserved * 2C0 Reserved * 2D0 Reserved * 2E0 Reserved * 2F0 Local Vector Table (CMCI) R/W * 300 ICR_LOW Interrupt Command Reg. (0-31) R/W * 310 ICR_HI Interrupt Command Reg. (32-63) R/W * 320 Local Vector Table (Timer) R/W * 330 Local Vector Table (Thermal) R/W (PIV+) * 340 Local Vector Table (Performance) R/W (P6+) * 350 LVT1 Local Vector Table (LINT0) R/W * 360 LVT2 Local Vector Table (LINT1) R/W * 370 LVT3 Local Vector Table (ERROR) R/W * 380 Initial Count Reg. for Timer R/W * 390 Current Count of Timer R * 3A0 Reserved * 3B0 Reserved * 3C0 Reserved * 3D0 Reserved * 3E0 Timer Divide Configuration Reg. R/W * 3F0 Reserved */ /****************************************************************************** * global defines, etc. */ /****************************************************************************** * LOCAL APIC structure */ #ifndef LOCORE #include #define PAD3 int : 32; int : 32; int : 32 #define PAD4 int : 32; int : 32; int : 32; int : 32 struct LAPIC { /* reserved */ PAD4; /* reserved */ PAD4; u_int32_t id; PAD3; u_int32_t version; PAD3; /* reserved */ PAD4; /* reserved */ PAD4; /* reserved */ PAD4; /* reserved */ PAD4; u_int32_t tpr; PAD3; u_int32_t apr; PAD3; u_int32_t ppr; PAD3; u_int32_t eoi; PAD3; /* reserved */ PAD4; u_int32_t ldr; PAD3; u_int32_t dfr; PAD3; u_int32_t svr; PAD3; u_int32_t isr0; PAD3; u_int32_t isr1; PAD3; u_int32_t isr2; PAD3; u_int32_t isr3; PAD3; u_int32_t isr4; PAD3; u_int32_t isr5; PAD3; u_int32_t isr6; PAD3; u_int32_t isr7; PAD3; u_int32_t tmr0; PAD3; u_int32_t tmr1; PAD3; u_int32_t tmr2; PAD3; u_int32_t tmr3; PAD3; u_int32_t tmr4; PAD3; u_int32_t tmr5; PAD3; u_int32_t tmr6; PAD3; u_int32_t tmr7; PAD3; u_int32_t irr0; PAD3; u_int32_t irr1; PAD3; u_int32_t irr2; PAD3; u_int32_t irr3; PAD3; u_int32_t irr4; PAD3; u_int32_t irr5; PAD3; u_int32_t irr6; PAD3; u_int32_t irr7; PAD3; u_int32_t esr; PAD3; /* reserved */ PAD4; /* reserved */ PAD4; /* reserved */ PAD4; /* reserved */ PAD4; /* reserved */ PAD4; /* reserved */ PAD4; u_int32_t lvt_cmci; PAD3; u_int32_t icr_lo; PAD3; u_int32_t icr_hi; PAD3; u_int32_t lvt_timer; PAD3; u_int32_t lvt_thermal; PAD3; u_int32_t lvt_pcint; PAD3; u_int32_t lvt_lint0; PAD3; u_int32_t lvt_lint1; PAD3; u_int32_t lvt_error; PAD3; u_int32_t icr_timer; PAD3; u_int32_t ccr_timer; PAD3; /* reserved */ PAD4; /* reserved */ PAD4; /* reserved */ PAD4; /* reserved */ PAD4; u_int32_t dcr_timer; PAD3; /* reserved */ PAD4; }; typedef struct LAPIC lapic_t; /****************************************************************************** * I/O APIC structure */ struct IOAPIC { u_int32_t ioregsel; PAD3; u_int32_t iowin; PAD3; }; typedef struct IOAPIC ioapic_t; #undef PAD4 #undef PAD3 #endif /* !LOCORE */ /****************************************************************************** * various code 'logical' values */ /****************************************************************************** * LOCAL APIC defines */ /* default physical locations of LOCAL (CPU) APICs */ #define DEFAULT_APIC_BASE 0xfee00000 /* constants relating to APIC ID registers */ #define APIC_ID_MASK 0xff000000 #define APIC_ID_SHIFT 24 #define APIC_ID_CLUSTER 0xf0 #define APIC_ID_CLUSTER_ID 0x0f #define APIC_MAX_CLUSTER 0xe #define APIC_MAX_INTRACLUSTER_ID 3 #define APIC_ID_CLUSTER_SHIFT 4 /* fields in VER */ #define APIC_VER_VERSION 0x000000ff #define APIC_VER_MAXLVT 0x00ff0000 #define MAXLVTSHIFT 16 #define APIC_VER_EOI_SUPPRESSION 0x01000000 /* fields in LDR */ #define APIC_LDR_RESERVED 0x00ffffff /* fields in DFR */ #define APIC_DFR_RESERVED 0x0fffffff #define APIC_DFR_MODEL_MASK 0xf0000000 #define APIC_DFR_MODEL_FLAT 0xf0000000 #define APIC_DFR_MODEL_CLUSTER 0x00000000 /* fields in SVR */ #define APIC_SVR_VECTOR 0x000000ff #define APIC_SVR_VEC_PROG 0x000000f0 #define APIC_SVR_VEC_FIX 0x0000000f #define APIC_SVR_ENABLE 0x00000100 # define APIC_SVR_SWDIS 0x00000000 # define APIC_SVR_SWEN 0x00000100 #define APIC_SVR_FOCUS 0x00000200 # define APIC_SVR_FEN 0x00000000 # define APIC_SVR_FDIS 0x00000200 #define APIC_SVR_EOI_SUPPRESSION 0x00001000 /* fields in TPR */ #define APIC_TPR_PRIO 0x000000ff # define APIC_TPR_INT 0x000000f0 # define APIC_TPR_SUB 0x0000000f /* fields in ESR */ #define APIC_ESR_SEND_CS_ERROR 0x00000001 #define APIC_ESR_RECEIVE_CS_ERROR 0x00000002 #define APIC_ESR_SEND_ACCEPT 0x00000004 #define APIC_ESR_RECEIVE_ACCEPT 0x00000008 #define APIC_ESR_SEND_ILLEGAL_VECTOR 0x00000020 #define APIC_ESR_RECEIVE_ILLEGAL_VECTOR 0x00000040 #define APIC_ESR_ILLEGAL_REGISTER 0x00000080 /* fields in ICR_LOW */ #define APIC_VECTOR_MASK 0x000000ff #define APIC_DELMODE_MASK 0x00000700 # define APIC_DELMODE_FIXED 0x00000000 # define APIC_DELMODE_LOWPRIO 0x00000100 # define APIC_DELMODE_SMI 0x00000200 # define APIC_DELMODE_RR 0x00000300 # define APIC_DELMODE_NMI 0x00000400 # define APIC_DELMODE_INIT 0x00000500 # define APIC_DELMODE_STARTUP 0x00000600 # define APIC_DELMODE_RESV 0x00000700 #define APIC_DESTMODE_MASK 0x00000800 # define APIC_DESTMODE_PHY 0x00000000 # define APIC_DESTMODE_LOG 0x00000800 #define APIC_DELSTAT_MASK 0x00001000 # define APIC_DELSTAT_IDLE 0x00000000 # define APIC_DELSTAT_PEND 0x00001000 #define APIC_RESV1_MASK 0x00002000 #define APIC_LEVEL_MASK 0x00004000 # define APIC_LEVEL_DEASSERT 0x00000000 # define APIC_LEVEL_ASSERT 0x00004000 #define APIC_TRIGMOD_MASK 0x00008000 # define APIC_TRIGMOD_EDGE 0x00000000 # define APIC_TRIGMOD_LEVEL 0x00008000 #define APIC_RRSTAT_MASK 0x00030000 # define APIC_RRSTAT_INVALID 0x00000000 # define APIC_RRSTAT_INPROG 0x00010000 # define APIC_RRSTAT_VALID 0x00020000 # define APIC_RRSTAT_RESV 0x00030000 #define APIC_DEST_MASK 0x000c0000 # define APIC_DEST_DESTFLD 0x00000000 # define APIC_DEST_SELF 0x00040000 # define APIC_DEST_ALLISELF 0x00080000 # define APIC_DEST_ALLESELF 0x000c0000 #define APIC_RESV2_MASK 0xfff00000 #define APIC_ICRLO_RESV_MASK (APIC_RESV1_MASK | APIC_RESV2_MASK) /* fields in LVT1/2 */ #define APIC_LVT_VECTOR 0x000000ff #define APIC_LVT_DM 0x00000700 # define APIC_LVT_DM_FIXED 0x00000000 # define APIC_LVT_DM_SMI 0x00000200 # define APIC_LVT_DM_NMI 0x00000400 # define APIC_LVT_DM_INIT 0x00000500 # define APIC_LVT_DM_EXTINT 0x00000700 #define APIC_LVT_DS 0x00001000 #define APIC_LVT_IIPP 0x00002000 #define APIC_LVT_IIPP_INTALO 0x00002000 #define APIC_LVT_IIPP_INTAHI 0x00000000 #define APIC_LVT_RIRR 0x00004000 #define APIC_LVT_TM 0x00008000 #define APIC_LVT_M 0x00010000 /* fields in LVT Timer */ #define APIC_LVTT_VECTOR 0x000000ff #define APIC_LVTT_DS 0x00001000 #define APIC_LVTT_M 0x00010000 #define APIC_LVTT_TM 0x00020000 # define APIC_LVTT_TM_ONE_SHOT 0x00000000 # define APIC_LVTT_TM_PERIODIC 0x00020000 /* APIC timer current count */ #define APIC_TIMER_MAX_COUNT 0xffffffff /* fields in TDCR */ #define APIC_TDCR_2 0x00 #define APIC_TDCR_4 0x01 #define APIC_TDCR_8 0x02 #define APIC_TDCR_16 0x03 #define APIC_TDCR_32 0x08 #define APIC_TDCR_64 0x09 #define APIC_TDCR_128 0x0a #define APIC_TDCR_1 0x0b /****************************************************************************** * I/O APIC defines */ /* default physical locations of an IO APIC */ #define DEFAULT_IO_APIC_BASE 0xfec00000 /* window register offset */ #define IOAPIC_WINDOW 0x10 #define IOAPIC_EOIR 0x40 /* indexes into IO APIC */ #define IOAPIC_ID 0x00 #define IOAPIC_VER 0x01 #define IOAPIC_ARB 0x02 #define IOAPIC_REDTBL 0x10 #define IOAPIC_REDTBL0 IOAPIC_REDTBL #define IOAPIC_REDTBL1 (IOAPIC_REDTBL+0x02) #define IOAPIC_REDTBL2 (IOAPIC_REDTBL+0x04) #define IOAPIC_REDTBL3 (IOAPIC_REDTBL+0x06) #define IOAPIC_REDTBL4 (IOAPIC_REDTBL+0x08) #define IOAPIC_REDTBL5 (IOAPIC_REDTBL+0x0a) #define IOAPIC_REDTBL6 (IOAPIC_REDTBL+0x0c) #define IOAPIC_REDTBL7 (IOAPIC_REDTBL+0x0e) #define IOAPIC_REDTBL8 (IOAPIC_REDTBL+0x10) #define IOAPIC_REDTBL9 (IOAPIC_REDTBL+0x12) #define IOAPIC_REDTBL10 (IOAPIC_REDTBL+0x14) #define IOAPIC_REDTBL11 (IOAPIC_REDTBL+0x16) #define IOAPIC_REDTBL12 (IOAPIC_REDTBL+0x18) #define IOAPIC_REDTBL13 (IOAPIC_REDTBL+0x1a) #define IOAPIC_REDTBL14 (IOAPIC_REDTBL+0x1c) #define IOAPIC_REDTBL15 (IOAPIC_REDTBL+0x1e) #define IOAPIC_REDTBL16 (IOAPIC_REDTBL+0x20) #define IOAPIC_REDTBL17 (IOAPIC_REDTBL+0x22) #define IOAPIC_REDTBL18 (IOAPIC_REDTBL+0x24) #define IOAPIC_REDTBL19 (IOAPIC_REDTBL+0x26) #define IOAPIC_REDTBL20 (IOAPIC_REDTBL+0x28) #define IOAPIC_REDTBL21 (IOAPIC_REDTBL+0x2a) #define IOAPIC_REDTBL22 (IOAPIC_REDTBL+0x2c) #define IOAPIC_REDTBL23 (IOAPIC_REDTBL+0x2e) /* fields in VER */ #define IOART_VER_VERSION 0x000000ff #define IOART_VER_MAXREDIR 0x00ff0000 #define MAXREDIRSHIFT 16 /* * fields in the IO APIC's redirection table entries */ #define IOART_DEST APIC_ID_MASK /* broadcast addr: all APICs */ #define IOART_RESV 0x00fe0000 /* reserved */ #define IOART_INTMASK 0x00010000 /* R/W: INTerrupt mask */ # define IOART_INTMCLR 0x00000000 /* clear, allow INTs */ # define IOART_INTMSET 0x00010000 /* set, inhibit INTs */ #define IOART_TRGRMOD 0x00008000 /* R/W: trigger mode */ # define IOART_TRGREDG 0x00000000 /* edge */ # define IOART_TRGRLVL 0x00008000 /* level */ #define IOART_REM_IRR 0x00004000 /* RO: remote IRR */ #define IOART_INTPOL 0x00002000 /* R/W: INT input pin polarity */ # define IOART_INTAHI 0x00000000 /* active high */ # define IOART_INTALO 0x00002000 /* active low */ #define IOART_DELIVS 0x00001000 /* RO: delivery status */ #define IOART_DESTMOD 0x00000800 /* R/W: destination mode */ # define IOART_DESTPHY 0x00000000 /* physical */ # define IOART_DESTLOG 0x00000800 /* logical */ #define IOART_DELMOD 0x00000700 /* R/W: delivery mode */ # define IOART_DELFIXED 0x00000000 /* fixed */ # define IOART_DELLOPRI 0x00000100 /* lowest priority */ # define IOART_DELSMI 0x00000200 /* System Management INT */ # define IOART_DELRSV1 0x00000300 /* reserved */ # define IOART_DELNMI 0x00000400 /* NMI signal */ # define IOART_DELINIT 0x00000500 /* INIT signal */ # define IOART_DELRSV2 0x00000600 /* reserved */ # define IOART_DELEXINT 0x00000700 /* External INTerrupt */ #define IOART_INTVEC 0x000000ff /* R/W: INTerrupt vector field */ -#endif /* _MACHINE_APICREG_H_ */ +#endif /* _X86_APICREG_H_ */ diff --git a/sys/x86/x86/io_apic.c b/sys/x86/x86/io_apic.c index 4468486d50c5..da97def5324f 100644 --- a/sys/x86/x86/io_apic.c +++ b/sys/x86/x86/io_apic.c @@ -1,922 +1,922 @@ /*- * Copyright (c) 2003 John Baldwin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_isa.h" #include #include #include #include #include #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include #define IOAPIC_ISA_INTS 16 #define IOAPIC_MEM_REGION 32 #define IOAPIC_REDTBL_LO(i) (IOAPIC_REDTBL + (i) * 2) #define IOAPIC_REDTBL_HI(i) (IOAPIC_REDTBL_LO(i) + 1) #define IRQ_EXTINT (NUM_IO_INTS + 1) #define IRQ_NMI (NUM_IO_INTS + 2) #define IRQ_SMI (NUM_IO_INTS + 3) #define IRQ_DISABLED (NUM_IO_INTS + 4) static MALLOC_DEFINE(M_IOAPIC, "io_apic", "I/O APIC structures"); /* * I/O APIC interrupt source driver. Each pin is assigned an IRQ cookie * as laid out in the ACPI System Interrupt number model where each I/O * APIC has a contiguous chunk of the System Interrupt address space. * We assume that IRQs 1 - 15 behave like ISA IRQs and that all other * IRQs behave as PCI IRQs by default. We also assume that the pin for * IRQ 0 is actually an ExtINT pin. The apic enumerators override the * configuration of individual pins as indicated by their tables. * * Documentation for the I/O APIC: "82093AA I/O Advanced Programmable * Interrupt Controller (IOAPIC)", May 1996, Intel Corp. * ftp://download.intel.com/design/chipsets/datashts/29056601.pdf */ struct ioapic_intsrc { struct intsrc io_intsrc; u_int io_irq; u_int io_intpin:8; u_int io_vector:8; u_int io_cpu:8; u_int io_activehi:1; u_int io_edgetrigger:1; u_int io_masked:1; int io_bus:4; uint32_t io_lowreg; }; struct ioapic { struct pic io_pic; u_int io_id:8; /* logical ID */ u_int io_apic_id:4; u_int io_intbase:8; /* System Interrupt base */ u_int io_numintr:8; volatile ioapic_t *io_addr; /* XXX: should use bus_space */ vm_paddr_t io_paddr; STAILQ_ENTRY(ioapic) io_next; struct ioapic_intsrc io_pins[0]; }; static u_int ioapic_read(volatile ioapic_t *apic, int reg); static void ioapic_write(volatile ioapic_t *apic, int reg, u_int val); static const char *ioapic_bus_string(int bus_type); static void ioapic_print_irq(struct ioapic_intsrc *intpin); static void ioapic_enable_source(struct intsrc *isrc); static void ioapic_disable_source(struct intsrc *isrc, int eoi); static void ioapic_eoi_source(struct intsrc *isrc); static void ioapic_enable_intr(struct intsrc *isrc); static void ioapic_disable_intr(struct intsrc *isrc); static int ioapic_vector(struct intsrc *isrc); static int ioapic_source_pending(struct intsrc *isrc); static int ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol); static void ioapic_resume(struct pic *pic); static int ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id); static void ioapic_program_intpin(struct ioapic_intsrc *intpin); static STAILQ_HEAD(,ioapic) ioapic_list = STAILQ_HEAD_INITIALIZER(ioapic_list); struct pic ioapic_template = { ioapic_enable_source, ioapic_disable_source, ioapic_eoi_source, ioapic_enable_intr, ioapic_disable_intr, ioapic_vector, ioapic_source_pending, NULL, ioapic_resume, ioapic_config_intr, ioapic_assign_cpu }; static int next_ioapic_base; static u_int next_id; SYSCTL_NODE(_hw, OID_AUTO, apic, CTLFLAG_RD, 0, "APIC options"); static int enable_extint; SYSCTL_INT(_hw_apic, OID_AUTO, enable_extint, CTLFLAG_RDTUN, &enable_extint, 0, "Enable the ExtINT pin in the first I/O APIC"); TUNABLE_INT("hw.apic.enable_extint", &enable_extint); static __inline void _ioapic_eoi_source(struct intsrc *isrc) { lapic_eoi(); } static u_int ioapic_read(volatile ioapic_t *apic, int reg) { mtx_assert(&icu_lock, MA_OWNED); apic->ioregsel = reg; return (apic->iowin); } static void ioapic_write(volatile ioapic_t *apic, int reg, u_int val) { mtx_assert(&icu_lock, MA_OWNED); apic->ioregsel = reg; apic->iowin = val; } static const char * ioapic_bus_string(int bus_type) { switch (bus_type) { case APIC_BUS_ISA: return ("ISA"); case APIC_BUS_EISA: return ("EISA"); case APIC_BUS_PCI: return ("PCI"); default: return ("unknown"); } } static void ioapic_print_irq(struct ioapic_intsrc *intpin) { switch (intpin->io_irq) { case IRQ_DISABLED: printf("disabled"); break; case IRQ_EXTINT: printf("ExtINT"); break; case IRQ_NMI: printf("NMI"); break; case IRQ_SMI: printf("SMI"); break; default: printf("%s IRQ %u", ioapic_bus_string(intpin->io_bus), intpin->io_irq); } } static void ioapic_enable_source(struct intsrc *isrc) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; struct ioapic *io = (struct ioapic *)isrc->is_pic; uint32_t flags; mtx_lock_spin(&icu_lock); if (intpin->io_masked) { flags = intpin->io_lowreg & ~IOART_INTMASK; ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), flags); intpin->io_masked = 0; } mtx_unlock_spin(&icu_lock); } static void ioapic_disable_source(struct intsrc *isrc, int eoi) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; struct ioapic *io = (struct ioapic *)isrc->is_pic; uint32_t flags; mtx_lock_spin(&icu_lock); if (!intpin->io_masked && !intpin->io_edgetrigger) { flags = intpin->io_lowreg | IOART_INTMSET; ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), flags); intpin->io_masked = 1; } if (eoi == PIC_EOI) _ioapic_eoi_source(isrc); mtx_unlock_spin(&icu_lock); } static void ioapic_eoi_source(struct intsrc *isrc) { _ioapic_eoi_source(isrc); } /* * Completely program an intpin based on the data in its interrupt source * structure. */ static void ioapic_program_intpin(struct ioapic_intsrc *intpin) { struct ioapic *io = (struct ioapic *)intpin->io_intsrc.is_pic; uint32_t low, high, value; /* * If a pin is completely invalid or if it is valid but hasn't * been enabled yet, just ensure that the pin is masked. */ mtx_assert(&icu_lock, MA_OWNED); if (intpin->io_irq == IRQ_DISABLED || (intpin->io_irq < NUM_IO_INTS && intpin->io_vector == 0)) { low = ioapic_read(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin)); if ((low & IOART_INTMASK) == IOART_INTMCLR) ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low | IOART_INTMSET); return; } /* Set the destination. */ low = IOART_DESTPHY; high = intpin->io_cpu << APIC_ID_SHIFT; /* Program the rest of the low word. */ if (intpin->io_edgetrigger) low |= IOART_TRGREDG; else low |= IOART_TRGRLVL; if (intpin->io_activehi) low |= IOART_INTAHI; else low |= IOART_INTALO; if (intpin->io_masked) low |= IOART_INTMSET; switch (intpin->io_irq) { case IRQ_EXTINT: KASSERT(intpin->io_edgetrigger, ("ExtINT not edge triggered")); low |= IOART_DELEXINT; break; case IRQ_NMI: KASSERT(intpin->io_edgetrigger, ("NMI not edge triggered")); low |= IOART_DELNMI; break; case IRQ_SMI: KASSERT(intpin->io_edgetrigger, ("SMI not edge triggered")); low |= IOART_DELSMI; break; default: KASSERT(intpin->io_vector != 0, ("No vector for IRQ %u", intpin->io_irq)); low |= IOART_DELFIXED | intpin->io_vector; } /* Write the values to the APIC. */ intpin->io_lowreg = low; ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), low); value = ioapic_read(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin)); value &= ~IOART_DEST; value |= high; ioapic_write(io->io_addr, IOAPIC_REDTBL_HI(intpin->io_intpin), value); } static int ioapic_assign_cpu(struct intsrc *isrc, u_int apic_id) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; struct ioapic *io = (struct ioapic *)isrc->is_pic; u_int old_vector, new_vector; u_int old_id; /* * keep 1st core as the destination for NMI */ if (intpin->io_irq == IRQ_NMI) apic_id = 0; /* * Set us up to free the old irq. */ old_vector = intpin->io_vector; old_id = intpin->io_cpu; if (old_vector && apic_id == old_id) return (0); /* * Allocate an APIC vector for this interrupt pin. Once * we have a vector we program the interrupt pin. */ new_vector = apic_alloc_vector(apic_id, intpin->io_irq); if (new_vector == 0) return (ENOSPC); /* * Mask the old intpin if it is enabled while it is migrated. * * At least some level-triggered interrupts seem to need the * extra DELAY() to avoid being stuck in a non-EOI'd state. */ mtx_lock_spin(&icu_lock); if (!intpin->io_masked && !intpin->io_edgetrigger) { ioapic_write(io->io_addr, IOAPIC_REDTBL_LO(intpin->io_intpin), intpin->io_lowreg | IOART_INTMSET); DELAY(100); } intpin->io_cpu = apic_id; intpin->io_vector = new_vector; if (isrc->is_handlers > 0) apic_enable_vector(intpin->io_cpu, intpin->io_vector); if (bootverbose) { printf("ioapic%u: routing intpin %u (", io->io_id, intpin->io_intpin); ioapic_print_irq(intpin); printf(") to lapic %u vector %u\n", intpin->io_cpu, intpin->io_vector); } ioapic_program_intpin(intpin); mtx_unlock_spin(&icu_lock); /* * Free the old vector after the new one is established. This is done * to prevent races where we could miss an interrupt. */ if (old_vector) { if (isrc->is_handlers > 0) apic_disable_vector(old_id, old_vector); apic_free_vector(old_id, old_vector, intpin->io_irq); } return (0); } static void ioapic_enable_intr(struct intsrc *isrc) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; if (intpin->io_vector == 0) if (ioapic_assign_cpu(isrc, intr_next_cpu()) != 0) panic("Couldn't find an APIC vector for IRQ %d", intpin->io_irq); apic_enable_vector(intpin->io_cpu, intpin->io_vector); } static void ioapic_disable_intr(struct intsrc *isrc) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; u_int vector; if (intpin->io_vector != 0) { /* Mask this interrupt pin and free its APIC vector. */ vector = intpin->io_vector; apic_disable_vector(intpin->io_cpu, vector); mtx_lock_spin(&icu_lock); intpin->io_masked = 1; intpin->io_vector = 0; ioapic_program_intpin(intpin); mtx_unlock_spin(&icu_lock); apic_free_vector(intpin->io_cpu, vector, intpin->io_irq); } } static int ioapic_vector(struct intsrc *isrc) { struct ioapic_intsrc *pin; pin = (struct ioapic_intsrc *)isrc; return (pin->io_irq); } static int ioapic_source_pending(struct intsrc *isrc) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; if (intpin->io_vector == 0) return 0; return (lapic_intr_pending(intpin->io_vector)); } static int ioapic_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol) { struct ioapic_intsrc *intpin = (struct ioapic_intsrc *)isrc; struct ioapic *io = (struct ioapic *)isrc->is_pic; int changed; KASSERT(!(trig == INTR_TRIGGER_CONFORM || pol == INTR_POLARITY_CONFORM), ("%s: Conforming trigger or polarity\n", __func__)); /* * EISA interrupts always use active high polarity, so don't allow * them to be set to active low. * * XXX: Should we write to the ELCR if the trigger mode changes for * an EISA IRQ or an ISA IRQ with the ELCR present? */ mtx_lock_spin(&icu_lock); if (intpin->io_bus == APIC_BUS_EISA) pol = INTR_POLARITY_HIGH; changed = 0; if (intpin->io_edgetrigger != (trig == INTR_TRIGGER_EDGE)) { if (bootverbose) printf("ioapic%u: Changing trigger for pin %u to %s\n", io->io_id, intpin->io_intpin, trig == INTR_TRIGGER_EDGE ? "edge" : "level"); intpin->io_edgetrigger = (trig == INTR_TRIGGER_EDGE); changed++; } if (intpin->io_activehi != (pol == INTR_POLARITY_HIGH)) { if (bootverbose) printf("ioapic%u: Changing polarity for pin %u to %s\n", io->io_id, intpin->io_intpin, pol == INTR_POLARITY_HIGH ? "high" : "low"); intpin->io_activehi = (pol == INTR_POLARITY_HIGH); changed++; } if (changed) ioapic_program_intpin(intpin); mtx_unlock_spin(&icu_lock); return (0); } static void ioapic_resume(struct pic *pic) { struct ioapic *io = (struct ioapic *)pic; int i; mtx_lock_spin(&icu_lock); for (i = 0; i < io->io_numintr; i++) ioapic_program_intpin(&io->io_pins[i]); mtx_unlock_spin(&icu_lock); } /* * Create a plain I/O APIC object. */ void * ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase) { struct ioapic *io; struct ioapic_intsrc *intpin; volatile ioapic_t *apic; u_int numintr, i; uint32_t value; /* Map the register window so we can access the device. */ apic = pmap_mapdev(addr, IOAPIC_MEM_REGION); mtx_lock_spin(&icu_lock); value = ioapic_read(apic, IOAPIC_VER); mtx_unlock_spin(&icu_lock); /* If it's version register doesn't seem to work, punt. */ if (value == 0xffffffff) { pmap_unmapdev((vm_offset_t)apic, IOAPIC_MEM_REGION); return (NULL); } /* Determine the number of vectors and set the APIC ID. */ numintr = ((value & IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) + 1; io = malloc(sizeof(struct ioapic) + numintr * sizeof(struct ioapic_intsrc), M_IOAPIC, M_WAITOK); io->io_pic = ioapic_template; mtx_lock_spin(&icu_lock); io->io_id = next_id++; io->io_apic_id = ioapic_read(apic, IOAPIC_ID) >> APIC_ID_SHIFT; if (apic_id != -1 && io->io_apic_id != apic_id) { ioapic_write(apic, IOAPIC_ID, apic_id << APIC_ID_SHIFT); mtx_unlock_spin(&icu_lock); io->io_apic_id = apic_id; printf("ioapic%u: Changing APIC ID to %d\n", io->io_id, apic_id); } else mtx_unlock_spin(&icu_lock); if (intbase == -1) { intbase = next_ioapic_base; printf("ioapic%u: Assuming intbase of %d\n", io->io_id, intbase); } else if (intbase != next_ioapic_base && bootverbose) printf("ioapic%u: WARNING: intbase %d != expected base %d\n", io->io_id, intbase, next_ioapic_base); io->io_intbase = intbase; next_ioapic_base = intbase + numintr; io->io_numintr = numintr; io->io_addr = apic; io->io_paddr = addr; /* * Initialize pins. Start off with interrupts disabled. Default * to active-hi and edge-triggered for ISA interrupts and active-lo * and level-triggered for all others. */ bzero(io->io_pins, sizeof(struct ioapic_intsrc) * numintr); mtx_lock_spin(&icu_lock); for (i = 0, intpin = io->io_pins; i < numintr; i++, intpin++) { intpin->io_intsrc.is_pic = (struct pic *)io; intpin->io_intpin = i; intpin->io_irq = intbase + i; /* * Assume that pin 0 on the first I/O APIC is an ExtINT pin. * Assume that pins 1-15 are ISA interrupts and that all * other pins are PCI interrupts. */ if (intpin->io_irq == 0) ioapic_set_extint(io, i); else if (intpin->io_irq < IOAPIC_ISA_INTS) { intpin->io_bus = APIC_BUS_ISA; intpin->io_activehi = 1; intpin->io_edgetrigger = 1; intpin->io_masked = 1; } else { intpin->io_bus = APIC_BUS_PCI; intpin->io_activehi = 0; intpin->io_edgetrigger = 0; intpin->io_masked = 1; } /* * Route interrupts to the BSP by default. Interrupts may * be routed to other CPUs later after they are enabled. */ intpin->io_cpu = PCPU_GET(apic_id); value = ioapic_read(apic, IOAPIC_REDTBL_LO(i)); ioapic_write(apic, IOAPIC_REDTBL_LO(i), value | IOART_INTMSET); } mtx_unlock_spin(&icu_lock); return (io); } int ioapic_get_vector(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (-1); return (io->io_pins[pin].io_irq); } int ioapic_disable_pin(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_DISABLED) return (EINVAL); io->io_pins[pin].io_irq = IRQ_DISABLED; if (bootverbose) printf("ioapic%u: intpin %d disabled\n", io->io_id, pin); return (0); } int ioapic_remap_vector(void *cookie, u_int pin, int vector) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr || vector < 0) return (EINVAL); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); io->io_pins[pin].io_irq = vector; if (bootverbose) printf("ioapic%u: Routing IRQ %d -> intpin %d\n", io->io_id, vector, pin); return (0); } int ioapic_set_bus(void *cookie, u_int pin, int bus_type) { struct ioapic *io; if (bus_type < 0 || bus_type > APIC_BUS_MAX) return (EINVAL); io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); if (io->io_pins[pin].io_bus == bus_type) return (0); io->io_pins[pin].io_bus = bus_type; if (bootverbose) printf("ioapic%u: intpin %d bus %s\n", io->io_id, pin, ioapic_bus_string(bus_type)); return (0); } int ioapic_set_nmi(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_NMI) return (0); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; io->io_pins[pin].io_irq = IRQ_NMI; io->io_pins[pin].io_masked = 0; io->io_pins[pin].io_edgetrigger = 1; io->io_pins[pin].io_activehi = 1; if (bootverbose) printf("ioapic%u: Routing NMI -> intpin %d\n", io->io_id, pin); return (0); } int ioapic_set_smi(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_SMI) return (0); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; io->io_pins[pin].io_irq = IRQ_SMI; io->io_pins[pin].io_masked = 0; io->io_pins[pin].io_edgetrigger = 1; io->io_pins[pin].io_activehi = 1; if (bootverbose) printf("ioapic%u: Routing SMI -> intpin %d\n", io->io_id, pin); return (0); } int ioapic_set_extint(void *cookie, u_int pin) { struct ioapic *io; io = (struct ioapic *)cookie; if (pin >= io->io_numintr) return (EINVAL); if (io->io_pins[pin].io_irq == IRQ_EXTINT) return (0); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); io->io_pins[pin].io_bus = APIC_BUS_UNKNOWN; io->io_pins[pin].io_irq = IRQ_EXTINT; if (enable_extint) io->io_pins[pin].io_masked = 0; else io->io_pins[pin].io_masked = 1; io->io_pins[pin].io_edgetrigger = 1; io->io_pins[pin].io_activehi = 1; if (bootverbose) printf("ioapic%u: Routing external 8259A's -> intpin %d\n", io->io_id, pin); return (0); } int ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol) { struct ioapic *io; int activehi; io = (struct ioapic *)cookie; if (pin >= io->io_numintr || pol == INTR_POLARITY_CONFORM) return (EINVAL); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); activehi = (pol == INTR_POLARITY_HIGH); if (io->io_pins[pin].io_activehi == activehi) return (0); io->io_pins[pin].io_activehi = activehi; if (bootverbose) printf("ioapic%u: intpin %d polarity: %s\n", io->io_id, pin, pol == INTR_POLARITY_HIGH ? "high" : "low"); return (0); } int ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger) { struct ioapic *io; int edgetrigger; io = (struct ioapic *)cookie; if (pin >= io->io_numintr || trigger == INTR_TRIGGER_CONFORM) return (EINVAL); if (io->io_pins[pin].io_irq >= NUM_IO_INTS) return (EINVAL); edgetrigger = (trigger == INTR_TRIGGER_EDGE); if (io->io_pins[pin].io_edgetrigger == edgetrigger) return (0); io->io_pins[pin].io_edgetrigger = edgetrigger; if (bootverbose) printf("ioapic%u: intpin %d trigger: %s\n", io->io_id, pin, trigger == INTR_TRIGGER_EDGE ? "edge" : "level"); return (0); } /* * Register a complete I/O APIC object with the interrupt subsystem. */ void ioapic_register(void *cookie) { struct ioapic_intsrc *pin; struct ioapic *io; volatile ioapic_t *apic; uint32_t flags; int i; io = (struct ioapic *)cookie; apic = io->io_addr; mtx_lock_spin(&icu_lock); flags = ioapic_read(apic, IOAPIC_VER) & IOART_VER_VERSION; STAILQ_INSERT_TAIL(&ioapic_list, io, io_next); mtx_unlock_spin(&icu_lock); printf("ioapic%u irqs %u-%u on motherboard\n", io->io_id, flags >> 4, flags & 0xf, io->io_intbase, io->io_intbase + io->io_numintr - 1); /* Register valid pins as interrupt sources. */ intr_register_pic(&io->io_pic); for (i = 0, pin = io->io_pins; i < io->io_numintr; i++, pin++) if (pin->io_irq < NUM_IO_INTS) intr_register_source(&pin->io_intsrc); } /* A simple new-bus driver to consume PCI I/O APIC devices. */ static int ioapic_pci_probe(device_t dev) { if (pci_get_class(dev) == PCIC_BASEPERIPH && pci_get_subclass(dev) == PCIS_BASEPERIPH_PIC) { switch (pci_get_progif(dev)) { case PCIP_BASEPERIPH_PIC_IO_APIC: device_set_desc(dev, "IO APIC"); break; case PCIP_BASEPERIPH_PIC_IOX_APIC: device_set_desc(dev, "IO(x) APIC"); break; default: return (ENXIO); } device_quiet(dev); return (-10000); } return (ENXIO); } static int ioapic_pci_attach(device_t dev) { return (0); } static device_method_t ioapic_pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, ioapic_pci_probe), DEVMETHOD(device_attach, ioapic_pci_attach), { 0, 0 } }; DEFINE_CLASS_0(ioapic, ioapic_pci_driver, ioapic_pci_methods, 0); static devclass_t ioapic_devclass; DRIVER_MODULE(ioapic, pci, ioapic_pci_driver, ioapic_devclass, 0, 0); /* * A new-bus driver to consume the memory resources associated with * the APICs in the system. On some systems ACPI or PnPBIOS system * resource devices may already claim these resources. To keep from * breaking those devices, we attach ourself to the nexus device after * legacy0 and acpi0 and ignore any allocation failures. */ static void apic_identify(driver_t *driver, device_t parent) { /* * Add at order 12. acpi0 is probed at order 10 and legacy0 * is probed at order 11. */ if (lapic_paddr != 0) BUS_ADD_CHILD(parent, 12, "apic", 0); } static int apic_probe(device_t dev) { device_set_desc(dev, "APIC resources"); device_quiet(dev); return (0); } static void apic_add_resource(device_t dev, int rid, vm_paddr_t base, size_t length) { int error; #ifdef PAE /* * Resources use long's to track resources, so we can't * include memory regions above 4GB. */ if (base >= ~0ul) return; #endif error = bus_set_resource(dev, SYS_RES_MEMORY, rid, base, length); if (error) panic("apic_add_resource: resource %d failed set with %d", rid, error); bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 0); } static int apic_attach(device_t dev) { struct ioapic *io; int i; /* Reserve the local APIC. */ apic_add_resource(dev, 0, lapic_paddr, sizeof(lapic_t)); i = 1; STAILQ_FOREACH(io, &ioapic_list, io_next) { apic_add_resource(dev, i, io->io_paddr, IOAPIC_MEM_REGION); i++; } return (0); } static device_method_t apic_methods[] = { /* Device interface */ DEVMETHOD(device_identify, apic_identify), DEVMETHOD(device_probe, apic_probe), DEVMETHOD(device_attach, apic_attach), { 0, 0 } }; DEFINE_CLASS_0(apic, apic_driver, apic_methods, 0); static devclass_t apic_devclass; DRIVER_MODULE(apic, nexus, apic_driver, apic_devclass, 0, 0); diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c index 76898f12b719..63a68433732e 100644 --- a/sys/x86/x86/local_apic.c +++ b/sys/x86/x86/local_apic.c @@ -1,1523 +1,1523 @@ /*- * Copyright (c) 2003 John Baldwin * Copyright (c) 1996, by Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Local APIC support on Pentium and later processors. */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_kdtrace.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #include #endif #ifdef __amd64__ #define SDT_APIC SDT_SYSIGT #define SDT_APICT SDT_SYSIGT #define GSEL_APIC 0 #else #define SDT_APIC SDT_SYS386IGT #define SDT_APICT SDT_SYS386TGT #define GSEL_APIC GSEL(GCODE_SEL, SEL_KPL) #endif /* Sanity checks on IDT vectors. */ CTASSERT(APIC_IO_INTS + APIC_NUM_IOINTS == APIC_TIMER_INT); CTASSERT(APIC_TIMER_INT < APIC_LOCAL_INTS); CTASSERT(APIC_LOCAL_INTS == 240); CTASSERT(IPI_STOP < APIC_SPURIOUS_INT); /* Magic IRQ values for the timer and syscalls. */ #define IRQ_TIMER (NUM_IO_INTS + 1) #define IRQ_SYSCALL (NUM_IO_INTS + 2) #define IRQ_DTRACE_RET (NUM_IO_INTS + 3) /* * Support for local APICs. Local APICs manage interrupts on each * individual processor as opposed to I/O APICs which receive interrupts * from I/O devices and then forward them on to the local APICs. * * Local APICs can also send interrupts to each other thus providing the * mechanism for IPIs. */ struct lvt { u_int lvt_edgetrigger:1; u_int lvt_activehi:1; u_int lvt_masked:1; u_int lvt_active:1; u_int lvt_mode:16; u_int lvt_vector:8; }; struct lapic { struct lvt la_lvts[LVT_MAX + 1]; u_int la_id:8; u_int la_cluster:4; u_int la_cluster_id:2; u_int la_present:1; u_long *la_timer_count; u_long la_timer_period; u_int la_timer_mode; /* Include IDT_SYSCALL to make indexing easier. */ int la_ioint_irqs[APIC_NUM_IOINTS + 1]; } static lapics[MAX_APIC_ID + 1]; /* Global defaults for local APIC LVT entries. */ static struct lvt lvts[LVT_MAX + 1] = { { 1, 1, 1, 1, APIC_LVT_DM_EXTINT, 0 }, /* LINT0: masked ExtINT */ { 1, 1, 0, 1, APIC_LVT_DM_NMI, 0 }, /* LINT1: NMI */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_TIMER_INT }, /* Timer */ { 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT }, /* Error */ { 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 }, /* PMC */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT }, /* Thermal */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT }, /* CMCI */ }; static inthand_t *ioint_handlers[] = { NULL, /* 0 - 31 */ IDTVEC(apic_isr1), /* 32 - 63 */ IDTVEC(apic_isr2), /* 64 - 95 */ IDTVEC(apic_isr3), /* 96 - 127 */ IDTVEC(apic_isr4), /* 128 - 159 */ IDTVEC(apic_isr5), /* 160 - 191 */ IDTVEC(apic_isr6), /* 192 - 223 */ IDTVEC(apic_isr7), /* 224 - 255 */ }; static u_int32_t lapic_timer_divisors[] = { APIC_TDCR_1, APIC_TDCR_2, APIC_TDCR_4, APIC_TDCR_8, APIC_TDCR_16, APIC_TDCR_32, APIC_TDCR_64, APIC_TDCR_128 }; extern inthand_t IDTVEC(rsvd); volatile lapic_t *lapic; vm_paddr_t lapic_paddr; static u_long lapic_timer_divisor; static struct eventtimer lapic_et; static void lapic_enable(void); static void lapic_resume(struct pic *pic); static void lapic_timer_enable_intr(void); static void lapic_timer_oneshot(u_int count); static void lapic_timer_periodic(u_int count); static void lapic_timer_stop(void); static void lapic_timer_set_divisor(u_int divisor); static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value); static int lapic_et_start(struct eventtimer *et, struct bintime *first, struct bintime *period); static int lapic_et_stop(struct eventtimer *et); struct pic lapic_pic = { .pic_resume = lapic_resume }; static uint32_t lvt_mode(struct lapic *la, u_int pin, uint32_t value) { struct lvt *lvt; KASSERT(pin <= LVT_MAX, ("%s: pin %u out of range", __func__, pin)); if (la->la_lvts[pin].lvt_active) lvt = &la->la_lvts[pin]; else lvt = &lvts[pin]; value &= ~(APIC_LVT_M | APIC_LVT_TM | APIC_LVT_IIPP | APIC_LVT_DM | APIC_LVT_VECTOR); if (lvt->lvt_edgetrigger == 0) value |= APIC_LVT_TM; if (lvt->lvt_activehi == 0) value |= APIC_LVT_IIPP_INTALO; if (lvt->lvt_masked) value |= APIC_LVT_M; value |= lvt->lvt_mode; switch (lvt->lvt_mode) { case APIC_LVT_DM_NMI: case APIC_LVT_DM_SMI: case APIC_LVT_DM_INIT: case APIC_LVT_DM_EXTINT: if (!lvt->lvt_edgetrigger) { printf("lapic%u: Forcing LINT%u to edge trigger\n", la->la_id, pin); value |= APIC_LVT_TM; } /* Use a vector of 0. */ break; case APIC_LVT_DM_FIXED: value |= lvt->lvt_vector; break; default: panic("bad APIC LVT delivery mode: %#x\n", value); } return (value); } /* * Map the local APIC and setup necessary interrupt vectors. */ void lapic_init(vm_paddr_t addr) { u_int regs[4]; int i, arat; /* Map the local APIC and setup the spurious interrupt handler. */ KASSERT(trunc_page(addr) == addr, ("local APIC not aligned on a page boundary")); lapic = pmap_mapdev(addr, sizeof(lapic_t)); lapic_paddr = addr; setidt(APIC_SPURIOUS_INT, IDTVEC(spuriousint), SDT_APIC, SEL_KPL, GSEL_APIC); /* Perform basic initialization of the BSP's local APIC. */ lapic_enable(); /* Set BSP's per-CPU local APIC ID. */ PCPU_SET(apic_id, lapic_id()); /* Local APIC timer interrupt. */ setidt(APIC_TIMER_INT, IDTVEC(timerint), SDT_APIC, SEL_KPL, GSEL_APIC); /* Local APIC error interrupt. */ setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC); /* XXX: Thermal interrupt */ /* Local APIC CMCI. */ setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC); if ((resource_int_value("apic", 0, "clock", &i) != 0 || i != 0)) { arat = 0; /* Intel CPUID 0x06 EAX[2] set if APIC timer runs in C3. */ if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_high >= 6) { do_cpuid(0x06, regs); if (regs[0] & 0x4) arat = 1; } bzero(&lapic_et, sizeof(lapic_et)); lapic_et.et_name = "LAPIC"; lapic_et.et_flags = ET_FLAGS_PERIODIC | ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU; lapic_et.et_quality = 600; if (!arat) { lapic_et.et_flags |= ET_FLAGS_C3STOP; lapic_et.et_quality -= 200; } lapic_et.et_frequency = 0; /* We don't know frequency yet, so trying to guess. */ lapic_et.et_min_period.sec = 0; lapic_et.et_min_period.frac = 0x00001000LL << 32; lapic_et.et_max_period.sec = 1; lapic_et.et_max_period.frac = 0; lapic_et.et_start = lapic_et_start; lapic_et.et_stop = lapic_et_stop; lapic_et.et_priv = NULL; et_register(&lapic_et); } } /* * Create a local APIC instance. */ void lapic_create(u_int apic_id, int boot_cpu) { int i; if (apic_id > MAX_APIC_ID) { printf("APIC: Ignoring local APIC with ID %d\n", apic_id); if (boot_cpu) panic("Can't ignore BSP"); return; } KASSERT(!lapics[apic_id].la_present, ("duplicate local APIC %u", apic_id)); /* * Assume no local LVT overrides and a cluster of 0 and * intra-cluster ID of 0. */ lapics[apic_id].la_present = 1; lapics[apic_id].la_id = apic_id; for (i = 0; i <= LVT_MAX; i++) { lapics[apic_id].la_lvts[i] = lvts[i]; lapics[apic_id].la_lvts[i].lvt_active = 0; } for (i = 0; i <= APIC_NUM_IOINTS; i++) lapics[apic_id].la_ioint_irqs[i] = -1; lapics[apic_id].la_ioint_irqs[IDT_SYSCALL - APIC_IO_INTS] = IRQ_SYSCALL; lapics[apic_id].la_ioint_irqs[APIC_TIMER_INT - APIC_IO_INTS] = IRQ_TIMER; #ifdef KDTRACE_HOOKS lapics[apic_id].la_ioint_irqs[IDT_DTRACE_RET - APIC_IO_INTS] = IRQ_DTRACE_RET; #endif #ifdef SMP cpu_add(apic_id, boot_cpu); #endif } /* * Dump contents of local APIC registers */ void lapic_dump(const char* str) { printf("cpu%d %s:\n", PCPU_GET(cpuid), str); printf(" ID: 0x%08x VER: 0x%08x LDR: 0x%08x DFR: 0x%08x\n", lapic->id, lapic->version, lapic->ldr, lapic->dfr); printf(" lint0: 0x%08x lint1: 0x%08x TPR: 0x%08x SVR: 0x%08x\n", lapic->lvt_lint0, lapic->lvt_lint1, lapic->tpr, lapic->svr); printf(" timer: 0x%08x therm: 0x%08x err: 0x%08x pmc: 0x%08x\n", lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error, lapic->lvt_pcint); printf(" cmci: 0x%08x\n", lapic->lvt_cmci); } void lapic_setup(int boot) { struct lapic *la; u_int32_t maxlvt; register_t saveintr; char buf[MAXCOMLEN + 1]; la = &lapics[lapic_id()]; KASSERT(la->la_present, ("missing APIC structure")); saveintr = intr_disable(); maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; /* Initialize the TPR to allow all interrupts. */ lapic_set_tpr(0); /* Setup spurious vector and enable the local APIC. */ lapic_enable(); /* Program LINT[01] LVT entries. */ lapic->lvt_lint0 = lvt_mode(la, LVT_LINT0, lapic->lvt_lint0); lapic->lvt_lint1 = lvt_mode(la, LVT_LINT1, lapic->lvt_lint1); /* Program the PMC LVT entry if present. */ if (maxlvt >= LVT_PMC) lapic->lvt_pcint = lvt_mode(la, LVT_PMC, lapic->lvt_pcint); /* Program timer LVT and setup handler. */ lapic->lvt_timer = lvt_mode(la, LVT_TIMER, lapic->lvt_timer); if (boot) { snprintf(buf, sizeof(buf), "cpu%d:timer", PCPU_GET(cpuid)); intrcnt_add(buf, &la->la_timer_count); } /* Setup the timer if configured. */ if (la->la_timer_mode != 0) { KASSERT(la->la_timer_period != 0, ("lapic%u: zero divisor", lapic_id())); lapic_timer_stop(); lapic_timer_set_divisor(lapic_timer_divisor); lapic_timer_enable_intr(); if (la->la_timer_mode == 1) lapic_timer_periodic(la->la_timer_period); else lapic_timer_oneshot(la->la_timer_period); } /* Program error LVT and clear any existing errors. */ lapic->lvt_error = lvt_mode(la, LVT_ERROR, lapic->lvt_error); lapic->esr = 0; /* XXX: Thermal LVT */ /* Program the CMCI LVT entry if present. */ if (maxlvt >= LVT_CMCI) lapic->lvt_cmci = lvt_mode(la, LVT_CMCI, lapic->lvt_cmci); intr_restore(saveintr); } void lapic_reenable_pmc(void) { #ifdef HWPMC_HOOKS uint32_t value; value = lapic->lvt_pcint; value &= ~APIC_LVT_M; lapic->lvt_pcint = value; #endif } #ifdef HWPMC_HOOKS static void lapic_update_pmc(void *dummy) { struct lapic *la; la = &lapics[lapic_id()]; lapic->lvt_pcint = lvt_mode(la, LVT_PMC, lapic->lvt_pcint); } #endif int lapic_enable_pmc(void) { #ifdef HWPMC_HOOKS u_int32_t maxlvt; /* Fail if the local APIC is not present. */ if (lapic == NULL) return (0); /* Fail if the PMC LVT is not present. */ maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; if (maxlvt < LVT_PMC) return (0); lvts[LVT_PMC].lvt_masked = 0; #ifdef SMP /* * If hwpmc was loaded at boot time then the APs may not be * started yet. In that case, don't forward the request to * them as they will program the lvt when they start. */ if (smp_started) smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL); else #endif lapic_update_pmc(NULL); return (1); #else return (0); #endif } void lapic_disable_pmc(void) { #ifdef HWPMC_HOOKS u_int32_t maxlvt; /* Fail if the local APIC is not present. */ if (lapic == NULL) return; /* Fail if the PMC LVT is not present. */ maxlvt = (lapic->version & APIC_VER_MAXLVT) >> MAXLVTSHIFT; if (maxlvt < LVT_PMC) return; lvts[LVT_PMC].lvt_masked = 1; #ifdef SMP /* The APs should always be started when hwpmc is unloaded. */ KASSERT(mp_ncpus == 1 || smp_started, ("hwpmc unloaded too early")); #endif smp_rendezvous(NULL, lapic_update_pmc, NULL, NULL); #endif } static int lapic_et_start(struct eventtimer *et, struct bintime *first, struct bintime *period) { struct lapic *la; u_long value; if (et->et_frequency == 0) { /* Start off with a divisor of 2 (power on reset default). */ lapic_timer_divisor = 2; /* Try to calibrate the local APIC timer. */ do { lapic_timer_set_divisor(lapic_timer_divisor); lapic_timer_oneshot(APIC_TIMER_MAX_COUNT); DELAY(1000000); value = APIC_TIMER_MAX_COUNT - lapic->ccr_timer; if (value != APIC_TIMER_MAX_COUNT) break; lapic_timer_divisor <<= 1; } while (lapic_timer_divisor <= 128); if (lapic_timer_divisor > 128) panic("lapic: Divisor too big"); if (bootverbose) printf("lapic: Divisor %lu, Frequency %lu Hz\n", lapic_timer_divisor, value); et->et_frequency = value; et->et_min_period.sec = 0; et->et_min_period.frac = ((0x00000002LLU << 32) / et->et_frequency) << 32; et->et_max_period.sec = 0xfffffffeLLU / et->et_frequency; et->et_max_period.frac = ((0xfffffffeLLU << 32) / et->et_frequency) << 32; } lapic_timer_stop(); lapic_timer_set_divisor(lapic_timer_divisor); lapic_timer_enable_intr(); la = &lapics[lapic_id()]; if (period != NULL) { la->la_timer_mode = 1; la->la_timer_period = (et->et_frequency * (period->frac >> 32)) >> 32; if (period->sec != 0) la->la_timer_period += et->et_frequency * period->sec; lapic_timer_periodic(la->la_timer_period); } else { la->la_timer_mode = 2; la->la_timer_period = (et->et_frequency * (first->frac >> 32)) >> 32; if (first->sec != 0) la->la_timer_period += et->et_frequency * first->sec; lapic_timer_oneshot(la->la_timer_period); } return (0); } static int lapic_et_stop(struct eventtimer *et) { struct lapic *la = &lapics[lapic_id()]; la->la_timer_mode = 0; lapic_timer_stop(); return (0); } void lapic_disable(void) { uint32_t value; /* Software disable the local APIC. */ value = lapic->svr; value &= ~APIC_SVR_SWEN; lapic->svr = value; } static void lapic_enable(void) { u_int32_t value; /* Program the spurious vector to enable the local APIC. */ value = lapic->svr; value &= ~(APIC_SVR_VECTOR | APIC_SVR_FOCUS); value |= (APIC_SVR_FEN | APIC_SVR_SWEN | APIC_SPURIOUS_INT); lapic->svr = value; } /* Reset the local APIC on the BSP during resume. */ static void lapic_resume(struct pic *pic) { lapic_setup(0); } int lapic_id(void) { KASSERT(lapic != NULL, ("local APIC is not mapped")); return (lapic->id >> APIC_ID_SHIFT); } int lapic_intr_pending(u_int vector) { volatile u_int32_t *irr; /* * The IRR registers are an array of 128-bit registers each of * which only describes 32 interrupts in the low 32 bits.. Thus, * we divide the vector by 32 to get the 128-bit index. We then * multiply that index by 4 to get the equivalent index from * treating the IRR as an array of 32-bit registers. Finally, we * modulus the vector by 32 to determine the individual bit to * test. */ irr = &lapic->irr0; return (irr[(vector / 32) * 4] & 1 << (vector % 32)); } void lapic_set_logical_id(u_int apic_id, u_int cluster, u_int cluster_id) { struct lapic *la; KASSERT(lapics[apic_id].la_present, ("%s: APIC %u doesn't exist", __func__, apic_id)); KASSERT(cluster <= APIC_MAX_CLUSTER, ("%s: cluster %u too big", __func__, cluster)); KASSERT(cluster_id <= APIC_MAX_INTRACLUSTER_ID, ("%s: intra cluster id %u too big", __func__, cluster_id)); la = &lapics[apic_id]; la->la_cluster = cluster; la->la_cluster_id = cluster_id; } int lapic_set_lvt_mask(u_int apic_id, u_int pin, u_char masked) { if (pin > LVT_MAX) return (EINVAL); if (apic_id == APIC_ID_ALL) { lvts[pin].lvt_masked = masked; if (bootverbose) printf("lapic:"); } else { KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lapics[apic_id].la_lvts[pin].lvt_masked = masked; lapics[apic_id].la_lvts[pin].lvt_active = 1; if (bootverbose) printf("lapic%u:", apic_id); } if (bootverbose) printf(" LINT%u %s\n", pin, masked ? "masked" : "unmasked"); return (0); } int lapic_set_lvt_mode(u_int apic_id, u_int pin, u_int32_t mode) { struct lvt *lvt; if (pin > LVT_MAX) return (EINVAL); if (apic_id == APIC_ID_ALL) { lvt = &lvts[pin]; if (bootverbose) printf("lapic:"); } else { KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lvt = &lapics[apic_id].la_lvts[pin]; lvt->lvt_active = 1; if (bootverbose) printf("lapic%u:", apic_id); } lvt->lvt_mode = mode; switch (mode) { case APIC_LVT_DM_NMI: case APIC_LVT_DM_SMI: case APIC_LVT_DM_INIT: case APIC_LVT_DM_EXTINT: lvt->lvt_edgetrigger = 1; lvt->lvt_activehi = 1; if (mode == APIC_LVT_DM_EXTINT) lvt->lvt_masked = 1; else lvt->lvt_masked = 0; break; default: panic("Unsupported delivery mode: 0x%x\n", mode); } if (bootverbose) { printf(" Routing "); switch (mode) { case APIC_LVT_DM_NMI: printf("NMI"); break; case APIC_LVT_DM_SMI: printf("SMI"); break; case APIC_LVT_DM_INIT: printf("INIT"); break; case APIC_LVT_DM_EXTINT: printf("ExtINT"); break; } printf(" -> LINT%u\n", pin); } return (0); } int lapic_set_lvt_polarity(u_int apic_id, u_int pin, enum intr_polarity pol) { if (pin > LVT_MAX || pol == INTR_POLARITY_CONFORM) return (EINVAL); if (apic_id == APIC_ID_ALL) { lvts[pin].lvt_activehi = (pol == INTR_POLARITY_HIGH); if (bootverbose) printf("lapic:"); } else { KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lapics[apic_id].la_lvts[pin].lvt_active = 1; lapics[apic_id].la_lvts[pin].lvt_activehi = (pol == INTR_POLARITY_HIGH); if (bootverbose) printf("lapic%u:", apic_id); } if (bootverbose) printf(" LINT%u polarity: %s\n", pin, pol == INTR_POLARITY_HIGH ? "high" : "low"); return (0); } int lapic_set_lvt_triggermode(u_int apic_id, u_int pin, enum intr_trigger trigger) { if (pin > LVT_MAX || trigger == INTR_TRIGGER_CONFORM) return (EINVAL); if (apic_id == APIC_ID_ALL) { lvts[pin].lvt_edgetrigger = (trigger == INTR_TRIGGER_EDGE); if (bootverbose) printf("lapic:"); } else { KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lapics[apic_id].la_lvts[pin].lvt_edgetrigger = (trigger == INTR_TRIGGER_EDGE); lapics[apic_id].la_lvts[pin].lvt_active = 1; if (bootverbose) printf("lapic%u:", apic_id); } if (bootverbose) printf(" LINT%u trigger: %s\n", pin, trigger == INTR_TRIGGER_EDGE ? "edge" : "level"); return (0); } /* * Adjust the TPR of the current CPU so that it blocks all interrupts below * the passed in vector. */ void lapic_set_tpr(u_int vector) { #ifdef CHEAP_TPR lapic->tpr = vector; #else u_int32_t tpr; tpr = lapic->tpr & ~APIC_TPR_PRIO; tpr |= vector; lapic->tpr = tpr; #endif } void lapic_eoi(void) { lapic->eoi = 0; } void lapic_handle_intr(int vector, struct trapframe *frame) { struct intsrc *isrc; if (vector == -1) panic("Couldn't get vector from ISR!"); isrc = intr_lookup_source(apic_idt_to_irq(PCPU_GET(apic_id), vector)); intr_execute_handlers(isrc, frame); } void lapic_handle_timer(struct trapframe *frame) { struct lapic *la; struct trapframe *oldframe; struct thread *td; /* Send EOI first thing. */ lapic_eoi(); #if defined(SMP) && !defined(SCHED_ULE) /* * Don't do any accounting for the disabled HTT cores, since it * will provide misleading numbers for the userland. * * No locking is necessary here, since even if we loose the race * when hlt_cpus_mask changes it is not a big deal, really. * * Don't do that for ULE, since ULE doesn't consider hlt_cpus_mask * and unlike other schedulers it actually schedules threads to * those CPUs. */ if ((hlt_cpus_mask & (1 << PCPU_GET(cpuid))) != 0) return; #endif /* Look up our local APIC structure for the tick counters. */ la = &lapics[PCPU_GET(apic_id)]; (*la->la_timer_count)++; critical_enter(); if (lapic_et.et_active) { td = curthread; td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = frame; lapic_et.et_event_cb(&lapic_et, lapic_et.et_arg); td->td_intr_frame = oldframe; td->td_intr_nesting_level--; } critical_exit(); } static void lapic_timer_set_divisor(u_int divisor) { KASSERT(powerof2(divisor), ("lapic: invalid divisor %u", divisor)); KASSERT(ffs(divisor) <= sizeof(lapic_timer_divisors) / sizeof(u_int32_t), ("lapic: invalid divisor %u", divisor)); lapic->dcr_timer = lapic_timer_divisors[ffs(divisor) - 1]; } static void lapic_timer_oneshot(u_int count) { u_int32_t value; value = lapic->lvt_timer; value &= ~APIC_LVTT_TM; value |= APIC_LVTT_TM_ONE_SHOT; lapic->lvt_timer = value; lapic->icr_timer = count; } static void lapic_timer_periodic(u_int count) { u_int32_t value; value = lapic->lvt_timer; value &= ~APIC_LVTT_TM; value |= APIC_LVTT_TM_PERIODIC; lapic->lvt_timer = value; lapic->icr_timer = count; } static void lapic_timer_stop(void) { u_int32_t value; value = lapic->lvt_timer; value &= ~APIC_LVTT_TM; value |= APIC_LVT_M; lapic->lvt_timer = value; lapic->icr_timer = 0; } static void lapic_timer_enable_intr(void) { u_int32_t value; value = lapic->lvt_timer; value &= ~APIC_LVT_M; lapic->lvt_timer = value; } void lapic_handle_cmc(void) { lapic_eoi(); cmc_intr(); } /* * Called from the mca_init() to activate the CMC interrupt if this CPU is * responsible for monitoring any MC banks for CMC events. Since mca_init() * is called prior to lapic_setup() during boot, this just needs to unmask * this CPU's LVT_CMCI entry. */ void lapic_enable_cmc(void) { u_int apic_id; apic_id = PCPU_GET(apic_id); KASSERT(lapics[apic_id].la_present, ("%s: missing APIC %u", __func__, apic_id)); lapics[apic_id].la_lvts[LVT_CMCI].lvt_masked = 0; lapics[apic_id].la_lvts[LVT_CMCI].lvt_active = 1; if (bootverbose) printf("lapic%u: CMCI unmasked\n", apic_id); } void lapic_handle_error(void) { u_int32_t esr; /* * Read the contents of the error status register. Write to * the register first before reading from it to force the APIC * to update its value to indicate any errors that have * occurred since the previous write to the register. */ lapic->esr = 0; esr = lapic->esr; printf("CPU%d: local APIC error 0x%x\n", PCPU_GET(cpuid), esr); lapic_eoi(); } u_int apic_cpuid(u_int apic_id) { #ifdef SMP return apic_cpuids[apic_id]; #else return 0; #endif } /* Request a free IDT vector to be used by the specified IRQ. */ u_int apic_alloc_vector(u_int apic_id, u_int irq) { u_int vector; KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq)); /* * Search for a free vector. Currently we just use a very simple * algorithm to find the first free vector. */ mtx_lock_spin(&icu_lock); for (vector = 0; vector < APIC_NUM_IOINTS; vector++) { if (lapics[apic_id].la_ioint_irqs[vector] != -1) continue; lapics[apic_id].la_ioint_irqs[vector] = irq; mtx_unlock_spin(&icu_lock); return (vector + APIC_IO_INTS); } mtx_unlock_spin(&icu_lock); return (0); } /* * Request 'count' free contiguous IDT vectors to be used by 'count' * IRQs. 'count' must be a power of two and the vectors will be * aligned on a boundary of 'align'. If the request cannot be * satisfied, 0 is returned. */ u_int apic_alloc_vectors(u_int apic_id, u_int *irqs, u_int count, u_int align) { u_int first, run, vector; KASSERT(powerof2(count), ("bad count")); KASSERT(powerof2(align), ("bad align")); KASSERT(align >= count, ("align < count")); #ifdef INVARIANTS for (run = 0; run < count; run++) KASSERT(irqs[run] < NUM_IO_INTS, ("Invalid IRQ %u at index %u", irqs[run], run)); #endif /* * Search for 'count' free vectors. As with apic_alloc_vector(), * this just uses a simple first fit algorithm. */ run = 0; first = 0; mtx_lock_spin(&icu_lock); for (vector = 0; vector < APIC_NUM_IOINTS; vector++) { /* Vector is in use, end run. */ if (lapics[apic_id].la_ioint_irqs[vector] != -1) { run = 0; first = 0; continue; } /* Start a new run if run == 0 and vector is aligned. */ if (run == 0) { if ((vector & (align - 1)) != 0) continue; first = vector; } run++; /* Keep looping if the run isn't long enough yet. */ if (run < count) continue; /* Found a run, assign IRQs and return the first vector. */ for (vector = 0; vector < count; vector++) lapics[apic_id].la_ioint_irqs[first + vector] = irqs[vector]; mtx_unlock_spin(&icu_lock); return (first + APIC_IO_INTS); } mtx_unlock_spin(&icu_lock); printf("APIC: Couldn't find APIC vectors for %u IRQs\n", count); return (0); } /* * Enable a vector for a particular apic_id. Since all lapics share idt * entries and ioint_handlers this enables the vector on all lapics. lapics * which do not have the vector configured would report spurious interrupts * should it fire. */ void apic_enable_vector(u_int apic_id, u_int vector) { KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry")); KASSERT(ioint_handlers[vector / 32] != NULL, ("No ISR handler for vector %u", vector)); #ifdef KDTRACE_HOOKS KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif setidt(vector, ioint_handlers[vector / 32], SDT_APIC, SEL_KPL, GSEL_APIC); } void apic_disable_vector(u_int apic_id, u_int vector) { KASSERT(vector != IDT_SYSCALL, ("Attempt to overwrite syscall entry")); #ifdef KDTRACE_HOOKS KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif KASSERT(ioint_handlers[vector / 32] != NULL, ("No ISR handler for vector %u", vector)); #ifdef notyet /* * We can not currently clear the idt entry because other cpus * may have a valid vector at this offset. */ setidt(vector, &IDTVEC(rsvd), SDT_APICT, SEL_KPL, GSEL_APIC); #endif } /* Release an APIC vector when it's no longer in use. */ void apic_free_vector(u_int apic_id, u_int vector, u_int irq) { struct thread *td; KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL && vector <= APIC_IO_INTS + APIC_NUM_IOINTS, ("Vector %u does not map to an IRQ line", vector)); KASSERT(irq < NUM_IO_INTS, ("Invalid IRQ %u", irq)); KASSERT(lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] == irq, ("IRQ mismatch")); #ifdef KDTRACE_HOOKS KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif /* * Bind us to the cpu that owned the vector before freeing it so * we don't lose an interrupt delivery race. */ td = curthread; if (!rebooting) { thread_lock(td); if (sched_is_bound(td)) panic("apic_free_vector: Thread already bound.\n"); sched_bind(td, apic_cpuid(apic_id)); thread_unlock(td); } mtx_lock_spin(&icu_lock); lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS] = -1; mtx_unlock_spin(&icu_lock); if (!rebooting) { thread_lock(td); sched_unbind(td); thread_unlock(td); } } /* Map an IDT vector (APIC) to an IRQ (interrupt source). */ u_int apic_idt_to_irq(u_int apic_id, u_int vector) { int irq; KASSERT(vector >= APIC_IO_INTS && vector != IDT_SYSCALL && vector <= APIC_IO_INTS + APIC_NUM_IOINTS, ("Vector %u does not map to an IRQ line", vector)); #ifdef KDTRACE_HOOKS KASSERT(vector != IDT_DTRACE_RET, ("Attempt to overwrite DTrace entry")); #endif irq = lapics[apic_id].la_ioint_irqs[vector - APIC_IO_INTS]; if (irq < 0) irq = 0; return (irq); } #ifdef DDB /* * Dump data about APIC IDT vector mappings. */ DB_SHOW_COMMAND(apic, db_show_apic) { struct intsrc *isrc; int i, verbose; u_int apic_id; u_int irq; if (strcmp(modif, "vv") == 0) verbose = 2; else if (strcmp(modif, "v") == 0) verbose = 1; else verbose = 0; for (apic_id = 0; apic_id <= MAX_APIC_ID; apic_id++) { if (lapics[apic_id].la_present == 0) continue; db_printf("Interrupts bound to lapic %u\n", apic_id); for (i = 0; i < APIC_NUM_IOINTS + 1 && !db_pager_quit; i++) { irq = lapics[apic_id].la_ioint_irqs[i]; if (irq == -1 || irq == IRQ_SYSCALL) continue; #ifdef KDTRACE_HOOKS if (irq == IRQ_DTRACE_RET) continue; #endif db_printf("vec 0x%2x -> ", i + APIC_IO_INTS); if (irq == IRQ_TIMER) db_printf("lapic timer\n"); else if (irq < NUM_IO_INTS) { isrc = intr_lookup_source(irq); if (isrc == NULL || verbose == 0) db_printf("IRQ %u\n", irq); else db_dump_intr_event(isrc->is_event, verbose == 2); } else db_printf("IRQ %u ???\n", irq); } } } static void dump_mask(const char *prefix, uint32_t v, int base) { int i, first; first = 1; for (i = 0; i < 32; i++) if (v & (1 << i)) { if (first) { db_printf("%s:", prefix); first = 0; } db_printf(" %02x", base + i); } if (!first) db_printf("\n"); } /* Show info from the lapic regs for this CPU. */ DB_SHOW_COMMAND(lapic, db_show_lapic) { uint32_t v; db_printf("lapic ID = %d\n", lapic_id()); v = lapic->version; db_printf("version = %d.%d\n", (v & APIC_VER_VERSION) >> 4, v & 0xf); db_printf("max LVT = %d\n", (v & APIC_VER_MAXLVT) >> MAXLVTSHIFT); v = lapic->svr; db_printf("SVR = %02x (%s)\n", v & APIC_SVR_VECTOR, v & APIC_SVR_ENABLE ? "enabled" : "disabled"); db_printf("TPR = %02x\n", lapic->tpr); #define dump_field(prefix, index) \ dump_mask(__XSTRING(prefix ## index), lapic->prefix ## index, \ index * 32) db_printf("In-service Interrupts:\n"); dump_field(isr, 0); dump_field(isr, 1); dump_field(isr, 2); dump_field(isr, 3); dump_field(isr, 4); dump_field(isr, 5); dump_field(isr, 6); dump_field(isr, 7); db_printf("TMR Interrupts:\n"); dump_field(tmr, 0); dump_field(tmr, 1); dump_field(tmr, 2); dump_field(tmr, 3); dump_field(tmr, 4); dump_field(tmr, 5); dump_field(tmr, 6); dump_field(tmr, 7); db_printf("IRR Interrupts:\n"); dump_field(irr, 0); dump_field(irr, 1); dump_field(irr, 2); dump_field(irr, 3); dump_field(irr, 4); dump_field(irr, 5); dump_field(irr, 6); dump_field(irr, 7); #undef dump_field } #endif /* * APIC probing support code. This includes code to manage enumerators. */ static SLIST_HEAD(, apic_enumerator) enumerators = SLIST_HEAD_INITIALIZER(enumerators); static struct apic_enumerator *best_enum; void apic_register_enumerator(struct apic_enumerator *enumerator) { #ifdef INVARIANTS struct apic_enumerator *apic_enum; SLIST_FOREACH(apic_enum, &enumerators, apic_next) { if (apic_enum == enumerator) panic("%s: Duplicate register of %s", __func__, enumerator->apic_name); } #endif SLIST_INSERT_HEAD(&enumerators, enumerator, apic_next); } /* * We have to look for CPU's very, very early because certain subsystems * want to know how many CPU's we have extremely early on in the boot * process. */ static void apic_init(void *dummy __unused) { struct apic_enumerator *enumerator; #ifndef __amd64__ uint64_t apic_base; #endif int retval, best; /* We only support built in local APICs. */ if (!(cpu_feature & CPUID_APIC)) return; /* Don't probe if APIC mode is disabled. */ if (resource_disabled("apic", 0)) return; /* First, probe all the enumerators to find the best match. */ best_enum = NULL; best = 0; SLIST_FOREACH(enumerator, &enumerators, apic_next) { retval = enumerator->apic_probe(); if (retval > 0) continue; if (best_enum == NULL || best < retval) { best_enum = enumerator; best = retval; } } if (best_enum == NULL) { if (bootverbose) printf("APIC: Could not find any APICs.\n"); return; } if (bootverbose) printf("APIC: Using the %s enumerator.\n", best_enum->apic_name); #ifndef __amd64__ /* * To work around an errata, we disable the local APIC on some * CPUs during early startup. We need to turn the local APIC back * on on such CPUs now. */ if (cpu == CPU_686 && cpu_vendor_id == CPU_VENDOR_INTEL && (cpu_id & 0xff0) == 0x610) { apic_base = rdmsr(MSR_APICBASE); apic_base |= APICBASE_ENABLED; wrmsr(MSR_APICBASE, apic_base); } #endif /* Second, probe the CPU's in the system. */ retval = best_enum->apic_probe_cpus(); if (retval != 0) printf("%s: Failed to probe CPUs: returned %d\n", best_enum->apic_name, retval); #ifdef __amd64__ } SYSINIT(apic_init, SI_SUB_TUNABLES - 1, SI_ORDER_SECOND, apic_init, NULL); /* * Setup the local APIC. We have to do this prior to starting up the APs * in the SMP case. */ static void apic_setup_local(void *dummy __unused) { int retval; if (best_enum == NULL) return; #endif /* Third, initialize the local APIC. */ retval = best_enum->apic_setup_local(); if (retval != 0) printf("%s: Failed to setup the local APIC: returned %d\n", best_enum->apic_name, retval); } #ifdef __amd64__ SYSINIT(apic_setup_local, SI_SUB_CPU, SI_ORDER_SECOND, apic_setup_local, NULL); #else SYSINIT(apic_init, SI_SUB_CPU, SI_ORDER_SECOND, apic_init, NULL); #endif /* * Setup the I/O APICs. */ static void apic_setup_io(void *dummy __unused) { int retval; if (best_enum == NULL) return; retval = best_enum->apic_setup_io(); if (retval != 0) printf("%s: Failed to setup I/O APICs: returned %d\n", best_enum->apic_name, retval); #ifdef XEN return; #endif /* * Finish setting up the local APIC on the BSP once we know how to * properly program the LINT pins. */ lapic_setup(1); intr_register_pic(&lapic_pic); if (bootverbose) lapic_dump("BSP"); /* Enable the MSI "pic". */ msi_init(); } SYSINIT(apic_setup_io, SI_SUB_INTR, SI_ORDER_SECOND, apic_setup_io, NULL); #ifdef SMP /* * Inter Processor Interrupt functions. The lapic_ipi_*() functions are * private to the MD code. The public interface for the rest of the * kernel is defined in mp_machdep.c. */ int lapic_ipi_wait(int delay) { int x, incr; /* * Wait delay loops for IPI to be sent. This is highly bogus * since this is sensitive to CPU clock speed. If delay is * -1, we wait forever. */ if (delay == -1) { incr = 0; delay = 1; } else incr = 1; for (x = 0; x < delay; x += incr) { if ((lapic->icr_lo & APIC_DELSTAT_MASK) == APIC_DELSTAT_IDLE) return (1); ia32_pause(); } return (0); } void lapic_ipi_raw(register_t icrlo, u_int dest) { register_t value, saveintr; /* XXX: Need more sanity checking of icrlo? */ KASSERT(lapic != NULL, ("%s called too early", __func__)); KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0, ("%s: invalid dest field", __func__)); KASSERT((icrlo & APIC_ICRLO_RESV_MASK) == 0, ("%s: reserved bits set in ICR LO register", __func__)); /* Set destination in ICR HI register if it is being used. */ saveintr = intr_disable(); if ((icrlo & APIC_DEST_MASK) == APIC_DEST_DESTFLD) { value = lapic->icr_hi; value &= ~APIC_ID_MASK; value |= dest << APIC_ID_SHIFT; lapic->icr_hi = value; } /* Program the contents of the IPI and dispatch it. */ value = lapic->icr_lo; value &= APIC_ICRLO_RESV_MASK; value |= icrlo; lapic->icr_lo = value; intr_restore(saveintr); } #define BEFORE_SPIN 1000000 #ifdef DETECT_DEADLOCK #define AFTER_SPIN 1000 #endif void lapic_ipi_vectored(u_int vector, int dest) { register_t icrlo, destfield; KASSERT((vector & ~APIC_VECTOR_MASK) == 0, ("%s: invalid vector %d", __func__, vector)); icrlo = APIC_DESTMODE_PHY | APIC_TRIGMOD_EDGE; /* * IPI_STOP_HARD is just a "fake" vector used to send a NMI. * Use special rules regard NMI if passed, otherwise specify * the vector. */ if (vector == IPI_STOP_HARD) icrlo |= APIC_DELMODE_NMI | APIC_LEVEL_ASSERT; else icrlo |= vector | APIC_DELMODE_FIXED | APIC_LEVEL_DEASSERT; destfield = 0; switch (dest) { case APIC_IPI_DEST_SELF: icrlo |= APIC_DEST_SELF; break; case APIC_IPI_DEST_ALL: icrlo |= APIC_DEST_ALLISELF; break; case APIC_IPI_DEST_OTHERS: icrlo |= APIC_DEST_ALLESELF; break; default: KASSERT((dest & ~(APIC_ID_MASK >> APIC_ID_SHIFT)) == 0, ("%s: invalid destination 0x%x", __func__, dest)); destfield = dest; } /* Wait for an earlier IPI to finish. */ if (!lapic_ipi_wait(BEFORE_SPIN)) { if (panicstr != NULL) return; else panic("APIC: Previous IPI is stuck"); } lapic_ipi_raw(icrlo, destfield); #ifdef DETECT_DEADLOCK /* Wait for IPI to be delivered. */ if (!lapic_ipi_wait(AFTER_SPIN)) { #ifdef needsattention /* * XXX FIXME: * * The above function waits for the message to actually be * delivered. It breaks out after an arbitrary timeout * since the message should eventually be delivered (at * least in theory) and that if it wasn't we would catch * the failure with the check above when the next IPI is * sent. * * We could skip this wait entirely, EXCEPT it probably * protects us from other routines that assume that the * message was delivered and acted upon when this function * returns. */ printf("APIC: IPI might be stuck\n"); #else /* !needsattention */ /* Wait until mesage is sent without a timeout. */ while (lapic->icr_lo & APIC_DELSTAT_PEND) ia32_pause(); #endif /* needsattention */ } #endif /* DETECT_DEADLOCK */ } #endif /* SMP */ diff --git a/sys/x86/x86/mptable.c b/sys/x86/x86/mptable.c index cc619a04655b..cad83a2315c1 100644 --- a/sys/x86/x86/mptable.c +++ b/sys/x86/x86/mptable.c @@ -1,1055 +1,1055 @@ /*- * Copyright (c) 2003 John Baldwin * Copyright (c) 1996, by Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mptable_force_htt.h" #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include #include #include /* string defined by the Intel MP Spec as identifying the MP table */ #define MP_SIG 0x5f504d5f /* _MP_ */ #ifdef __amd64__ #define MAX_LAPIC_ID 63 /* Max local APIC ID for HTT fixup */ #else #define MAX_LAPIC_ID 31 /* Max local APIC ID for HTT fixup */ #endif #ifdef PC98 #define BIOS_BASE (0xe8000) #define BIOS_SIZE (0x18000) #else #define BIOS_BASE (0xf0000) #define BIOS_SIZE (0x10000) #endif #define BIOS_COUNT (BIOS_SIZE/4) typedef void mptable_entry_handler(u_char *entry, void *arg); static basetable_entry basetable_entry_types[] = { {0, 20, "Processor"}, {1, 8, "Bus"}, {2, 8, "I/O APIC"}, {3, 8, "I/O INT"}, {4, 8, "Local INT"} }; typedef struct BUSDATA { u_char bus_id; enum busTypes bus_type; } bus_datum; typedef struct INTDATA { u_char int_type; u_short int_flags; u_char src_bus_id; u_char src_bus_irq; u_char dst_apic_id; u_char dst_apic_int; u_char int_vector; } io_int, local_int; typedef struct BUSTYPENAME { u_char type; char name[7]; } bus_type_name; /* From MP spec v1.4, table 4-8. */ static bus_type_name bus_type_table[] = { {UNKNOWN_BUSTYPE, "CBUS "}, {UNKNOWN_BUSTYPE, "CBUSII"}, {EISA, "EISA "}, {UNKNOWN_BUSTYPE, "FUTURE"}, {UNKNOWN_BUSTYPE, "INTERN"}, {ISA, "ISA "}, {UNKNOWN_BUSTYPE, "MBI "}, {UNKNOWN_BUSTYPE, "MBII "}, {MCA, "MCA "}, {UNKNOWN_BUSTYPE, "MPI "}, {UNKNOWN_BUSTYPE, "MPSA "}, {UNKNOWN_BUSTYPE, "NUBUS "}, {PCI, "PCI "}, {UNKNOWN_BUSTYPE, "PCMCIA"}, {UNKNOWN_BUSTYPE, "TC "}, {UNKNOWN_BUSTYPE, "VL "}, {UNKNOWN_BUSTYPE, "VME "}, {UNKNOWN_BUSTYPE, "XPRESS"} }; /* From MP spec v1.4, table 5-1. */ static int default_data[7][5] = { /* nbus, id0, type0, id1, type1 */ {1, 0, ISA, 255, NOBUS}, {1, 0, EISA, 255, NOBUS}, {1, 0, EISA, 255, NOBUS}, {1, 0, MCA, 255, NOBUS}, {2, 0, ISA, 1, PCI}, {2, 0, EISA, 1, PCI}, {2, 0, MCA, 1, PCI} }; struct pci_probe_table_args { u_char bus; u_char found; }; struct pci_route_interrupt_args { u_char bus; /* Source bus. */ u_char irq; /* Source slot:pin. */ int vector; /* Return value. */ }; static mpfps_t mpfps; static mpcth_t mpct; static void *ioapics[MAX_APIC_ID + 1]; static bus_datum *busses; static int mptable_nioapics, mptable_nbusses, mptable_maxbusid; static int pci0 = -1; static MALLOC_DEFINE(M_MPTABLE, "mptable", "MP Table Items"); static enum intr_polarity conforming_polarity(u_char src_bus, u_char src_bus_irq); static enum intr_trigger conforming_trigger(u_char src_bus, u_char src_bus_irq); static enum intr_polarity intentry_polarity(int_entry_ptr intr); static enum intr_trigger intentry_trigger(int_entry_ptr intr); static int lookup_bus_type(char *name); static void mptable_count_items(void); static void mptable_count_items_handler(u_char *entry, void *arg); #ifdef MPTABLE_FORCE_HTT static void mptable_hyperthread_fixup(u_int id_mask); #endif static void mptable_parse_apics_and_busses(void); static void mptable_parse_apics_and_busses_handler(u_char *entry, void *arg); static void mptable_parse_default_config_ints(void); static void mptable_parse_ints(void); static void mptable_parse_ints_handler(u_char *entry, void *arg); static void mptable_parse_io_int(int_entry_ptr intr); static void mptable_parse_local_int(int_entry_ptr intr); static void mptable_pci_probe_table_handler(u_char *entry, void *arg); static void mptable_pci_route_interrupt_handler(u_char *entry, void *arg); static void mptable_pci_setup(void); static int mptable_probe(void); static int mptable_probe_cpus(void); static void mptable_probe_cpus_handler(u_char *entry, void *arg __unused); static void mptable_register(void *dummy); static int mptable_setup_local(void); static int mptable_setup_io(void); static void mptable_walk_table(mptable_entry_handler *handler, void *arg); static int search_for_sig(u_int32_t target, int count); static struct apic_enumerator mptable_enumerator = { "MPTable", mptable_probe, mptable_probe_cpus, mptable_setup_local, mptable_setup_io }; /* * look for the MP spec signature */ static int search_for_sig(u_int32_t target, int count) { int x; u_int32_t *addr = (u_int32_t *) (KERNBASE + target); for (x = 0; x < count; x += 4) if (addr[x] == MP_SIG) /* make array index a byte index */ return (target + (x * sizeof(u_int32_t))); return (-1); } static int lookup_bus_type(char *name) { int x; for (x = 0; x < MAX_BUSTYPE; ++x) if (strncmp(bus_type_table[x].name, name, 6) == 0) return (bus_type_table[x].type); return (UNKNOWN_BUSTYPE); } /* * Look for an Intel MP spec table (ie, SMP capable hardware). */ static int mptable_probe(void) { int x; u_long segment; u_int32_t target; /* see if EBDA exists */ if ((segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) != 0) { /* search first 1K of EBDA */ target = (u_int32_t) (segment << 4); if ((x = search_for_sig(target, 1024 / 4)) >= 0) goto found; } else { /* last 1K of base memory, effective 'top of base' passed in */ target = (u_int32_t) ((basemem * 1024) - 0x400); if ((x = search_for_sig(target, 1024 / 4)) >= 0) goto found; } /* search the BIOS */ target = (u_int32_t) BIOS_BASE; if ((x = search_for_sig(target, BIOS_COUNT)) >= 0) goto found; /* nothing found */ return (ENXIO); found: mpfps = (mpfps_t)(KERNBASE + x); /* Map in the configuration table if it exists. */ if (mpfps->config_type != 0) { if (bootverbose) printf( "MP Table version 1.%d found using Default Configuration %d\n", mpfps->spec_rev, mpfps->config_type); if (mpfps->config_type != 5 && mpfps->config_type != 6) { printf( "MP Table Default Configuration %d is unsupported\n", mpfps->config_type); return (ENXIO); } mpct = NULL; } else { if ((uintptr_t)mpfps->pap >= 1024 * 1024) { printf("%s: Unable to map MP Configuration Table\n", __func__); return (ENXIO); } mpct = (mpcth_t)(KERNBASE + (uintptr_t)mpfps->pap); if (mpct->base_table_length + (uintptr_t)mpfps->pap >= 1024 * 1024) { printf("%s: Unable to map end of MP Config Table\n", __func__); return (ENXIO); } if (mpct->signature[0] != 'P' || mpct->signature[1] != 'C' || mpct->signature[2] != 'M' || mpct->signature[3] != 'P') { printf("%s: MP Config Table has bad signature: %c%c%c%c\n", __func__, mpct->signature[0], mpct->signature[1], mpct->signature[2], mpct->signature[3]); return (ENXIO); } if (bootverbose) printf( "MP Configuration Table version 1.%d found at %p\n", mpct->spec_rev, mpct); } return (-100); } /* * Run through the MP table enumerating CPUs. */ static int mptable_probe_cpus(void) { u_int cpu_mask; /* Is this a pre-defined config? */ if (mpfps->config_type != 0) { lapic_create(0, 1); lapic_create(1, 0); } else { cpu_mask = 0; mptable_walk_table(mptable_probe_cpus_handler, &cpu_mask); #ifdef MPTABLE_FORCE_HTT mptable_hyperthread_fixup(cpu_mask); #endif } return (0); } /* * Initialize the local APIC on the BSP. */ static int mptable_setup_local(void) { vm_paddr_t addr; /* Is this a pre-defined config? */ printf("MPTable: <"); if (mpfps->config_type != 0) { addr = DEFAULT_APIC_BASE; printf("Default Configuration %d", mpfps->config_type); } else { addr = mpct->apic_address; printf("%.*s %.*s", (int)sizeof(mpct->oem_id), mpct->oem_id, (int)sizeof(mpct->product_id), mpct->product_id); } printf(">\n"); lapic_init(addr); return (0); } /* * Run through the MP table enumerating I/O APICs. */ static int mptable_setup_io(void) { int i; u_char byte; /* First, we count individual items and allocate arrays. */ mptable_count_items(); busses = malloc((mptable_maxbusid + 1) * sizeof(bus_datum), M_MPTABLE, M_WAITOK); for (i = 0; i <= mptable_maxbusid; i++) busses[i].bus_type = NOBUS; /* Second, we run through adding I/O APIC's and busses. */ mptable_parse_apics_and_busses(); /* Third, we run through the table tweaking interrupt sources. */ mptable_parse_ints(); /* Fourth, we register all the I/O APIC's. */ for (i = 0; i <= MAX_APIC_ID; i++) if (ioapics[i] != NULL) ioapic_register(ioapics[i]); /* Fifth, we setup data structures to handle PCI interrupt routing. */ mptable_pci_setup(); /* Finally, we throw the switch to enable the I/O APIC's. */ if (mpfps->mpfb2 & MPFB2_IMCR_PRESENT) { outb(0x22, 0x70); /* select IMCR */ byte = inb(0x23); /* current contents */ byte |= 0x01; /* mask external INTR */ outb(0x23, byte); /* disconnect 8259s/NMI */ } return (0); } static void mptable_register(void *dummy __unused) { apic_register_enumerator(&mptable_enumerator); } SYSINIT(mptable_register, SI_SUB_CPU - 1, SI_ORDER_FIRST, mptable_register, NULL); /* * Call the handler routine for each entry in the MP config table. */ static void mptable_walk_table(mptable_entry_handler *handler, void *arg) { u_int i; u_char *entry; entry = (u_char *)(mpct + 1); for (i = 0; i < mpct->entry_count; i++) { switch (*entry) { case MPCT_ENTRY_PROCESSOR: case MPCT_ENTRY_IOAPIC: case MPCT_ENTRY_BUS: case MPCT_ENTRY_INT: case MPCT_ENTRY_LOCAL_INT: break; default: panic("%s: Unknown MP Config Entry %d\n", __func__, (int)*entry); } handler(entry, arg); entry += basetable_entry_types[*entry].length; } } static void mptable_probe_cpus_handler(u_char *entry, void *arg) { proc_entry_ptr proc; u_int *cpu_mask; switch (*entry) { case MPCT_ENTRY_PROCESSOR: proc = (proc_entry_ptr)entry; if (proc->cpu_flags & PROCENTRY_FLAG_EN) { lapic_create(proc->apic_id, proc->cpu_flags & PROCENTRY_FLAG_BP); if (proc->apic_id < MAX_LAPIC_ID) { cpu_mask = (u_int *)arg; *cpu_mask |= (1ul << proc->apic_id); } } break; } } static void mptable_count_items_handler(u_char *entry, void *arg __unused) { io_apic_entry_ptr apic; bus_entry_ptr bus; switch (*entry) { case MPCT_ENTRY_BUS: bus = (bus_entry_ptr)entry; mptable_nbusses++; if (bus->bus_id > mptable_maxbusid) mptable_maxbusid = bus->bus_id; break; case MPCT_ENTRY_IOAPIC: apic = (io_apic_entry_ptr)entry; if (apic->apic_flags & IOAPICENTRY_FLAG_EN) mptable_nioapics++; break; } } /* * Count items in the table. */ static void mptable_count_items(void) { /* Is this a pre-defined config? */ if (mpfps->config_type != 0) { mptable_nioapics = 1; switch (mpfps->config_type) { case 1: case 2: case 3: case 4: mptable_nbusses = 1; break; case 5: case 6: case 7: mptable_nbusses = 2; break; default: panic("Unknown pre-defined MP Table config type %d", mpfps->config_type); } mptable_maxbusid = mptable_nbusses - 1; } else mptable_walk_table(mptable_count_items_handler, NULL); } /* * Add a bus or I/O APIC from an entry in the table. */ static void mptable_parse_apics_and_busses_handler(u_char *entry, void *arg __unused) { io_apic_entry_ptr apic; bus_entry_ptr bus; enum busTypes bus_type; int i; switch (*entry) { case MPCT_ENTRY_BUS: bus = (bus_entry_ptr)entry; bus_type = lookup_bus_type(bus->bus_type); if (bus_type == UNKNOWN_BUSTYPE) { printf("MPTable: Unknown bus %d type \"", bus->bus_id); for (i = 0; i < 6; i++) printf("%c", bus->bus_type[i]); printf("\"\n"); } busses[bus->bus_id].bus_id = bus->bus_id; busses[bus->bus_id].bus_type = bus_type; break; case MPCT_ENTRY_IOAPIC: apic = (io_apic_entry_ptr)entry; if (!(apic->apic_flags & IOAPICENTRY_FLAG_EN)) break; if (apic->apic_id > MAX_APIC_ID) panic("%s: I/O APIC ID %d too high", __func__, apic->apic_id); if (ioapics[apic->apic_id] != NULL) panic("%s: Double APIC ID %d", __func__, apic->apic_id); ioapics[apic->apic_id] = ioapic_create(apic->apic_address, apic->apic_id, -1); break; default: break; } } /* * Enumerate I/O APIC's and busses. */ static void mptable_parse_apics_and_busses(void) { /* Is this a pre-defined config? */ if (mpfps->config_type != 0) { ioapics[2] = ioapic_create(DEFAULT_IO_APIC_BASE, 2, 0); busses[0].bus_id = 0; busses[0].bus_type = default_data[mpfps->config_type - 1][2]; if (mptable_nbusses > 1) { busses[1].bus_id = 1; busses[1].bus_type = default_data[mpfps->config_type - 1][4]; } } else mptable_walk_table(mptable_parse_apics_and_busses_handler, NULL); } /* * Determine conforming polarity for a given bus type. */ static enum intr_polarity conforming_polarity(u_char src_bus, u_char src_bus_irq) { KASSERT(src_bus <= mptable_maxbusid, ("bus id %d too large", src_bus)); switch (busses[src_bus].bus_type) { case ISA: case EISA: return (INTR_POLARITY_HIGH); case PCI: return (INTR_POLARITY_LOW); default: panic("%s: unknown bus type %d", __func__, busses[src_bus].bus_type); } } /* * Determine conforming trigger for a given bus type. */ static enum intr_trigger conforming_trigger(u_char src_bus, u_char src_bus_irq) { KASSERT(src_bus <= mptable_maxbusid, ("bus id %d too large", src_bus)); switch (busses[src_bus].bus_type) { case ISA: #ifndef PC98 if (elcr_found) return (elcr_read_trigger(src_bus_irq)); else #endif return (INTR_TRIGGER_EDGE); case PCI: return (INTR_TRIGGER_LEVEL); #ifndef PC98 case EISA: KASSERT(src_bus_irq < 16, ("Invalid EISA IRQ %d", src_bus_irq)); KASSERT(elcr_found, ("Missing ELCR")); return (elcr_read_trigger(src_bus_irq)); #endif default: panic("%s: unknown bus type %d", __func__, busses[src_bus].bus_type); } } static enum intr_polarity intentry_polarity(int_entry_ptr intr) { switch (intr->int_flags & INTENTRY_FLAGS_POLARITY) { case INTENTRY_FLAGS_POLARITY_CONFORM: return (conforming_polarity(intr->src_bus_id, intr->src_bus_irq)); case INTENTRY_FLAGS_POLARITY_ACTIVEHI: return (INTR_POLARITY_HIGH); case INTENTRY_FLAGS_POLARITY_ACTIVELO: return (INTR_POLARITY_LOW); default: panic("Bogus interrupt flags"); } } static enum intr_trigger intentry_trigger(int_entry_ptr intr) { switch (intr->int_flags & INTENTRY_FLAGS_TRIGGER) { case INTENTRY_FLAGS_TRIGGER_CONFORM: return (conforming_trigger(intr->src_bus_id, intr->src_bus_irq)); case INTENTRY_FLAGS_TRIGGER_EDGE: return (INTR_TRIGGER_EDGE); case INTENTRY_FLAGS_TRIGGER_LEVEL: return (INTR_TRIGGER_LEVEL); default: panic("Bogus interrupt flags"); } } /* * Parse an interrupt entry for an I/O interrupt routed to a pin on an I/O APIC. */ static void mptable_parse_io_int(int_entry_ptr intr) { void *ioapic; u_int pin, apic_id; apic_id = intr->dst_apic_id; if (intr->dst_apic_id == 0xff) { /* * An APIC ID of 0xff means that the interrupt is connected * to the specified pin on all I/O APICs in the system. If * there is only one I/O APIC, then use that APIC to route * the interrupts. If there is more than one I/O APIC, then * punt. */ if (mptable_nioapics == 1) { apic_id = 0; while (ioapics[apic_id] == NULL) apic_id++; } else { printf( "MPTable: Ignoring global interrupt entry for pin %d\n", intr->dst_apic_int); return; } } if (apic_id > MAX_APIC_ID) { printf("MPTable: Ignoring interrupt entry for ioapic%d\n", intr->dst_apic_id); return; } ioapic = ioapics[apic_id]; if (ioapic == NULL) { printf( "MPTable: Ignoring interrupt entry for missing ioapic%d\n", apic_id); return; } pin = intr->dst_apic_int; switch (intr->int_type) { case INTENTRY_TYPE_INT: switch (busses[intr->src_bus_id].bus_type) { case NOBUS: panic("interrupt from missing bus"); case ISA: case EISA: if (busses[intr->src_bus_id].bus_type == ISA) ioapic_set_bus(ioapic, pin, APIC_BUS_ISA); else ioapic_set_bus(ioapic, pin, APIC_BUS_EISA); if (intr->src_bus_irq == pin) break; ioapic_remap_vector(ioapic, pin, intr->src_bus_irq); if (ioapic_get_vector(ioapic, intr->src_bus_irq) == intr->src_bus_irq) ioapic_disable_pin(ioapic, intr->src_bus_irq); break; case PCI: ioapic_set_bus(ioapic, pin, APIC_BUS_PCI); break; default: ioapic_set_bus(ioapic, pin, APIC_BUS_UNKNOWN); break; } break; case INTENTRY_TYPE_NMI: ioapic_set_nmi(ioapic, pin); break; case INTENTRY_TYPE_SMI: ioapic_set_smi(ioapic, pin); break; case INTENTRY_TYPE_EXTINT: ioapic_set_extint(ioapic, pin); break; default: panic("%s: invalid interrupt entry type %d\n", __func__, intr->int_type); } if (intr->int_type == INTENTRY_TYPE_INT || (intr->int_flags & INTENTRY_FLAGS_TRIGGER) != INTENTRY_FLAGS_TRIGGER_CONFORM) ioapic_set_triggermode(ioapic, pin, intentry_trigger(intr)); if (intr->int_type == INTENTRY_TYPE_INT || (intr->int_flags & INTENTRY_FLAGS_POLARITY) != INTENTRY_FLAGS_POLARITY_CONFORM) ioapic_set_polarity(ioapic, pin, intentry_polarity(intr)); } /* * Parse an interrupt entry for a local APIC LVT pin. */ static void mptable_parse_local_int(int_entry_ptr intr) { u_int apic_id, pin; if (intr->dst_apic_id == 0xff) apic_id = APIC_ID_ALL; else apic_id = intr->dst_apic_id; if (intr->dst_apic_int == 0) pin = LVT_LINT0; else pin = LVT_LINT1; switch (intr->int_type) { case INTENTRY_TYPE_INT: #if 1 printf( "MPTable: Ignoring vectored local interrupt for LINTIN%d vector %d\n", intr->dst_apic_int, intr->src_bus_irq); return; #else lapic_set_lvt_mode(apic_id, pin, APIC_LVT_DM_FIXED); break; #endif case INTENTRY_TYPE_NMI: lapic_set_lvt_mode(apic_id, pin, APIC_LVT_DM_NMI); break; case INTENTRY_TYPE_SMI: lapic_set_lvt_mode(apic_id, pin, APIC_LVT_DM_SMI); break; case INTENTRY_TYPE_EXTINT: lapic_set_lvt_mode(apic_id, pin, APIC_LVT_DM_EXTINT); break; default: panic("%s: invalid interrupt entry type %d\n", __func__, intr->int_type); } if ((intr->int_flags & INTENTRY_FLAGS_TRIGGER) != INTENTRY_FLAGS_TRIGGER_CONFORM) lapic_set_lvt_triggermode(apic_id, pin, intentry_trigger(intr)); if ((intr->int_flags & INTENTRY_FLAGS_POLARITY) != INTENTRY_FLAGS_POLARITY_CONFORM) lapic_set_lvt_polarity(apic_id, pin, intentry_polarity(intr)); } /* * Parse interrupt entries. */ static void mptable_parse_ints_handler(u_char *entry, void *arg __unused) { int_entry_ptr intr; intr = (int_entry_ptr)entry; switch (*entry) { case MPCT_ENTRY_INT: mptable_parse_io_int(intr); break; case MPCT_ENTRY_LOCAL_INT: mptable_parse_local_int(intr); break; } } /* * Configure interrupt pins for a default configuration. For details see * Table 5-2 in Section 5 of the MP Table specification. */ static void mptable_parse_default_config_ints(void) { struct INTENTRY entry; int pin; /* * All default configs route IRQs from bus 0 to the first 16 pins * of the first I/O APIC with an APIC ID of 2. */ entry.type = MPCT_ENTRY_INT; entry.int_flags = INTENTRY_FLAGS_POLARITY_CONFORM | INTENTRY_FLAGS_TRIGGER_CONFORM; entry.src_bus_id = 0; entry.dst_apic_id = 2; /* Run through all 16 pins. */ for (pin = 0; pin < 16; pin++) { entry.dst_apic_int = pin; switch (pin) { case 0: /* Pin 0 is an ExtINT pin. */ entry.int_type = INTENTRY_TYPE_EXTINT; break; case 2: /* IRQ 0 is routed to pin 2. */ entry.int_type = INTENTRY_TYPE_INT; entry.src_bus_irq = 0; break; default: /* All other pins are identity mapped. */ entry.int_type = INTENTRY_TYPE_INT; entry.src_bus_irq = pin; break; } mptable_parse_io_int(&entry); } /* Certain configs disable certain pins. */ if (mpfps->config_type == 7) ioapic_disable_pin(ioapics[2], 0); if (mpfps->config_type == 2) { ioapic_disable_pin(ioapics[2], 2); ioapic_disable_pin(ioapics[2], 13); } } /* * Configure the interrupt pins */ static void mptable_parse_ints(void) { /* Is this a pre-defined config? */ if (mpfps->config_type != 0) { /* Configure LINT pins. */ lapic_set_lvt_mode(APIC_ID_ALL, LVT_LINT0, APIC_LVT_DM_EXTINT); lapic_set_lvt_mode(APIC_ID_ALL, LVT_LINT1, APIC_LVT_DM_NMI); /* Configure I/O APIC pins. */ mptable_parse_default_config_ints(); } else mptable_walk_table(mptable_parse_ints_handler, NULL); } #ifdef MPTABLE_FORCE_HTT /* * Perform a hyperthreading "fix-up" to enumerate any logical CPU's * that aren't already listed in the table. * * XXX: We assume that all of the physical CPUs in the * system have the same number of logical CPUs. * * XXX: We assume that APIC ID's are allocated such that * the APIC ID's for a physical processor are aligned * with the number of logical CPU's in the processor. */ static void mptable_hyperthread_fixup(u_int id_mask) { u_int i, id, logical_cpus; /* Nothing to do if there is no HTT support. */ if ((cpu_feature & CPUID_HTT) == 0) return; logical_cpus = (cpu_procinfo & CPUID_HTT_CORES) >> 16; if (logical_cpus <= 1) return; /* * For each APIC ID of a CPU that is set in the mask, * scan the other candidate APIC ID's for this * physical processor. If any of those ID's are * already in the table, then kill the fixup. */ for (id = 0; id <= MAX_LAPIC_ID; id++) { if ((id_mask & 1 << id) == 0) continue; /* First, make sure we are on a logical_cpus boundary. */ if (id % logical_cpus != 0) return; for (i = id + 1; i < id + logical_cpus; i++) if ((id_mask & 1 << i) != 0) return; } /* * Ok, the ID's checked out, so perform the fixup by * adding the logical CPUs. */ while ((id = ffs(id_mask)) != 0) { id--; for (i = id + 1; i < id + logical_cpus; i++) { if (bootverbose) printf( "MPTable: Adding logical CPU %d from main CPU %d\n", i, id); lapic_create(i, 0); } id_mask &= ~(1 << id); } } #endif /* MPTABLE_FORCE_HTT */ /* * Support code for routing PCI interrupts using the MP Table. */ static void mptable_pci_setup(void) { int i; /* * Find the first pci bus and call it 0. Panic if pci0 is not * bus zero and there are multiple PCI busses. */ for (i = 0; i <= mptable_maxbusid; i++) if (busses[i].bus_type == PCI) { if (pci0 == -1) pci0 = i; else if (pci0 != 0) panic( "MPTable contains multiple PCI busses but no PCI bus 0"); } } static void mptable_pci_probe_table_handler(u_char *entry, void *arg) { struct pci_probe_table_args *args; int_entry_ptr intr; if (*entry != MPCT_ENTRY_INT) return; intr = (int_entry_ptr)entry; args = (struct pci_probe_table_args *)arg; KASSERT(args->bus <= mptable_maxbusid, ("bus %d is too big", args->bus)); KASSERT(busses[args->bus].bus_type == PCI, ("probing for non-PCI bus")); if (intr->src_bus_id == args->bus) args->found = 1; } int mptable_pci_probe_table(int bus) { struct pci_probe_table_args args; if (bus < 0) return (EINVAL); if (mpct == NULL || pci0 == -1 || pci0 + bus > mptable_maxbusid) return (ENXIO); if (busses[pci0 + bus].bus_type != PCI) return (ENXIO); args.bus = pci0 + bus; args.found = 0; mptable_walk_table(mptable_pci_probe_table_handler, &args); if (args.found == 0) return (ENXIO); return (0); } static void mptable_pci_route_interrupt_handler(u_char *entry, void *arg) { struct pci_route_interrupt_args *args; int_entry_ptr intr; int vector; if (*entry != MPCT_ENTRY_INT) return; intr = (int_entry_ptr)entry; args = (struct pci_route_interrupt_args *)arg; if (intr->src_bus_id != args->bus || intr->src_bus_irq != args->irq) return; /* Make sure the APIC maps to a known APIC. */ KASSERT(ioapics[intr->dst_apic_id] != NULL, ("No I/O APIC %d to route interrupt to", intr->dst_apic_id)); /* * Look up the vector for this APIC / pin combination. If we * have previously matched an entry for this PCI IRQ but it * has the same vector as this entry, just return. Otherwise, * we use the vector for this APIC / pin combination. */ vector = ioapic_get_vector(ioapics[intr->dst_apic_id], intr->dst_apic_int); if (args->vector == vector) return; KASSERT(args->vector == -1, ("Multiple IRQs for PCI interrupt %d.%d.INT%c: %d and %d\n", args->bus, args->irq >> 2, 'A' + (args->irq & 0x3), args->vector, vector)); args->vector = vector; } int mptable_pci_route_interrupt(device_t pcib, device_t dev, int pin) { struct pci_route_interrupt_args args; int slot; /* Like ACPI, pin numbers are 0-3, not 1-4. */ pin--; KASSERT(pci0 != -1, ("do not know how to route PCI interrupts")); args.bus = pci_get_bus(dev) + pci0; slot = pci_get_slot(dev); /* * PCI interrupt entries in the MP Table encode both the slot and * pin into the IRQ with the pin being the two least significant * bits, the slot being the next five bits, and the most significant * bit being reserved. */ args.irq = slot << 2 | pin; args.vector = -1; mptable_walk_table(mptable_pci_route_interrupt_handler, &args); if (args.vector < 0) { device_printf(pcib, "unable to route slot %d INT%c\n", slot, 'A' + pin); return (PCI_INVALID_IRQ); } if (bootverbose) device_printf(pcib, "slot %d INT%c routed to irq %d\n", slot, 'A' + pin, args.vector); return (args.vector); } diff --git a/sys/x86/x86/msi.c b/sys/x86/x86/msi.c index 428894ed1a9e..381f09760719 100644 --- a/sys/x86/x86/msi.c +++ b/sys/x86/x86/msi.c @@ -1,603 +1,603 @@ /*- * Copyright (c) 2006 Yahoo!, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Support for PCI Message Signalled Interrupts (MSI). MSI interrupts on * x86 are basically APIC messages that the northbridge delivers directly * to the local APICs as if they had come from an I/O APIC. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include #include #include /* Fields in address for Intel MSI messages. */ #define MSI_INTEL_ADDR_DEST 0x000ff000 #define MSI_INTEL_ADDR_RH 0x00000008 # define MSI_INTEL_ADDR_RH_ON 0x00000008 # define MSI_INTEL_ADDR_RH_OFF 0x00000000 #define MSI_INTEL_ADDR_DM 0x00000004 # define MSI_INTEL_ADDR_DM_PHYSICAL 0x00000000 # define MSI_INTEL_ADDR_DM_LOGICAL 0x00000004 /* Fields in data for Intel MSI messages. */ #define MSI_INTEL_DATA_TRGRMOD IOART_TRGRMOD /* Trigger mode. */ # define MSI_INTEL_DATA_TRGREDG IOART_TRGREDG # define MSI_INTEL_DATA_TRGRLVL IOART_TRGRLVL #define MSI_INTEL_DATA_LEVEL 0x00004000 /* Polarity. */ # define MSI_INTEL_DATA_DEASSERT 0x00000000 # define MSI_INTEL_DATA_ASSERT 0x00004000 #define MSI_INTEL_DATA_DELMOD IOART_DELMOD /* Delivery mode. */ # define MSI_INTEL_DATA_DELFIXED IOART_DELFIXED # define MSI_INTEL_DATA_DELLOPRI IOART_DELLOPRI # define MSI_INTEL_DATA_DELSMI IOART_DELSMI # define MSI_INTEL_DATA_DELNMI IOART_DELNMI # define MSI_INTEL_DATA_DELINIT IOART_DELINIT # define MSI_INTEL_DATA_DELEXINT IOART_DELEXINT #define MSI_INTEL_DATA_INTVEC IOART_INTVEC /* Interrupt vector. */ /* * Build Intel MSI message and data values from a source. AMD64 systems * seem to be compatible, so we use the same function for both. */ #define INTEL_ADDR(msi) \ (MSI_INTEL_ADDR_BASE | (msi)->msi_cpu << 12 | \ MSI_INTEL_ADDR_RH_OFF | MSI_INTEL_ADDR_DM_PHYSICAL) #define INTEL_DATA(msi) \ (MSI_INTEL_DATA_TRGREDG | MSI_INTEL_DATA_DELFIXED | (msi)->msi_vector) static MALLOC_DEFINE(M_MSI, "msi", "PCI MSI"); /* * MSI sources are bunched into groups. This is because MSI forces * all of the messages to share the address and data registers and * thus certain properties (such as the local APIC ID target on x86). * Each group has a 'first' source that contains information global to * the group. These fields are marked with (g) below. * * Note that local APIC ID is kind of special. Each message will be * assigned an ID by the system; however, a group will use the ID from * the first message. * * For MSI-X, each message is isolated. */ struct msi_intsrc { struct intsrc msi_intsrc; device_t msi_dev; /* Owning device. (g) */ struct msi_intsrc *msi_first; /* First source in group. */ u_int msi_irq; /* IRQ cookie. */ u_int msi_msix; /* MSI-X message. */ u_int msi_vector:8; /* IDT vector. */ u_int msi_cpu:8; /* Local APIC ID. (g) */ u_int msi_count:8; /* Messages in this group. (g) */ u_int msi_maxcount:8; /* Alignment for this group. (g) */ int *msi_irqs; /* Group's IRQ list. (g) */ }; static void msi_create_source(void); static void msi_enable_source(struct intsrc *isrc); static void msi_disable_source(struct intsrc *isrc, int eoi); static void msi_eoi_source(struct intsrc *isrc); static void msi_enable_intr(struct intsrc *isrc); static void msi_disable_intr(struct intsrc *isrc); static int msi_vector(struct intsrc *isrc); static int msi_source_pending(struct intsrc *isrc); static int msi_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol); static int msi_assign_cpu(struct intsrc *isrc, u_int apic_id); struct pic msi_pic = { msi_enable_source, msi_disable_source, msi_eoi_source, msi_enable_intr, msi_disable_intr, msi_vector, msi_source_pending, NULL, NULL, msi_config_intr, msi_assign_cpu }; static int msi_enabled; static int msi_last_irq; static struct mtx msi_lock; static void msi_enable_source(struct intsrc *isrc) { } static void msi_disable_source(struct intsrc *isrc, int eoi) { if (eoi == PIC_EOI) lapic_eoi(); } static void msi_eoi_source(struct intsrc *isrc) { lapic_eoi(); } static void msi_enable_intr(struct intsrc *isrc) { struct msi_intsrc *msi = (struct msi_intsrc *)isrc; apic_enable_vector(msi->msi_cpu, msi->msi_vector); } static void msi_disable_intr(struct intsrc *isrc) { struct msi_intsrc *msi = (struct msi_intsrc *)isrc; apic_disable_vector(msi->msi_cpu, msi->msi_vector); } static int msi_vector(struct intsrc *isrc) { struct msi_intsrc *msi = (struct msi_intsrc *)isrc; return (msi->msi_irq); } static int msi_source_pending(struct intsrc *isrc) { return (0); } static int msi_config_intr(struct intsrc *isrc, enum intr_trigger trig, enum intr_polarity pol) { return (ENODEV); } static int msi_assign_cpu(struct intsrc *isrc, u_int apic_id) { struct msi_intsrc *sib, *msi = (struct msi_intsrc *)isrc; int old_vector; u_int old_id; int i, vector; /* * Only allow CPUs to be assigned to the first message for an * MSI group. */ if (msi->msi_first != msi) return (EINVAL); /* Store information to free existing irq. */ old_vector = msi->msi_vector; old_id = msi->msi_cpu; if (old_id == apic_id) return (0); /* Allocate IDT vectors on this cpu. */ if (msi->msi_count > 1) { KASSERT(msi->msi_msix == 0, ("MSI-X message group")); vector = apic_alloc_vectors(apic_id, msi->msi_irqs, msi->msi_count, msi->msi_maxcount); } else vector = apic_alloc_vector(apic_id, msi->msi_irq); if (vector == 0) return (ENOSPC); msi->msi_cpu = apic_id; msi->msi_vector = vector; if (msi->msi_intsrc.is_handlers > 0) apic_enable_vector(msi->msi_cpu, msi->msi_vector); if (bootverbose) printf("msi: Assigning %s IRQ %d to local APIC %u vector %u\n", msi->msi_msix ? "MSI-X" : "MSI", msi->msi_irq, msi->msi_cpu, msi->msi_vector); for (i = 1; i < msi->msi_count; i++) { sib = (struct msi_intsrc *)intr_lookup_source(msi->msi_irqs[i]); sib->msi_cpu = apic_id; sib->msi_vector = vector + i; if (sib->msi_intsrc.is_handlers > 0) apic_enable_vector(sib->msi_cpu, sib->msi_vector); if (bootverbose) printf( "msi: Assigning MSI IRQ %d to local APIC %u vector %u\n", sib->msi_irq, sib->msi_cpu, sib->msi_vector); } BUS_REMAP_INTR(device_get_parent(msi->msi_dev), msi->msi_dev, msi->msi_irq); /* * Free the old vector after the new one is established. This is done * to prevent races where we could miss an interrupt. */ if (msi->msi_intsrc.is_handlers > 0) apic_disable_vector(old_id, old_vector); apic_free_vector(old_id, old_vector, msi->msi_irq); for (i = 1; i < msi->msi_count; i++) { sib = (struct msi_intsrc *)intr_lookup_source(msi->msi_irqs[i]); if (sib->msi_intsrc.is_handlers > 0) apic_disable_vector(old_id, old_vector + i); apic_free_vector(old_id, old_vector + i, msi->msi_irqs[i]); } return (0); } void msi_init(void) { /* Check if we have a supported CPU. */ switch (cpu_vendor_id) { case CPU_VENDOR_INTEL: case CPU_VENDOR_AMD: break; case CPU_VENDOR_CENTAUR: if (CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) >= 0xf) break; /* FALLTHROUGH */ default: return; } msi_enabled = 1; intr_register_pic(&msi_pic); mtx_init(&msi_lock, "msi", NULL, MTX_DEF); } static void msi_create_source(void) { struct msi_intsrc *msi; u_int irq; mtx_lock(&msi_lock); if (msi_last_irq >= NUM_MSI_INTS) { mtx_unlock(&msi_lock); return; } irq = msi_last_irq + FIRST_MSI_INT; msi_last_irq++; mtx_unlock(&msi_lock); msi = malloc(sizeof(struct msi_intsrc), M_MSI, M_WAITOK | M_ZERO); msi->msi_intsrc.is_pic = &msi_pic; msi->msi_irq = irq; intr_register_source(&msi->msi_intsrc); nexus_add_irq(irq); } /* * Try to allocate 'count' interrupt sources with contiguous IDT values. */ int msi_alloc(device_t dev, int count, int maxcount, int *irqs) { struct msi_intsrc *msi, *fsrc; u_int cpu; int cnt, i, *mirqs, vector; if (!msi_enabled) return (ENXIO); if (count > 1) mirqs = malloc(count * sizeof(*mirqs), M_MSI, M_WAITOK); else mirqs = NULL; again: mtx_lock(&msi_lock); /* Try to find 'count' free IRQs. */ cnt = 0; for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) { msi = (struct msi_intsrc *)intr_lookup_source(i); /* End of allocated sources, so break. */ if (msi == NULL) break; /* If this is a free one, save its IRQ in the array. */ if (msi->msi_dev == NULL) { irqs[cnt] = i; cnt++; if (cnt == count) break; } } /* Do we need to create some new sources? */ if (cnt < count) { /* If we would exceed the max, give up. */ if (i + (count - cnt) > FIRST_MSI_INT + NUM_MSI_INTS) { mtx_unlock(&msi_lock); free(mirqs, M_MSI); return (ENXIO); } mtx_unlock(&msi_lock); /* We need count - cnt more sources. */ while (cnt < count) { msi_create_source(); cnt++; } goto again; } /* Ok, we now have the IRQs allocated. */ KASSERT(cnt == count, ("count mismatch")); /* Allocate 'count' IDT vectors. */ cpu = intr_next_cpu(); vector = apic_alloc_vectors(cpu, irqs, count, maxcount); if (vector == 0) { mtx_unlock(&msi_lock); free(mirqs, M_MSI); return (ENOSPC); } /* Assign IDT vectors and make these messages owned by 'dev'. */ fsrc = (struct msi_intsrc *)intr_lookup_source(irqs[0]); for (i = 0; i < count; i++) { msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]); msi->msi_cpu = cpu; msi->msi_dev = dev; msi->msi_vector = vector + i; if (bootverbose) printf( "msi: routing MSI IRQ %d to local APIC %u vector %u\n", msi->msi_irq, msi->msi_cpu, msi->msi_vector); msi->msi_first = fsrc; KASSERT(msi->msi_intsrc.is_handlers == 0, ("dead MSI has handlers")); } fsrc->msi_count = count; fsrc->msi_maxcount = maxcount; if (count > 1) bcopy(irqs, mirqs, count * sizeof(*mirqs)); fsrc->msi_irqs = mirqs; mtx_unlock(&msi_lock); return (0); } int msi_release(int *irqs, int count) { struct msi_intsrc *msi, *first; int i; mtx_lock(&msi_lock); first = (struct msi_intsrc *)intr_lookup_source(irqs[0]); if (first == NULL) { mtx_unlock(&msi_lock); return (ENOENT); } /* Make sure this isn't an MSI-X message. */ if (first->msi_msix) { mtx_unlock(&msi_lock); return (EINVAL); } /* Make sure this message is allocated to a group. */ if (first->msi_first == NULL) { mtx_unlock(&msi_lock); return (ENXIO); } /* * Make sure this is the start of a group and that we are releasing * the entire group. */ if (first->msi_first != first || first->msi_count != count) { mtx_unlock(&msi_lock); return (EINVAL); } KASSERT(first->msi_dev != NULL, ("unowned group")); /* Clear all the extra messages in the group. */ for (i = 1; i < count; i++) { msi = (struct msi_intsrc *)intr_lookup_source(irqs[i]); KASSERT(msi->msi_first == first, ("message not in group")); KASSERT(msi->msi_dev == first->msi_dev, ("owner mismatch")); msi->msi_first = NULL; msi->msi_dev = NULL; apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq); msi->msi_vector = 0; } /* Clear out the first message. */ first->msi_first = NULL; first->msi_dev = NULL; apic_free_vector(first->msi_cpu, first->msi_vector, first->msi_irq); first->msi_vector = 0; first->msi_count = 0; first->msi_maxcount = 0; free(first->msi_irqs, M_MSI); first->msi_irqs = NULL; mtx_unlock(&msi_lock); return (0); } int msi_map(int irq, uint64_t *addr, uint32_t *data) { struct msi_intsrc *msi; mtx_lock(&msi_lock); msi = (struct msi_intsrc *)intr_lookup_source(irq); if (msi == NULL) { mtx_unlock(&msi_lock); return (ENOENT); } /* Make sure this message is allocated to a device. */ if (msi->msi_dev == NULL) { mtx_unlock(&msi_lock); return (ENXIO); } /* * If this message isn't an MSI-X message, make sure it's part * of a group, and switch to the first message in the * group. */ if (!msi->msi_msix) { if (msi->msi_first == NULL) { mtx_unlock(&msi_lock); return (ENXIO); } msi = msi->msi_first; } *addr = INTEL_ADDR(msi); *data = INTEL_DATA(msi); mtx_unlock(&msi_lock); return (0); } int msix_alloc(device_t dev, int *irq) { struct msi_intsrc *msi; u_int cpu; int i, vector; if (!msi_enabled) return (ENXIO); again: mtx_lock(&msi_lock); /* Find a free IRQ. */ for (i = FIRST_MSI_INT; i < FIRST_MSI_INT + NUM_MSI_INTS; i++) { msi = (struct msi_intsrc *)intr_lookup_source(i); /* End of allocated sources, so break. */ if (msi == NULL) break; /* Stop at the first free source. */ if (msi->msi_dev == NULL) break; } /* Do we need to create a new source? */ if (msi == NULL) { /* If we would exceed the max, give up. */ if (i + 1 > FIRST_MSI_INT + NUM_MSI_INTS) { mtx_unlock(&msi_lock); return (ENXIO); } mtx_unlock(&msi_lock); /* Create a new source. */ msi_create_source(); goto again; } /* Allocate an IDT vector. */ cpu = intr_next_cpu(); vector = apic_alloc_vector(cpu, i); if (vector == 0) { mtx_unlock(&msi_lock); return (ENOSPC); } if (bootverbose) printf("msi: routing MSI-X IRQ %d to local APIC %u vector %u\n", msi->msi_irq, cpu, vector); /* Setup source. */ msi->msi_cpu = cpu; msi->msi_dev = dev; msi->msi_first = msi; msi->msi_vector = vector; msi->msi_msix = 1; msi->msi_count = 1; msi->msi_maxcount = 1; msi->msi_irqs = NULL; KASSERT(msi->msi_intsrc.is_handlers == 0, ("dead MSI-X has handlers")); mtx_unlock(&msi_lock); *irq = i; return (0); } int msix_release(int irq) { struct msi_intsrc *msi; mtx_lock(&msi_lock); msi = (struct msi_intsrc *)intr_lookup_source(irq); if (msi == NULL) { mtx_unlock(&msi_lock); return (ENOENT); } /* Make sure this is an MSI-X message. */ if (!msi->msi_msix) { mtx_unlock(&msi_lock); return (EINVAL); } KASSERT(msi->msi_dev != NULL, ("unowned message")); /* Clear out the message. */ msi->msi_first = NULL; msi->msi_dev = NULL; apic_free_vector(msi->msi_cpu, msi->msi_vector, msi->msi_irq); msi->msi_vector = 0; msi->msi_msix = 0; msi->msi_count = 0; msi->msi_maxcount = 0; mtx_unlock(&msi_lock); return (0); }