Index: stable/10/sys/amd64/amd64/mp_machdep.c =================================================================== --- stable/10/sys/amd64/amd64/mp_machdep.c (revision 331909) +++ stable/10/sys/amd64/amd64/mp_machdep.c (revision 331910) @@ -1,1737 +1,1748 @@ /*- * Copyright (c) 1996, by Steve Passe * Copyright (c) 2003, by Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include "opt_sched.h" #include "opt_smp.h" #include #include #include #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) /* lock region used by kernel profiling */ int mcount_lock; int mp_naps; /* # of Applications processors */ int boot_cpu_id = -1; /* designated BSP */ extern struct pcpu __pcpu[]; /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; static int bootAP; /* Free these after use */ void *bootstacks[MAXCPU]; /* Temporary variables for init_secondary() */ char *doublefault_stack; char *nmi_stack; void *dpcpu; struct pcb stoppcbs[MAXCPU]; struct susppcb **susppcbs; /* Variables needed for SMP tlb shootdown. */ vm_offset_t smp_tlb_addr2; struct invpcid_descr smp_tlb_invpcid; volatile int smp_tlb_wait; uint64_t pcid_cr3; pmap_t smp_tlb_pmap; extern int invpcid_works; #ifdef COUNT_IPIS /* Interrupt counts. */ static u_long *ipi_preempt_counts[MAXCPU]; static u_long *ipi_ast_counts[MAXCPU]; u_long *ipi_invltlb_counts[MAXCPU]; u_long *ipi_invlrng_counts[MAXCPU]; u_long *ipi_invlpg_counts[MAXCPU]; u_long *ipi_invlcache_counts[MAXCPU]; u_long *ipi_rendezvous_counts[MAXCPU]; static u_long *ipi_hardclock_counts[MAXCPU]; #endif /* Default cpu_ops implementation. */ struct cpu_ops cpu_ops = { .ipi_vectored = lapic_ipi_vectored }; extern inthand_t IDTVEC(fast_syscall), IDTVEC(fast_syscall32); extern int pmap_pcid_enabled; /* * Local data and functions. */ static volatile cpuset_t ipi_nmi_pending; +volatile cpuset_t resuming_cpus; +volatile cpuset_t toresume_cpus; + /* used to hold the AP's until we are ready to release them */ static struct mtx ap_boot_mtx; /* Set to 1 once we're ready to let the APs out of the pen. */ static volatile int aps_ready = 0; /* * Store data from cpu_add() until later in the boot when we actually setup * the APs. */ struct cpu_info { int cpu_present:1; int cpu_bsp:1; int cpu_disabled:1; int cpu_hyperthread:1; } static cpu_info[MAX_APIC_ID + 1]; int cpu_apic_ids[MAXCPU]; int apic_cpuids[MAX_APIC_ID + 1]; /* Holds pending bitmap based IPIs per CPU */ volatile u_int cpu_ipi_pending[MAXCPU]; static u_int boot_address; static int cpu_logical; /* logical cpus per core */ static int cpu_cores; /* cores per package */ static void assign_cpu_ids(void); static void set_interrupt_apic_ids(void); static int start_all_aps(void); static int start_ap(int apic_id); static void release_aps(void *dummy); static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ static int hyperthreading_allowed = 1; static u_int bootMP_size; static void mem_range_AP_init(void) { if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) mem_range_softc.mr_op->initAP(&mem_range_softc); } static void topo_probe_amd(void) { int core_id_bits; int id; /* AMD processors do not support HTT. */ cpu_logical = 1; if ((amd_feature2 & AMDID2_CMP) == 0) { cpu_cores = 1; return; } core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> AMDID_COREID_SIZE_SHIFT; if (core_id_bits == 0) { cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; return; } /* Fam 10h and newer should get here. */ for (id = 0; id <= MAX_APIC_ID; id++) { /* Check logical CPU availability. */ if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) continue; /* Check if logical CPU has the same package ID. */ if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits)) continue; cpu_cores++; } } /* * Round up to the next power of two, if necessary, and then * take log2. * Returns -1 if argument is zero. */ static __inline int mask_width(u_int x) { return (fls(x << (1 - powerof2(x))) - 1); } static void topo_probe_0x4(void) { u_int p[4]; int pkg_id_bits; int core_id_bits; int max_cores; int max_logical; int id; /* Both zero and one here mean one logical processor per package. */ max_logical = (cpu_feature & CPUID_HTT) != 0 ? (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; if (max_logical <= 1) return; /* * Because of uniformity assumption we examine only * those logical processors that belong to the same * package as BSP. Further, we count number of * logical processors that belong to the same core * as BSP thus deducing number of threads per core. */ if (cpu_high >= 0x4) { cpuid_count(0x04, 0, p); max_cores = ((p[0] >> 26) & 0x3f) + 1; } else max_cores = 1; core_id_bits = mask_width(max_logical/max_cores); if (core_id_bits < 0) return; pkg_id_bits = core_id_bits + mask_width(max_cores); for (id = 0; id <= MAX_APIC_ID; id++) { /* Check logical CPU availability. */ if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) continue; /* Check if logical CPU has the same package ID. */ if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) continue; cpu_cores++; /* Check if logical CPU has the same package and core IDs. */ if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) cpu_logical++; } KASSERT(cpu_cores >= 1 && cpu_logical >= 1, ("topo_probe_0x4 couldn't find BSP")); cpu_cores /= cpu_logical; hyperthreading_cpus = cpu_logical; } static void topo_probe_0xb(void) { u_int p[4]; int bits; int cnt; int i; int logical; int type; int x; /* We only support three levels for now. */ for (i = 0; i < 3; i++) { cpuid_count(0x0b, i, p); /* Fall back if CPU leaf 11 doesn't really exist. */ if (i == 0 && p[1] == 0) { topo_probe_0x4(); return; } bits = p[0] & 0x1f; logical = p[1] &= 0xffff; type = (p[2] >> 8) & 0xff; if (type == 0 || logical == 0) break; /* * Because of uniformity assumption we examine only * those logical processors that belong to the same * package as BSP. */ for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { if (!cpu_info[x].cpu_present || cpu_info[x].cpu_disabled) continue; if (x >> bits == boot_cpu_id >> bits) cnt++; } if (type == CPUID_TYPE_SMT) cpu_logical = cnt; else if (type == CPUID_TYPE_CORE) cpu_cores = cnt; } if (cpu_logical == 0) cpu_logical = 1; cpu_cores /= cpu_logical; } /* * Both topology discovery code and code that consumes topology * information assume top-down uniformity of the topology. * That is, all physical packages must be identical and each * core in a package must have the same number of threads. * Topology information is queried only on BSP, on which this * code runs and for which it can query CPUID information. * Then topology is extrapolated on all packages using the * uniformity assumption. */ static void topo_probe(void) { static int cpu_topo_probed = 0; if (cpu_topo_probed) return; CPU_ZERO(&logical_cpus_mask); if (mp_ncpus <= 1) cpu_cores = cpu_logical = 1; else if (cpu_vendor_id == CPU_VENDOR_AMD) topo_probe_amd(); else if (cpu_vendor_id == CPU_VENDOR_INTEL) { /* * See Intel(R) 64 Architecture Processor * Topology Enumeration article for details. * * Note that 0x1 <= cpu_high < 4 case should be * compatible with topo_probe_0x4() logic when * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) * or it should trigger the fallback otherwise. */ if (cpu_high >= 0xb) topo_probe_0xb(); else if (cpu_high >= 0x1) topo_probe_0x4(); } /* * Fallback: assume each logical CPU is in separate * physical package. That is, no multi-core, no SMT. */ if (cpu_cores == 0 || cpu_logical == 0) cpu_cores = cpu_logical = 1; cpu_topo_probed = 1; } struct cpu_group * cpu_topo(void) { int cg_flags; /* * Determine whether any threading flags are * necessry. */ topo_probe(); if (cpu_logical > 1 && hyperthreading_cpus) cg_flags = CG_FLAG_HTT; else if (cpu_logical > 1) cg_flags = CG_FLAG_SMT; else cg_flags = 0; if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { printf("WARNING: Non-uniform processors.\n"); printf("WARNING: Using suboptimal topology.\n"); return (smp_topo_none()); } /* * No multi-core or hyper-threaded. */ if (cpu_logical * cpu_cores == 1) return (smp_topo_none()); /* * Only HTT no multi-core. */ if (cpu_logical > 1 && cpu_cores == 1) return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); /* * Only multi-core no HTT. */ if (cpu_cores > 1 && cpu_logical == 1) return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); /* * Both HTT and multi-core. */ return (smp_topo_2level(CG_SHARE_L2, cpu_cores, CG_SHARE_L1, cpu_logical, cg_flags)); } /* * Calculate usable address in base memory for AP trampoline code. */ u_int mp_bootaddress(u_int basemem) { bootMP_size = mptramp_end - mptramp_start; boot_address = trunc_page(basemem * 1024); /* round down to 4k boundary */ if (((basemem * 1024) - boot_address) < bootMP_size) boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ /* 3 levels of page table pages */ mptramp_pagetables = boot_address - (PAGE_SIZE * 3); return mptramp_pagetables; } void cpu_add(u_int apic_id, char boot_cpu) { if (apic_id > MAX_APIC_ID) { panic("SMP: APIC ID %d too high", apic_id); return; } KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", apic_id)); cpu_info[apic_id].cpu_present = 1; if (boot_cpu) { KASSERT(boot_cpu_id == -1, ("CPU %d claims to be BSP, but CPU %d already is", apic_id, boot_cpu_id)); boot_cpu_id = apic_id; cpu_info[apic_id].cpu_bsp = 1; } if (mp_ncpus < MAXCPU) { mp_ncpus++; mp_maxid = mp_ncpus - 1; } if (bootverbose) printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : "AP"); } void cpu_mp_setmaxid(void) { /* * mp_maxid should be already set by calls to cpu_add(). * Just sanity check its value here. */ if (mp_ncpus == 0) KASSERT(mp_maxid == 0, ("%s: mp_ncpus is zero, but mp_maxid is not", __func__)); else if (mp_ncpus == 1) mp_maxid = 0; else KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); } int cpu_mp_probe(void) { /* * Always record BSP in CPU map so that the mbuf init code works * correctly. */ CPU_SETOF(0, &all_cpus); if (mp_ncpus == 0) { /* * No CPUs were found, so this must be a UP system. Setup * the variables to represent a system with a single CPU * with an id of 0. */ mp_ncpus = 1; return (0); } /* At least one CPU was found. */ if (mp_ncpus == 1) { /* * One CPU was found, so this must be a UP system with * an I/O APIC. */ mp_maxid = 0; return (0); } /* At least two CPUs were found. */ return (1); } /* * Initialize the IPI handlers and start up the AP's. */ void cpu_mp_start(void) { int i; /* Initialize the logical ID to APIC ID table. */ for (i = 0; i < MAXCPU; i++) { cpu_apic_ids[i] = -1; cpu_ipi_pending[i] = 0; } /* Install an inter-CPU IPI for TLB invalidation */ if (pmap_pcid_enabled) { setidt(IPI_INVLTLB, IDTVEC(invltlb_pcid), SDT_SYSIGT, SEL_KPL, 0); setidt(IPI_INVLPG, IDTVEC(invlpg_pcid), SDT_SYSIGT, SEL_KPL, 0); } else { setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYSIGT, SEL_KPL, 0); setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYSIGT, SEL_KPL, 0); } setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for cache invalidation. */ setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for all-CPU rendezvous */ setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYSIGT, SEL_KPL, 0); /* Install generic inter-CPU IPI handler */ setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for CPU stop/restart */ setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYSIGT, SEL_KPL, 0); /* Install an inter-CPU IPI for CPU suspend/resume */ setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYSIGT, SEL_KPL, 0); /* Set boot_cpu_id if needed. */ if (boot_cpu_id == -1) { boot_cpu_id = PCPU_GET(apic_id); cpu_info[boot_cpu_id].cpu_bsp = 1; } else KASSERT(boot_cpu_id == PCPU_GET(apic_id), ("BSP's APIC ID doesn't match boot_cpu_id")); /* Probe logical/physical core configuration. */ topo_probe(); assign_cpu_ids(); /* Start each Application Processor */ start_all_aps(); set_interrupt_apic_ids(); } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { const char *hyperthread; int i; printf("FreeBSD/SMP: %d package(s) x %d core(s)", mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); if (hyperthreading_cpus > 1) printf(" x %d HTT threads", cpu_logical); else if (cpu_logical > 1) printf(" x %d SMT threads", cpu_logical); printf("\n"); /* List active CPUs first. */ printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); for (i = 1; i < mp_ncpus; i++) { if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) hyperthread = "/HT"; else hyperthread = ""; printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, cpu_apic_ids[i]); } /* List disabled CPUs last. */ for (i = 0; i <= MAX_APIC_ID; i++) { if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) continue; if (cpu_info[i].cpu_hyperthread) hyperthread = "/HT"; else hyperthread = ""; printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, i); } } /* * AP CPU's call this to initialize themselves. */ void init_secondary(void) { struct pcpu *pc; struct nmi_pcpu *np; u_int64_t msr, cr0; u_int cpuid; int cpu, gsel_tss, x; struct region_descriptor ap_gdt; /* Set by the startup code for us to use */ cpu = bootAP; /* Init tss */ common_tss[cpu] = common_tss[0]; common_tss[cpu].tss_rsp0 = 0; /* not used until after switch */ common_tss[cpu].tss_iobase = sizeof(struct amd64tss) + IOPAGES * PAGE_SIZE; common_tss[cpu].tss_ist1 = (long)&doublefault_stack[PAGE_SIZE]; /* The NMI stack runs on IST2. */ np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1; common_tss[cpu].tss_ist2 = (long) np; /* Prepare private GDT */ gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu]; for (x = 0; x < NGDT; x++) { if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) && x != GUSERLDT_SEL && x != (GUSERLDT_SEL + 1)) ssdtosd(&gdt_segs[x], &gdt[NGDT * cpu + x]); } ssdtosyssd(&gdt_segs[GPROC0_SEL], (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]); ap_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; ap_gdt.rd_base = (long) &gdt[NGDT * cpu]; lgdt(&ap_gdt); /* does magic intra-segment return */ /* Get per-cpu data */ pc = &__pcpu[cpu]; /* prime data page for it to use */ pcpu_init(pc, cpu, sizeof(struct pcpu)); dpcpu_init(dpcpu, cpu); pc->pc_apic_id = cpu_apic_ids[cpu]; pc->pc_prvspace = pc; pc->pc_curthread = 0; pc->pc_tssp = &common_tss[cpu]; pc->pc_commontssp = &common_tss[cpu]; pc->pc_rsp0 = 0; pc->pc_tss = (struct system_segment_descriptor *)&gdt[NGDT * cpu + GPROC0_SEL]; pc->pc_fs32p = &gdt[NGDT * cpu + GUFS32_SEL]; pc->pc_gs32p = &gdt[NGDT * cpu + GUGS32_SEL]; pc->pc_ldt = (struct system_segment_descriptor *)&gdt[NGDT * cpu + GUSERLDT_SEL]; /* Save the per-cpu pointer for use by the NMI handler. */ np->np_pcpu = (register_t) pc; wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, (u_int64_t)pc); /* XXX User value while we're in the kernel */ fix_cpuid(); lidt(&r_idt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); ltr(gsel_tss); /* * Set to a known state: * Set by mpboot.s: CR0_PG, CR0_PE * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM */ cr0 = rcr0(); cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); load_cr0(cr0); /* Set up the fast syscall stuff */ msr = rdmsr(MSR_EFER) | EFER_SCE; wrmsr(MSR_EFER, msr); wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall)); wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32)); msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) | ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48); wrmsr(MSR_STAR, msr); wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D); /* Disable local APIC just to be sure. */ lapic_disable(); /* signal our startup to the BSP. */ mp_naps++; /* Spin until the BSP releases the AP's. */ while (!aps_ready) ia32_pause(); /* Initialize the PAT MSR. */ pmap_init_pat(); /* set up CPU registers and state */ cpu_setregs(); /* set up SSE/NX */ initializecpu(); /* set up FPU state on the AP */ fpuinit(); if (cpu_ops.cpu_init) cpu_ops.cpu_init(); /* A quick check from sanity claus */ cpuid = PCPU_GET(cpuid); if (PCPU_GET(apic_id) != lapic_id()) { printf("SMP: cpuid = %d\n", cpuid); printf("SMP: actual apic_id = %d\n", lapic_id()); printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); panic("cpuid mismatch! boom!!"); } /* Initialize curthread. */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); PCPU_SET(curthread, PCPU_GET(idlethread)); mca_init(); mtx_lock_spin(&ap_boot_mtx); /* Init local apic for irq's */ lapic_setup(1); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); smp_cpus++; CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); printf("SMP: AP CPU #%d Launched!\n", cpuid); /* Determine if we are a logical CPU. */ /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) CPU_SET(cpuid, &logical_cpus_mask); if (bootverbose) lapic_dump("AP"); if (smp_cpus == mp_ncpus) { /* enable IPI's, tlb shootdown, freezes etc */ atomic_store_rel_int(&smp_started, 1); } /* * Enable global pages TLB extension * This also implicitly flushes the TLB */ load_cr4(rcr4() | CR4_PGE); if (pmap_pcid_enabled) load_cr4(rcr4() | CR4_PCIDE); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); mtx_unlock_spin(&ap_boot_mtx); /* Wait until all the AP's are up. */ while (smp_started == 0) ia32_pause(); /* Start per-CPU event timers. */ cpu_initclocks_ap(); sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ } /******************************************************************* * local functions and data */ /* * We tell the I/O APIC code about all the CPUs we want to receive * interrupts. If we don't want certain CPUs to receive IRQs we * can simply not tell the I/O APIC code about them in this function. * We also do not tell it about the BSP since it tells itself about * the BSP internally to work with UP kernels and on UP machines. */ static void set_interrupt_apic_ids(void) { u_int i, apic_id; for (i = 0; i < MAXCPU; i++) { apic_id = cpu_apic_ids[i]; if (apic_id == -1) continue; if (cpu_info[apic_id].cpu_bsp) continue; if (cpu_info[apic_id].cpu_disabled) continue; /* Don't let hyperthreads service interrupts. */ if (hyperthreading_cpus > 1 && apic_id % hyperthreading_cpus != 0) continue; intr_add_cpu(i); } } /* * Assign logical CPU IDs to local APICs. */ static void assign_cpu_ids(void) { u_int i; TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", &hyperthreading_allowed); /* Check for explicitly disabled CPUs. */ for (i = 0; i <= MAX_APIC_ID; i++) { if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) continue; if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { cpu_info[i].cpu_hyperthread = 1; /* * Don't use HT CPU if it has been disabled by a * tunable. */ if (hyperthreading_allowed == 0) { cpu_info[i].cpu_disabled = 1; continue; } } /* Don't use this CPU if it has been disabled by a tunable. */ if (resource_disabled("lapic", i)) { cpu_info[i].cpu_disabled = 1; continue; } } if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) { hyperthreading_cpus = 0; cpu_logical = 1; } /* * Assign CPU IDs to local APIC IDs and disable any CPUs * beyond MAXCPU. CPU 0 is always assigned to the BSP. * * To minimize confusion for userland, we attempt to number * CPUs such that all threads and cores in a package are * grouped together. For now we assume that the BSP is always * the first thread in a package and just start adding APs * starting with the BSP's APIC ID. */ mp_ncpus = 1; cpu_apic_ids[0] = boot_cpu_id; apic_cpuids[boot_cpu_id] = 0; for (i = boot_cpu_id + 1; i != boot_cpu_id; i == MAX_APIC_ID ? i = 0 : i++) { if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || cpu_info[i].cpu_disabled) continue; if (mp_ncpus < MAXCPU) { cpu_apic_ids[mp_ncpus] = i; apic_cpuids[i] = mp_ncpus; mp_ncpus++; } else cpu_info[i].cpu_disabled = 1; } KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); } /* * start each AP in our list */ static int start_all_aps(void) { vm_offset_t va = boot_address + KERNBASE; u_int64_t *pt4, *pt3, *pt2; u_int32_t mpbioswarmvec; int apic_id, cpu, i; u_char mpbiosreason; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); /* install the AP 1st level boot code */ pmap_kenter(va, boot_address); pmap_invalidate_page(kernel_pmap, va); bcopy(mptramp_start, (void *)va, bootMP_size); /* Locate the page tables, they'll be below the trampoline */ pt4 = (u_int64_t *)(uintptr_t)(mptramp_pagetables + KERNBASE); pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); /* Create the initial 1GB replicated page tables */ for (i = 0; i < 512; i++) { /* Each slot of the level 4 pages points to the same level 3 page */ pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); pt4[i] |= PG_V | PG_RW | PG_U; /* Each slot of the level 3 pages points to the same level 2 page */ pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); pt3[i] |= PG_V | PG_RW | PG_U; /* The level 2 page slots are mapped with 2MB pages for 1GB. */ pt2[i] = i * (2 * 1024 * 1024); pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; } /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); /* setup a vector to our boot code */ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ /* start each AP */ for (cpu = 1; cpu < mp_ncpus; cpu++) { apic_id = cpu_apic_ids[cpu]; /* allocate and set up an idle stack data page */ bootstacks[cpu] = (void *)kmem_malloc(kernel_arena, KSTACK_PAGES * PAGE_SIZE, M_WAITOK | M_ZERO); doublefault_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO); nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK | M_ZERO); dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE, M_WAITOK | M_ZERO); bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 8; bootAP = cpu; /* attempt to start the Application Processor */ if (!start_ap(apic_id)) { /* restore the warmstart vector */ *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; panic("AP #%d (PHY# %d) failed!", cpu, apic_id); } CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ } /* restore the warmstart vector */ *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); /* number of APs actually started */ return mp_naps; } /* * This function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It isn't pretty, * but it seems to work. */ static int start_ap(int apic_id) { int vector, ms; int cpus; /* calculate the vector */ vector = (boot_address >> 12) & 0xff; /* used as a watchpoint to signal AP startup */ cpus = mp_naps; ipi_startup(apic_id, vector); /* Wait up to 5 seconds for it to start. */ for (ms = 0; ms < 5000; ms++) { if (mp_naps > cpus) return 1; /* return SUCCESS */ DELAY(1000); } return 0; /* return FAILURE */ } #ifdef COUNT_XINVLTLB_HITS u_int xhits_gbl[MAXCPU]; u_int xhits_pg[MAXCPU]; u_int xhits_rng[MAXCPU]; static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, sizeof(xhits_gbl), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, sizeof(xhits_pg), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, sizeof(xhits_rng), "IU", ""); u_int ipi_global; u_int ipi_page; u_int ipi_range; u_int ipi_range_size; SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 0, ""); u_int ipi_masked_global; u_int ipi_masked_page; u_int ipi_masked_range; u_int ipi_masked_range_size; SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, &ipi_masked_global, 0, ""); SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, &ipi_masked_page, 0, ""); SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, &ipi_masked_range, 0, ""); SYSCTL_UINT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, &ipi_masked_range_size, 0, ""); #endif /* COUNT_XINVLTLB_HITS */ /* * Init and startup IPI. */ void ipi_startup(int apic_id, int vector) { /* * This attempts to follow the algorithm described in the * Intel Multiprocessor Specification v1.4 in section B.4. * For each IPI, we allow the local APIC ~20us to deliver the * IPI. If that times out, we panic. */ /* * first we do an INIT IPI: this INIT IPI might be run, resetting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); lapic_ipi_wait(100); /* Explicitly deassert the INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); DELAY(10000); /* wait ~10mS */ /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); if (!lapic_ipi_wait(100)) panic("Failed to deliver first STARTUP IPI to APIC %d", apic_id); DELAY(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); if (!lapic_ipi_wait(100)) panic("Failed to deliver second STARTUP IPI to APIC %d", apic_id); DELAY(200); /* wait ~200uS */ } /* * Send an IPI to specified CPU handling the bitmap logic. */ static void ipi_send_cpu(int cpu, u_int ipi) { u_int bitmap, old_pending, new_pending; KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); if (IPI_IS_BITMAPED(ipi)) { bitmap = 1 << ipi; ipi = IPI_BITMAP_VECTOR; do { old_pending = cpu_ipi_pending[cpu]; new_pending = old_pending | bitmap; } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], old_pending, new_pending)); if (old_pending) return; } cpu_ops.ipi_vectored(ipi, cpu_apic_ids[cpu]); } /* * Flush the TLB on all other CPU's */ static void smp_tlb_shootdown(u_int vector, pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2) { u_int ncpu; ncpu = mp_ncpus - 1; /* does not shootdown self */ if (ncpu < 1) return; /* no other cpus */ if (!(read_rflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); smp_tlb_invpcid.addr = addr1; if (pmap == NULL) { smp_tlb_invpcid.pcid = 0; } else { smp_tlb_invpcid.pcid = pmap->pm_pcid; pcid_cr3 = pmap->pm_cr3; } smp_tlb_addr2 = addr2; smp_tlb_pmap = pmap; atomic_store_rel_int(&smp_tlb_wait, 0); ipi_all_but_self(vector); while (smp_tlb_wait < ncpu) ia32_pause(); mtx_unlock_spin(&smp_ipi_mtx); } static void smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2) { int cpu, ncpu, othercpus; othercpus = mp_ncpus - 1; if (CPU_ISFULLSET(&mask)) { if (othercpus < 1) return; } else { CPU_CLR(PCPU_GET(cpuid), &mask); if (CPU_EMPTY(&mask)) return; } if (!(read_rflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); smp_tlb_invpcid.addr = addr1; if (pmap == NULL) { smp_tlb_invpcid.pcid = 0; } else { smp_tlb_invpcid.pcid = pmap->pm_pcid; pcid_cr3 = pmap->pm_cr3; } smp_tlb_addr2 = addr2; smp_tlb_pmap = pmap; atomic_store_rel_int(&smp_tlb_wait, 0); if (CPU_ISFULLSET(&mask)) { ncpu = othercpus; ipi_all_but_self(vector); } else { ncpu = 0; while ((cpu = CPU_FFS(&mask)) != 0) { cpu--; CPU_CLR(cpu, &mask); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, vector); ipi_send_cpu(cpu, vector); ncpu++; } } while (smp_tlb_wait < ncpu) ia32_pause(); mtx_unlock_spin(&smp_ipi_mtx); } void smp_cache_flush(void) { if (smp_started) smp_tlb_shootdown(IPI_INVLCACHE, NULL, 0, 0); } void smp_invltlb(pmap_t pmap) { if (smp_started) { smp_tlb_shootdown(IPI_INVLTLB, pmap, 0, 0); #ifdef COUNT_XINVLTLB_HITS ipi_global++; #endif } } void smp_invlpg(pmap_t pmap, vm_offset_t addr) { if (smp_started) { smp_tlb_shootdown(IPI_INVLPG, pmap, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_page++; #endif } } void smp_invlpg_range(pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { smp_tlb_shootdown(IPI_INVLRNG, pmap, addr1, addr2); #ifdef COUNT_XINVLTLB_HITS ipi_range++; ipi_range_size += (addr2 - addr1) / PAGE_SIZE; #endif } } void smp_masked_invltlb(cpuset_t mask, pmap_t pmap) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0); #ifdef COUNT_XINVLTLB_HITS ipi_masked_global++; #endif } } void smp_masked_invlpg(cpuset_t mask, pmap_t pmap, vm_offset_t addr) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLPG, pmap, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_masked_page++; #endif } } void smp_masked_invlpg_range(cpuset_t mask, pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, pmap, addr1, addr2); #ifdef COUNT_XINVLTLB_HITS ipi_masked_range++; ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; #endif } } void ipi_bitmap_handler(struct trapframe frame) { struct trapframe *oldframe; struct thread *td; int cpu = PCPU_GET(cpuid); u_int ipi_bitmap; critical_enter(); td = curthread; td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = &frame; ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); if (ipi_bitmap & (1 << IPI_PREEMPT)) { #ifdef COUNT_IPIS (*ipi_preempt_counts[cpu])++; #endif sched_preempt(td); } if (ipi_bitmap & (1 << IPI_AST)) { #ifdef COUNT_IPIS (*ipi_ast_counts[cpu])++; #endif /* Nothing to do for AST */ } if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { #ifdef COUNT_IPIS (*ipi_hardclock_counts[cpu])++; #endif hardclockintr(); } td->td_intr_frame = oldframe; td->td_intr_nesting_level--; critical_exit(); } /* * send an IPI to a set of cpus. */ void ipi_selected(cpuset_t cpus, u_int ipi) { int cpu; /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus); while ((cpu = CPU_FFS(&cpus)) != 0) { cpu--; CPU_CLR(cpu, &cpus); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } } /* * send an IPI to a specific CPU. */ void ipi_cpu(int cpu, u_int ipi) { /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_SET_ATOMIC(cpu, &ipi_nmi_pending); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { cpuset_t other_cpus; other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); if (IPI_IS_BITMAPED(ipi)) { ipi_selected(other_cpus, ipi); return; } /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus); CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); cpu_ops.ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); } int ipi_nmi_handler() { u_int cpuid; /* * As long as there is not a simple way to know about a NMI's * source, if the bitmask for the current CPU is present in * the global pending bitword an IPI_STOP_HARD has been issued * and should be handled. */ cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &ipi_nmi_pending)) return (1); CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending); cpustop_handler(); return (0); } /* * Handle an IPI_STOP by saving our current context and spinning until we * are resumed. */ void cpustop_handler(void) { u_int cpu; cpu = PCPU_GET(cpuid); savectx(&stoppcbs[cpu]); /* Indicate that we are stopped */ CPU_SET_ATOMIC(cpu, &stopped_cpus); /* Wait for restart */ while (!CPU_ISSET(cpu, &started_cpus)) ia32_pause(); CPU_CLR_ATOMIC(cpu, &started_cpus); CPU_CLR_ATOMIC(cpu, &stopped_cpus); #ifdef DDB amd64_db_resume_dbreg(); #endif if (cpu == 0 && cpustop_restartfunc != NULL) { cpustop_restartfunc(); cpustop_restartfunc = NULL; } } /* * Handle an IPI_SUSPEND by saving our current context and spinning until we * are resumed. */ void cpususpend_handler(void) { u_int cpu; mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); cpu = PCPU_GET(cpuid); if (savectx(&susppcbs[cpu]->sp_pcb)) { fpususpend(susppcbs[cpu]->sp_fpususpend); wbinvd(); CPU_SET_ATOMIC(cpu, &suspended_cpus); + /* + * Hack for xen, which does not use resumectx() so never + * uses the next clause: set resuming_cpus early so that + * resume_cpus() can wait on the same bitmap for acpi and + * xen. resuming_cpus now means eventually_resumable_cpus. + */ + CPU_SET_ATOMIC(cpu, &resuming_cpus); } else { fpuresume(susppcbs[cpu]->sp_fpususpend); pmap_init_pat(); initializecpu(); PCPU_SET(switchtime, 0); PCPU_SET(switchticks, ticks); - /* Indicate that we are resumed */ + /* Indicate that we are resuming */ CPU_CLR_ATOMIC(cpu, &suspended_cpus); } - /* Wait for resume */ - while (!CPU_ISSET(cpu, &started_cpus)) + /* Wait for resume directive */ + while (!CPU_ISSET(cpu, &toresume_cpus)) ia32_pause(); if (cpu_ops.cpu_resume) cpu_ops.cpu_resume(); if (vmm_resume_p) vmm_resume_p(); /* Resume MCA and local APIC */ mca_resume(); lapic_setup(0); - CPU_CLR_ATOMIC(cpu, &started_cpus); /* Indicate that we are resumed */ + CPU_CLR_ATOMIC(cpu, &resuming_cpus); CPU_CLR_ATOMIC(cpu, &suspended_cpus); + CPU_CLR_ATOMIC(cpu, &toresume_cpus); } /* * Handlers for TLB related IPIs */ void invltlb_handler(void) { #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ invltlb(); atomic_add_int(&smp_tlb_wait, 1); } void invltlb_pcid_handler(void) { uint64_t cr3; u_int cpuid; #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ if (smp_tlb_invpcid.pcid != (uint64_t)-1 && smp_tlb_invpcid.pcid != 0) { if (invpcid_works) { invpcid(&smp_tlb_invpcid, INVPCID_CTX); } else { /* Otherwise reload %cr3 twice. */ cr3 = rcr3(); if (cr3 != pcid_cr3) { load_cr3(pcid_cr3); cr3 |= CR3_PCID_SAVE; } load_cr3(cr3); } } else { invltlb_globpcid(); } if (smp_tlb_pmap != NULL) { cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &smp_tlb_pmap->pm_active)) CPU_CLR_ATOMIC(cpuid, &smp_tlb_pmap->pm_save); } atomic_add_int(&smp_tlb_wait, 1); } void invlpg_handler(void) { #ifdef COUNT_XINVLTLB_HITS xhits_pg[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ invlpg(smp_tlb_invpcid.addr); atomic_add_int(&smp_tlb_wait, 1); } void invlpg_pcid_handler(void) { uint64_t cr3; #ifdef COUNT_XINVLTLB_HITS xhits_pg[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ if (smp_tlb_invpcid.pcid == (uint64_t)-1) { invltlb_globpcid(); } else if (smp_tlb_invpcid.pcid == 0) { invlpg(smp_tlb_invpcid.addr); } else if (invpcid_works) { invpcid(&smp_tlb_invpcid, INVPCID_ADDR); } else { /* * PCID supported, but INVPCID is not. * Temporarily switch to the target address * space and do INVLPG. */ cr3 = rcr3(); if (cr3 != pcid_cr3) load_cr3(pcid_cr3 | CR3_PCID_SAVE); invlpg(smp_tlb_invpcid.addr); load_cr3(cr3 | CR3_PCID_SAVE); } atomic_add_int(&smp_tlb_wait, 1); } static inline void invlpg_range(vm_offset_t start, vm_offset_t end) { do { invlpg(start); start += PAGE_SIZE; } while (start < end); } void invlrng_handler(void) { struct invpcid_descr d; vm_offset_t addr; uint64_t cr3; u_int cpuid; #ifdef COUNT_XINVLTLB_HITS xhits_rng[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ addr = smp_tlb_invpcid.addr; if (pmap_pcid_enabled) { if (smp_tlb_invpcid.pcid == 0) { /* * kernel pmap - use invlpg to invalidate * global mapping. */ invlpg_range(addr, smp_tlb_addr2); } else if (smp_tlb_invpcid.pcid == (uint64_t)-1) { invltlb_globpcid(); if (smp_tlb_pmap != NULL) { cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &smp_tlb_pmap->pm_active)) CPU_CLR_ATOMIC(cpuid, &smp_tlb_pmap->pm_save); } } else if (invpcid_works) { d = smp_tlb_invpcid; do { invpcid(&d, INVPCID_ADDR); d.addr += PAGE_SIZE; } while (d.addr <= smp_tlb_addr2); } else { cr3 = rcr3(); if (cr3 != pcid_cr3) load_cr3(pcid_cr3 | CR3_PCID_SAVE); invlpg_range(addr, smp_tlb_addr2); load_cr3(cr3 | CR3_PCID_SAVE); } } else { invlpg_range(addr, smp_tlb_addr2); } atomic_add_int(&smp_tlb_wait, 1); } void invlcache_handler(void) { #ifdef COUNT_IPIS (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ wbinvd(); atomic_add_int(&smp_tlb_wait, 1); } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ static void release_aps(void *dummy __unused) { if (mp_ncpus == 1) return; atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ia32_pause(); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); #ifdef COUNT_IPIS /* * Setup interrupt counters for IPI handlers. */ static void mp_ipi_intrcnt(void *dummy) { char buf[64]; int i; CPU_FOREACH(i) { snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); intrcnt_add(buf, &ipi_invltlb_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); intrcnt_add(buf, &ipi_invlrng_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); intrcnt_add(buf, &ipi_invlpg_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); intrcnt_add(buf, &ipi_invlcache_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:preempt", i); intrcnt_add(buf, &ipi_preempt_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:ast", i); intrcnt_add(buf, &ipi_ast_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); intrcnt_add(buf, &ipi_rendezvous_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); intrcnt_add(buf, &ipi_hardclock_counts[i]); } } SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); #endif Index: stable/10/sys/i386/i386/mp_machdep.c =================================================================== --- stable/10/sys/i386/i386/mp_machdep.c (revision 331909) +++ stable/10/sys/i386/i386/mp_machdep.c (revision 331910) @@ -1,1680 +1,1691 @@ /*- * Copyright (c) 1996, by Steve Passe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_apic.h" #include "opt_cpu.h" #include "opt_kstack_pages.h" #include "opt_pmap.h" #include "opt_sched.h" #include "opt_smp.h" #if !defined(lint) #if !defined(SMP) #error How did you get here? #endif #ifndef DEV_APIC #error The apic device is required for SMP, add "device apic" to your config file. #endif #if defined(CPU_DISABLE_CMPXCHG) && !defined(COMPILING_LINT) #error SMP not supported with CPU_DISABLE_CMPXCHG #endif #endif /* not lint */ #include #include #include #include /* cngetc() */ #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) /* * this code MUST be enabled here and in mpboot.s. * it follows the very early stages of AP boot by placing values in CMOS ram. * it NORMALLY will never be needed and thus the primitive method for enabling. * #define CHECK_POINTS */ #if defined(CHECK_POINTS) && !defined(PC98) #define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) #define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) #define CHECK_INIT(D); \ CHECK_WRITE(0x34, (D)); \ CHECK_WRITE(0x35, (D)); \ CHECK_WRITE(0x36, (D)); \ CHECK_WRITE(0x37, (D)); \ CHECK_WRITE(0x38, (D)); \ CHECK_WRITE(0x39, (D)); #define CHECK_PRINT(S); \ printf("%s: %d, %d, %d, %d, %d, %d\n", \ (S), \ CHECK_READ(0x34), \ CHECK_READ(0x35), \ CHECK_READ(0x36), \ CHECK_READ(0x37), \ CHECK_READ(0x38), \ CHECK_READ(0x39)); #else /* CHECK_POINTS */ #define CHECK_INIT(D) #define CHECK_PRINT(S) #define CHECK_WRITE(A, D) #endif /* CHECK_POINTS */ /* lock region used by kernel profiling */ int mcount_lock; int mp_naps; /* # of Applications processors */ int boot_cpu_id = -1; /* designated BSP */ extern struct pcpu __pcpu[]; /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; static int bootAP; /* Free these after use */ void *bootstacks[MAXCPU]; static void *dpcpu; struct pcb stoppcbs[MAXCPU]; struct susppcb **susppcbs; /* Variables needed for SMP tlb shootdown. */ vm_offset_t smp_tlb_addr1; vm_offset_t smp_tlb_addr2; volatile int smp_tlb_wait; #ifdef COUNT_IPIS /* Interrupt counts. */ static u_long *ipi_preempt_counts[MAXCPU]; static u_long *ipi_ast_counts[MAXCPU]; u_long *ipi_invltlb_counts[MAXCPU]; u_long *ipi_invlrng_counts[MAXCPU]; u_long *ipi_invlpg_counts[MAXCPU]; u_long *ipi_invlcache_counts[MAXCPU]; u_long *ipi_rendezvous_counts[MAXCPU]; u_long *ipi_lazypmap_counts[MAXCPU]; static u_long *ipi_hardclock_counts[MAXCPU]; #endif /* Default cpu_ops implementation. */ struct cpu_ops cpu_ops = { .ipi_vectored = lapic_ipi_vectored }; /* * Local data and functions. */ static volatile cpuset_t ipi_nmi_pending; +volatile cpuset_t resuming_cpus; +volatile cpuset_t toresume_cpus; + /* used to hold the AP's until we are ready to release them */ static struct mtx ap_boot_mtx; /* Set to 1 once we're ready to let the APs out of the pen. */ static volatile int aps_ready = 0; /* * Store data from cpu_add() until later in the boot when we actually setup * the APs. */ struct cpu_info { int cpu_present:1; int cpu_bsp:1; int cpu_disabled:1; int cpu_hyperthread:1; } static cpu_info[MAX_APIC_ID + 1]; int cpu_apic_ids[MAXCPU]; int apic_cpuids[MAX_APIC_ID + 1]; /* Holds pending bitmap based IPIs per CPU */ volatile u_int cpu_ipi_pending[MAXCPU]; static u_int boot_address; static int cpu_logical; /* logical cpus per core */ static int cpu_cores; /* cores per package */ static void assign_cpu_ids(void); static void install_ap_tramp(void); static void set_interrupt_apic_ids(void); static int start_all_aps(void); static int start_ap(int apic_id); static void release_aps(void *dummy); static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ static int hyperthreading_allowed = 1; static void mem_range_AP_init(void) { if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) mem_range_softc.mr_op->initAP(&mem_range_softc); } static void topo_probe_amd(void) { int core_id_bits; int id; /* AMD processors do not support HTT. */ cpu_logical = 1; if ((amd_feature2 & AMDID2_CMP) == 0) { cpu_cores = 1; return; } core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> AMDID_COREID_SIZE_SHIFT; if (core_id_bits == 0) { cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; return; } /* Fam 10h and newer should get here. */ for (id = 0; id <= MAX_APIC_ID; id++) { /* Check logical CPU availability. */ if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) continue; /* Check if logical CPU has the same package ID. */ if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits)) continue; cpu_cores++; } } /* * Round up to the next power of two, if necessary, and then * take log2. * Returns -1 if argument is zero. */ static __inline int mask_width(u_int x) { return (fls(x << (1 - powerof2(x))) - 1); } static void topo_probe_0x4(void) { u_int p[4]; int pkg_id_bits; int core_id_bits; int max_cores; int max_logical; int id; /* Both zero and one here mean one logical processor per package. */ max_logical = (cpu_feature & CPUID_HTT) != 0 ? (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; if (max_logical <= 1) return; /* * Because of uniformity assumption we examine only * those logical processors that belong to the same * package as BSP. Further, we count number of * logical processors that belong to the same core * as BSP thus deducing number of threads per core. */ if (cpu_high >= 0x4) { cpuid_count(0x04, 0, p); max_cores = ((p[0] >> 26) & 0x3f) + 1; } else max_cores = 1; core_id_bits = mask_width(max_logical/max_cores); if (core_id_bits < 0) return; pkg_id_bits = core_id_bits + mask_width(max_cores); for (id = 0; id <= MAX_APIC_ID; id++) { /* Check logical CPU availability. */ if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) continue; /* Check if logical CPU has the same package ID. */ if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) continue; cpu_cores++; /* Check if logical CPU has the same package and core IDs. */ if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) cpu_logical++; } KASSERT(cpu_cores >= 1 && cpu_logical >= 1, ("topo_probe_0x4 couldn't find BSP")); cpu_cores /= cpu_logical; hyperthreading_cpus = cpu_logical; } static void topo_probe_0xb(void) { u_int p[4]; int bits; int cnt; int i; int logical; int type; int x; /* We only support three levels for now. */ for (i = 0; i < 3; i++) { cpuid_count(0x0b, i, p); /* Fall back if CPU leaf 11 doesn't really exist. */ if (i == 0 && p[1] == 0) { topo_probe_0x4(); return; } bits = p[0] & 0x1f; logical = p[1] &= 0xffff; type = (p[2] >> 8) & 0xff; if (type == 0 || logical == 0) break; /* * Because of uniformity assumption we examine only * those logical processors that belong to the same * package as BSP. */ for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { if (!cpu_info[x].cpu_present || cpu_info[x].cpu_disabled) continue; if (x >> bits == boot_cpu_id >> bits) cnt++; } if (type == CPUID_TYPE_SMT) cpu_logical = cnt; else if (type == CPUID_TYPE_CORE) cpu_cores = cnt; } if (cpu_logical == 0) cpu_logical = 1; cpu_cores /= cpu_logical; } /* * Both topology discovery code and code that consumes topology * information assume top-down uniformity of the topology. * That is, all physical packages must be identical and each * core in a package must have the same number of threads. * Topology information is queried only on BSP, on which this * code runs and for which it can query CPUID information. * Then topology is extrapolated on all packages using the * uniformity assumption. */ static void topo_probe(void) { static int cpu_topo_probed = 0; if (cpu_topo_probed) return; CPU_ZERO(&logical_cpus_mask); if (mp_ncpus <= 1) cpu_cores = cpu_logical = 1; else if (cpu_vendor_id == CPU_VENDOR_AMD) topo_probe_amd(); else if (cpu_vendor_id == CPU_VENDOR_INTEL) { /* * See Intel(R) 64 Architecture Processor * Topology Enumeration article for details. * * Note that 0x1 <= cpu_high < 4 case should be * compatible with topo_probe_0x4() logic when * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) * or it should trigger the fallback otherwise. */ if (cpu_high >= 0xb) topo_probe_0xb(); else if (cpu_high >= 0x1) topo_probe_0x4(); } /* * Fallback: assume each logical CPU is in separate * physical package. That is, no multi-core, no SMT. */ if (cpu_cores == 0 || cpu_logical == 0) cpu_cores = cpu_logical = 1; cpu_topo_probed = 1; } struct cpu_group * cpu_topo(void) { int cg_flags; /* * Determine whether any threading flags are * necessry. */ topo_probe(); if (cpu_logical > 1 && hyperthreading_cpus) cg_flags = CG_FLAG_HTT; else if (cpu_logical > 1) cg_flags = CG_FLAG_SMT; else cg_flags = 0; if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { printf("WARNING: Non-uniform processors.\n"); printf("WARNING: Using suboptimal topology.\n"); return (smp_topo_none()); } /* * No multi-core or hyper-threaded. */ if (cpu_logical * cpu_cores == 1) return (smp_topo_none()); /* * Only HTT no multi-core. */ if (cpu_logical > 1 && cpu_cores == 1) return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); /* * Only multi-core no HTT. */ if (cpu_cores > 1 && cpu_logical == 1) return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); /* * Both HTT and multi-core. */ return (smp_topo_2level(CG_SHARE_L2, cpu_cores, CG_SHARE_L1, cpu_logical, cg_flags)); } /* * Calculate usable address in base memory for AP trampoline code. */ u_int mp_bootaddress(u_int basemem) { boot_address = trunc_page(basemem); /* round down to 4k boundary */ if ((basemem - boot_address) < bootMP_size) boot_address -= PAGE_SIZE; /* not enough, lower by 4k */ return boot_address; } void cpu_add(u_int apic_id, char boot_cpu) { if (apic_id > MAX_APIC_ID) { panic("SMP: APIC ID %d too high", apic_id); return; } KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", apic_id)); cpu_info[apic_id].cpu_present = 1; if (boot_cpu) { KASSERT(boot_cpu_id == -1, ("CPU %d claims to be BSP, but CPU %d already is", apic_id, boot_cpu_id)); boot_cpu_id = apic_id; cpu_info[apic_id].cpu_bsp = 1; } if (mp_ncpus < MAXCPU) { mp_ncpus++; mp_maxid = mp_ncpus - 1; } if (bootverbose) printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : "AP"); } void cpu_mp_setmaxid(void) { /* * mp_maxid should be already set by calls to cpu_add(). * Just sanity check its value here. */ if (mp_ncpus == 0) KASSERT(mp_maxid == 0, ("%s: mp_ncpus is zero, but mp_maxid is not", __func__)); else if (mp_ncpus == 1) mp_maxid = 0; else KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); } int cpu_mp_probe(void) { /* * Always record BSP in CPU map so that the mbuf init code works * correctly. */ CPU_SETOF(0, &all_cpus); if (mp_ncpus == 0) { /* * No CPUs were found, so this must be a UP system. Setup * the variables to represent a system with a single CPU * with an id of 0. */ mp_ncpus = 1; return (0); } /* At least one CPU was found. */ if (mp_ncpus == 1) { /* * One CPU was found, so this must be a UP system with * an I/O APIC. */ mp_maxid = 0; return (0); } /* At least two CPUs were found. */ return (1); } /* * Initialize the IPI handlers and start up the AP's. */ void cpu_mp_start(void) { int i; /* Initialize the logical ID to APIC ID table. */ for (i = 0; i < MAXCPU; i++) { cpu_apic_ids[i] = -1; cpu_ipi_pending[i] = 0; } /* Install an inter-CPU IPI for TLB invalidation */ setidt(IPI_INVLTLB, IDTVEC(invltlb), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IPI_INVLPG, IDTVEC(invlpg), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); setidt(IPI_INVLRNG, IDTVEC(invlrng), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for cache invalidation. */ setidt(IPI_INVLCACHE, IDTVEC(invlcache), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for lazy pmap release */ setidt(IPI_LAZYPMAP, IDTVEC(lazypmap), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for all-CPU rendezvous */ setidt(IPI_RENDEZVOUS, IDTVEC(rendezvous), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install generic inter-CPU IPI handler */ setidt(IPI_BITMAP_VECTOR, IDTVEC(ipi_intr_bitmap_handler), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for CPU stop/restart */ setidt(IPI_STOP, IDTVEC(cpustop), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Install an inter-CPU IPI for CPU suspend/resume */ setidt(IPI_SUSPEND, IDTVEC(cpususpend), SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); /* Set boot_cpu_id if needed. */ if (boot_cpu_id == -1) { boot_cpu_id = PCPU_GET(apic_id); cpu_info[boot_cpu_id].cpu_bsp = 1; } else KASSERT(boot_cpu_id == PCPU_GET(apic_id), ("BSP's APIC ID doesn't match boot_cpu_id")); /* Probe logical/physical core configuration. */ topo_probe(); assign_cpu_ids(); /* Start each Application Processor */ start_all_aps(); set_interrupt_apic_ids(); } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { const char *hyperthread; int i; printf("FreeBSD/SMP: %d package(s) x %d core(s)", mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); if (hyperthreading_cpus > 1) printf(" x %d HTT threads", cpu_logical); else if (cpu_logical > 1) printf(" x %d SMT threads", cpu_logical); printf("\n"); /* List active CPUs first. */ printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); for (i = 1; i < mp_ncpus; i++) { if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) hyperthread = "/HT"; else hyperthread = ""; printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, cpu_apic_ids[i]); } /* List disabled CPUs last. */ for (i = 0; i <= MAX_APIC_ID; i++) { if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) continue; if (cpu_info[i].cpu_hyperthread) hyperthread = "/HT"; else hyperthread = ""; printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, i); } } /* * AP CPU's call this to initialize themselves. */ void init_secondary(void) { struct pcpu *pc; vm_offset_t addr; int gsel_tss; int x, myid; u_int cpuid, cr0; /* bootAP is set in start_ap() to our ID. */ myid = bootAP; /* Get per-cpu data */ pc = &__pcpu[myid]; /* prime data page for it to use */ pcpu_init(pc, myid, sizeof(struct pcpu)); dpcpu_init(dpcpu, myid); pc->pc_apic_id = cpu_apic_ids[myid]; pc->pc_prvspace = pc; pc->pc_curthread = 0; fix_cpuid(); gdt_segs[GPRIV_SEL].ssd_base = (int) pc; gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss; for (x = 0; x < NGDT; x++) { ssdtosd(&gdt_segs[x], &gdt[myid * NGDT + x].sd); } r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1; r_gdt.rd_base = (int) &gdt[myid * NGDT]; lgdt(&r_gdt); /* does magic intra-segment return */ lidt(&r_idt); lldt(_default_ldt); PCPU_SET(currentldt, _default_ldt); gsel_tss = GSEL(GPROC0_SEL, SEL_KPL); gdt[myid * NGDT + GPROC0_SEL].sd.sd_type = SDT_SYS386TSS; PCPU_SET(common_tss.tss_esp0, 0); /* not used until after switch */ PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL)); PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16); PCPU_SET(tss_gdt, &gdt[myid * NGDT + GPROC0_SEL].sd); PCPU_SET(common_tssd, *PCPU_GET(tss_gdt)); ltr(gsel_tss); PCPU_SET(fsgs_gdt, &gdt[myid * NGDT + GUFS_SEL].sd); /* * Set to a known state: * Set by mpboot.s: CR0_PG, CR0_PE * Set by cpu_setregs: CR0_NE, CR0_MP, CR0_TS, CR0_WP, CR0_AM */ cr0 = rcr0(); cr0 &= ~(CR0_CD | CR0_NW | CR0_EM); load_cr0(cr0); CHECK_WRITE(0x38, 5); /* Disable local APIC just to be sure. */ lapic_disable(); /* signal our startup to the BSP. */ mp_naps++; CHECK_WRITE(0x39, 6); /* Spin until the BSP releases the AP's. */ while (!aps_ready) ia32_pause(); /* BSP may have changed PTD while we were waiting */ invltlb(); for (addr = 0; addr < NKPT * NBPDR - 1; addr += PAGE_SIZE) invlpg(addr); #if defined(I586_CPU) && !defined(NO_F00F_HACK) lidt(&r_idt); #endif /* Initialize the PAT MSR if present. */ pmap_init_pat(); /* set up CPU registers and state */ cpu_setregs(); /* set up SSE/NX */ initializecpu(); /* set up FPU state on the AP */ npxinit(false); if (cpu_ops.cpu_init) cpu_ops.cpu_init(); /* A quick check from sanity claus */ cpuid = PCPU_GET(cpuid); if (PCPU_GET(apic_id) != lapic_id()) { printf("SMP: cpuid = %d\n", cpuid); printf("SMP: actual apic_id = %d\n", lapic_id()); printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); panic("cpuid mismatch! boom!!"); } /* Initialize curthread. */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); PCPU_SET(curthread, PCPU_GET(idlethread)); mca_init(); mtx_lock_spin(&ap_boot_mtx); /* Init local apic for irq's */ lapic_setup(1); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); smp_cpus++; CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); printf("SMP: AP CPU #%d Launched!\n", cpuid); /* Determine if we are a logical CPU. */ /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) CPU_SET(cpuid, &logical_cpus_mask); if (bootverbose) lapic_dump("AP"); if (smp_cpus == mp_ncpus) { /* enable IPI's, tlb shootdown, freezes etc */ atomic_store_rel_int(&smp_started, 1); } mtx_unlock_spin(&ap_boot_mtx); /* Wait until all the AP's are up. */ while (smp_started == 0) ia32_pause(); /* Start per-CPU event timers. */ cpu_initclocks_ap(); /* Enter the scheduler. */ sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ } /******************************************************************* * local functions and data */ /* * We tell the I/O APIC code about all the CPUs we want to receive * interrupts. If we don't want certain CPUs to receive IRQs we * can simply not tell the I/O APIC code about them in this function. * We also do not tell it about the BSP since it tells itself about * the BSP internally to work with UP kernels and on UP machines. */ static void set_interrupt_apic_ids(void) { u_int i, apic_id; for (i = 0; i < MAXCPU; i++) { apic_id = cpu_apic_ids[i]; if (apic_id == -1) continue; if (cpu_info[apic_id].cpu_bsp) continue; if (cpu_info[apic_id].cpu_disabled) continue; /* Don't let hyperthreads service interrupts. */ if (hyperthreading_cpus > 1 && apic_id % hyperthreading_cpus != 0) continue; intr_add_cpu(i); } } /* * Assign logical CPU IDs to local APICs. */ static void assign_cpu_ids(void) { u_int i; TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", &hyperthreading_allowed); /* Check for explicitly disabled CPUs. */ for (i = 0; i <= MAX_APIC_ID; i++) { if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) continue; if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { cpu_info[i].cpu_hyperthread = 1; /* * Don't use HT CPU if it has been disabled by a * tunable. */ if (hyperthreading_allowed == 0) { cpu_info[i].cpu_disabled = 1; continue; } } /* Don't use this CPU if it has been disabled by a tunable. */ if (resource_disabled("lapic", i)) { cpu_info[i].cpu_disabled = 1; continue; } } if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) { hyperthreading_cpus = 0; cpu_logical = 1; } /* * Assign CPU IDs to local APIC IDs and disable any CPUs * beyond MAXCPU. CPU 0 is always assigned to the BSP. * * To minimize confusion for userland, we attempt to number * CPUs such that all threads and cores in a package are * grouped together. For now we assume that the BSP is always * the first thread in a package and just start adding APs * starting with the BSP's APIC ID. */ mp_ncpus = 1; cpu_apic_ids[0] = boot_cpu_id; apic_cpuids[boot_cpu_id] = 0; for (i = boot_cpu_id + 1; i != boot_cpu_id; i == MAX_APIC_ID ? i = 0 : i++) { if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || cpu_info[i].cpu_disabled) continue; if (mp_ncpus < MAXCPU) { cpu_apic_ids[mp_ncpus] = i; apic_cpuids[i] = mp_ncpus; mp_ncpus++; } else cpu_info[i].cpu_disabled = 1; } KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); } /* * start each AP in our list */ /* Lowest 1MB is already mapped: don't touch*/ #define TMPMAP_START 1 static int start_all_aps(void) { #ifndef PC98 u_char mpbiosreason; #endif u_int32_t mpbioswarmvec; int apic_id, cpu, i; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); /* install the AP 1st level boot code */ install_ap_tramp(); /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); #endif /* set up temporary P==V mapping for AP boot */ /* XXX this is a hack, we should boot the AP on its own stack/PTD */ for (i = TMPMAP_START; i < NKPT; i++) PTD[i] = PTD[KPTDI + i]; invltlb(); /* start each AP */ for (cpu = 1; cpu < mp_ncpus; cpu++) { apic_id = cpu_apic_ids[cpu]; /* allocate and set up a boot stack data page */ bootstacks[cpu] = (char *)kmem_malloc(kernel_arena, KSTACK_PAGES * PAGE_SIZE, M_WAITOK | M_ZERO); dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE, M_WAITOK | M_ZERO); /* setup a vector to our boot code */ *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ #endif bootSTK = (char *)bootstacks[cpu] + KSTACK_PAGES * PAGE_SIZE - 4; bootAP = cpu; /* attempt to start the Application Processor */ CHECK_INIT(99); /* setup checkpoints */ if (!start_ap(apic_id)) { printf("AP #%d (PHY# %d) failed!\n", cpu, apic_id); CHECK_PRINT("trace"); /* show checkpoints */ /* better panic as the AP may be running loose */ printf("panic y/n? [y] "); if (cngetc() != 'n') panic("bye-bye"); } CHECK_PRINT("trace"); /* show checkpoints */ CPU_SET(cpu, &all_cpus); /* record AP in CPU map */ } /* restore the warmstart vector */ *(u_int32_t *) WARMBOOT_OFF = mpbioswarmvec; #ifndef PC98 outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); #endif /* Undo V==P hack from above */ for (i = TMPMAP_START; i < NKPT; i++) PTD[i] = 0; pmap_invalidate_range(kernel_pmap, 0, NKPT * NBPDR - 1); /* number of APs actually started */ return mp_naps; } /* * load the 1st level AP boot code into base memory. */ /* targets for relocation */ extern void bigJump(void); extern void bootCodeSeg(void); extern void bootDataSeg(void); extern void MPentry(void); extern u_int MP_GDT; extern u_int mp_gdtbase; static void install_ap_tramp(void) { int x; int size = *(int *) ((u_long) & bootMP_size); vm_offset_t va = boot_address + KERNBASE; u_char *src = (u_char *) ((u_long) bootMP); u_char *dst = (u_char *) va; u_int boot_base = (u_int) bootMP; u_int8_t *dst8; u_int16_t *dst16; u_int32_t *dst32; KASSERT (size <= PAGE_SIZE, ("'size' do not fit into PAGE_SIZE, as expected.")); pmap_kenter(va, boot_address); pmap_invalidate_page (kernel_pmap, va); for (x = 0; x < size; ++x) *dst++ = *src++; /* * modify addresses in code we just moved to basemem. unfortunately we * need fairly detailed info about mpboot.s for this to work. changes * to mpboot.s might require changes here. */ /* boot code is located in KERNEL space */ dst = (u_char *) va; /* modify the lgdt arg */ dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); *dst32 = boot_address + ((u_int) & MP_GDT - boot_base); /* modify the ljmp target for MPentry() */ dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); *dst32 = ((u_int) MPentry - KERNBASE); /* modify the target for boot code segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_address & 0xffff; *dst8 = ((u_int) boot_address >> 16) & 0xff; /* modify the target for boot data segment */ dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); dst8 = (u_int8_t *) (dst16 + 1); *dst16 = (u_int) boot_address & 0xffff; *dst8 = ((u_int) boot_address >> 16) & 0xff; } /* * This function starts the AP (application processor) identified * by the APIC ID 'physicalCpu'. It does quite a "song and dance" * to accomplish this. This is necessary because of the nuances * of the different hardware we might encounter. It isn't pretty, * but it seems to work. */ static int start_ap(int apic_id) { int vector, ms; int cpus; /* calculate the vector */ vector = (boot_address >> 12) & 0xff; /* used as a watchpoint to signal AP startup */ cpus = mp_naps; ipi_startup(apic_id, vector); /* Wait up to 5 seconds for it to start. */ for (ms = 0; ms < 5000; ms++) { if (mp_naps > cpus) return 1; /* return SUCCESS */ DELAY(1000); } return 0; /* return FAILURE */ } #ifdef COUNT_XINVLTLB_HITS u_int xhits_gbl[MAXCPU]; u_int xhits_pg[MAXCPU]; u_int xhits_rng[MAXCPU]; static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, sizeof(xhits_gbl), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, sizeof(xhits_pg), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, sizeof(xhits_rng), "IU", ""); u_int ipi_global; u_int ipi_page; u_int ipi_range; u_int ipi_range_size; SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 0, ""); u_int ipi_masked_global; u_int ipi_masked_page; u_int ipi_masked_range; u_int ipi_masked_range_size; SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_global, CTLFLAG_RW, &ipi_masked_global, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_page, CTLFLAG_RW, &ipi_masked_page, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range, CTLFLAG_RW, &ipi_masked_range, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_masked_range_size, CTLFLAG_RW, &ipi_masked_range_size, 0, ""); #endif /* COUNT_XINVLTLB_HITS */ /* * Init and startup IPI. */ void ipi_startup(int apic_id, int vector) { /* * This attempts to follow the algorithm described in the * Intel Multiprocessor Specification v1.4 in section B.4. * For each IPI, we allow the local APIC ~20us to deliver the * IPI. If that times out, we panic. */ /* * first we do an INIT IPI: this INIT IPI might be run, resetting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); lapic_ipi_wait(100); /* Explicitly deassert the INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); DELAY(10000); /* wait ~10mS */ /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); if (!lapic_ipi_wait(100)) panic("Failed to deliver first STARTUP IPI to APIC %d", apic_id); DELAY(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); if (!lapic_ipi_wait(100)) panic("Failed to deliver second STARTUP IPI to APIC %d", apic_id); DELAY(200); /* wait ~200uS */ } /* * Send an IPI to specified CPU handling the bitmap logic. */ static void ipi_send_cpu(int cpu, u_int ipi) { u_int bitmap, old_pending, new_pending; KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); if (IPI_IS_BITMAPED(ipi)) { bitmap = 1 << ipi; ipi = IPI_BITMAP_VECTOR; do { old_pending = cpu_ipi_pending[cpu]; new_pending = old_pending | bitmap; } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], old_pending, new_pending)); if (old_pending) return; } cpu_ops.ipi_vectored(ipi, cpu_apic_ids[cpu]); } /* * Flush the TLB on all other CPU's */ static void smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2) { u_int ncpu; ncpu = mp_ncpus - 1; /* does not shootdown self */ if (ncpu < 1) return; /* no other cpus */ if (!(read_eflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); smp_tlb_addr1 = addr1; smp_tlb_addr2 = addr2; atomic_store_rel_int(&smp_tlb_wait, 0); ipi_all_but_self(vector); while (smp_tlb_wait < ncpu) ia32_pause(); mtx_unlock_spin(&smp_ipi_mtx); } static void smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2) { int cpu, ncpu, othercpus; othercpus = mp_ncpus - 1; if (CPU_ISFULLSET(&mask)) { if (othercpus < 1) return; } else { CPU_CLR(PCPU_GET(cpuid), &mask); if (CPU_EMPTY(&mask)) return; } if (!(read_eflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); smp_tlb_addr1 = addr1; smp_tlb_addr2 = addr2; atomic_store_rel_int(&smp_tlb_wait, 0); if (CPU_ISFULLSET(&mask)) { ncpu = othercpus; ipi_all_but_self(vector); } else { ncpu = 0; while ((cpu = CPU_FFS(&mask)) != 0) { cpu--; CPU_CLR(cpu, &mask); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, vector); ipi_send_cpu(cpu, vector); ncpu++; } } while (smp_tlb_wait < ncpu) ia32_pause(); mtx_unlock_spin(&smp_ipi_mtx); } void smp_cache_flush(void) { if (smp_started) smp_tlb_shootdown(IPI_INVLCACHE, 0, 0); } void smp_invltlb(void) { if (smp_started) { smp_tlb_shootdown(IPI_INVLTLB, 0, 0); #ifdef COUNT_XINVLTLB_HITS ipi_global++; #endif } } void smp_invlpg(vm_offset_t addr) { if (smp_started) { smp_tlb_shootdown(IPI_INVLPG, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_page++; #endif } } void smp_invlpg_range(vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { smp_tlb_shootdown(IPI_INVLRNG, addr1, addr2); #ifdef COUNT_XINVLTLB_HITS ipi_range++; ipi_range_size += (addr2 - addr1) / PAGE_SIZE; #endif } } void smp_masked_invltlb(cpuset_t mask) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, 0, 0); #ifdef COUNT_XINVLTLB_HITS ipi_masked_global++; #endif } } void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLPG, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_masked_page++; #endif } } void smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, addr1, addr2); #ifdef COUNT_XINVLTLB_HITS ipi_masked_range++; ipi_masked_range_size += (addr2 - addr1) / PAGE_SIZE; #endif } } void ipi_bitmap_handler(struct trapframe frame) { struct trapframe *oldframe; struct thread *td; int cpu = PCPU_GET(cpuid); u_int ipi_bitmap; critical_enter(); td = curthread; td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = &frame; ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); if (ipi_bitmap & (1 << IPI_PREEMPT)) { #ifdef COUNT_IPIS (*ipi_preempt_counts[cpu])++; #endif sched_preempt(td); } if (ipi_bitmap & (1 << IPI_AST)) { #ifdef COUNT_IPIS (*ipi_ast_counts[cpu])++; #endif /* Nothing to do for AST */ } if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { #ifdef COUNT_IPIS (*ipi_hardclock_counts[cpu])++; #endif hardclockintr(); } td->td_intr_frame = oldframe; td->td_intr_nesting_level--; critical_exit(); } /* * send an IPI to a set of cpus. */ void ipi_selected(cpuset_t cpus, u_int ipi) { int cpu; /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_OR_ATOMIC(&ipi_nmi_pending, &cpus); while ((cpu = CPU_FFS(&cpus)) != 0) { cpu--; CPU_CLR(cpu, &cpus); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } } /* * send an IPI to a specific CPU. */ void ipi_cpu(int cpu, u_int ipi) { /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_SET_ATOMIC(cpu, &ipi_nmi_pending); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { cpuset_t other_cpus; other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); if (IPI_IS_BITMAPED(ipi)) { ipi_selected(other_cpus, ipi); return; } /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_OR_ATOMIC(&ipi_nmi_pending, &other_cpus); CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); cpu_ops.ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); } int ipi_nmi_handler() { u_int cpuid; /* * As long as there is not a simple way to know about a NMI's * source, if the bitmask for the current CPU is present in * the global pending bitword an IPI_STOP_HARD has been issued * and should be handled. */ cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &ipi_nmi_pending)) return (1); CPU_CLR_ATOMIC(cpuid, &ipi_nmi_pending); cpustop_handler(); return (0); } /* * Handle an IPI_STOP by saving our current context and spinning until we * are resumed. */ void cpustop_handler(void) { u_int cpu; cpu = PCPU_GET(cpuid); savectx(&stoppcbs[cpu]); /* Indicate that we are stopped */ CPU_SET_ATOMIC(cpu, &stopped_cpus); /* Wait for restart */ while (!CPU_ISSET(cpu, &started_cpus)) ia32_pause(); CPU_CLR_ATOMIC(cpu, &started_cpus); CPU_CLR_ATOMIC(cpu, &stopped_cpus); if (cpu == 0 && cpustop_restartfunc != NULL) { cpustop_restartfunc(); cpustop_restartfunc = NULL; } } /* * Handle an IPI_SUSPEND by saving our current context and spinning until we * are resumed. */ void cpususpend_handler(void) { u_int cpu; mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); cpu = PCPU_GET(cpuid); if (savectx(&susppcbs[cpu]->sp_pcb)) { npxsuspend(susppcbs[cpu]->sp_fpususpend); wbinvd(); CPU_SET_ATOMIC(cpu, &suspended_cpus); + /* + * Hack for xen, which does not use resumectx() so never + * uses the next clause: set resuming_cpus early so that + * resume_cpus() can wait on the same bitmap for acpi and + * xen. resuming_cpus now means eventually_resumable_cpus. + */ + CPU_SET_ATOMIC(cpu, &resuming_cpus); } else { npxresume(susppcbs[cpu]->sp_fpususpend); pmap_init_pat(); initializecpu(); PCPU_SET(switchtime, 0); PCPU_SET(switchticks, ticks); - /* Indicate that we are resumed */ + /* Indicate that we are resuming */ CPU_CLR_ATOMIC(cpu, &suspended_cpus); } - /* Wait for resume */ - while (!CPU_ISSET(cpu, &started_cpus)) + /* Wait for resume directive */ + while (!CPU_ISSET(cpu, &toresume_cpus)) ia32_pause(); if (cpu_ops.cpu_resume) cpu_ops.cpu_resume(); /* Resume MCA and local APIC */ mca_resume(); lapic_setup(0); /* Indicate that we are resumed */ + CPU_CLR_ATOMIC(cpu, &resuming_cpus); CPU_CLR_ATOMIC(cpu, &suspended_cpus); - CPU_CLR_ATOMIC(cpu, &started_cpus); + CPU_CLR_ATOMIC(cpu, &toresume_cpus); } /* * Handlers for TLB related IPIs */ void invltlb_handler(void) { uint64_t cr3; #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ cr3 = rcr3(); load_cr3(cr3); atomic_add_int(&smp_tlb_wait, 1); } void invlpg_handler(void) { #ifdef COUNT_XINVLTLB_HITS xhits_pg[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ invlpg(smp_tlb_addr1); atomic_add_int(&smp_tlb_wait, 1); } void invlrng_handler(void) { vm_offset_t addr; #ifdef COUNT_XINVLTLB_HITS xhits_rng[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ addr = smp_tlb_addr1; do { invlpg(addr); addr += PAGE_SIZE; } while (addr < smp_tlb_addr2); atomic_add_int(&smp_tlb_wait, 1); } void invlcache_handler(void) { #ifdef COUNT_IPIS (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ wbinvd(); atomic_add_int(&smp_tlb_wait, 1); } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ static void release_aps(void *dummy __unused) { if (mp_ncpus == 1) return; atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ia32_pause(); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); #ifdef COUNT_IPIS /* * Setup interrupt counters for IPI handlers. */ static void mp_ipi_intrcnt(void *dummy) { char buf[64]; int i; CPU_FOREACH(i) { snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); intrcnt_add(buf, &ipi_invltlb_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); intrcnt_add(buf, &ipi_invlrng_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); intrcnt_add(buf, &ipi_invlpg_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); intrcnt_add(buf, &ipi_invlcache_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:preempt", i); intrcnt_add(buf, &ipi_preempt_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:ast", i); intrcnt_add(buf, &ipi_ast_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); intrcnt_add(buf, &ipi_rendezvous_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:lazypmap", i); intrcnt_add(buf, &ipi_lazypmap_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); intrcnt_add(buf, &ipi_hardclock_counts[i]); } } SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); #endif Index: stable/10/sys/kern/subr_smp.c =================================================================== --- stable/10/sys/kern/subr_smp.c (revision 331909) +++ stable/10/sys/kern/subr_smp.c (revision 331910) @@ -1,847 +1,852 @@ /*- * Copyright (c) 2001, John Baldwin . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module holds the global variables and machine independent functions * used for the kernel SMP support. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "opt_sched.h" #ifdef SMP volatile cpuset_t stopped_cpus; volatile cpuset_t started_cpus; volatile cpuset_t suspended_cpus; cpuset_t hlt_cpus_mask; cpuset_t logical_cpus_mask; void (*cpustop_restartfunc)(void); #endif static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS); /* This is used in modules that need to work in both SMP and UP. */ cpuset_t all_cpus; int mp_ncpus; /* export this for libkvm consumers. */ int mp_maxcpus = MAXCPU; volatile int smp_started; u_int mp_maxid; static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL, "Kernel SMP"); SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0, "Max CPU ID."); SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus, 0, "Max number of CPUs that the system was compiled for."); SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD | CTLTYPE_INT, NULL, 0, sysctl_kern_smp_active, "I", "Indicates system is running in SMP mode"); int smp_disabled = 0; /* has smp been disabled? */ SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD, &smp_disabled, 0, "SMP has been disabled from the loader"); TUNABLE_INT("kern.smp.disabled", &smp_disabled); int smp_cpus = 1; /* how many cpu's running */ SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0, "Number of CPUs online"); int smp_topology = 0; /* Which topology we're using. */ SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RD, &smp_topology, 0, "Topology override setting; 0 is default provided by hardware."); TUNABLE_INT("kern.smp.topology", &smp_topology); #ifdef SMP /* Enable forwarding of a signal to a process running on a different CPU */ static int forward_signal_enabled = 1; SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW, &forward_signal_enabled, 0, "Forwarding of a signal to a process on a different CPU"); /* Variables needed for SMP rendezvous. */ static volatile int smp_rv_ncpus; static void (*volatile smp_rv_setup_func)(void *arg); static void (*volatile smp_rv_action_func)(void *arg); static void (*volatile smp_rv_teardown_func)(void *arg); static void *volatile smp_rv_func_arg; static volatile int smp_rv_waiters[4]; /* * Shared mutex to restrict busywaits between smp_rendezvous() and * smp(_targeted)_tlb_shootdown(). A deadlock occurs if both of these * functions trigger at once and cause multiple CPUs to busywait with * interrupts disabled. */ struct mtx smp_ipi_mtx; /* * Let the MD SMP code initialize mp_maxid very early if it can. */ static void mp_setmaxid(void *dummy) { cpu_mp_setmaxid(); } SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL); /* * Call the MD SMP initialization code. */ static void mp_start(void *dummy) { mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN); /* Probe for MP hardware. */ if (smp_disabled != 0 || cpu_mp_probe() == 0) { mp_ncpus = 1; CPU_SETOF(PCPU_GET(cpuid), &all_cpus); return; } cpu_mp_start(); printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n", mp_ncpus); cpu_mp_announce(); } SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL); void forward_signal(struct thread *td) { int id; /* * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on * this thread, so all we need to do is poke it if it is currently * executing so that it executes ast(). */ THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_RUNNING(td), ("forward_signal: thread is not TDS_RUNNING")); CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc); if (!smp_started || cold || panicstr) return; if (!forward_signal_enabled) return; /* No need to IPI ourself. */ if (td == curthread) return; id = td->td_oncpu; if (id == NOCPU) return; ipi_cpu(id, IPI_AST); } /* * When called the executing CPU will send an IPI to all other CPUs * requesting that they halt execution. * * Usually (but not necessarily) called with 'other_cpus' as its arg. * * - Signals all CPUs in map to stop. * - Waits for each to stop. * * Returns: * -1: error * 0: NA * 1: ok * */ static int generic_stop_cpus(cpuset_t map, u_int type) { #ifdef KTR char cpusetbuf[CPUSETBUFSIZ]; #endif static volatile u_int stopping_cpu = NOCPU; int i; volatile cpuset_t *cpus; KASSERT( #if defined(__amd64__) || defined(__i386__) type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND, #else type == IPI_STOP || type == IPI_STOP_HARD, #endif ("%s: invalid stop type", __func__)); if (!smp_started) return (0); CTR2(KTR_SMP, "stop_cpus(%s) with %u type", cpusetobj_strprint(cpusetbuf, &map), type); #if defined(__amd64__) || defined(__i386__) /* * When suspending, ensure there are are no IPIs in progress. * IPIs that have been issued, but not yet delivered (e.g. * not pending on a vCPU when running under virtualization) * will be lost, violating FreeBSD's assumption of reliable * IPI delivery. */ if (type == IPI_SUSPEND) mtx_lock_spin(&smp_ipi_mtx); #endif if (stopping_cpu != PCPU_GET(cpuid)) while (atomic_cmpset_int(&stopping_cpu, NOCPU, PCPU_GET(cpuid)) == 0) while (stopping_cpu != NOCPU) cpu_spinwait(); /* spin */ /* send the stop IPI to all CPUs in map */ ipi_selected(map, type); #if defined(__amd64__) || defined(__i386__) if (type == IPI_SUSPEND) cpus = &suspended_cpus; else #endif cpus = &stopped_cpus; i = 0; while (!CPU_SUBSET(cpus, &map)) { /* spin */ cpu_spinwait(); i++; if (i == 100000000) { printf("timeout stopping cpus\n"); break; } } #if defined(__amd64__) || defined(__i386__) if (type == IPI_SUSPEND) mtx_unlock_spin(&smp_ipi_mtx); #endif stopping_cpu = NOCPU; return (1); } int stop_cpus(cpuset_t map) { return (generic_stop_cpus(map, IPI_STOP)); } int stop_cpus_hard(cpuset_t map) { return (generic_stop_cpus(map, IPI_STOP_HARD)); } #if defined(__amd64__) || defined(__i386__) int suspend_cpus(cpuset_t map) { return (generic_stop_cpus(map, IPI_SUSPEND)); } #endif /* * Called by a CPU to restart stopped CPUs. * * Usually (but not necessarily) called with 'stopped_cpus' as its arg. * * - Signals all CPUs in map to restart. * - Waits for each to restart. * * Returns: * -1: error * 0: NA * 1: ok */ static int generic_restart_cpus(cpuset_t map, u_int type) { #ifdef KTR char cpusetbuf[CPUSETBUFSIZ]; #endif volatile cpuset_t *cpus; KASSERT( #if defined(__amd64__) || defined(__i386__) type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND, #else type == IPI_STOP || type == IPI_STOP_HARD, #endif ("%s: invalid stop type", __func__)); if (!smp_started) return 0; CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map)); #if defined(__amd64__) || defined(__i386__) if (type == IPI_SUSPEND) - cpus = &suspended_cpus; + cpus = &resuming_cpus; else #endif cpus = &stopped_cpus; /* signal other cpus to restart */ - CPU_COPY_STORE_REL(&map, &started_cpus); +#if defined(__amd64__) || defined(__i386__) + if (type == IPI_SUSPEND) + CPU_COPY_STORE_REL(&map, &toresume_cpus); + else +#endif + CPU_COPY_STORE_REL(&map, &started_cpus); /* wait for each to clear its bit */ while (CPU_OVERLAP(cpus, &map)) cpu_spinwait(); return 1; } int restart_cpus(cpuset_t map) { return (generic_restart_cpus(map, IPI_STOP)); } #if defined(__amd64__) || defined(__i386__) int resume_cpus(cpuset_t map) { return (generic_restart_cpus(map, IPI_SUSPEND)); } #endif /* * All-CPU rendezvous. CPUs are signalled, all execute the setup function * (if specified), rendezvous, execute the action function (if specified), * rendezvous again, execute the teardown function (if specified), and then * resume. * * Note that the supplied external functions _must_ be reentrant and aware * that they are running in parallel and in an unknown lock context. */ void smp_rendezvous_action(void) { struct thread *td; void *local_func_arg; void (*local_setup_func)(void*); void (*local_action_func)(void*); void (*local_teardown_func)(void*); #ifdef INVARIANTS int owepreempt; #endif /* Ensure we have up-to-date values. */ atomic_add_acq_int(&smp_rv_waiters[0], 1); while (smp_rv_waiters[0] < smp_rv_ncpus) cpu_spinwait(); /* Fetch rendezvous parameters after acquire barrier. */ local_func_arg = smp_rv_func_arg; local_setup_func = smp_rv_setup_func; local_action_func = smp_rv_action_func; local_teardown_func = smp_rv_teardown_func; /* * Use a nested critical section to prevent any preemptions * from occurring during a rendezvous action routine. * Specifically, if a rendezvous handler is invoked via an IPI * and the interrupted thread was in the critical_exit() * function after setting td_critnest to 0 but before * performing a deferred preemption, this routine can be * invoked with td_critnest set to 0 and td_owepreempt true. * In that case, a critical_exit() during the rendezvous * action would trigger a preemption which is not permitted in * a rendezvous action. To fix this, wrap all of the * rendezvous action handlers in a critical section. We * cannot use a regular critical section however as having * critical_exit() preempt from this routine would also be * problematic (the preemption must not occur before the IPI * has been acknowledged via an EOI). Instead, we * intentionally ignore td_owepreempt when leaving the * critical section. This should be harmless because we do * not permit rendezvous action routines to schedule threads, * and thus td_owepreempt should never transition from 0 to 1 * during this routine. */ td = curthread; td->td_critnest++; #ifdef INVARIANTS owepreempt = td->td_owepreempt; #endif /* * If requested, run a setup function before the main action * function. Ensure all CPUs have completed the setup * function before moving on to the action function. */ if (local_setup_func != smp_no_rendevous_barrier) { if (smp_rv_setup_func != NULL) smp_rv_setup_func(smp_rv_func_arg); atomic_add_int(&smp_rv_waiters[1], 1); while (smp_rv_waiters[1] < smp_rv_ncpus) cpu_spinwait(); } if (local_action_func != NULL) local_action_func(local_func_arg); if (local_teardown_func != smp_no_rendevous_barrier) { /* * Signal that the main action has been completed. If a * full exit rendezvous is requested, then all CPUs will * wait here until all CPUs have finished the main action. */ atomic_add_int(&smp_rv_waiters[2], 1); while (smp_rv_waiters[2] < smp_rv_ncpus) cpu_spinwait(); if (local_teardown_func != NULL) local_teardown_func(local_func_arg); } /* * Signal that the rendezvous is fully completed by this CPU. * This means that no member of smp_rv_* pseudo-structure will be * accessed by this target CPU after this point; in particular, * memory pointed by smp_rv_func_arg. */ atomic_add_int(&smp_rv_waiters[3], 1); td->td_critnest--; KASSERT(owepreempt == td->td_owepreempt, ("rendezvous action changed td_owepreempt")); } void smp_rendezvous_cpus(cpuset_t map, void (* setup_func)(void *), void (* action_func)(void *), void (* teardown_func)(void *), void *arg) { int curcpumap, i, ncpus = 0; /* Look comments in the !SMP case. */ if (!smp_started) { spinlock_enter(); if (setup_func != NULL) setup_func(arg); if (action_func != NULL) action_func(arg); if (teardown_func != NULL) teardown_func(arg); spinlock_exit(); return; } CPU_FOREACH(i) { if (CPU_ISSET(i, &map)) ncpus++; } if (ncpus == 0) panic("ncpus is 0 with non-zero map"); mtx_lock_spin(&smp_ipi_mtx); /* Pass rendezvous parameters via global variables. */ smp_rv_ncpus = ncpus; smp_rv_setup_func = setup_func; smp_rv_action_func = action_func; smp_rv_teardown_func = teardown_func; smp_rv_func_arg = arg; smp_rv_waiters[1] = 0; smp_rv_waiters[2] = 0; smp_rv_waiters[3] = 0; atomic_store_rel_int(&smp_rv_waiters[0], 0); /* * Signal other processors, which will enter the IPI with * interrupts off. */ curcpumap = CPU_ISSET(curcpu, &map); CPU_CLR(curcpu, &map); ipi_selected(map, IPI_RENDEZVOUS); /* Check if the current CPU is in the map */ if (curcpumap != 0) smp_rendezvous_action(); /* * Ensure that the master CPU waits for all the other * CPUs to finish the rendezvous, so that smp_rv_* * pseudo-structure and the arg are guaranteed to not * be in use. */ while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus) cpu_spinwait(); mtx_unlock_spin(&smp_ipi_mtx); } void smp_rendezvous(void (* setup_func)(void *), void (* action_func)(void *), void (* teardown_func)(void *), void *arg) { smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg); } static struct cpu_group group[MAXCPU]; struct cpu_group * smp_topo(void) { char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ]; struct cpu_group *top; /* * Check for a fake topology request for debugging purposes. */ switch (smp_topology) { case 1: /* Dual core with no sharing. */ top = smp_topo_1level(CG_SHARE_NONE, 2, 0); break; case 2: /* No topology, all cpus are equal. */ top = smp_topo_none(); break; case 3: /* Dual core with shared L2. */ top = smp_topo_1level(CG_SHARE_L2, 2, 0); break; case 4: /* quad core, shared l3 among each package, private l2. */ top = smp_topo_1level(CG_SHARE_L3, 4, 0); break; case 5: /* quad core, 2 dualcore parts on each package share l2. */ top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0); break; case 6: /* Single-core 2xHTT */ top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT); break; case 7: /* quad core with a shared l3, 8 threads sharing L2. */ top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8, CG_FLAG_SMT); break; default: /* Default, ask the system what it wants. */ top = cpu_topo(); break; } /* * Verify the returned topology. */ if (top->cg_count != mp_ncpus) panic("Built bad topology at %p. CPU count %d != %d", top, top->cg_count, mp_ncpus); if (CPU_CMP(&top->cg_mask, &all_cpus)) panic("Built bad topology at %p. CPU mask (%s) != (%s)", top, cpusetobj_strprint(cpusetbuf, &top->cg_mask), cpusetobj_strprint(cpusetbuf2, &all_cpus)); return (top); } struct cpu_group * smp_topo_none(void) { struct cpu_group *top; top = &group[0]; top->cg_parent = NULL; top->cg_child = NULL; top->cg_mask = all_cpus; top->cg_count = mp_ncpus; top->cg_children = 0; top->cg_level = CG_SHARE_NONE; top->cg_flags = 0; return (top); } static int smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share, int count, int flags, int start) { char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ]; cpuset_t mask; int i; CPU_ZERO(&mask); for (i = 0; i < count; i++, start++) CPU_SET(start, &mask); child->cg_parent = parent; child->cg_child = NULL; child->cg_children = 0; child->cg_level = share; child->cg_count = count; child->cg_flags = flags; child->cg_mask = mask; parent->cg_children++; for (; parent != NULL; parent = parent->cg_parent) { if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask)) panic("Duplicate children in %p. mask (%s) child (%s)", parent, cpusetobj_strprint(cpusetbuf, &parent->cg_mask), cpusetobj_strprint(cpusetbuf2, &child->cg_mask)); CPU_OR(&parent->cg_mask, &child->cg_mask); parent->cg_count += child->cg_count; } return (start); } struct cpu_group * smp_topo_1level(int share, int count, int flags) { struct cpu_group *child; struct cpu_group *top; int packages; int cpu; int i; cpu = 0; top = &group[0]; packages = mp_ncpus / count; top->cg_child = child = &group[1]; top->cg_level = CG_SHARE_NONE; for (i = 0; i < packages; i++, child++) cpu = smp_topo_addleaf(top, child, share, count, flags, cpu); return (top); } struct cpu_group * smp_topo_2level(int l2share, int l2count, int l1share, int l1count, int l1flags) { struct cpu_group *top; struct cpu_group *l1g; struct cpu_group *l2g; int cpu; int i; int j; cpu = 0; top = &group[0]; l2g = &group[1]; top->cg_child = l2g; top->cg_level = CG_SHARE_NONE; top->cg_children = mp_ncpus / (l2count * l1count); l1g = l2g + top->cg_children; for (i = 0; i < top->cg_children; i++, l2g++) { l2g->cg_parent = top; l2g->cg_child = l1g; l2g->cg_level = l2share; for (j = 0; j < l2count; j++, l1g++) cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count, l1flags, cpu); } return (top); } struct cpu_group * smp_topo_find(struct cpu_group *top, int cpu) { struct cpu_group *cg; cpuset_t mask; int children; int i; CPU_SETOF(cpu, &mask); cg = top; for (;;) { if (!CPU_OVERLAP(&cg->cg_mask, &mask)) return (NULL); if (cg->cg_children == 0) return (cg); children = cg->cg_children; for (i = 0, cg = cg->cg_child; i < children; cg++, i++) if (CPU_OVERLAP(&cg->cg_mask, &mask)) break; } return (NULL); } #else /* !SMP */ void smp_rendezvous_cpus(cpuset_t map, void (*setup_func)(void *), void (*action_func)(void *), void (*teardown_func)(void *), void *arg) { /* * In the !SMP case we just need to ensure the same initial conditions * as the SMP case. */ spinlock_enter(); if (setup_func != NULL) setup_func(arg); if (action_func != NULL) action_func(arg); if (teardown_func != NULL) teardown_func(arg); spinlock_exit(); } void smp_rendezvous(void (*setup_func)(void *), void (*action_func)(void *), void (*teardown_func)(void *), void *arg) { /* Look comments in the smp_rendezvous_cpus() case. */ spinlock_enter(); if (setup_func != NULL) setup_func(arg); if (action_func != NULL) action_func(arg); if (teardown_func != NULL) teardown_func(arg); spinlock_exit(); } /* * Provide dummy SMP support for UP kernels. Modules that need to use SMP * APIs will still work using this dummy support. */ static void mp_setvariables_for_up(void *dummy) { mp_ncpus = 1; mp_maxid = PCPU_GET(cpuid); CPU_SETOF(mp_maxid, &all_cpus); KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero")); } SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setvariables_for_up, NULL); #endif /* SMP */ void smp_no_rendevous_barrier(void *dummy) { #ifdef SMP KASSERT((!smp_started),("smp_no_rendevous called and smp is started")); #endif } /* * Wait specified idle threads to switch once. This ensures that even * preempted threads have cycled through the switch function once, * exiting their codepaths. This allows us to change global pointers * with no other synchronization. */ int quiesce_cpus(cpuset_t map, const char *wmesg, int prio) { struct pcpu *pcpu; u_int gen[MAXCPU]; int error; int cpu; error = 0; for (cpu = 0; cpu <= mp_maxid; cpu++) { if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu)) continue; pcpu = pcpu_find(cpu); gen[cpu] = pcpu->pc_idlethread->td_generation; } for (cpu = 0; cpu <= mp_maxid; cpu++) { if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu)) continue; pcpu = pcpu_find(cpu); thread_lock(curthread); sched_bind(curthread, cpu); thread_unlock(curthread); while (gen[cpu] == pcpu->pc_idlethread->td_generation) { error = tsleep(quiesce_cpus, prio, wmesg, 1); if (error != EWOULDBLOCK) goto out; error = 0; } } out: thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); return (error); } int quiesce_all_cpus(const char *wmesg, int prio) { return quiesce_cpus(all_cpus, wmesg, prio); } /* Extra care is taken with this sysctl because the data type is volatile */ static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS) { int error, active; active = smp_started; error = SYSCTL_OUT(req, &active, sizeof(active)); return (error); } Index: stable/10/sys/sys/smp.h =================================================================== --- stable/10/sys/sys/smp.h (revision 331909) +++ stable/10/sys/sys/smp.h (revision 331910) @@ -1,190 +1,193 @@ /*- * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * * $FreeBSD$ */ #ifndef _SYS_SMP_H_ #define _SYS_SMP_H_ #ifdef _KERNEL #ifndef LOCORE #include /* * Topology of a NUMA or HTT system. * * The top level topology is an array of pointers to groups. Each group * contains a bitmask of cpus in its group or subgroups. It may also * contain a pointer to an array of child groups. * * The bitmasks at non leaf groups may be used by consumers who support * a smaller depth than the hardware provides. * * The topology may be omitted by systems where all CPUs are equal. */ struct cpu_group { struct cpu_group *cg_parent; /* Our parent group. */ struct cpu_group *cg_child; /* Optional children groups. */ cpuset_t cg_mask; /* Mask of cpus in this group. */ int32_t cg_count; /* Count of cpus in this group. */ int16_t cg_children; /* Number of children groups. */ int8_t cg_level; /* Shared cache level. */ int8_t cg_flags; /* Traversal modifiers. */ }; typedef struct cpu_group *cpu_group_t; /* * Defines common resources for CPUs in the group. The highest level * resource should be used when multiple are shared. */ #define CG_SHARE_NONE 0 #define CG_SHARE_L1 1 #define CG_SHARE_L2 2 #define CG_SHARE_L3 3 /* * Behavior modifiers for load balancing and affinity. */ #define CG_FLAG_HTT 0x01 /* Schedule the alternate core last. */ #define CG_FLAG_SMT 0x02 /* New age htt, less crippled. */ #define CG_FLAG_THREAD (CG_FLAG_HTT | CG_FLAG_SMT) /* Any threading. */ /* * Convenience routines for building topologies. */ #ifdef SMP struct cpu_group *smp_topo(void); struct cpu_group *smp_topo_none(void); struct cpu_group *smp_topo_1level(int l1share, int l1count, int l1flags); struct cpu_group *smp_topo_2level(int l2share, int l2count, int l1share, int l1count, int l1flags); struct cpu_group *smp_topo_find(struct cpu_group *top, int cpu); extern void (*cpustop_restartfunc)(void); extern int smp_cpus; -extern volatile cpuset_t started_cpus; -extern volatile cpuset_t stopped_cpus; -extern volatile cpuset_t suspended_cpus; -extern cpuset_t hlt_cpus_mask; +/* The suspend/resume cpusets are x86 only, but minimize ifdefs. */ +extern volatile cpuset_t resuming_cpus; /* woken up cpus in suspend pen */ +extern volatile cpuset_t started_cpus; /* cpus to let out of stop pen */ +extern volatile cpuset_t stopped_cpus; /* cpus in stop pen */ +extern volatile cpuset_t suspended_cpus; /* cpus [near] sleeping in susp pen */ +extern volatile cpuset_t toresume_cpus; /* cpus to let out of suspend pen */ +extern cpuset_t hlt_cpus_mask; /* XXX 'mask' is detail in old impl */ extern cpuset_t logical_cpus_mask; #endif /* SMP */ extern u_int mp_maxid; extern int mp_maxcpus; extern int mp_ncpus; extern volatile int smp_started; extern cpuset_t all_cpus; /* * Macro allowing us to determine whether a CPU is absent at any given * time, thus permitting us to configure sparse maps of cpuid-dependent * (per-CPU) structures. */ #define CPU_ABSENT(x_cpu) (!CPU_ISSET(x_cpu, &all_cpus)) /* * Macros to iterate over non-absent CPUs. CPU_FOREACH() takes an * integer iterator and iterates over the available set of CPUs. * CPU_FIRST() returns the id of the first non-absent CPU. CPU_NEXT() * returns the id of the next non-absent CPU. It will wrap back to * CPU_FIRST() once the end of the list is reached. The iterators are * currently implemented via inline functions. */ #define CPU_FOREACH(i) \ for ((i) = 0; (i) <= mp_maxid; (i)++) \ if (!CPU_ABSENT((i))) static __inline int cpu_first(void) { int i; for (i = 0;; i++) if (!CPU_ABSENT(i)) return (i); } static __inline int cpu_next(int i) { for (;;) { i++; if (i > mp_maxid) i = 0; if (!CPU_ABSENT(i)) return (i); } } #define CPU_FIRST() cpu_first() #define CPU_NEXT(i) cpu_next((i)) #ifdef SMP /* * Machine dependent functions used to initialize MP support. * * The cpu_mp_probe() should check to see if MP support is present and return * zero if it is not or non-zero if it is. If MP support is present, then * cpu_mp_start() will be called so that MP can be enabled. This function * should do things such as startup secondary processors. It should also * setup mp_ncpus, all_cpus, and smp_cpus. It should also ensure that * smp_started is initialized at the appropriate time. * Once cpu_mp_start() returns, machine independent MP startup code will be * executed and a simple message will be output to the console. Finally, * cpu_mp_announce() will be called so that machine dependent messages about * the MP support may be output to the console if desired. * * The cpu_setmaxid() function is called very early during the boot process * so that the MD code may set mp_maxid to provide an upper bound on CPU IDs * that other subsystems may use. If a platform is not able to determine * the exact maximum ID that early, then it may set mp_maxid to MAXCPU - 1. */ struct thread; struct cpu_group *cpu_topo(void); void cpu_mp_announce(void); int cpu_mp_probe(void); void cpu_mp_setmaxid(void); void cpu_mp_start(void); void forward_signal(struct thread *); int restart_cpus(cpuset_t); int stop_cpus(cpuset_t); int stop_cpus_hard(cpuset_t); #if defined(__amd64__) || defined(__i386__) int suspend_cpus(cpuset_t); int resume_cpus(cpuset_t); #endif void smp_rendezvous_action(void); extern struct mtx smp_ipi_mtx; #endif /* SMP */ int quiesce_all_cpus(const char *, int); int quiesce_cpus(cpuset_t, const char *, int); void smp_no_rendevous_barrier(void *); void smp_rendezvous(void (*)(void *), void (*)(void *), void (*)(void *), void *arg); void smp_rendezvous_cpus(cpuset_t, void (*)(void *), void (*)(void *), void (*)(void *), void *arg); #endif /* !LOCORE */ #endif /* _KERNEL */ #endif /* _SYS_SMP_H_ */ Index: stable/10/sys/x86/acpica/acpi_wakeup.c =================================================================== --- stable/10/sys/x86/acpica/acpi_wakeup.c (revision 331909) +++ stable/10/sys/x86/acpica/acpi_wakeup.c (revision 331910) @@ -1,408 +1,408 @@ /*- * Copyright (c) 2001 Takanori Watanabe * Copyright (c) 2001-2012 Mitsuru IWASAKI * Copyright (c) 2003 Peter Wemm * Copyright (c) 2008-2012 Jung-uk Kim * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifdef __i386__ #include "opt_npx.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #include #include #endif #include #include #include "acpi_wakecode.h" #include "acpi_wakedata.h" /* Make sure the code is less than a page and leave room for the stack. */ CTASSERT(sizeof(wakecode) < PAGE_SIZE - 1024); extern int acpi_resume_beep; extern int acpi_reset_video; #ifdef SMP extern struct susppcb **susppcbs; static cpuset_t suspcpus; #else static struct susppcb **susppcbs; #endif static void *acpi_alloc_wakeup_handler(void); static void acpi_stop_beep(void *); #ifdef SMP static int acpi_wakeup_ap(struct acpi_softc *, int); static void acpi_wakeup_cpus(struct acpi_softc *); #endif #ifdef __amd64__ #define ACPI_PAGETABLES 3 #else #define ACPI_PAGETABLES 0 #endif #define WAKECODE_VADDR(sc) \ ((sc)->acpi_wakeaddr + (ACPI_PAGETABLES * PAGE_SIZE)) #define WAKECODE_PADDR(sc) \ ((sc)->acpi_wakephys + (ACPI_PAGETABLES * PAGE_SIZE)) #define WAKECODE_FIXUP(offset, type, val) do { \ type *addr; \ addr = (type *)(WAKECODE_VADDR(sc) + offset); \ *addr = val; \ } while (0) static void acpi_stop_beep(void *arg) { if (acpi_resume_beep != 0) timer_spkr_release(); } #ifdef SMP static int acpi_wakeup_ap(struct acpi_softc *sc, int cpu) { struct pcb *pcb; int vector = (WAKECODE_PADDR(sc) >> 12) & 0xff; int apic_id = cpu_apic_ids[cpu]; int ms; pcb = &susppcbs[cpu]->sp_pcb; WAKECODE_FIXUP(wakeup_pcb, struct pcb *, pcb); WAKECODE_FIXUP(wakeup_gdt, uint16_t, pcb->pcb_gdt.rd_limit); WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t, pcb->pcb_gdt.rd_base); ipi_startup(apic_id, vector); /* Wait up to 5 seconds for it to resume. */ for (ms = 0; ms < 5000; ms++) { if (!CPU_ISSET(cpu, &suspended_cpus)) return (1); /* return SUCCESS */ DELAY(1000); } return (0); /* return FAILURE */ } #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) static void acpi_wakeup_cpus(struct acpi_softc *sc) { uint32_t mpbioswarmvec; int cpu; u_char mpbiosreason; /* save the current value of the warm-start vector */ mpbioswarmvec = *((uint32_t *)WARMBOOT_OFF); outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); /* setup a vector to our boot code */ *((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET; *((volatile u_short *)WARMBOOT_SEG) = WAKECODE_PADDR(sc) >> 4; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ /* Wake up each AP. */ for (cpu = 1; cpu < mp_ncpus; cpu++) { if (!CPU_ISSET(cpu, &suspcpus)) continue; if (acpi_wakeup_ap(sc, cpu) == 0) { /* restore the warmstart vector */ *(uint32_t *)WARMBOOT_OFF = mpbioswarmvec; panic("acpi_wakeup: failed to resume AP #%d (PHY #%d)", cpu, cpu_apic_ids[cpu]); } } /* restore the warmstart vector */ *(uint32_t *)WARMBOOT_OFF = mpbioswarmvec; outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); } #endif int acpi_sleep_machdep(struct acpi_softc *sc, int state) { ACPI_STATUS status; struct pcb *pcb; if (sc->acpi_wakeaddr == 0ul) return (-1); /* couldn't alloc wake memory */ #ifdef SMP suspcpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &suspcpus); #endif if (acpi_resume_beep != 0) timer_spkr_acquire(); AcpiSetFirmwareWakingVector(WAKECODE_PADDR(sc), 0); intr_suspend(); pcb = &susppcbs[0]->sp_pcb; if (savectx(pcb)) { #ifdef __amd64__ fpususpend(susppcbs[0]->sp_fpususpend); #elif defined(DEV_NPX) npxsuspend(susppcbs[0]->sp_fpususpend); #endif #ifdef SMP if (!CPU_EMPTY(&suspcpus) && suspend_cpus(suspcpus) == 0) { device_printf(sc->acpi_dev, "Failed to suspend APs\n"); return (0); /* couldn't sleep */ } #endif WAKECODE_FIXUP(resume_beep, uint8_t, (acpi_resume_beep != 0)); WAKECODE_FIXUP(reset_video, uint8_t, (acpi_reset_video != 0)); #ifndef __amd64__ WAKECODE_FIXUP(wakeup_cr4, register_t, pcb->pcb_cr4); #endif WAKECODE_FIXUP(wakeup_pcb, struct pcb *, pcb); WAKECODE_FIXUP(wakeup_gdt, uint16_t, pcb->pcb_gdt.rd_limit); WAKECODE_FIXUP(wakeup_gdt + 2, uint64_t, pcb->pcb_gdt.rd_base); /* Call ACPICA to enter the desired sleep state */ if (state == ACPI_STATE_S4 && sc->acpi_s4bios) status = AcpiEnterSleepStateS4bios(); else status = AcpiEnterSleepState(state); if (ACPI_FAILURE(status)) { device_printf(sc->acpi_dev, "AcpiEnterSleepState failed - %s\n", AcpiFormatException(status)); return (0); /* couldn't sleep */ } for (;;) ia32_pause(); } else { #ifdef __amd64__ fpuresume(susppcbs[0]->sp_fpususpend); #elif defined(DEV_NPX) npxresume(susppcbs[0]->sp_fpususpend); #endif } return (1); /* wakeup successfully */ } int acpi_wakeup_machdep(struct acpi_softc *sc, int state, int sleep_result, int intr_enabled) { if (sleep_result == -1) return (sleep_result); if (!intr_enabled) { /* Wakeup MD procedures in interrupt disabled context */ if (sleep_result == 1) { pmap_init_pat(); initializecpu(); PCPU_SET(switchtime, 0); PCPU_SET(switchticks, ticks); #ifdef SMP if (!CPU_EMPTY(&suspcpus)) acpi_wakeup_cpus(sc); #endif } #ifdef SMP if (!CPU_EMPTY(&suspcpus)) - restart_cpus(suspcpus); + resume_cpus(suspcpus); #endif mca_resume(); #ifdef __amd64__ if (vmm_resume_p != NULL) vmm_resume_p(); #endif intr_resume(/*suspend_cancelled*/false); AcpiSetFirmwareWakingVector(0, 0); } else { /* Wakeup MD procedures in interrupt enabled context */ if (sleep_result == 1 && mem_range_softc.mr_op != NULL && mem_range_softc.mr_op->reinit != NULL) mem_range_softc.mr_op->reinit(&mem_range_softc); } return (sleep_result); } static void * acpi_alloc_wakeup_handler(void) { void *wakeaddr; int i; /* * Specify the region for our wakeup code. We want it in the low 1 MB * region, excluding real mode IVT (0-0x3ff), BDA (0x400-0x4ff), EBDA * (less than 128KB, below 0xa0000, must be excluded by SMAP and DSDT), * and ROM area (0xa0000 and above). The temporary page tables must be * page-aligned. */ wakeaddr = contigmalloc((ACPI_PAGETABLES + 1) * PAGE_SIZE, M_DEVBUF, M_WAITOK, 0x500, 0xa0000, PAGE_SIZE, 0ul); if (wakeaddr == NULL) { printf("%s: can't alloc wake memory\n", __func__); return (NULL); } if (EVENTHANDLER_REGISTER(power_resume, acpi_stop_beep, NULL, EVENTHANDLER_PRI_LAST) == NULL) { printf("%s: can't register event handler\n", __func__); contigfree(wakeaddr, (ACPI_PAGETABLES + 1) * PAGE_SIZE, M_DEVBUF); return (NULL); } susppcbs = malloc(mp_ncpus * sizeof(*susppcbs), M_DEVBUF, M_WAITOK); for (i = 0; i < mp_ncpus; i++) { susppcbs[i] = malloc(sizeof(**susppcbs), M_DEVBUF, M_WAITOK); susppcbs[i]->sp_fpususpend = alloc_fpusave(M_WAITOK); } return (wakeaddr); } void acpi_install_wakeup_handler(struct acpi_softc *sc) { static void *wakeaddr = NULL; #ifdef __amd64__ uint64_t *pt4, *pt3, *pt2; int i; #endif if (wakeaddr != NULL) return; wakeaddr = acpi_alloc_wakeup_handler(); if (wakeaddr == NULL) return; sc->acpi_wakeaddr = (vm_offset_t)wakeaddr; sc->acpi_wakephys = vtophys(wakeaddr); bcopy(wakecode, (void *)WAKECODE_VADDR(sc), sizeof(wakecode)); /* Patch GDT base address, ljmp targets. */ WAKECODE_FIXUP((bootgdtdesc + 2), uint32_t, WAKECODE_PADDR(sc) + bootgdt); WAKECODE_FIXUP((wakeup_sw32 + 2), uint32_t, WAKECODE_PADDR(sc) + wakeup_32); #ifdef __amd64__ WAKECODE_FIXUP((wakeup_sw64 + 1), uint32_t, WAKECODE_PADDR(sc) + wakeup_64); WAKECODE_FIXUP(wakeup_pagetables, uint32_t, sc->acpi_wakephys); #endif /* Save pointers to some global data. */ WAKECODE_FIXUP(wakeup_ret, void *, resumectx); #ifndef __amd64__ #if defined(PAE) || defined(PAE_TABLES) WAKECODE_FIXUP(wakeup_cr3, register_t, vtophys(kernel_pmap->pm_pdpt)); #else WAKECODE_FIXUP(wakeup_cr3, register_t, vtophys(kernel_pmap->pm_pdir)); #endif #else /* Build temporary page tables below realmode code. */ pt4 = wakeaddr; pt3 = pt4 + (PAGE_SIZE) / sizeof(uint64_t); pt2 = pt3 + (PAGE_SIZE) / sizeof(uint64_t); /* Create the initial 1GB replicated page tables */ for (i = 0; i < 512; i++) { /* * Each slot of the level 4 pages points * to the same level 3 page */ pt4[i] = (uint64_t)(sc->acpi_wakephys + PAGE_SIZE); pt4[i] |= PG_V | PG_RW | PG_U; /* * Each slot of the level 3 pages points * to the same level 2 page */ pt3[i] = (uint64_t)(sc->acpi_wakephys + (2 * PAGE_SIZE)); pt3[i] |= PG_V | PG_RW | PG_U; /* The level 2 page slots are mapped with 2MB pages for 1GB. */ pt2[i] = i * (2 * 1024 * 1024); pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; } #endif if (bootverbose) device_printf(sc->acpi_dev, "wakeup code va %#jx pa %#jx\n", (uintmax_t)sc->acpi_wakeaddr, (uintmax_t)sc->acpi_wakephys); } Index: stable/10 =================================================================== --- stable/10 (revision 331909) +++ stable/10 (revision 331910) Property changes on: stable/10 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r327056