Index: head/sys/kern/subr_smp.c =================================================================== --- head/sys/kern/subr_smp.c (revision 297557) +++ head/sys/kern/subr_smp.c (revision 297558) @@ -1,863 +1,1109 @@ /*- * Copyright (c) 2001, John Baldwin . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module holds the global variables and machine independent functions * used for the kernel SMP support. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include "opt_sched.h" #ifdef SMP +MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data"); +#endif + +#ifdef SMP volatile cpuset_t stopped_cpus; volatile cpuset_t started_cpus; volatile cpuset_t suspended_cpus; cpuset_t hlt_cpus_mask; cpuset_t logical_cpus_mask; void (*cpustop_restartfunc)(void); #endif static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS); /* This is used in modules that need to work in both SMP and UP. */ cpuset_t all_cpus; int mp_ncpus; /* export this for libkvm consumers. */ int mp_maxcpus = MAXCPU; volatile int smp_started; u_int mp_maxid; static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL, "Kernel SMP"); SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0, "Max CPU ID."); SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus, 0, "Max number of CPUs that the system was compiled for."); SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD | CTLTYPE_INT, NULL, 0, sysctl_kern_smp_active, "I", "Indicates system is running in SMP mode"); int smp_disabled = 0; /* has smp been disabled? */ SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD, &smp_disabled, 0, "SMP has been disabled from the loader"); int smp_cpus = 1; /* how many cpu's running */ SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0, "Number of CPUs online"); int smp_topology = 0; /* Which topology we're using. */ SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0, "Topology override setting; 0 is default provided by hardware."); #ifdef SMP /* Enable forwarding of a signal to a process running on a different CPU */ static int forward_signal_enabled = 1; SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW, &forward_signal_enabled, 0, "Forwarding of a signal to a process on a different CPU"); /* Variables needed for SMP rendezvous. */ static volatile int smp_rv_ncpus; static void (*volatile smp_rv_setup_func)(void *arg); static void (*volatile smp_rv_action_func)(void *arg); static void (*volatile smp_rv_teardown_func)(void *arg); static void *volatile smp_rv_func_arg; static volatile int smp_rv_waiters[4]; /* * Shared mutex to restrict busywaits between smp_rendezvous() and * smp(_targeted)_tlb_shootdown(). A deadlock occurs if both of these * functions trigger at once and cause multiple CPUs to busywait with * interrupts disabled. */ struct mtx smp_ipi_mtx; /* * Let the MD SMP code initialize mp_maxid very early if it can. */ static void mp_setmaxid(void *dummy) { cpu_mp_setmaxid(); KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__)); KASSERT(mp_ncpus > 1 || mp_maxid == 0, ("%s: one CPU but mp_maxid is not zero", __func__)); KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); } SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL); /* * Call the MD SMP initialization code. */ static void mp_start(void *dummy) { mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN); /* Probe for MP hardware. */ if (smp_disabled != 0 || cpu_mp_probe() == 0) { mp_ncpus = 1; CPU_SETOF(PCPU_GET(cpuid), &all_cpus); return; } cpu_mp_start(); printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n", mp_ncpus); cpu_mp_announce(); } SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL); void forward_signal(struct thread *td) { int id; /* * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on * this thread, so all we need to do is poke it if it is currently * executing so that it executes ast(). */ THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(TD_IS_RUNNING(td), ("forward_signal: thread is not TDS_RUNNING")); CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc); if (!smp_started || cold || panicstr) return; if (!forward_signal_enabled) return; /* No need to IPI ourself. */ if (td == curthread) return; id = td->td_oncpu; if (id == NOCPU) return; ipi_cpu(id, IPI_AST); } /* * When called the executing CPU will send an IPI to all other CPUs * requesting that they halt execution. * * Usually (but not necessarily) called with 'other_cpus' as its arg. * * - Signals all CPUs in map to stop. * - Waits for each to stop. * * Returns: * -1: error * 0: NA * 1: ok * */ static int generic_stop_cpus(cpuset_t map, u_int type) { #ifdef KTR char cpusetbuf[CPUSETBUFSIZ]; #endif static volatile u_int stopping_cpu = NOCPU; int i; volatile cpuset_t *cpus; KASSERT( #if defined(__amd64__) || defined(__i386__) type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND, #else type == IPI_STOP || type == IPI_STOP_HARD, #endif ("%s: invalid stop type", __func__)); if (!smp_started) return (0); CTR2(KTR_SMP, "stop_cpus(%s) with %u type", cpusetobj_strprint(cpusetbuf, &map), type); #if defined(__amd64__) || defined(__i386__) /* * When suspending, ensure there are are no IPIs in progress. * IPIs that have been issued, but not yet delivered (e.g. * not pending on a vCPU when running under virtualization) * will be lost, violating FreeBSD's assumption of reliable * IPI delivery. */ if (type == IPI_SUSPEND) mtx_lock_spin(&smp_ipi_mtx); #endif if (stopping_cpu != PCPU_GET(cpuid)) while (atomic_cmpset_int(&stopping_cpu, NOCPU, PCPU_GET(cpuid)) == 0) while (stopping_cpu != NOCPU) cpu_spinwait(); /* spin */ /* send the stop IPI to all CPUs in map */ ipi_selected(map, type); #if defined(__amd64__) || defined(__i386__) if (type == IPI_SUSPEND) cpus = &suspended_cpus; else #endif cpus = &stopped_cpus; i = 0; while (!CPU_SUBSET(cpus, &map)) { /* spin */ cpu_spinwait(); i++; if (i == 100000000) { printf("timeout stopping cpus\n"); break; } } #if defined(__amd64__) || defined(__i386__) if (type == IPI_SUSPEND) mtx_unlock_spin(&smp_ipi_mtx); #endif stopping_cpu = NOCPU; return (1); } int stop_cpus(cpuset_t map) { return (generic_stop_cpus(map, IPI_STOP)); } int stop_cpus_hard(cpuset_t map) { return (generic_stop_cpus(map, IPI_STOP_HARD)); } #if defined(__amd64__) || defined(__i386__) int suspend_cpus(cpuset_t map) { return (generic_stop_cpus(map, IPI_SUSPEND)); } #endif /* * Called by a CPU to restart stopped CPUs. * * Usually (but not necessarily) called with 'stopped_cpus' as its arg. * * - Signals all CPUs in map to restart. * - Waits for each to restart. * * Returns: * -1: error * 0: NA * 1: ok */ static int generic_restart_cpus(cpuset_t map, u_int type) { #ifdef KTR char cpusetbuf[CPUSETBUFSIZ]; #endif volatile cpuset_t *cpus; KASSERT( #if defined(__amd64__) || defined(__i386__) type == IPI_STOP || type == IPI_STOP_HARD || type == IPI_SUSPEND, #else type == IPI_STOP || type == IPI_STOP_HARD, #endif ("%s: invalid stop type", __func__)); if (!smp_started) return 0; CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map)); #if defined(__amd64__) || defined(__i386__) if (type == IPI_SUSPEND) cpus = &suspended_cpus; else #endif cpus = &stopped_cpus; /* signal other cpus to restart */ CPU_COPY_STORE_REL(&map, &started_cpus); /* wait for each to clear its bit */ while (CPU_OVERLAP(cpus, &map)) cpu_spinwait(); return 1; } int restart_cpus(cpuset_t map) { return (generic_restart_cpus(map, IPI_STOP)); } #if defined(__amd64__) || defined(__i386__) int resume_cpus(cpuset_t map) { return (generic_restart_cpus(map, IPI_SUSPEND)); } #endif /* * All-CPU rendezvous. CPUs are signalled, all execute the setup function * (if specified), rendezvous, execute the action function (if specified), * rendezvous again, execute the teardown function (if specified), and then * resume. * * Note that the supplied external functions _must_ be reentrant and aware * that they are running in parallel and in an unknown lock context. */ void smp_rendezvous_action(void) { struct thread *td; void *local_func_arg; void (*local_setup_func)(void*); void (*local_action_func)(void*); void (*local_teardown_func)(void*); #ifdef INVARIANTS int owepreempt; #endif /* Ensure we have up-to-date values. */ atomic_add_acq_int(&smp_rv_waiters[0], 1); while (smp_rv_waiters[0] < smp_rv_ncpus) cpu_spinwait(); /* Fetch rendezvous parameters after acquire barrier. */ local_func_arg = smp_rv_func_arg; local_setup_func = smp_rv_setup_func; local_action_func = smp_rv_action_func; local_teardown_func = smp_rv_teardown_func; /* * Use a nested critical section to prevent any preemptions * from occurring during a rendezvous action routine. * Specifically, if a rendezvous handler is invoked via an IPI * and the interrupted thread was in the critical_exit() * function after setting td_critnest to 0 but before * performing a deferred preemption, this routine can be * invoked with td_critnest set to 0 and td_owepreempt true. * In that case, a critical_exit() during the rendezvous * action would trigger a preemption which is not permitted in * a rendezvous action. To fix this, wrap all of the * rendezvous action handlers in a critical section. We * cannot use a regular critical section however as having * critical_exit() preempt from this routine would also be * problematic (the preemption must not occur before the IPI * has been acknowledged via an EOI). Instead, we * intentionally ignore td_owepreempt when leaving the * critical section. This should be harmless because we do * not permit rendezvous action routines to schedule threads, * and thus td_owepreempt should never transition from 0 to 1 * during this routine. */ td = curthread; td->td_critnest++; #ifdef INVARIANTS owepreempt = td->td_owepreempt; #endif /* * If requested, run a setup function before the main action * function. Ensure all CPUs have completed the setup * function before moving on to the action function. */ if (local_setup_func != smp_no_rendevous_barrier) { if (smp_rv_setup_func != NULL) smp_rv_setup_func(smp_rv_func_arg); atomic_add_int(&smp_rv_waiters[1], 1); while (smp_rv_waiters[1] < smp_rv_ncpus) cpu_spinwait(); } if (local_action_func != NULL) local_action_func(local_func_arg); if (local_teardown_func != smp_no_rendevous_barrier) { /* * Signal that the main action has been completed. If a * full exit rendezvous is requested, then all CPUs will * wait here until all CPUs have finished the main action. */ atomic_add_int(&smp_rv_waiters[2], 1); while (smp_rv_waiters[2] < smp_rv_ncpus) cpu_spinwait(); if (local_teardown_func != NULL) local_teardown_func(local_func_arg); } /* * Signal that the rendezvous is fully completed by this CPU. * This means that no member of smp_rv_* pseudo-structure will be * accessed by this target CPU after this point; in particular, * memory pointed by smp_rv_func_arg. * * The release semantic ensures that all accesses performed by * the current CPU are visible when smp_rendezvous_cpus() * returns, by synchronizing with the * atomic_load_acq_int(&smp_rv_waiters[3]). */ atomic_add_rel_int(&smp_rv_waiters[3], 1); td->td_critnest--; KASSERT(owepreempt == td->td_owepreempt, ("rendezvous action changed td_owepreempt")); } void smp_rendezvous_cpus(cpuset_t map, void (* setup_func)(void *), void (* action_func)(void *), void (* teardown_func)(void *), void *arg) { int curcpumap, i, ncpus = 0; /* Look comments in the !SMP case. */ if (!smp_started) { spinlock_enter(); if (setup_func != NULL) setup_func(arg); if (action_func != NULL) action_func(arg); if (teardown_func != NULL) teardown_func(arg); spinlock_exit(); return; } CPU_FOREACH(i) { if (CPU_ISSET(i, &map)) ncpus++; } if (ncpus == 0) panic("ncpus is 0 with non-zero map"); mtx_lock_spin(&smp_ipi_mtx); /* Pass rendezvous parameters via global variables. */ smp_rv_ncpus = ncpus; smp_rv_setup_func = setup_func; smp_rv_action_func = action_func; smp_rv_teardown_func = teardown_func; smp_rv_func_arg = arg; smp_rv_waiters[1] = 0; smp_rv_waiters[2] = 0; smp_rv_waiters[3] = 0; atomic_store_rel_int(&smp_rv_waiters[0], 0); /* * Signal other processors, which will enter the IPI with * interrupts off. */ curcpumap = CPU_ISSET(curcpu, &map); CPU_CLR(curcpu, &map); ipi_selected(map, IPI_RENDEZVOUS); /* Check if the current CPU is in the map */ if (curcpumap != 0) smp_rendezvous_action(); /* * Ensure that the master CPU waits for all the other * CPUs to finish the rendezvous, so that smp_rv_* * pseudo-structure and the arg are guaranteed to not * be in use. * * Load acquire synchronizes with the release add in * smp_rendezvous_action(), which ensures that our caller sees * all memory actions done by the called functions on other * CPUs. */ while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus) cpu_spinwait(); mtx_unlock_spin(&smp_ipi_mtx); } void smp_rendezvous(void (* setup_func)(void *), void (* action_func)(void *), void (* teardown_func)(void *), void *arg) { smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg); } -static struct cpu_group group[MAXCPU]; +static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1]; struct cpu_group * smp_topo(void) { char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ]; struct cpu_group *top; /* * Check for a fake topology request for debugging purposes. */ switch (smp_topology) { case 1: /* Dual core with no sharing. */ top = smp_topo_1level(CG_SHARE_NONE, 2, 0); break; case 2: /* No topology, all cpus are equal. */ top = smp_topo_none(); break; case 3: /* Dual core with shared L2. */ top = smp_topo_1level(CG_SHARE_L2, 2, 0); break; case 4: /* quad core, shared l3 among each package, private l2. */ top = smp_topo_1level(CG_SHARE_L3, 4, 0); break; case 5: /* quad core, 2 dualcore parts on each package share l2. */ top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0); break; case 6: /* Single-core 2xHTT */ top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT); break; case 7: /* quad core with a shared l3, 8 threads sharing L2. */ top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8, CG_FLAG_SMT); break; default: /* Default, ask the system what it wants. */ top = cpu_topo(); break; } /* * Verify the returned topology. */ if (top->cg_count != mp_ncpus) panic("Built bad topology at %p. CPU count %d != %d", top, top->cg_count, mp_ncpus); if (CPU_CMP(&top->cg_mask, &all_cpus)) panic("Built bad topology at %p. CPU mask (%s) != (%s)", top, cpusetobj_strprint(cpusetbuf, &top->cg_mask), cpusetobj_strprint(cpusetbuf2, &all_cpus)); return (top); } struct cpu_group * +smp_topo_alloc(u_int count) +{ + static u_int index; + u_int curr; + + curr = index; + index += count; + return (&group[curr]); +} + +struct cpu_group * smp_topo_none(void) { struct cpu_group *top; top = &group[0]; top->cg_parent = NULL; top->cg_child = NULL; top->cg_mask = all_cpus; top->cg_count = mp_ncpus; top->cg_children = 0; top->cg_level = CG_SHARE_NONE; top->cg_flags = 0; return (top); } static int smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share, int count, int flags, int start) { char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ]; cpuset_t mask; int i; CPU_ZERO(&mask); for (i = 0; i < count; i++, start++) CPU_SET(start, &mask); child->cg_parent = parent; child->cg_child = NULL; child->cg_children = 0; child->cg_level = share; child->cg_count = count; child->cg_flags = flags; child->cg_mask = mask; parent->cg_children++; for (; parent != NULL; parent = parent->cg_parent) { if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask)) panic("Duplicate children in %p. mask (%s) child (%s)", parent, cpusetobj_strprint(cpusetbuf, &parent->cg_mask), cpusetobj_strprint(cpusetbuf2, &child->cg_mask)); CPU_OR(&parent->cg_mask, &child->cg_mask); parent->cg_count += child->cg_count; } return (start); } struct cpu_group * smp_topo_1level(int share, int count, int flags) { struct cpu_group *child; struct cpu_group *top; int packages; int cpu; int i; cpu = 0; top = &group[0]; packages = mp_ncpus / count; top->cg_child = child = &group[1]; top->cg_level = CG_SHARE_NONE; for (i = 0; i < packages; i++, child++) cpu = smp_topo_addleaf(top, child, share, count, flags, cpu); return (top); } struct cpu_group * smp_topo_2level(int l2share, int l2count, int l1share, int l1count, int l1flags) { struct cpu_group *top; struct cpu_group *l1g; struct cpu_group *l2g; int cpu; int i; int j; cpu = 0; top = &group[0]; l2g = &group[1]; top->cg_child = l2g; top->cg_level = CG_SHARE_NONE; top->cg_children = mp_ncpus / (l2count * l1count); l1g = l2g + top->cg_children; for (i = 0; i < top->cg_children; i++, l2g++) { l2g->cg_parent = top; l2g->cg_child = l1g; l2g->cg_level = l2share; for (j = 0; j < l2count; j++, l1g++) cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count, l1flags, cpu); } return (top); } struct cpu_group * smp_topo_find(struct cpu_group *top, int cpu) { struct cpu_group *cg; cpuset_t mask; int children; int i; CPU_SETOF(cpu, &mask); cg = top; for (;;) { if (!CPU_OVERLAP(&cg->cg_mask, &mask)) return (NULL); if (cg->cg_children == 0) return (cg); children = cg->cg_children; for (i = 0, cg = cg->cg_child; i < children; cg++, i++) if (CPU_OVERLAP(&cg->cg_mask, &mask)) break; } return (NULL); } #else /* !SMP */ void smp_rendezvous_cpus(cpuset_t map, void (*setup_func)(void *), void (*action_func)(void *), void (*teardown_func)(void *), void *arg) { /* * In the !SMP case we just need to ensure the same initial conditions * as the SMP case. */ spinlock_enter(); if (setup_func != NULL) setup_func(arg); if (action_func != NULL) action_func(arg); if (teardown_func != NULL) teardown_func(arg); spinlock_exit(); } void smp_rendezvous(void (*setup_func)(void *), void (*action_func)(void *), void (*teardown_func)(void *), void *arg) { /* Look comments in the smp_rendezvous_cpus() case. */ spinlock_enter(); if (setup_func != NULL) setup_func(arg); if (action_func != NULL) action_func(arg); if (teardown_func != NULL) teardown_func(arg); spinlock_exit(); } /* * Provide dummy SMP support for UP kernels. Modules that need to use SMP * APIs will still work using this dummy support. */ static void mp_setvariables_for_up(void *dummy) { mp_ncpus = 1; mp_maxid = PCPU_GET(cpuid); CPU_SETOF(mp_maxid, &all_cpus); KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero")); } SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setvariables_for_up, NULL); #endif /* SMP */ void smp_no_rendevous_barrier(void *dummy) { #ifdef SMP KASSERT((!smp_started),("smp_no_rendevous called and smp is started")); #endif } /* * Wait specified idle threads to switch once. This ensures that even * preempted threads have cycled through the switch function once, * exiting their codepaths. This allows us to change global pointers * with no other synchronization. */ int quiesce_cpus(cpuset_t map, const char *wmesg, int prio) { struct pcpu *pcpu; u_int gen[MAXCPU]; int error; int cpu; error = 0; for (cpu = 0; cpu <= mp_maxid; cpu++) { if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu)) continue; pcpu = pcpu_find(cpu); gen[cpu] = pcpu->pc_idlethread->td_generation; } for (cpu = 0; cpu <= mp_maxid; cpu++) { if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu)) continue; pcpu = pcpu_find(cpu); thread_lock(curthread); sched_bind(curthread, cpu); thread_unlock(curthread); while (gen[cpu] == pcpu->pc_idlethread->td_generation) { error = tsleep(quiesce_cpus, prio, wmesg, 1); if (error != EWOULDBLOCK) goto out; error = 0; } } out: thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); return (error); } int quiesce_all_cpus(const char *wmesg, int prio) { return quiesce_cpus(all_cpus, wmesg, prio); } /* Extra care is taken with this sysctl because the data type is volatile */ static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS) { int error, active; active = smp_started; error = SYSCTL_OUT(req, &active, sizeof(active)); return (error); } + + +#ifdef SMP +void +topo_init_node(struct topo_node *node) +{ + + bzero(node, sizeof(*node)); + TAILQ_INIT(&node->children); +} + +void +topo_init_root(struct topo_node *root) +{ + + topo_init_node(root); + root->type = TOPO_TYPE_SYSTEM; +} + +struct topo_node * +topo_add_node_by_hwid(struct topo_node *parent, int hwid, + topo_node_type type, uintptr_t subtype) +{ + struct topo_node *node; + + TAILQ_FOREACH_REVERSE(node, &parent->children, + topo_children, siblings) { + if (node->hwid == hwid + && node->type == type && node->subtype == subtype) { + return (node); + } + } + + node = malloc(sizeof(*node), M_TOPO, M_WAITOK); + topo_init_node(node); + node->parent = parent; + node->hwid = hwid; + node->type = type; + node->subtype = subtype; + TAILQ_INSERT_TAIL(&parent->children, node, siblings); + parent->nchildren++; + + return (node); +} + +struct topo_node * +topo_find_node_by_hwid(struct topo_node *parent, int hwid, + topo_node_type type, uintptr_t subtype) +{ + + struct topo_node *node; + + TAILQ_FOREACH(node, &parent->children, siblings) { + if (node->hwid == hwid + && node->type == type && node->subtype == subtype) { + return (node); + } + } + + return (NULL); +} + +void +topo_promote_child(struct topo_node *child) +{ + struct topo_node *next; + struct topo_node *node; + struct topo_node *parent; + + parent = child->parent; + next = TAILQ_NEXT(child, siblings); + TAILQ_REMOVE(&parent->children, child, siblings); + TAILQ_INSERT_HEAD(&parent->children, child, siblings); + + while (next != NULL) { + node = next; + next = TAILQ_NEXT(node, siblings); + TAILQ_REMOVE(&parent->children, node, siblings); + TAILQ_INSERT_AFTER(&parent->children, child, node, siblings); + child = node; + } +} + +struct topo_node * +topo_next_node(struct topo_node *top, struct topo_node *node) +{ + struct topo_node *next; + + if ((next = TAILQ_FIRST(&node->children)) != NULL) + return (next); + + if ((next = TAILQ_NEXT(node, siblings)) != NULL) + return (next); + + while ((node = node->parent) != top) + if ((next = TAILQ_NEXT(node, siblings)) != NULL) + return (next); + + return (NULL); +} + +struct topo_node * +topo_next_nonchild_node(struct topo_node *top, struct topo_node *node) +{ + struct topo_node *next; + + if ((next = TAILQ_NEXT(node, siblings)) != NULL) + return (next); + + while ((node = node->parent) != top) + if ((next = TAILQ_NEXT(node, siblings)) != NULL) + return (next); + + return (NULL); +} + +void +topo_set_pu_id(struct topo_node *node, cpuid_t id) +{ + + KASSERT(node->type == TOPO_TYPE_PU, + ("topo_set_pu_id: wrong node type: %u", node->type)); + KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0, + ("topo_set_pu_id: cpuset already not empty")); + node->id = id; + CPU_SET(id, &node->cpuset); + node->cpu_count = 1; + node->subtype = 1; + + while ((node = node->parent) != NULL) { + if (CPU_ISSET(id, &node->cpuset)) + break; + CPU_SET(id, &node->cpuset); + node->cpu_count++; + } +} + +int +topo_analyze(struct topo_node *topo_root, int all, + int *pkg_count, int *cores_per_pkg, int *thrs_per_core) +{ + struct topo_node *pkg_node; + struct topo_node *core_node; + struct topo_node *pu_node; + int thrs_per_pkg; + int cpp_counter; + int tpc_counter; + int tpp_counter; + + *pkg_count = 0; + *cores_per_pkg = -1; + *thrs_per_core = -1; + thrs_per_pkg = -1; + pkg_node = topo_root; + while (pkg_node != NULL) { + if (pkg_node->type != TOPO_TYPE_PKG) { + pkg_node = topo_next_node(topo_root, pkg_node); + continue; + } + if (!all && CPU_EMPTY(&pkg_node->cpuset)) { + pkg_node = topo_next_nonchild_node(topo_root, pkg_node); + continue; + } + + (*pkg_count)++; + + cpp_counter = 0; + tpp_counter = 0; + core_node = pkg_node; + while (core_node != NULL) { + if (core_node->type == TOPO_TYPE_CORE) { + if (!all && CPU_EMPTY(&core_node->cpuset)) { + core_node = + topo_next_nonchild_node(pkg_node, + core_node); + continue; + } + + cpp_counter++; + + tpc_counter = 0; + pu_node = core_node; + while (pu_node != NULL) { + if (pu_node->type == TOPO_TYPE_PU && + (all || !CPU_EMPTY(&pu_node->cpuset))) + tpc_counter++; + pu_node = topo_next_node(core_node, + pu_node); + } + + if (*thrs_per_core == -1) + *thrs_per_core = tpc_counter; + else if (*thrs_per_core != tpc_counter) + return (0); + + core_node = topo_next_nonchild_node(pkg_node, + core_node); + } else { + /* PU node directly under PKG. */ + if (core_node->type == TOPO_TYPE_PU && + (all || !CPU_EMPTY(&core_node->cpuset))) + tpp_counter++; + core_node = topo_next_node(pkg_node, + core_node); + } + } + + if (*cores_per_pkg == -1) + *cores_per_pkg = cpp_counter; + else if (*cores_per_pkg != cpp_counter) + return (0); + if (thrs_per_pkg == -1) + thrs_per_pkg = tpp_counter; + else if (thrs_per_pkg != tpp_counter) + return (0); + + pkg_node = topo_next_nonchild_node(topo_root, pkg_node); + } + + KASSERT(*pkg_count > 0, + ("bug in topology or analysis")); + if (*cores_per_pkg == 0) { + KASSERT(*thrs_per_core == -1 && thrs_per_pkg > 0, + ("bug in topology or analysis")); + *thrs_per_core = thrs_per_pkg; + } + + return (1); +} +#endif /* SMP */ Index: head/sys/sys/smp.h =================================================================== --- head/sys/sys/smp.h (revision 297557) +++ head/sys/sys/smp.h (revision 297558) @@ -1,191 +1,255 @@ /*- * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * * $FreeBSD$ */ #ifndef _SYS_SMP_H_ #define _SYS_SMP_H_ #ifdef _KERNEL #ifndef LOCORE #include +#include /* - * Topology of a NUMA or HTT system. + * Types of nodes in the topological tree. + */ +typedef enum { + /* No node has this type; can be used in topo API calls. */ + TOPO_TYPE_DUMMY, + /* Processing unit aka computing unit aka logical CPU. */ + TOPO_TYPE_PU, + /* Physical subdivision of a package. */ + TOPO_TYPE_CORE, + /* CPU L1/L2/L3 cache. */ + TOPO_TYPE_CACHE, + /* Package aka chip, equivalent to socket. */ + TOPO_TYPE_PKG, + /* NUMA node. */ + TOPO_TYPE_NODE, + /* Other logical or physical grouping of PUs. */ + /* E.g. PUs on the same dye, or PUs sharing an FPU. */ + TOPO_TYPE_GROUP, + /* The whole system. */ + TOPO_TYPE_SYSTEM +} topo_node_type; + +/* Hardware indenitifier of a topology component. */ +typedef unsigned int hwid_t; +/* Logical CPU idenitifier. */ +typedef int cpuid_t; + +/* A node in the topology. */ +struct topo_node { + struct topo_node *parent; + TAILQ_HEAD(topo_children, topo_node) children; + TAILQ_ENTRY(topo_node) siblings; + cpuset_t cpuset; + topo_node_type type; + uintptr_t subtype; + hwid_t hwid; + cpuid_t id; + int nchildren; + int cpu_count; +}; + +/* + * Scheduling topology of a NUMA or SMP system. * * The top level topology is an array of pointers to groups. Each group * contains a bitmask of cpus in its group or subgroups. It may also * contain a pointer to an array of child groups. * * The bitmasks at non leaf groups may be used by consumers who support * a smaller depth than the hardware provides. * * The topology may be omitted by systems where all CPUs are equal. */ struct cpu_group { struct cpu_group *cg_parent; /* Our parent group. */ struct cpu_group *cg_child; /* Optional children groups. */ cpuset_t cg_mask; /* Mask of cpus in this group. */ int32_t cg_count; /* Count of cpus in this group. */ int16_t cg_children; /* Number of children groups. */ int8_t cg_level; /* Shared cache level. */ int8_t cg_flags; /* Traversal modifiers. */ }; typedef struct cpu_group *cpu_group_t; /* * Defines common resources for CPUs in the group. The highest level * resource should be used when multiple are shared. */ #define CG_SHARE_NONE 0 #define CG_SHARE_L1 1 #define CG_SHARE_L2 2 #define CG_SHARE_L3 3 +#define MAX_CACHE_LEVELS CG_SHARE_L3 + /* * Behavior modifiers for load balancing and affinity. */ #define CG_FLAG_HTT 0x01 /* Schedule the alternate core last. */ #define CG_FLAG_SMT 0x02 /* New age htt, less crippled. */ #define CG_FLAG_THREAD (CG_FLAG_HTT | CG_FLAG_SMT) /* Any threading. */ /* - * Convenience routines for building topologies. + * Convenience routines for building and traversing topologies. */ #ifdef SMP +void topo_init_node(struct topo_node *node); +void topo_init_root(struct topo_node *root); +struct topo_node * topo_add_node_by_hwid(struct topo_node *parent, int hwid, + topo_node_type type, uintptr_t subtype); +struct topo_node * topo_find_node_by_hwid(struct topo_node *parent, int hwid, + topo_node_type type, uintptr_t subtype); +void topo_promote_child(struct topo_node *child); +struct topo_node * topo_next_node(struct topo_node *top, + struct topo_node *node); +struct topo_node * topo_next_nonchild_node(struct topo_node *top, + struct topo_node *node); +void topo_set_pu_id(struct topo_node *node, cpuid_t id); +int topo_analyze(struct topo_node *topo_root, int all, int *pkg_count, + int *cores_per_pkg, int *thrs_per_core); + +#define TOPO_FOREACH(i, root) \ + for (i = root; i != NULL; i = topo_next_node(root, i)) + struct cpu_group *smp_topo(void); +struct cpu_group *smp_topo_alloc(u_int count); struct cpu_group *smp_topo_none(void); struct cpu_group *smp_topo_1level(int l1share, int l1count, int l1flags); struct cpu_group *smp_topo_2level(int l2share, int l2count, int l1share, int l1count, int l1flags); struct cpu_group *smp_topo_find(struct cpu_group *top, int cpu); extern void (*cpustop_restartfunc)(void); extern int smp_cpus; extern volatile cpuset_t started_cpus; extern volatile cpuset_t stopped_cpus; extern volatile cpuset_t suspended_cpus; extern cpuset_t hlt_cpus_mask; extern cpuset_t logical_cpus_mask; #endif /* SMP */ extern u_int mp_maxid; extern int mp_maxcpus; extern int mp_ncpus; extern volatile int smp_started; extern cpuset_t all_cpus; extern cpuset_t cpuset_domain[MAXMEMDOM]; /* CPUs in each NUMA domain. */ /* * Macro allowing us to determine whether a CPU is absent at any given * time, thus permitting us to configure sparse maps of cpuid-dependent * (per-CPU) structures. */ #define CPU_ABSENT(x_cpu) (!CPU_ISSET(x_cpu, &all_cpus)) /* * Macros to iterate over non-absent CPUs. CPU_FOREACH() takes an * integer iterator and iterates over the available set of CPUs. * CPU_FIRST() returns the id of the first non-absent CPU. CPU_NEXT() * returns the id of the next non-absent CPU. It will wrap back to * CPU_FIRST() once the end of the list is reached. The iterators are * currently implemented via inline functions. */ #define CPU_FOREACH(i) \ for ((i) = 0; (i) <= mp_maxid; (i)++) \ if (!CPU_ABSENT((i))) static __inline int cpu_first(void) { int i; for (i = 0;; i++) if (!CPU_ABSENT(i)) return (i); } static __inline int cpu_next(int i) { for (;;) { i++; if (i > mp_maxid) i = 0; if (!CPU_ABSENT(i)) return (i); } } #define CPU_FIRST() cpu_first() #define CPU_NEXT(i) cpu_next((i)) #ifdef SMP /* * Machine dependent functions used to initialize MP support. * * The cpu_mp_probe() should check to see if MP support is present and return * zero if it is not or non-zero if it is. If MP support is present, then * cpu_mp_start() will be called so that MP can be enabled. This function * should do things such as startup secondary processors. It should also * setup mp_ncpus, all_cpus, and smp_cpus. It should also ensure that * smp_started is initialized at the appropriate time. * Once cpu_mp_start() returns, machine independent MP startup code will be * executed and a simple message will be output to the console. Finally, * cpu_mp_announce() will be called so that machine dependent messages about * the MP support may be output to the console if desired. * * The cpu_setmaxid() function is called very early during the boot process * so that the MD code may set mp_maxid to provide an upper bound on CPU IDs * that other subsystems may use. If a platform is not able to determine * the exact maximum ID that early, then it may set mp_maxid to MAXCPU - 1. */ struct thread; struct cpu_group *cpu_topo(void); void cpu_mp_announce(void); int cpu_mp_probe(void); void cpu_mp_setmaxid(void); void cpu_mp_start(void); void forward_signal(struct thread *); int restart_cpus(cpuset_t); int stop_cpus(cpuset_t); int stop_cpus_hard(cpuset_t); #if defined(__amd64__) || defined(__i386__) int suspend_cpus(cpuset_t); int resume_cpus(cpuset_t); #endif void smp_rendezvous_action(void); extern struct mtx smp_ipi_mtx; #endif /* SMP */ int quiesce_all_cpus(const char *, int); int quiesce_cpus(cpuset_t, const char *, int); void smp_no_rendevous_barrier(void *); void smp_rendezvous(void (*)(void *), void (*)(void *), void (*)(void *), void *arg); void smp_rendezvous_cpus(cpuset_t, void (*)(void *), void (*)(void *), void (*)(void *), void *arg); #endif /* !LOCORE */ #endif /* _KERNEL */ #endif /* _SYS_SMP_H_ */ Index: head/sys/x86/x86/mp_x86.c =================================================================== --- head/sys/x86/x86/mp_x86.c (revision 297557) +++ head/sys/x86/x86/mp_x86.c (revision 297558) @@ -1,1241 +1,1456 @@ /*- * Copyright (c) 1996, by Steve Passe * Copyright (c) 2003, by Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifdef __i386__ #include "opt_apic.h" #endif #include "opt_cpu.h" #include "opt_kstack_pages.h" #include "opt_pmap.h" #include "opt_sched.h" #include "opt_smp.h" #include #include #include #include /* cngetc() */ #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define WARMBOOT_TARGET 0 #define WARMBOOT_OFF (KERNBASE + 0x0467) #define WARMBOOT_SEG (KERNBASE + 0x0469) #define CMOS_REG (0x70) #define CMOS_DATA (0x71) #define BIOS_RESET (0x0f) #define BIOS_WARM (0x0a) /* lock region used by kernel profiling */ int mcount_lock; int mp_naps; /* # of Applications processors */ int boot_cpu_id = -1; /* designated BSP */ extern struct pcpu __pcpu[]; /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; int bootAP; /* Free these after use */ void *bootstacks[MAXCPU]; void *dpcpu; struct pcb stoppcbs[MAXCPU]; struct susppcb **susppcbs; #ifdef COUNT_IPIS /* Interrupt counts. */ static u_long *ipi_preempt_counts[MAXCPU]; static u_long *ipi_ast_counts[MAXCPU]; u_long *ipi_invltlb_counts[MAXCPU]; u_long *ipi_invlrng_counts[MAXCPU]; u_long *ipi_invlpg_counts[MAXCPU]; u_long *ipi_invlcache_counts[MAXCPU]; u_long *ipi_rendezvous_counts[MAXCPU]; static u_long *ipi_hardclock_counts[MAXCPU]; #endif /* Default cpu_ops implementation. */ struct cpu_ops cpu_ops; /* * Local data and functions. */ static volatile cpuset_t ipi_stop_nmi_pending; /* used to hold the AP's until we are ready to release them */ struct mtx ap_boot_mtx; /* Set to 1 once we're ready to let the APs out of the pen. */ volatile int aps_ready = 0; /* * Store data from cpu_add() until later in the boot when we actually setup * the APs. */ struct cpu_info cpu_info[MAX_APIC_ID + 1]; -int cpu_apic_ids[MAXCPU]; int apic_cpuids[MAX_APIC_ID + 1]; +int cpu_apic_ids[MAXCPU]; /* Holds pending bitmap based IPIs per CPU */ volatile u_int cpu_ipi_pending[MAXCPU]; -int cpu_logical; /* logical cpus per core */ -int cpu_cores; /* cores per package */ - static void release_aps(void *dummy); -static u_int hyperthreading_cpus; /* logical cpus sharing L1 cache */ static int hyperthreading_allowed = 1; +SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, + &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); +static struct topo_node topo_root; + +static int pkg_id_shift; +static int core_id_shift; +static int disabled_cpus; + +struct cache_info { + int id_shift; + int present; +} static caches[MAX_CACHE_LEVELS]; + void mem_range_AP_init(void) { if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) mem_range_softc.mr_op->initAP(&mem_range_softc); } -static void -topo_probe_amd(void) +/* + * Round up to the next power of two, if necessary, and then + * take log2. + * Returns -1 if argument is zero. + */ +static __inline int +mask_width(u_int x) { - int core_id_bits; - int id; - /* AMD processors do not support HTT. */ - cpu_logical = 1; + return (fls(x << (1 - powerof2(x))) - 1); +} - if ((amd_feature2 & AMDID2_CMP) == 0) { - cpu_cores = 1; - return; +static int +add_deterministic_cache(int type, int level, int share_count) +{ + + if (type == 0) + return (0); + if (type > 3) { + printf("unexpected cache type %d\n", type); + return (1); } + if (type == 2) /* ignore instruction cache */ + return (1); + if (level == 0 || level > MAX_CACHE_LEVELS) { + printf("unexpected cache level %d\n", type); + return (1); + } - core_id_bits = (cpu_procinfo2 & AMDID_COREID_SIZE) >> - AMDID_COREID_SIZE_SHIFT; - if (core_id_bits == 0) { - cpu_cores = (cpu_procinfo2 & AMDID_CMP_CORES) + 1; - return; + if (caches[level - 1].present) { + printf("WARNING: multiple entries for L%u data cache\n", level); + printf("%u => %u\n", caches[level - 1].id_shift, + mask_width(share_count)); } + caches[level - 1].id_shift = mask_width(share_count); + caches[level - 1].present = 1; - /* Fam 10h and newer should get here. */ - for (id = 0; id <= MAX_APIC_ID; id++) { - /* Check logical CPU availability. */ - if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) - continue; - /* Check if logical CPU has the same package ID. */ - if ((id >> core_id_bits) != (boot_cpu_id >> core_id_bits)) - continue; - cpu_cores++; + if (caches[level - 1].id_shift > pkg_id_shift) { + printf("WARNING: L%u data cache covers more " + "APIC IDs than a package\n", level); + printf("%u > %u\n", caches[level - 1].id_shift, pkg_id_shift); + caches[level - 1].id_shift = pkg_id_shift; } + if (caches[level - 1].id_shift < core_id_shift) { + printf("WARNING: L%u data cache covers less " + "APIC IDs than a core\n", level); + printf("%u < %u\n", caches[level - 1].id_shift, core_id_shift); + caches[level - 1].id_shift = core_id_shift; + } + + return (1); } -/* - * Round up to the next power of two, if necessary, and then - * take log2. - * Returns -1 if argument is zero. - */ -static __inline int -mask_width(u_int x) +static void +topo_probe_amd(void) { + u_int p[4]; + int level; + int share_count; + int type; + int i; - return (fls(x << (1 - powerof2(x))) - 1); + /* No multi-core capability. */ + if ((amd_feature2 & AMDID2_CMP) == 0) + return; + + /* For families 10h and newer. */ + pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> + AMDID_COREID_SIZE_SHIFT; + + /* For 0Fh family. */ + if (pkg_id_shift == 0) + pkg_id_shift = + mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); + + if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { + for (i = 0; ; i++) { + cpuid_count(0x8000001d, i, p); + type = p[0] & 0x1f; + level = (p[0] >> 5) & 0x7; + share_count = 1 + ((p[0] >> 14) & 0xfff); + + if (!add_deterministic_cache(type, level, share_count)) + break; + } + } else { + if (cpu_exthigh >= 0x80000005) { + cpuid_count(0x80000005, 0, p); + if (((p[2] >> 24) & 0xff) != 0) { + caches[0].id_shift = 0; + caches[0].present = 1; + } + } + if (cpu_exthigh >= 0x80000006) { + cpuid_count(0x80000006, 0, p); + if (((p[2] >> 16) & 0xffff) != 0) { + caches[1].id_shift = 0; + caches[1].present = 1; + } + if (((p[3] >> 18) & 0x3fff) != 0) { + + /* + * TODO: Account for dual-node processors + * where each node within a package has its own + * L3 cache. + */ + caches[2].id_shift = pkg_id_shift; + caches[2].present = 1; + } + } + } } static void -topo_probe_0x4(void) +topo_probe_intel_0x4(void) { u_int p[4]; - int pkg_id_bits; - int core_id_bits; int max_cores; int max_logical; - int id; /* Both zero and one here mean one logical processor per package. */ max_logical = (cpu_feature & CPUID_HTT) != 0 ? (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; if (max_logical <= 1) return; - /* - * Because of uniformity assumption we examine only - * those logical processors that belong to the same - * package as BSP. Further, we count number of - * logical processors that belong to the same core - * as BSP thus deducing number of threads per core. - */ if (cpu_high >= 0x4) { cpuid_count(0x04, 0, p); max_cores = ((p[0] >> 26) & 0x3f) + 1; } else max_cores = 1; - core_id_bits = mask_width(max_logical/max_cores); - if (core_id_bits < 0) - return; - pkg_id_bits = core_id_bits + mask_width(max_cores); - for (id = 0; id <= MAX_APIC_ID; id++) { - /* Check logical CPU availability. */ - if (!cpu_info[id].cpu_present || cpu_info[id].cpu_disabled) - continue; - /* Check if logical CPU has the same package ID. */ - if ((id >> pkg_id_bits) != (boot_cpu_id >> pkg_id_bits)) - continue; - cpu_cores++; - /* Check if logical CPU has the same package and core IDs. */ - if ((id >> core_id_bits) == (boot_cpu_id >> core_id_bits)) - cpu_logical++; - } - - KASSERT(cpu_cores >= 1 && cpu_logical >= 1, - ("topo_probe_0x4 couldn't find BSP")); - - cpu_cores /= cpu_logical; - hyperthreading_cpus = cpu_logical; + core_id_shift = mask_width(max_logical/max_cores); + KASSERT(core_id_shift >= 0, + ("intel topo: max_cores > max_logical\n")); + pkg_id_shift = core_id_shift + mask_width(max_cores); } static void -topo_probe_0xb(void) +topo_probe_intel_0xb(void) { u_int p[4]; int bits; - int cnt; - int i; - int logical; int type; - int x; + int i; + /* Fall back if CPU leaf 11 doesn't really exist. */ + cpuid_count(0x0b, 0, p); + if (p[1] == 0) { + topo_probe_intel_0x4(); + return; + } + /* We only support three levels for now. */ - for (i = 0; i < 3; i++) { + for (i = 0; ; i++) { cpuid_count(0x0b, i, p); - /* Fall back if CPU leaf 11 doesn't really exist. */ - if (i == 0 && p[1] == 0) { - topo_probe_0x4(); - return; - } - bits = p[0] & 0x1f; - logical = p[1] &= 0xffff; type = (p[2] >> 8) & 0xff; - if (type == 0 || logical == 0) + + if (type == 0) break; - /* - * Because of uniformity assumption we examine only - * those logical processors that belong to the same - * package as BSP. - */ - for (cnt = 0, x = 0; x <= MAX_APIC_ID; x++) { - if (!cpu_info[x].cpu_present || - cpu_info[x].cpu_disabled) - continue; - if (x >> bits == boot_cpu_id >> bits) - cnt++; - } + + /* TODO: check for duplicate (re-)assignment */ if (type == CPUID_TYPE_SMT) - cpu_logical = cnt; + core_id_shift = bits; else if (type == CPUID_TYPE_CORE) - cpu_cores = cnt; + pkg_id_shift = bits; + else + printf("unknown CPU level type %d\n", type); } - if (cpu_logical == 0) - cpu_logical = 1; - cpu_cores /= cpu_logical; + + if (pkg_id_shift < core_id_shift) { + printf("WARNING: core covers more APIC IDs than a package\n"); + core_id_shift = pkg_id_shift; + } } +static void +topo_probe_intel_caches(void) +{ + u_int p[4]; + int level; + int share_count; + int type; + int i; + + if (cpu_high < 0x4) { + /* + * Available cache level and sizes can be determined + * via CPUID leaf 2, but that requires a huge table of hardcoded + * values, so for now just assume L1 and L2 caches potentially + * shared only by HTT processing units, if HTT is present. + */ + caches[0].id_shift = pkg_id_shift; + caches[0].present = 1; + caches[1].id_shift = pkg_id_shift; + caches[1].present = 1; + return; + } + + for (i = 0; ; i++) { + cpuid_count(0x4, i, p); + type = p[0] & 0x1f; + level = (p[0] >> 5) & 0x7; + share_count = 1 + ((p[0] >> 14) & 0xfff); + + if (!add_deterministic_cache(type, level, share_count)) + break; + } +} + +static void +topo_probe_intel(void) +{ + + /* + * See Intel(R) 64 Architecture Processor + * Topology Enumeration article for details. + * + * Note that 0x1 <= cpu_high < 4 case should be + * compatible with topo_probe_intel_0x4() logic when + * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) + * or it should trigger the fallback otherwise. + */ + if (cpu_high >= 0xb) + topo_probe_intel_0xb(); + else if (cpu_high >= 0x1) + topo_probe_intel_0x4(); + + topo_probe_intel_caches(); +} + /* - * Both topology discovery code and code that consumes topology - * information assume top-down uniformity of the topology. - * That is, all physical packages must be identical and each - * core in a package must have the same number of threads. * Topology information is queried only on BSP, on which this * code runs and for which it can query CPUID information. - * Then topology is extrapolated on all packages using the - * uniformity assumption. + * Then topology is extrapolated on all packages using an + * assumption that APIC ID to hardware component ID mapping is + * homogenious. + * That doesn't necesserily imply that the topology is uniform. */ void topo_probe(void) { static int cpu_topo_probed = 0; + struct x86_topo_layer { + int type; + int subtype; + int id_shift; + } topo_layers[MAX_CACHE_LEVELS + 3]; + struct topo_node *parent; + struct topo_node *node; + int layer; + int nlayers; + int node_id; + int i; if (cpu_topo_probed) return; CPU_ZERO(&logical_cpus_mask); + if (mp_ncpus <= 1) - cpu_cores = cpu_logical = 1; + ; /* nothing */ else if (cpu_vendor_id == CPU_VENDOR_AMD) topo_probe_amd(); - else if (cpu_vendor_id == CPU_VENDOR_INTEL) { - /* - * See Intel(R) 64 Architecture Processor - * Topology Enumeration article for details. - * - * Note that 0x1 <= cpu_high < 4 case should be - * compatible with topo_probe_0x4() logic when - * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) - * or it should trigger the fallback otherwise. - */ - if (cpu_high >= 0xb) - topo_probe_0xb(); - else if (cpu_high >= 0x1) - topo_probe_0x4(); - } + else if (cpu_vendor_id == CPU_VENDOR_INTEL) + topo_probe_intel(); + KASSERT(pkg_id_shift >= core_id_shift, + ("bug in APIC topology discovery")); + + nlayers = 0; + bzero(topo_layers, sizeof(topo_layers)); + + topo_layers[nlayers].type = TOPO_TYPE_PKG; + topo_layers[nlayers].id_shift = pkg_id_shift; + if (bootverbose) + printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); + nlayers++; + /* - * Fallback: assume each logical CPU is in separate - * physical package. That is, no multi-core, no SMT. + * Consider all caches to be within a package/chip + * and "in front" of all sub-components like + * cores and hardware threads. */ - if (cpu_cores == 0 || cpu_logical == 0) - cpu_cores = cpu_logical = 1; + for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { + if (caches[i].present) { + KASSERT(caches[i].id_shift <= pkg_id_shift, + ("bug in APIC topology discovery")); + KASSERT(caches[i].id_shift >= core_id_shift, + ("bug in APIC topology discovery")); + + topo_layers[nlayers].type = TOPO_TYPE_CACHE; + topo_layers[nlayers].subtype = i + 1; + topo_layers[nlayers].id_shift = caches[i].id_shift; + if (bootverbose) + printf("L%u cache ID shift: %u\n", + topo_layers[nlayers].subtype, + topo_layers[nlayers].id_shift); + nlayers++; + } + } + + if (pkg_id_shift > core_id_shift) { + topo_layers[nlayers].type = TOPO_TYPE_CORE; + topo_layers[nlayers].id_shift = core_id_shift; + if (bootverbose) + printf("Core ID shift: %u\n", + topo_layers[nlayers].id_shift); + nlayers++; + } + + topo_layers[nlayers].type = TOPO_TYPE_PU; + topo_layers[nlayers].id_shift = 0; + nlayers++; + + topo_init_root(&topo_root); + for (i = 0; i <= MAX_APIC_ID; ++i) { + if (!cpu_info[i].cpu_present) + continue; + + parent = &topo_root; + for (layer = 0; layer < nlayers; ++layer) { + node_id = i >> topo_layers[layer].id_shift; + parent = topo_add_node_by_hwid(parent, node_id, + topo_layers[layer].type, + topo_layers[layer].subtype); + } + } + + parent = &topo_root; + for (layer = 0; layer < nlayers; ++layer) { + node_id = boot_cpu_id >> topo_layers[layer].id_shift; + node = topo_find_node_by_hwid(parent, node_id, + topo_layers[layer].type, + topo_layers[layer].subtype); + topo_promote_child(node); + parent = node; + } + cpu_topo_probed = 1; } -struct cpu_group * -cpu_topo(void) +/* + * Assign logical CPU IDs to local APICs. + */ +void +assign_cpu_ids(void) { - int cg_flags; + struct topo_node *node; + u_int smt_mask; + smt_mask = (1u << core_id_shift) - 1; + /* - * Determine whether any threading flags are - * necessry. + * Assign CPU IDs to local APIC IDs and disable any CPUs + * beyond MAXCPU. CPU 0 is always assigned to the BSP. */ - topo_probe(); - if (cpu_logical > 1 && hyperthreading_cpus) - cg_flags = CG_FLAG_HTT; - else if (cpu_logical > 1) - cg_flags = CG_FLAG_SMT; + mp_ncpus = 0; + TOPO_FOREACH(node, &topo_root) { + if (node->type != TOPO_TYPE_PU) + continue; + + if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) + cpu_info[node->hwid].cpu_hyperthread = 1; + + if (resource_disabled("lapic", node->hwid)) { + if (node->hwid != boot_cpu_id) + cpu_info[node->hwid].cpu_disabled = 1; + else + printf("Cannot disable BSP, APIC ID = %d\n", + node->hwid); + } + + if (!hyperthreading_allowed && + cpu_info[node->hwid].cpu_hyperthread) + cpu_info[node->hwid].cpu_disabled = 1; + + if (mp_ncpus >= MAXCPU) + cpu_info[node->hwid].cpu_disabled = 1; + + if (cpu_info[node->hwid].cpu_disabled) { + disabled_cpus++; + continue; + } + + cpu_apic_ids[mp_ncpus] = node->hwid; + apic_cpuids[node->hwid] = mp_ncpus; + topo_set_pu_id(node, mp_ncpus); + mp_ncpus++; + } + + KASSERT(mp_maxid >= mp_ncpus - 1, + ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, + mp_ncpus)); +} + +/* + * Print various information about the SMP system hardware and setup. + */ +void +cpu_mp_announce(void) +{ + struct topo_node *node; + const char *hyperthread; + int pkg_count; + int cores_per_pkg; + int thrs_per_core; + + printf("FreeBSD/SMP: "); + if (topo_analyze(&topo_root, 1, &pkg_count, + &cores_per_pkg, &thrs_per_core)) { + printf("%d package(s)", pkg_count); + if (cores_per_pkg > 0) + printf(" x %d core(s)", cores_per_pkg); + if (thrs_per_core > 1) + printf(" x %d hardware threads", thrs_per_core); + } else { + printf("Non-uniform topology"); + } + printf("\n"); + + if (disabled_cpus) { + printf("FreeBSD/SMP Online: "); + if (topo_analyze(&topo_root, 0, &pkg_count, + &cores_per_pkg, &thrs_per_core)) { + printf("%d package(s)", pkg_count); + if (cores_per_pkg > 0) + printf(" x %d core(s)", cores_per_pkg); + if (thrs_per_core > 1) + printf(" x %d hardware threads", thrs_per_core); + } else { + printf("Non-uniform topology"); + } + printf("\n"); + } + + if (!bootverbose) + return; + + TOPO_FOREACH(node, &topo_root) { + switch (node->type) { + case TOPO_TYPE_PKG: + printf("Package HW ID = %u (%#x)\n", + node->hwid, node->hwid); + break; + case TOPO_TYPE_CORE: + printf("\tCore HW ID = %u (%#x)\n", + node->hwid, node->hwid); + break; + case TOPO_TYPE_PU: + if (cpu_info[node->hwid].cpu_hyperthread) + hyperthread = "/HT"; + else + hyperthread = ""; + + if (node->subtype == 0) + printf("\t\tCPU (AP%s): APIC ID: %u (%#x)" + "(disabled)\n", hyperthread, node->hwid, + node->hwid); + else if (node->id == 0) + printf("\t\tCPU0 (BSP): APIC ID: %u (%#x)\n", + node->hwid, node->hwid); + else + printf("\t\tCPU%u (AP%s): APIC ID: %u (%#x)\n", + node->id, hyperthread, node->hwid, + node->hwid); + break; + default: + /* ignored */ + break; + } + } +} + +static void +x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) +{ + struct topo_node *node; + int nchildren; + int ncores; + int i; + + KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE, + ("x86topo_add_sched_group: bad type: %u", root->type)); + CPU_COPY(&root->cpuset, &cg_root->cg_mask); + cg_root->cg_count = root->cpu_count; + if (root->type == TOPO_TYPE_SYSTEM) + cg_root->cg_level = CG_SHARE_NONE; else - cg_flags = 0; - if (mp_ncpus % (cpu_cores * cpu_logical) != 0) { - printf("WARNING: Non-uniform processors.\n"); - printf("WARNING: Using suboptimal topology.\n"); - return (smp_topo_none()); + cg_root->cg_level = root->subtype; + + ncores = 0; + node = root; + while (node != NULL) { + if (node->type != TOPO_TYPE_CORE) { + node = topo_next_node(root, node); + continue; + } + + ncores++; + node = topo_next_nonchild_node(root, node); } - /* - * No multi-core or hyper-threaded. - */ - if (cpu_logical * cpu_cores == 1) + + if (cg_root->cg_level != CG_SHARE_NONE && + root->cpu_count > 1 && ncores < 2) + cg_root->cg_flags = CG_FLAG_SMT; + + nchildren = 0; + node = root; + while (node != NULL) { + if (node->type != TOPO_TYPE_CACHE || + (root->type != TOPO_TYPE_SYSTEM && + CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { + node = topo_next_node(root, node); + continue; + } + nchildren++; + node = topo_next_nonchild_node(root, node); + } + + cg_root->cg_child = smp_topo_alloc(nchildren); + cg_root->cg_children = nchildren; + + node = root; + i = 0; + while (node != NULL) { + if (node->type != TOPO_TYPE_CACHE || + (root->type != TOPO_TYPE_SYSTEM && + CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { + node = topo_next_node(root, node); + continue; + } + cg_root->cg_child[i].cg_parent = cg_root; + x86topo_add_sched_group(node, &cg_root->cg_child[i]); + i++; + node = topo_next_nonchild_node(root, node); + } +} + +struct cpu_group * +cpu_topo(void) +{ + struct cpu_group *cg_root; + + if (mp_ncpus <= 1) return (smp_topo_none()); - /* - * Only HTT no multi-core. - */ - if (cpu_logical > 1 && cpu_cores == 1) - return (smp_topo_1level(CG_SHARE_L1, cpu_logical, cg_flags)); - /* - * Only multi-core no HTT. - */ - if (cpu_cores > 1 && cpu_logical == 1) - return (smp_topo_1level(CG_SHARE_L2, cpu_cores, cg_flags)); - /* - * Both HTT and multi-core. - */ - return (smp_topo_2level(CG_SHARE_L2, cpu_cores, - CG_SHARE_L1, cpu_logical, cg_flags)); + + cg_root = smp_topo_alloc(1); + x86topo_add_sched_group(&topo_root, cg_root); + return (cg_root); } void cpu_add(u_int apic_id, char boot_cpu) { if (apic_id > MAX_APIC_ID) { panic("SMP: APIC ID %d too high", apic_id); return; } KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %d added twice", apic_id)); cpu_info[apic_id].cpu_present = 1; if (boot_cpu) { KASSERT(boot_cpu_id == -1, ("CPU %d claims to be BSP, but CPU %d already is", apic_id, boot_cpu_id)); boot_cpu_id = apic_id; cpu_info[apic_id].cpu_bsp = 1; } if (mp_ncpus < MAXCPU) { mp_ncpus++; mp_maxid = mp_ncpus - 1; } if (bootverbose) printf("SMP: Added CPU %d (%s)\n", apic_id, boot_cpu ? "BSP" : "AP"); } void cpu_mp_setmaxid(void) { /* * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). * If there were no calls to cpu_add() assume this is a UP system. */ if (mp_ncpus == 0) mp_ncpus = 1; } int cpu_mp_probe(void) { /* * Always record BSP in CPU map so that the mbuf init code works * correctly. */ CPU_SETOF(0, &all_cpus); return (mp_ncpus > 1); } /* - * Print various information about the SMP system hardware and setup. + * AP CPU's call this to initialize themselves. */ void -cpu_mp_announce(void) -{ - const char *hyperthread; - int i; - - printf("FreeBSD/SMP: %d package(s) x %d core(s)", - mp_ncpus / (cpu_cores * cpu_logical), cpu_cores); - if (hyperthreading_cpus > 1) - printf(" x %d HTT threads", cpu_logical); - else if (cpu_logical > 1) - printf(" x %d SMT threads", cpu_logical); - printf("\n"); - - /* List active CPUs first. */ - printf(" cpu0 (BSP): APIC ID: %2d\n", boot_cpu_id); - for (i = 1; i < mp_ncpus; i++) { - if (cpu_info[cpu_apic_ids[i]].cpu_hyperthread) - hyperthread = "/HT"; - else - hyperthread = ""; - printf(" cpu%d (AP%s): APIC ID: %2d\n", i, hyperthread, - cpu_apic_ids[i]); - } - - /* List disabled CPUs last. */ - for (i = 0; i <= MAX_APIC_ID; i++) { - if (!cpu_info[i].cpu_present || !cpu_info[i].cpu_disabled) - continue; - if (cpu_info[i].cpu_hyperthread) - hyperthread = "/HT"; - else - hyperthread = ""; - printf(" cpu (AP%s): APIC ID: %2d (disabled)\n", hyperthread, - i); - } -} - -void init_secondary_tail(void) { u_int cpuid; /* * On real hardware, switch to x2apic mode if possible. Do it * after aps_ready was signalled, to avoid manipulating the * mode while BSP might still want to send some IPI to us * (second startup IPI is ignored on modern hardware etc). */ lapic_xapic_mode(); /* Initialize the PAT MSR. */ pmap_init_pat(); /* set up CPU registers and state */ cpu_setregs(); /* set up SSE/NX */ initializecpu(); /* set up FPU state on the AP */ #ifdef __amd64__ fpuinit(); #else npxinit(false); #endif if (cpu_ops.cpu_init) cpu_ops.cpu_init(); /* A quick check from sanity claus */ cpuid = PCPU_GET(cpuid); if (PCPU_GET(apic_id) != lapic_id()) { printf("SMP: cpuid = %d\n", cpuid); printf("SMP: actual apic_id = %d\n", lapic_id()); printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); panic("cpuid mismatch! boom!!"); } /* Initialize curthread. */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); PCPU_SET(curthread, PCPU_GET(idlethread)); mca_init(); mtx_lock_spin(&ap_boot_mtx); /* Init local apic for irq's */ lapic_setup(1); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); smp_cpus++; CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); printf("SMP: AP CPU #%d Launched!\n", cpuid); /* Determine if we are a logical CPU. */ - /* XXX Calculation depends on cpu_logical being a power of 2, e.g. 2 */ - if (cpu_logical > 1 && PCPU_GET(apic_id) % cpu_logical != 0) + if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) CPU_SET(cpuid, &logical_cpus_mask); if (bootverbose) lapic_dump("AP"); if (smp_cpus == mp_ncpus) { /* enable IPI's, tlb shootdown, freezes etc */ atomic_store_rel_int(&smp_started, 1); } #ifdef __amd64__ /* * Enable global pages TLB extension * This also implicitly flushes the TLB */ load_cr4(rcr4() | CR4_PGE); if (pmap_pcid_enabled) load_cr4(rcr4() | CR4_PCIDE); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); #endif mtx_unlock_spin(&ap_boot_mtx); /* Wait until all the AP's are up. */ while (atomic_load_acq_int(&smp_started) == 0) ia32_pause(); /* Start per-CPU event timers. */ cpu_initclocks_ap(); sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ } /******************************************************************* * local functions and data */ /* * We tell the I/O APIC code about all the CPUs we want to receive * interrupts. If we don't want certain CPUs to receive IRQs we * can simply not tell the I/O APIC code about them in this function. * We also do not tell it about the BSP since it tells itself about * the BSP internally to work with UP kernels and on UP machines. */ void set_interrupt_apic_ids(void) { u_int i, apic_id; for (i = 0; i < MAXCPU; i++) { apic_id = cpu_apic_ids[i]; if (apic_id == -1) continue; if (cpu_info[apic_id].cpu_bsp) continue; if (cpu_info[apic_id].cpu_disabled) continue; /* Don't let hyperthreads service interrupts. */ - if (cpu_logical > 1 && - apic_id % cpu_logical != 0) + if (cpu_info[apic_id].cpu_hyperthread) continue; intr_add_cpu(i); } } -/* - * Assign logical CPU IDs to local APICs. - */ -void -assign_cpu_ids(void) -{ - u_int i; - - TUNABLE_INT_FETCH("machdep.hyperthreading_allowed", - &hyperthreading_allowed); - - /* Check for explicitly disabled CPUs. */ - for (i = 0; i <= MAX_APIC_ID; i++) { - if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp) - continue; - - if (hyperthreading_cpus > 1 && i % hyperthreading_cpus != 0) { - cpu_info[i].cpu_hyperthread = 1; - - /* - * Don't use HT CPU if it has been disabled by a - * tunable. - */ - if (hyperthreading_allowed == 0) { - cpu_info[i].cpu_disabled = 1; - continue; - } - } - - /* Don't use this CPU if it has been disabled by a tunable. */ - if (resource_disabled("lapic", i)) { - cpu_info[i].cpu_disabled = 1; - continue; - } - } - - if (hyperthreading_allowed == 0 && hyperthreading_cpus > 1) { - hyperthreading_cpus = 0; - cpu_logical = 1; - } - - /* - * Assign CPU IDs to local APIC IDs and disable any CPUs - * beyond MAXCPU. CPU 0 is always assigned to the BSP. - * - * To minimize confusion for userland, we attempt to number - * CPUs such that all threads and cores in a package are - * grouped together. For now we assume that the BSP is always - * the first thread in a package and just start adding APs - * starting with the BSP's APIC ID. - */ - mp_ncpus = 1; - cpu_apic_ids[0] = boot_cpu_id; - apic_cpuids[boot_cpu_id] = 0; - for (i = boot_cpu_id + 1; i != boot_cpu_id; - i == MAX_APIC_ID ? i = 0 : i++) { - if (!cpu_info[i].cpu_present || cpu_info[i].cpu_bsp || - cpu_info[i].cpu_disabled) - continue; - - if (mp_ncpus < MAXCPU) { - cpu_apic_ids[mp_ncpus] = i; - apic_cpuids[i] = mp_ncpus; - mp_ncpus++; - } else - cpu_info[i].cpu_disabled = 1; - } - KASSERT(mp_maxid >= mp_ncpus - 1, - ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, - mp_ncpus)); -} #ifdef COUNT_XINVLTLB_HITS u_int xhits_gbl[MAXCPU]; u_int xhits_pg[MAXCPU]; u_int xhits_rng[MAXCPU]; static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW, 0, ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, sizeof(xhits_gbl), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, sizeof(xhits_pg), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, sizeof(xhits_rng), "IU", ""); u_int ipi_global; u_int ipi_page; u_int ipi_range; u_int ipi_range_size; SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 0, ""); #endif /* COUNT_XINVLTLB_HITS */ /* * Init and startup IPI. */ void ipi_startup(int apic_id, int vector) { /* * This attempts to follow the algorithm described in the * Intel Multiprocessor Specification v1.4 in section B.4. * For each IPI, we allow the local APIC ~20us to deliver the * IPI. If that times out, we panic. */ /* * first we do an INIT IPI: this INIT IPI might be run, resetting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); lapic_ipi_wait(100); /* Explicitly deassert the INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); DELAY(10000); /* wait ~10mS */ /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); if (!lapic_ipi_wait(100)) panic("Failed to deliver first STARTUP IPI to APIC %d", apic_id); DELAY(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); if (!lapic_ipi_wait(100)) panic("Failed to deliver second STARTUP IPI to APIC %d", apic_id); DELAY(200); /* wait ~200uS */ } /* * Send an IPI to specified CPU handling the bitmap logic. */ void ipi_send_cpu(int cpu, u_int ipi) { u_int bitmap, old_pending, new_pending; KASSERT(cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); if (IPI_IS_BITMAPED(ipi)) { bitmap = 1 << ipi; ipi = IPI_BITMAP_VECTOR; do { old_pending = cpu_ipi_pending[cpu]; new_pending = old_pending | bitmap; } while (!atomic_cmpset_int(&cpu_ipi_pending[cpu], old_pending, new_pending)); if (old_pending) return; } lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); } void ipi_bitmap_handler(struct trapframe frame) { struct trapframe *oldframe; struct thread *td; int cpu = PCPU_GET(cpuid); u_int ipi_bitmap; critical_enter(); td = curthread; td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = &frame; ipi_bitmap = atomic_readandclear_int(&cpu_ipi_pending[cpu]); if (ipi_bitmap & (1 << IPI_PREEMPT)) { #ifdef COUNT_IPIS (*ipi_preempt_counts[cpu])++; #endif sched_preempt(td); } if (ipi_bitmap & (1 << IPI_AST)) { #ifdef COUNT_IPIS (*ipi_ast_counts[cpu])++; #endif /* Nothing to do for AST */ } if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { #ifdef COUNT_IPIS (*ipi_hardclock_counts[cpu])++; #endif hardclockintr(); } td->td_intr_frame = oldframe; td->td_intr_nesting_level--; critical_exit(); } /* * send an IPI to a set of cpus. */ void ipi_selected(cpuset_t cpus, u_int ipi) { int cpu; /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); while ((cpu = CPU_FFS(&cpus)) != 0) { cpu--; CPU_CLR(cpu, &cpus); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } } /* * send an IPI to a specific CPU. */ void ipi_cpu(int cpu, u_int ipi) { /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { cpuset_t other_cpus; other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); if (IPI_IS_BITMAPED(ipi)) { ipi_selected(other_cpus, ipi); return; } /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); } int ipi_nmi_handler(void) { u_int cpuid; /* * As long as there is not a simple way to know about a NMI's * source, if the bitmask for the current CPU is present in * the global pending bitword an IPI_STOP_HARD has been issued * and should be handled. */ cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) return (1); CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); cpustop_handler(); return (0); } /* * Handle an IPI_STOP by saving our current context and spinning until we * are resumed. */ void cpustop_handler(void) { u_int cpu; cpu = PCPU_GET(cpuid); savectx(&stoppcbs[cpu]); /* Indicate that we are stopped */ CPU_SET_ATOMIC(cpu, &stopped_cpus); /* Wait for restart */ while (!CPU_ISSET(cpu, &started_cpus)) ia32_pause(); CPU_CLR_ATOMIC(cpu, &started_cpus); CPU_CLR_ATOMIC(cpu, &stopped_cpus); #if defined(__amd64__) && defined(DDB) amd64_db_resume_dbreg(); #endif if (cpu == 0 && cpustop_restartfunc != NULL) { cpustop_restartfunc(); cpustop_restartfunc = NULL; } } /* * Handle an IPI_SUSPEND by saving our current context and spinning until we * are resumed. */ void cpususpend_handler(void) { u_int cpu; mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); cpu = PCPU_GET(cpuid); if (savectx(&susppcbs[cpu]->sp_pcb)) { #ifdef __amd64__ fpususpend(susppcbs[cpu]->sp_fpususpend); #else npxsuspend(susppcbs[cpu]->sp_fpususpend); #endif wbinvd(); CPU_SET_ATOMIC(cpu, &suspended_cpus); } else { #ifdef __amd64__ fpuresume(susppcbs[cpu]->sp_fpususpend); #else npxresume(susppcbs[cpu]->sp_fpususpend); #endif pmap_init_pat(); initializecpu(); PCPU_SET(switchtime, 0); PCPU_SET(switchticks, ticks); /* Indicate that we are resumed */ CPU_CLR_ATOMIC(cpu, &suspended_cpus); } /* Wait for resume */ while (!CPU_ISSET(cpu, &started_cpus)) ia32_pause(); if (cpu_ops.cpu_resume) cpu_ops.cpu_resume(); #ifdef __amd64__ if (vmm_resume_p) vmm_resume_p(); #endif /* Resume MCA and local APIC */ lapic_xapic_mode(); mca_resume(); lapic_setup(0); /* Indicate that we are resumed */ CPU_CLR_ATOMIC(cpu, &suspended_cpus); CPU_CLR_ATOMIC(cpu, &started_cpus); } void invlcache_handler(void) { #ifdef COUNT_IPIS (*ipi_invlcache_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ wbinvd(); atomic_add_int(&smp_tlb_wait, 1); } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ static void release_aps(void *dummy __unused) { if (mp_ncpus == 1) return; atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ia32_pause(); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); #ifdef COUNT_IPIS /* * Setup interrupt counters for IPI handlers. */ static void mp_ipi_intrcnt(void *dummy) { char buf[64]; int i; CPU_FOREACH(i) { snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); intrcnt_add(buf, &ipi_invltlb_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); intrcnt_add(buf, &ipi_invlrng_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); intrcnt_add(buf, &ipi_invlpg_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); intrcnt_add(buf, &ipi_invlcache_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:preempt", i); intrcnt_add(buf, &ipi_preempt_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:ast", i); intrcnt_add(buf, &ipi_ast_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); intrcnt_add(buf, &ipi_rendezvous_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); intrcnt_add(buf, &ipi_hardclock_counts[i]); } } SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); #endif /* * Flush the TLB on other CPU's */ /* Variables needed for SMP tlb shootdown. */ static vm_offset_t smp_tlb_addr1, smp_tlb_addr2; pmap_t smp_tlb_pmap; volatile int smp_tlb_wait; #ifdef __amd64__ #define read_eflags() read_rflags() #endif static void smp_targeted_tlb_shootdown(cpuset_t mask, u_int vector, pmap_t pmap, vm_offset_t addr1, vm_offset_t addr2) { int cpu, ncpu, othercpus; othercpus = mp_ncpus - 1; /* does not shootdown self */ /* * Check for other cpus. Return if none. */ if (CPU_ISFULLSET(&mask)) { if (othercpus < 1) return; } else { CPU_CLR(PCPU_GET(cpuid), &mask); if (CPU_EMPTY(&mask)) return; } if (!(read_eflags() & PSL_I)) panic("%s: interrupts disabled", __func__); mtx_lock_spin(&smp_ipi_mtx); smp_tlb_addr1 = addr1; smp_tlb_addr2 = addr2; smp_tlb_pmap = pmap; smp_tlb_wait = 0; if (CPU_ISFULLSET(&mask)) { ncpu = othercpus; ipi_all_but_self(vector); } else { ncpu = 0; while ((cpu = CPU_FFS(&mask)) != 0) { cpu--; CPU_CLR(cpu, &mask); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, vector); ipi_send_cpu(cpu, vector); ncpu++; } } while (smp_tlb_wait < ncpu) ia32_pause(); mtx_unlock_spin(&smp_ipi_mtx); } void smp_masked_invltlb(cpuset_t mask, pmap_t pmap) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLTLB, pmap, 0, 0); #ifdef COUNT_XINVLTLB_HITS ipi_global++; #endif } } void smp_masked_invlpg(cpuset_t mask, vm_offset_t addr) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLPG, NULL, addr, 0); #ifdef COUNT_XINVLTLB_HITS ipi_page++; #endif } } void smp_masked_invlpg_range(cpuset_t mask, vm_offset_t addr1, vm_offset_t addr2) { if (smp_started) { smp_targeted_tlb_shootdown(mask, IPI_INVLRNG, NULL, addr1, addr2); #ifdef COUNT_XINVLTLB_HITS ipi_range++; ipi_range_size += (addr2 - addr1) / PAGE_SIZE; #endif } } void smp_cache_flush(void) { if (smp_started) { smp_targeted_tlb_shootdown(all_cpus, IPI_INVLCACHE, NULL, 0, 0); } } /* * Handlers for TLB related IPIs */ void invltlb_handler(void) { #ifdef COUNT_XINVLTLB_HITS xhits_gbl[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invltlb_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ if (smp_tlb_pmap == kernel_pmap) invltlb_glob(); else invltlb(); atomic_add_int(&smp_tlb_wait, 1); } void invlpg_handler(void) { #ifdef COUNT_XINVLTLB_HITS xhits_pg[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ invlpg(smp_tlb_addr1); atomic_add_int(&smp_tlb_wait, 1); } void invlrng_handler(void) { vm_offset_t addr; #ifdef COUNT_XINVLTLB_HITS xhits_rng[PCPU_GET(cpuid)]++; #endif /* COUNT_XINVLTLB_HITS */ #ifdef COUNT_IPIS (*ipi_invlrng_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ addr = smp_tlb_addr1; do { invlpg(addr); addr += PAGE_SIZE; } while (addr < smp_tlb_addr2); atomic_add_int(&smp_tlb_wait, 1); }