diff --git a/sys/arm/arm/mp_machdep.c b/sys/arm/arm/mp_machdep.c index 6368f7b72da9..4089af5929eb 100644 --- a/sys/arm/arm/mp_machdep.c +++ b/sys/arm/arm/mp_machdep.c @@ -1,396 +1,397 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 Semihalf. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_ddb.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef VFP #include #endif #ifdef CPU_MV_PJ4B #include #endif /* used to hold the AP's until we are ready to release them */ struct mtx ap_boot_mtx; struct pcb stoppcbs[MAXCPU]; /* # of Applications processors */ volatile int mp_naps; /* Set to 1 once we're ready to let the APs out of the pen. */ volatile int aps_ready = 0; void set_stackptrs(int cpu); /* Temporary variables for init_secondary() */ void *dpcpu[MAXCPU - 1]; /* Determine if we running MP machine */ int cpu_mp_probe(void) { KASSERT(mp_ncpus != 0, ("cpu_mp_probe: mp_ncpus is unset")); CPU_SETOF(0, &all_cpus); return (mp_ncpus > 1); } /* Start Application Processor via platform specific function */ static int check_ap(void) { uint32_t ms; for (ms = 0; ms < 2000; ++ms) { if ((mp_naps + 1) == mp_ncpus) return (0); /* success */ else DELAY(1000); } return (-2); } /* Initialize and fire up non-boot processors */ void cpu_mp_start(void) { int error, i; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); /* Reserve memory for application processors */ for(i = 0; i < (mp_ncpus - 1); i++) dpcpu[i] = (void *)kmem_malloc(DPCPU_SIZE, M_WAITOK | M_ZERO); dcache_wbinv_poc_all(); /* Initialize boot code and start up processors */ platform_mp_start_ap(); /* Check if ap's started properly */ error = check_ap(); if (error) printf("WARNING: Some AP's failed to start\n"); else for (i = 1; i < mp_ncpus; i++) CPU_SET(i, &all_cpus); } /* Introduce rest of cores to the world */ void cpu_mp_announce(void) { } void init_secondary(int cpu) { struct pcpu *pc; uint32_t loop_counter; pmap_set_tex(); cpuinfo_reinit_mmu(pmap_kern_ttb); cpu_setup(); /* Provide stack pointers for other processor modes. */ set_stackptrs(cpu); enable_interrupts(PSR_A); pc = &__pcpu[cpu]; /* * pcpu_init() updates queue, so it should not be executed in parallel * on several cores */ while(mp_naps < (cpu - 1)) ; pcpu_init(pc, cpu, sizeof(struct pcpu)); pc->pc_mpidr = cp15_mpidr_get() & 0xFFFFFF; dpcpu_init(dpcpu[cpu - 1], cpu); #if defined(DDB) dbg_monitor_init_secondary(); #endif /* Signal our startup to BSP */ atomic_add_rel_32(&mp_naps, 1); /* Spin until the BSP releases the APs */ while (!atomic_load_acq_int(&aps_ready)) { #if __ARM_ARCH >= 7 __asm __volatile("wfe"); #endif } /* Initialize curthread */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); pc->pc_curthread = pc->pc_idlethread; pc->pc_curpcb = pc->pc_idlethread->td_pcb; set_curthread(pc->pc_idlethread); + schedinit_ap(); #ifdef VFP vfp_init(); #endif /* Configure the interrupt controller */ intr_pic_init_secondary(); /* Apply possible BP hardening */ cpuinfo_init_bp_hardening(); mtx_lock_spin(&ap_boot_mtx); atomic_add_rel_32(&smp_cpus, 1); if (smp_cpus == mp_ncpus) { /* enable IPI's, tlb shootdown, freezes etc */ atomic_store_rel_int(&smp_started, 1); } mtx_unlock_spin(&ap_boot_mtx); loop_counter = 0; while (smp_started == 0) { DELAY(100); loop_counter++; if (loop_counter == 1000) CTR0(KTR_SMP, "AP still wait for smp_started"); } /* Start per-CPU event timers. */ cpu_initclocks_ap(); CTR0(KTR_SMP, "go into scheduler"); /* Enter the scheduler */ sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ } static void ipi_rendezvous(void *dummy __unused) { CTR0(KTR_SMP, "IPI_RENDEZVOUS"); smp_rendezvous_action(); } static void ipi_ast(void *dummy __unused) { CTR0(KTR_SMP, "IPI_AST"); } static void ipi_stop(void *dummy __unused) { u_int cpu; /* * IPI_STOP_HARD is mapped to IPI_STOP. */ CTR0(KTR_SMP, "IPI_STOP or IPI_STOP_HARD"); cpu = PCPU_GET(cpuid); savectx(&stoppcbs[cpu]); /* * CPUs are stopped when entering the debugger and at * system shutdown, both events which can precede a * panic dump. For the dump to be correct, all caches * must be flushed and invalidated, but on ARM there's * no way to broadcast a wbinv_all to other cores. * Instead, we have each core do the local wbinv_all as * part of stopping the core. The core requesting the * stop will do the l2 cache flush after all other cores * have done their l1 flushes and stopped. */ dcache_wbinv_poc_all(); /* Indicate we are stopped */ CPU_SET_ATOMIC(cpu, &stopped_cpus); /* Wait for restart */ while (!CPU_ISSET(cpu, &started_cpus)) cpu_spinwait(); CPU_CLR_ATOMIC(cpu, &started_cpus); CPU_CLR_ATOMIC(cpu, &stopped_cpus); #ifdef DDB dbg_resume_dbreg(); #endif CTR0(KTR_SMP, "IPI_STOP (restart)"); } static void ipi_preempt(void *arg) { struct trapframe *oldframe; struct thread *td; critical_enter(); td = curthread; td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = (struct trapframe *)arg; CTR1(KTR_SMP, "%s: IPI_PREEMPT", __func__); sched_preempt(td); td->td_intr_frame = oldframe; td->td_intr_nesting_level--; critical_exit(); } static void ipi_hardclock(void *arg) { struct trapframe *oldframe; struct thread *td; critical_enter(); td = curthread; td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = (struct trapframe *)arg; CTR1(KTR_SMP, "%s: IPI_HARDCLOCK", __func__); hardclockintr(); td->td_intr_frame = oldframe; td->td_intr_nesting_level--; critical_exit(); } static void release_aps(void *dummy __unused) { uint32_t loop_counter; if (mp_ncpus == 1) return; intr_pic_ipi_setup(IPI_RENDEZVOUS, "rendezvous", ipi_rendezvous, NULL); intr_pic_ipi_setup(IPI_AST, "ast", ipi_ast, NULL); intr_pic_ipi_setup(IPI_STOP, "stop", ipi_stop, NULL); intr_pic_ipi_setup(IPI_PREEMPT, "preempt", ipi_preempt, NULL); intr_pic_ipi_setup(IPI_HARDCLOCK, "hardclock", ipi_hardclock, NULL); atomic_store_rel_int(&aps_ready, 1); /* Wake the other threads up */ dsb(); sev(); printf("Release APs\n"); for (loop_counter = 0; loop_counter < 2000; loop_counter++) { if (smp_started) return; DELAY(1000); } printf("AP's not started\n"); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); struct cpu_group * cpu_topo(void) { return (smp_topo_1level(CG_SHARE_L2, mp_ncpus, 0)); } void cpu_mp_setmaxid(void) { platform_mp_setmaxid(); } /* Sending IPI */ void ipi_all_but_self(u_int ipi) { cpuset_t other_cpus; other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); intr_ipi_send(other_cpus, ipi); } void ipi_cpu(int cpu, u_int ipi) { cpuset_t cpus; CPU_ZERO(&cpus); CPU_SET(cpu, &cpus); CTR3(KTR_SMP, "%s: cpu: %d, ipi: %x", __func__, cpu, ipi); intr_ipi_send(cpus, ipi); } void ipi_selected(cpuset_t cpus, u_int ipi) { CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); intr_ipi_send(cpus, ipi); } diff --git a/sys/arm64/arm64/mp_machdep.c b/sys/arm64/arm64/mp_machdep.c index 15e05ef46262..b42f65b9e399 100644 --- a/sys/arm64/arm64/mp_machdep.c +++ b/sys/arm64/arm64/mp_machdep.c @@ -1,943 +1,944 @@ /*- * Copyright (c) 2015-2016 The FreeBSD Foundation * * This software was developed by Andrew Turner under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include "opt_acpi.h" #include "opt_ddb.h" #include "opt_kstack_pages.h" #include "opt_platform.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef VFP #include #endif #ifdef DEV_ACPI #include #include #endif #ifdef FDT #include #include #include #include #endif #include #include "pic_if.h" #define MP_QUIRK_CPULIST 0x01 /* The list of cpus may be wrong, */ /* don't panic if one fails to start */ static uint32_t mp_quirks; #ifdef FDT static struct { const char *compat; uint32_t quirks; } fdt_quirks[] = { { "arm,foundation-aarch64", MP_QUIRK_CPULIST }, { "arm,fvp-base", MP_QUIRK_CPULIST }, /* This is incorrect in some DTS files */ { "arm,vfp-base", MP_QUIRK_CPULIST }, { NULL, 0 }, }; #endif typedef void intr_ipi_send_t(void *, cpuset_t, u_int); typedef void intr_ipi_handler_t(void *); #define INTR_IPI_NAMELEN (MAXCOMLEN + 1) struct intr_ipi { intr_ipi_handler_t * ii_handler; void * ii_handler_arg; intr_ipi_send_t * ii_send; void * ii_send_arg; char ii_name[INTR_IPI_NAMELEN]; u_long * ii_count; }; static struct intr_ipi ipi_sources[INTR_IPI_COUNT]; static struct intr_ipi *intr_ipi_lookup(u_int); static void intr_pic_ipi_setup(u_int, const char *, intr_ipi_handler_t *, void *); static void ipi_ast(void *); static void ipi_hardclock(void *); static void ipi_preempt(void *); static void ipi_rendezvous(void *); static void ipi_stop(void *); struct pcb stoppcbs[MAXCPU]; #ifdef FDT static u_int fdt_cpuid; #endif void mpentry(unsigned long cpuid); void init_secondary(uint64_t); /* Synchronize AP startup. */ static struct mtx ap_boot_mtx; /* Stacks for AP initialization, discarded once idle threads are started. */ void *bootstack; static void *bootstacks[MAXCPU]; /* Count of started APs, used to synchronize access to bootstack. */ static volatile int aps_started; /* Set to 1 once we're ready to let the APs out of the pen. */ static volatile int aps_ready; /* Temporary variables for init_secondary() */ void *dpcpu[MAXCPU - 1]; static bool is_boot_cpu(uint64_t target_cpu) { return (cpuid_to_pcpu[0]->pc_mpidr == (target_cpu & CPU_AFF_MASK)); } static void release_aps(void *dummy __unused) { int i, started; /* Only release CPUs if they exist */ if (mp_ncpus == 1) return; intr_pic_ipi_setup(IPI_AST, "ast", ipi_ast, NULL); intr_pic_ipi_setup(IPI_PREEMPT, "preempt", ipi_preempt, NULL); intr_pic_ipi_setup(IPI_RENDEZVOUS, "rendezvous", ipi_rendezvous, NULL); intr_pic_ipi_setup(IPI_STOP, "stop", ipi_stop, NULL); intr_pic_ipi_setup(IPI_STOP_HARD, "stop hard", ipi_stop, NULL); intr_pic_ipi_setup(IPI_HARDCLOCK, "hardclock", ipi_hardclock, NULL); atomic_store_rel_int(&aps_ready, 1); /* Wake up the other CPUs */ __asm __volatile( "dsb ishst \n" "sev \n" ::: "memory"); printf("Release APs..."); started = 0; for (i = 0; i < 2000; i++) { if (smp_started) { printf("done\n"); return; } /* * Don't time out while we are making progress. Some large * systems can take a while to start all CPUs. */ if (smp_cpus > started) { i = 0; started = smp_cpus; } DELAY(1000); } printf("APs not started\n"); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); void init_secondary(uint64_t cpu) { struct pcpu *pcpup; pmap_t pmap0; u_int mpidr; /* * Verify that the value passed in 'cpu' argument (aka context_id) is * valid. Some older U-Boot based PSCI implementations are buggy, * they can pass random value in it. */ mpidr = READ_SPECIALREG(mpidr_el1) & CPU_AFF_MASK; if (cpu >= MAXCPU || cpuid_to_pcpu[cpu] == NULL || cpuid_to_pcpu[cpu]->pc_mpidr != mpidr) { for (cpu = 0; cpu < mp_maxid; cpu++) if (cpuid_to_pcpu[cpu] != NULL && cpuid_to_pcpu[cpu]->pc_mpidr == mpidr) break; if ( cpu >= MAXCPU) panic("MPIDR for this CPU is not in pcpu table"); } pcpup = cpuid_to_pcpu[cpu]; /* * Set the pcpu pointer with a backup in tpidr_el1 to be * loaded when entering the kernel from userland. */ __asm __volatile( "mov x18, %0 \n" "msr tpidr_el1, %0" :: "r"(pcpup)); /* * Identify current CPU. This is necessary to setup * affinity registers and to provide support for * runtime chip identification. * * We need this before signalling the CPU is ready to * let the boot CPU use the results. */ pcpup->pc_midr = get_midr(); identify_cpu(cpu); /* Ensure the stores in identify_cpu have completed */ atomic_thread_fence_acq_rel(); /* Signal the BSP and spin until it has released all APs. */ atomic_add_int(&aps_started, 1); while (!atomic_load_int(&aps_ready)) __asm __volatile("wfe"); /* Initialize curthread */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); pcpup->pc_curthread = pcpup->pc_idlethread; + schedinit_ap(); /* Initialize curpmap to match TTBR0's current setting. */ pmap0 = vmspace_pmap(&vmspace0); KASSERT(pmap_to_ttbr0(pmap0) == READ_SPECIALREG(ttbr0_el1), ("pmap0 doesn't match cpu %ld's ttbr0", cpu)); pcpup->pc_curpmap = pmap0; install_cpu_errata(); intr_pic_init_secondary(); /* Start per-CPU event timers. */ cpu_initclocks_ap(); #ifdef VFP vfp_init(); #endif dbg_init(); pan_enable(); mtx_lock_spin(&ap_boot_mtx); atomic_add_rel_32(&smp_cpus, 1); if (smp_cpus == mp_ncpus) { /* enable IPI's, tlb shootdown, freezes etc */ atomic_store_rel_int(&smp_started, 1); } mtx_unlock_spin(&ap_boot_mtx); kcsan_cpu_init(cpu); /* * Assert that smp_after_idle_runnable condition is reasonable. */ MPASS(PCPU_GET(curpcb) == NULL); /* Enter the scheduler */ sched_throw(NULL); panic("scheduler returned us to init_secondary"); /* NOTREACHED */ } static void smp_after_idle_runnable(void *arg __unused) { struct pcpu *pc; int cpu; for (cpu = 1; cpu < mp_ncpus; cpu++) { if (bootstacks[cpu] != NULL) { pc = pcpu_find(cpu); while (atomic_load_ptr(&pc->pc_curpcb) == NULL) cpu_spinwait(); kmem_free((vm_offset_t)bootstacks[cpu], PAGE_SIZE); } } } SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY, smp_after_idle_runnable, NULL); /* * Send IPI thru interrupt controller. */ static void pic_ipi_send(void *arg, cpuset_t cpus, u_int ipi) { KASSERT(intr_irq_root_dev != NULL, ("%s: no root attached", __func__)); /* * Ensure that this CPU's stores will be visible to IPI * recipients before starting to send the interrupts. */ dsb(ishst); PIC_IPI_SEND(intr_irq_root_dev, arg, cpus, ipi); } /* * Setup IPI handler on interrupt controller. * * Not SMP coherent. */ static void intr_pic_ipi_setup(u_int ipi, const char *name, intr_ipi_handler_t *hand, void *arg) { struct intr_irqsrc *isrc; struct intr_ipi *ii; int error; KASSERT(intr_irq_root_dev != NULL, ("%s: no root attached", __func__)); KASSERT(hand != NULL, ("%s: ipi %u no handler", __func__, ipi)); error = PIC_IPI_SETUP(intr_irq_root_dev, ipi, &isrc); if (error != 0) return; isrc->isrc_handlers++; ii = intr_ipi_lookup(ipi); KASSERT(ii->ii_count == NULL, ("%s: ipi %u reused", __func__, ipi)); ii->ii_handler = hand; ii->ii_handler_arg = arg; ii->ii_send = pic_ipi_send; ii->ii_send_arg = isrc; strlcpy(ii->ii_name, name, INTR_IPI_NAMELEN); ii->ii_count = intr_ipi_setup_counters(name); PIC_ENABLE_INTR(intr_irq_root_dev, isrc); } static void intr_ipi_send(cpuset_t cpus, u_int ipi) { struct intr_ipi *ii; ii = intr_ipi_lookup(ipi); if (ii->ii_count == NULL) panic("%s: not setup IPI %u", __func__, ipi); ii->ii_send(ii->ii_send_arg, cpus, ipi); } static void ipi_ast(void *dummy __unused) { CTR0(KTR_SMP, "IPI_AST"); } static void ipi_hardclock(void *dummy __unused) { CTR1(KTR_SMP, "%s: IPI_HARDCLOCK", __func__); hardclockintr(); } static void ipi_preempt(void *dummy __unused) { CTR1(KTR_SMP, "%s: IPI_PREEMPT", __func__); sched_preempt(curthread); } static void ipi_rendezvous(void *dummy __unused) { CTR0(KTR_SMP, "IPI_RENDEZVOUS"); smp_rendezvous_action(); } static void ipi_stop(void *dummy __unused) { u_int cpu; CTR0(KTR_SMP, "IPI_STOP"); cpu = PCPU_GET(cpuid); savectx(&stoppcbs[cpu]); /* Indicate we are stopped */ CPU_SET_ATOMIC(cpu, &stopped_cpus); /* Wait for restart */ while (!CPU_ISSET(cpu, &started_cpus)) cpu_spinwait(); #ifdef DDB dbg_register_sync(NULL); #endif CPU_CLR_ATOMIC(cpu, &started_cpus); CPU_CLR_ATOMIC(cpu, &stopped_cpus); CTR0(KTR_SMP, "IPI_STOP (restart)"); } struct cpu_group * cpu_topo(void) { struct cpu_group *dom, *root; int i; root = smp_topo_alloc(1); dom = smp_topo_alloc(vm_ndomains); root->cg_parent = NULL; root->cg_child = dom; CPU_COPY(&all_cpus, &root->cg_mask); root->cg_count = mp_ncpus; root->cg_children = vm_ndomains; root->cg_level = CG_SHARE_NONE; root->cg_flags = 0; /* * Redundant layers will be collapsed by the caller so we don't need a * special case for a single domain. */ for (i = 0; i < vm_ndomains; i++, dom++) { dom->cg_parent = root; dom->cg_child = NULL; CPU_COPY(&cpuset_domain[i], &dom->cg_mask); dom->cg_count = CPU_COUNT(&dom->cg_mask); dom->cg_children = 0; dom->cg_level = CG_SHARE_L3; dom->cg_flags = 0; } return (root); } /* Determine if we running MP machine */ int cpu_mp_probe(void) { /* ARM64TODO: Read the u bit of mpidr_el1 to determine this */ return (1); } /* * Starts a given CPU. If the CPU is already running, i.e. it is the boot CPU, * do nothing. Returns true if the CPU is present and running. */ static bool start_cpu(u_int cpuid, uint64_t target_cpu, int domain) { struct pcpu *pcpup; vm_paddr_t pa; int err, naps; /* Check we are able to start this cpu */ if (cpuid > mp_maxid) return (false); /* Skip boot CPU */ if (is_boot_cpu(target_cpu)) return (true); KASSERT(cpuid < MAXCPU, ("Too many CPUs")); pcpup = (void *)kmem_malloc_domainset(DOMAINSET_PREF(domain), sizeof(*pcpup), M_WAITOK | M_ZERO); pcpu_init(pcpup, cpuid, sizeof(struct pcpu)); pcpup->pc_mpidr = target_cpu & CPU_AFF_MASK; dpcpu[cpuid - 1] = (void *)kmem_malloc_domainset( DOMAINSET_PREF(domain), DPCPU_SIZE, M_WAITOK | M_ZERO); dpcpu_init(dpcpu[cpuid - 1], cpuid); bootstacks[cpuid] = (void *)kmem_malloc_domainset( DOMAINSET_PREF(domain), PAGE_SIZE, M_WAITOK | M_ZERO); naps = atomic_load_int(&aps_started); bootstack = (char *)bootstacks[cpuid] + PAGE_SIZE; printf("Starting CPU %u (%lx)\n", cpuid, target_cpu); pa = pmap_extract(kernel_pmap, (vm_offset_t)mpentry); err = psci_cpu_on(target_cpu, pa, cpuid); if (err != PSCI_RETVAL_SUCCESS) { /* * Panic here if INVARIANTS are enabled and PSCI failed to * start the requested CPU. psci_cpu_on() returns PSCI_MISSING * to indicate we are unable to use it to start the given CPU. */ KASSERT(err == PSCI_MISSING || (mp_quirks & MP_QUIRK_CPULIST) == MP_QUIRK_CPULIST, ("Failed to start CPU %u (%lx), error %d\n", cpuid, target_cpu, err)); pcpu_destroy(pcpup); kmem_free((vm_offset_t)dpcpu[cpuid - 1], DPCPU_SIZE); dpcpu[cpuid - 1] = NULL; kmem_free((vm_offset_t)bootstacks[cpuid], PAGE_SIZE); bootstacks[cpuid] = NULL; mp_ncpus--; return (false); } /* Wait for the AP to switch to its boot stack. */ while (atomic_load_int(&aps_started) < naps + 1) cpu_spinwait(); CPU_SET(cpuid, &all_cpus); return (true); } #ifdef DEV_ACPI static void madt_handler(ACPI_SUBTABLE_HEADER *entry, void *arg) { ACPI_MADT_GENERIC_INTERRUPT *intr; u_int *cpuid; u_int id; int domain; switch(entry->Type) { case ACPI_MADT_TYPE_GENERIC_INTERRUPT: intr = (ACPI_MADT_GENERIC_INTERRUPT *)entry; cpuid = arg; if (is_boot_cpu(intr->ArmMpidr)) id = 0; else id = *cpuid; domain = 0; #ifdef NUMA if (vm_ndomains > 1) domain = acpi_pxm_get_cpu_locality(intr->Uid); #endif if (start_cpu(id, intr->ArmMpidr, domain)) { MPASS(cpuid_to_pcpu[id] != NULL); cpuid_to_pcpu[id]->pc_acpi_id = intr->Uid; /* * Don't increment for the boot CPU, its CPU ID is * reserved. */ if (!is_boot_cpu(intr->ArmMpidr)) (*cpuid)++; } break; default: break; } } static void cpu_init_acpi(void) { ACPI_TABLE_MADT *madt; vm_paddr_t physaddr; u_int cpuid; physaddr = acpi_find_table(ACPI_SIG_MADT); if (physaddr == 0) return; madt = acpi_map_table(physaddr, ACPI_SIG_MADT); if (madt == NULL) { printf("Unable to map the MADT, not starting APs\n"); return; } /* Boot CPU is always 0 */ cpuid = 1; acpi_walk_subtables(madt + 1, (char *)madt + madt->Header.Length, madt_handler, &cpuid); acpi_unmap_table(madt); #if MAXMEMDOM > 1 acpi_pxm_set_cpu_locality(); #endif } #endif #ifdef FDT static boolean_t start_cpu_fdt(u_int id, phandle_t node, u_int addr_size, pcell_t *reg) { uint64_t target_cpu; int domain; int cpuid; target_cpu = reg[0]; if (addr_size == 2) { target_cpu <<= 32; target_cpu |= reg[1]; } if (is_boot_cpu(target_cpu)) cpuid = 0; else cpuid = fdt_cpuid; if (!start_cpu(cpuid, target_cpu, 0)) return (FALSE); /* * Don't increment for the boot CPU, its CPU ID is reserved. */ if (!is_boot_cpu(target_cpu)) fdt_cpuid++; /* Try to read the numa node of this cpu */ if (vm_ndomains == 1 || OF_getencprop(node, "numa-node-id", &domain, sizeof(domain)) <= 0) domain = 0; cpuid_to_pcpu[cpuid]->pc_domain = domain; if (domain < MAXMEMDOM) CPU_SET(cpuid, &cpuset_domain[domain]); return (TRUE); } static void cpu_init_fdt(void) { phandle_t node; int i; node = OF_peer(0); for (i = 0; fdt_quirks[i].compat != NULL; i++) { if (ofw_bus_node_is_compatible(node, fdt_quirks[i].compat) != 0) { mp_quirks = fdt_quirks[i].quirks; } } fdt_cpuid = 1; ofw_cpu_early_foreach(start_cpu_fdt, true); } #endif /* Initialize and fire up non-boot processors */ void cpu_mp_start(void) { mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); /* CPU 0 is always boot CPU. */ CPU_SET(0, &all_cpus); cpuid_to_pcpu[0]->pc_mpidr = READ_SPECIALREG(mpidr_el1) & CPU_AFF_MASK; switch(arm64_bus_method) { #ifdef DEV_ACPI case ARM64_BUS_ACPI: mp_quirks = MP_QUIRK_CPULIST; cpu_init_acpi(); break; #endif #ifdef FDT case ARM64_BUS_FDT: cpu_init_fdt(); break; #endif default: break; } } /* Introduce rest of cores to the world */ void cpu_mp_announce(void) { } #ifdef DEV_ACPI static void cpu_count_acpi_handler(ACPI_SUBTABLE_HEADER *entry, void *arg) { ACPI_MADT_GENERIC_INTERRUPT *intr; u_int *cores = arg; switch(entry->Type) { case ACPI_MADT_TYPE_GENERIC_INTERRUPT: intr = (ACPI_MADT_GENERIC_INTERRUPT *)entry; (*cores)++; break; default: break; } } static u_int cpu_count_acpi(void) { ACPI_TABLE_MADT *madt; vm_paddr_t physaddr; u_int cores; physaddr = acpi_find_table(ACPI_SIG_MADT); if (physaddr == 0) return (0); madt = acpi_map_table(physaddr, ACPI_SIG_MADT); if (madt == NULL) { printf("Unable to map the MADT, not starting APs\n"); return (0); } cores = 0; acpi_walk_subtables(madt + 1, (char *)madt + madt->Header.Length, cpu_count_acpi_handler, &cores); acpi_unmap_table(madt); return (cores); } #endif void cpu_mp_setmaxid(void) { int cores; mp_ncpus = 1; mp_maxid = 0; switch(arm64_bus_method) { #ifdef DEV_ACPI case ARM64_BUS_ACPI: cores = cpu_count_acpi(); if (cores > 0) { cores = MIN(cores, MAXCPU); if (bootverbose) printf("Found %d CPUs in the ACPI tables\n", cores); mp_ncpus = cores; mp_maxid = cores - 1; } break; #endif #ifdef FDT case ARM64_BUS_FDT: cores = ofw_cpu_early_foreach(NULL, false); if (cores > 0) { cores = MIN(cores, MAXCPU); if (bootverbose) printf("Found %d CPUs in the device tree\n", cores); mp_ncpus = cores; mp_maxid = cores - 1; } break; #endif default: if (bootverbose) printf("No CPU data, limiting to 1 core\n"); break; } if (TUNABLE_INT_FETCH("hw.ncpu", &cores)) { if (cores > 0 && cores < mp_ncpus) { mp_ncpus = cores; mp_maxid = cores - 1; } } } /* * Lookup IPI source. */ static struct intr_ipi * intr_ipi_lookup(u_int ipi) { if (ipi >= INTR_IPI_COUNT) panic("%s: no such IPI %u", __func__, ipi); return (&ipi_sources[ipi]); } /* * interrupt controller dispatch function for IPIs. It should * be called straight from the interrupt controller, when associated * interrupt source is learned. Or from anybody who has an interrupt * source mapped. */ void intr_ipi_dispatch(u_int ipi, struct trapframe *tf) { void *arg; struct intr_ipi *ii; ii = intr_ipi_lookup(ipi); if (ii->ii_count == NULL) panic("%s: not setup IPI %u", __func__, ipi); intr_ipi_increment_count(ii->ii_count, PCPU_GET(cpuid)); /* * Supply ipi filter with trapframe argument * if none is registered. */ arg = ii->ii_handler_arg != NULL ? ii->ii_handler_arg : tf; ii->ii_handler(arg); } #ifdef notyet /* * Map IPI into interrupt controller. * * Not SMP coherent. */ static int ipi_map(struct intr_irqsrc *isrc, u_int ipi) { boolean_t is_percpu; int error; if (ipi >= INTR_IPI_COUNT) panic("%s: no such IPI %u", __func__, ipi); KASSERT(intr_irq_root_dev != NULL, ("%s: no root attached", __func__)); isrc->isrc_type = INTR_ISRCT_NAMESPACE; isrc->isrc_nspc_type = INTR_IRQ_NSPC_IPI; isrc->isrc_nspc_num = ipi_next_num; error = PIC_REGISTER(intr_irq_root_dev, isrc, &is_percpu); if (error == 0) { isrc->isrc_dev = intr_irq_root_dev; ipi_next_num++; } return (error); } /* * Setup IPI handler to interrupt source. * * Note that there could be more ways how to send and receive IPIs * on a platform like fast interrupts for example. In that case, * one can call this function with ASIF_NOALLOC flag set and then * call intr_ipi_dispatch() when appropriate. * * Not SMP coherent. */ int intr_ipi_set_handler(u_int ipi, const char *name, intr_ipi_filter_t *filter, void *arg, u_int flags) { struct intr_irqsrc *isrc; int error; if (filter == NULL) return(EINVAL); isrc = intr_ipi_lookup(ipi); if (isrc->isrc_ipifilter != NULL) return (EEXIST); if ((flags & AISHF_NOALLOC) == 0) { error = ipi_map(isrc, ipi); if (error != 0) return (error); } isrc->isrc_ipifilter = filter; isrc->isrc_arg = arg; isrc->isrc_handlers = 1; isrc->isrc_count = intr_ipi_setup_counters(name); isrc->isrc_index = 0; /* it should not be used in IPI case */ if (isrc->isrc_dev != NULL) { PIC_ENABLE_INTR(isrc->isrc_dev, isrc); PIC_ENABLE_SOURCE(isrc->isrc_dev, isrc); } return (0); } #endif /* Sending IPI */ void ipi_all_but_self(u_int ipi) { cpuset_t cpus; cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &cpus); CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); intr_ipi_send(cpus, ipi); } void ipi_cpu(int cpu, u_int ipi) { cpuset_t cpus; CPU_ZERO(&cpus); CPU_SET(cpu, &cpus); CTR3(KTR_SMP, "%s: cpu: %d, ipi: %x", __func__, cpu, ipi); intr_ipi_send(cpus, ipi); } void ipi_selected(cpuset_t cpus, u_int ipi) { CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); intr_ipi_send(cpus, ipi); } diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c index 7146704fc531..7591bef4b14e 100644 --- a/sys/kern/sched_4bsd.c +++ b/sys/kern/sched_4bsd.c @@ -1,1797 +1,1804 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #ifdef KDTRACE_HOOKS #include int __read_mostly dtrace_vtime_active; dtrace_vtime_switch_func_t dtrace_vtime_switch_func; #endif /* * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in * the range 100-256 Hz (approximately). */ #define ESTCPULIM(e) \ min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \ RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1) #ifdef SMP #define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus) #else #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */ #endif #define NICE_WEIGHT 1 /* Priorities per nice level. */ #define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX))) /* * The schedulable entity that runs a context. * This is an extension to the thread structure and is tailored to * the requirements of this scheduler. * All fields are protected by the scheduler lock. */ struct td_sched { fixpt_t ts_pctcpu; /* %cpu during p_swtime. */ u_int ts_estcpu; /* Estimated cpu utilization. */ int ts_cpticks; /* Ticks of cpu time. */ int ts_slptime; /* Seconds !RUNNING. */ int ts_slice; /* Remaining part of time slice. */ int ts_flags; struct runq *ts_runq; /* runq the thread is currently on */ #ifdef KTR char ts_name[TS_NAME_LEN]; #endif }; /* flags kept in td_flags */ #define TDF_DIDRUN TDF_SCHED0 /* thread actually ran. */ #define TDF_BOUND TDF_SCHED1 /* Bound to one CPU. */ #define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */ /* flags kept in ts_flags */ #define TSF_AFFINITY 0x0001 /* Has a non-"full" CPU set. */ #define SKE_RUNQ_PCPU(ts) \ ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq) #define THREAD_CAN_SCHED(td, cpu) \ CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <= sizeof(struct thread0_storage), "increase struct thread0_storage.t0st_sched size"); static struct mtx sched_lock; static int realstathz = 127; /* stathz is sometimes 0 and run off of hz. */ static int sched_tdcnt; /* Total runnable threads in the system. */ static int sched_slice = 12; /* Thread run time before rescheduling. */ static void setup_runqs(void); static void schedcpu(void); static void schedcpu_thread(void); static void sched_priority(struct thread *td, u_char prio); static void sched_setup(void *dummy); static void maybe_resched(struct thread *td); static void updatepri(struct thread *td); static void resetpriority(struct thread *td); static void resetpriority_thread(struct thread *td); #ifdef SMP static int sched_pickcpu(struct thread *td); static int forward_wakeup(int cpunum); static void kick_other_cpu(int pri, int cpuid); #endif static struct kproc_desc sched_kp = { "schedcpu", schedcpu_thread, NULL }; SYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, kproc_start, &sched_kp); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); static void sched_initticks(void *dummy); SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL); /* * Global run queue. */ static struct runq runq; #ifdef SMP /* * Per-CPU run queues */ static struct runq runq_pcpu[MAXCPU]; long runq_length[MAXCPU]; static cpuset_t idle_cpus_mask; #endif struct pcpuidlestat { u_int idlecalls; u_int oldidlecalls; }; DPCPU_DEFINE_STATIC(struct pcpuidlestat, idlestat); static void setup_runqs(void) { #ifdef SMP int i; for (i = 0; i < MAXCPU; ++i) runq_init(&runq_pcpu[i]); #endif runq_init(&runq); } static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) { int error, new_val, period; period = 1000000 / realstathz; new_val = period * sched_slice; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val <= 0) return (EINVAL); sched_slice = imax(1, (new_val + period / 2) / period); hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); return (0); } SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "Scheduler"); SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0, "Scheduler name"); SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_quantum, "I", "Quantum for timeshare threads in microseconds"); SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "Quantum for timeshare threads in stathz ticks"); #ifdef SMP /* Enable forwarding of wakeups to all other cpus */ static SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Kernel SMP"); static int runq_fuzz = 1; SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, ""); static int forward_wakeup_enabled = 1; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW, &forward_wakeup_enabled, 0, "Forwarding of wakeup to idle CPUs"); static int forward_wakeups_requested = 0; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD, &forward_wakeups_requested, 0, "Requests for Forwarding of wakeup to idle CPUs"); static int forward_wakeups_delivered = 0; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD, &forward_wakeups_delivered, 0, "Completed Forwarding of wakeup to idle CPUs"); static int forward_wakeup_use_mask = 1; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW, &forward_wakeup_use_mask, 0, "Use the mask of idle cpus"); static int forward_wakeup_use_loop = 0; SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW, &forward_wakeup_use_loop, 0, "Use a loop to find idle cpus"); #endif #if 0 static int sched_followon = 0; SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW, &sched_followon, 0, "allow threads to share a quantum"); #endif SDT_PROVIDER_DEFINE(sched); SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", "struct proc *", "uint8_t"); SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", "struct proc *", "void *"); SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", "struct proc *", "void *", "int"); SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", "struct proc *", "uint8_t", "struct thread *"); SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int"); SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", "struct proc *"); SDT_PROBE_DEFINE(sched, , , on__cpu); SDT_PROBE_DEFINE(sched, , , remain__cpu); SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", "struct proc *"); static __inline void sched_load_add(void) { sched_tdcnt++; KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt); SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt); } static __inline void sched_load_rem(void) { sched_tdcnt--; KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt); SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt); } /* * Arrange to reschedule if necessary, taking the priorities and * schedulers into account. */ static void maybe_resched(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority < curthread->td_priority) curthread->td_flags |= TDF_NEEDRESCHED; } /* * This function is called when a thread is about to be put on run queue * because it has been made runnable or its priority has been adjusted. It * determines if the new thread should preempt the current thread. If so, * it sets td_owepreempt to request a preemption. */ int maybe_preempt(struct thread *td) { #ifdef PREEMPTION struct thread *ctd; int cpri, pri; /* * The new thread should not preempt the current thread if any of the * following conditions are true: * * - The kernel is in the throes of crashing (panicstr). * - The current thread has a higher (numerically lower) or * equivalent priority. Note that this prevents curthread from * trying to preempt to itself. * - The current thread has an inhibitor set or is in the process of * exiting. In this case, the current thread is about to switch * out anyways, so there's no point in preempting. If we did, * the current thread would not be properly resumed as well, so * just avoid that whole landmine. * - If the new thread's priority is not a realtime priority and * the current thread's priority is not an idle priority and * FULL_PREEMPTION is disabled. * * If all of these conditions are false, but the current thread is in * a nested critical section, then we have to defer the preemption * until we exit the critical section. Otherwise, switch immediately * to the new thread. */ ctd = curthread; THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("maybe_preempt: trying to run inhibited thread")); pri = td->td_priority; cpri = ctd->td_priority; if (KERNEL_PANICKED() || pri >= cpri /* || dumping */ || TD_IS_INHIBITED(ctd)) return (0); #ifndef FULL_PREEMPTION if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE) return (0); #endif CTR0(KTR_PROC, "maybe_preempt: scheduling preemption"); ctd->td_owepreempt = 1; return (1); #else return (0); #endif } /* * Constants for digital decay and forget: * 90% of (ts_estcpu) usage in 5 * loadav time * 95% of (ts_pctcpu) usage in 60 seconds (load insensitive) * Note that, as ps(1) mentions, this can let percentages * total over 100% (I've seen 137.9% for 3 processes). * * Note that schedclock() updates ts_estcpu and p_cpticks asynchronously. * * We wish to decay away 90% of ts_estcpu in (5 * loadavg) seconds. * That is, the system wants to compute a value of decay such * that the following for loop: * for (i = 0; i < (5 * loadavg); i++) * ts_estcpu *= decay; * will compute * ts_estcpu *= 0.1; * for all values of loadavg: * * Mathematically this loop can be expressed by saying: * decay ** (5 * loadavg) ~= .1 * * The system computes decay as: * decay = (2 * loadavg) / (2 * loadavg + 1) * * We wish to prove that the system's computation of decay * will always fulfill the equation: * decay ** (5 * loadavg) ~= .1 * * If we compute b as: * b = 2 * loadavg * then * decay = b / (b + 1) * * We now need to prove two things: * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) * * Facts: * For x close to zero, exp(x) =~ 1 + x, since * exp(x) = 0! + x**1/1! + x**2/2! + ... . * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. * For x close to zero, ln(1+x) =~ x, since * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). * ln(.1) =~ -2.30 * * Proof of (1): * Solve (factor)**(power) =~ .1 given power (5*loadav): * solving for factor, * ln(factor) =~ (-2.30/5*loadav), or * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED * * Proof of (2): * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): * solving for power, * power*ln(b/(b+1)) =~ -2.30, or * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED * * Actual power values for the implemented algorithm are as follows: * loadav: 1 2 3 4 * power: 5.68 10.32 14.94 19.55 */ /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ #define loadfactor(loadav) (2 * (loadav)) #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) /* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ SYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "Decay factor used for updating %CPU"); /* * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). * * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). * * If you don't want to bother with the faster/more-accurate formula, you * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate * (more general) method of calculating the %age of CPU used by a process. */ #define CCPU_SHIFT 11 /* * Recompute process priorities, every hz ticks. * MP-safe, called without the Giant mutex. */ /* ARGSUSED */ static void schedcpu(void) { fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); struct thread *td; struct proc *p; struct td_sched *ts; int awake; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } FOREACH_THREAD_IN_PROC(p, td) { awake = 0; ts = td_get_sched(td); thread_lock(td); /* * Increment sleep time (if sleeping). We * ignore overflow, as above. */ /* * The td_sched slptimes are not touched in wakeup * because the thread may not HAVE everything in * memory? XXX I think this is out of date. */ if (TD_ON_RUNQ(td)) { awake = 1; td->td_flags &= ~TDF_DIDRUN; } else if (TD_IS_RUNNING(td)) { awake = 1; /* Do not clear TDF_DIDRUN */ } else if (td->td_flags & TDF_DIDRUN) { awake = 1; td->td_flags &= ~TDF_DIDRUN; } /* * ts_pctcpu is only for ps and ttyinfo(). */ ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT; /* * If the td_sched has been idle the entire second, * stop recalculating its priority until * it wakes up. */ if (ts->ts_cpticks != 0) { #if (FSHIFT >= CCPU_SHIFT) ts->ts_pctcpu += (realstathz == 100) ? ((fixpt_t) ts->ts_cpticks) << (FSHIFT - CCPU_SHIFT) : 100 * (((fixpt_t) ts->ts_cpticks) << (FSHIFT - CCPU_SHIFT)) / realstathz; #else ts->ts_pctcpu += ((FSCALE - ccpu) * (ts->ts_cpticks * FSCALE / realstathz)) >> FSHIFT; #endif ts->ts_cpticks = 0; } /* * If there are ANY running threads in this process, * then don't count it as sleeping. * XXX: this is broken. */ if (awake) { if (ts->ts_slptime > 1) { /* * In an ideal world, this should not * happen, because whoever woke us * up from the long sleep should have * unwound the slptime and reset our * priority before we run at the stale * priority. Should KASSERT at some * point when all the cases are fixed. */ updatepri(td); } ts->ts_slptime = 0; } else ts->ts_slptime++; if (ts->ts_slptime > 1) { thread_unlock(td); continue; } ts->ts_estcpu = decay_cpu(loadfac, ts->ts_estcpu); resetpriority(td); resetpriority_thread(td); thread_unlock(td); } PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } /* * Main loop for a kthread that executes schedcpu once a second. */ static void schedcpu_thread(void) { for (;;) { schedcpu(); pause("-", hz); } } /* * Recalculate the priority of a process after it has slept for a while. * For all load averages >= 1 and max ts_estcpu of 255, sleeping for at * least six times the loadfactor will decay ts_estcpu to zero. */ static void updatepri(struct thread *td) { struct td_sched *ts; fixpt_t loadfac; unsigned int newcpu; ts = td_get_sched(td); loadfac = loadfactor(averunnable.ldavg[0]); if (ts->ts_slptime > 5 * loadfac) ts->ts_estcpu = 0; else { newcpu = ts->ts_estcpu; ts->ts_slptime--; /* was incremented in schedcpu() */ while (newcpu && --ts->ts_slptime) newcpu = decay_cpu(loadfac, newcpu); ts->ts_estcpu = newcpu; } } /* * Compute the priority of a process when running in user mode. * Arrange to reschedule if the resulting priority is better * than that of the current process. */ static void resetpriority(struct thread *td) { u_int newpriority; if (td->td_pri_class != PRI_TIMESHARE) return; newpriority = PUSER + td_get_sched(td)->ts_estcpu / INVERSE_ESTCPU_WEIGHT + NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN); newpriority = min(max(newpriority, PRI_MIN_TIMESHARE), PRI_MAX_TIMESHARE); sched_user_prio(td, newpriority); } /* * Update the thread's priority when the associated process's user * priority changes. */ static void resetpriority_thread(struct thread *td) { /* Only change threads with a time sharing user priority. */ if (td->td_priority < PRI_MIN_TIMESHARE || td->td_priority > PRI_MAX_TIMESHARE) return; /* XXX the whole needresched thing is broken, but not silly. */ maybe_resched(td); sched_prio(td, td->td_user_pri); } /* ARGSUSED */ static void sched_setup(void *dummy) { setup_runqs(); /* Account for thread0. */ sched_load_add(); } /* * This routine determines time constants after stathz and hz are setup. */ static void sched_initticks(void *dummy) { realstathz = stathz ? stathz : hz; sched_slice = realstathz / 10; /* ~100ms */ hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); } /* External interfaces start here */ /* * Very early in the boot some setup of scheduler-specific * parts of proc0 and of some scheduler resources needs to be done. * Called from: * proc0_init() */ void schedinit(void) { /* * Set up the scheduler specific parts of thread0. */ thread0.td_lock = &sched_lock; td_get_sched(&thread0)->ts_slice = sched_slice; mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN); } +void +schedinit_ap(void) +{ + + /* Nothing needed. */ +} + int sched_runnable(void) { #ifdef SMP return runq_check(&runq) + runq_check(&runq_pcpu[PCPU_GET(cpuid)]); #else return runq_check(&runq); #endif } int sched_rr_interval(void) { /* Convert sched_slice from stathz to hz. */ return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz)); } /* * We adjust the priority of the current process. The priority of a * process gets worse as it accumulates CPU time. The cpu usage * estimator (ts_estcpu) is increased here. resetpriority() will * compute a different priority each time ts_estcpu increases by * INVERSE_ESTCPU_WEIGHT (until PRI_MAX_TIMESHARE is reached). The * cpu usage estimator ramps up quite quickly when the process is * running (linearly), and decays away exponentially, at a rate which * is proportionally slower when the system is busy. The basic * principle is that the system will 90% forget that the process used * a lot of CPU time in 5 * loadav seconds. This causes the system to * favor processes which haven't run much recently, and to round-robin * among other processes. */ static void sched_clock_tick(struct thread *td) { struct pcpuidlestat *stat; struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); ts->ts_cpticks++; ts->ts_estcpu = ESTCPULIM(ts->ts_estcpu + 1); if ((ts->ts_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) { resetpriority(td); resetpriority_thread(td); } /* * Force a context switch if the current thread has used up a full * time slice (default is 100ms). */ if (!TD_IS_IDLETHREAD(td) && --ts->ts_slice <= 0) { ts->ts_slice = sched_slice; td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND; } stat = DPCPU_PTR(idlestat); stat->oldidlecalls = stat->idlecalls; stat->idlecalls = 0; } void sched_clock(struct thread *td, int cnt) { for ( ; cnt > 0; cnt--) sched_clock_tick(td); } /* * Charge child's scheduling CPU usage to parent. */ void sched_exit(struct proc *p, struct thread *td) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit", "prio:%d", td->td_priority); PROC_LOCK_ASSERT(p, MA_OWNED); sched_exit_thread(FIRST_THREAD_IN_PROC(p), td); } void sched_exit_thread(struct thread *td, struct thread *child) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit", "prio:%d", child->td_priority); thread_lock(td); td_get_sched(td)->ts_estcpu = ESTCPULIM(td_get_sched(td)->ts_estcpu + td_get_sched(child)->ts_estcpu); thread_unlock(td); thread_lock(child); if ((child->td_flags & TDF_NOLOAD) == 0) sched_load_rem(); thread_unlock(child); } void sched_fork(struct thread *td, struct thread *childtd) { sched_fork_thread(td, childtd); } void sched_fork_thread(struct thread *td, struct thread *childtd) { struct td_sched *ts, *tsc; childtd->td_oncpu = NOCPU; childtd->td_lastcpu = NOCPU; childtd->td_lock = &sched_lock; childtd->td_cpuset = cpuset_ref(td->td_cpuset); childtd->td_domain.dr_policy = td->td_cpuset->cs_domain; childtd->td_priority = childtd->td_base_pri; ts = td_get_sched(childtd); bzero(ts, sizeof(*ts)); tsc = td_get_sched(td); ts->ts_estcpu = tsc->ts_estcpu; ts->ts_flags |= (tsc->ts_flags & TSF_AFFINITY); ts->ts_slice = 1; } void sched_nice(struct proc *p, int nice) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_nice = nice; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); resetpriority(td); resetpriority_thread(td); thread_unlock(td); } } void sched_class(struct thread *td, int class) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_pri_class = class; } /* * Adjust the priority of a thread. */ static void sched_priority(struct thread *td, u_char prio) { KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(curthread)); SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio); if (td != curthread && prio > td->td_priority) { KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread), "lend prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, curthread); } THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority == prio) return; td->td_priority = prio; if (TD_ON_RUNQ(td) && td->td_rqindex != (prio / RQ_PPQ)) { sched_rem(td); sched_add(td, SRQ_BORING | SRQ_HOLDTD); } } /* * Update a thread's priority when it is lent another thread's * priority. */ void sched_lend_prio(struct thread *td, u_char prio) { td->td_flags |= TDF_BORROWING; sched_priority(td, prio); } /* * Restore a thread's priority when priority propagation is * over. The prio argument is the minimum priority the thread * needs to have to satisfy other possible priority lending * requests. If the thread's regulary priority is less * important than prio the thread will keep a priority boost * of prio. */ void sched_unlend_prio(struct thread *td, u_char prio) { u_char base_pri; if (td->td_base_pri >= PRI_MIN_TIMESHARE && td->td_base_pri <= PRI_MAX_TIMESHARE) base_pri = td->td_user_pri; else base_pri = td->td_base_pri; if (prio >= base_pri) { td->td_flags &= ~TDF_BORROWING; sched_prio(td, base_pri); } else sched_lend_prio(td, prio); } void sched_prio(struct thread *td, u_char prio) { u_char oldprio; /* First, update the base priority. */ td->td_base_pri = prio; /* * If the thread is borrowing another thread's priority, don't ever * lower the priority. */ if (td->td_flags & TDF_BORROWING && td->td_priority < prio) return; /* Change the real priority. */ oldprio = td->td_priority; sched_priority(td, prio); /* * If the thread is on a turnstile, then let the turnstile update * its state. */ if (TD_ON_LOCK(td) && oldprio != prio) turnstile_adjust(td, oldprio); } void sched_user_prio(struct thread *td, u_char prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_base_user_pri = prio; if (td->td_lend_user_pri <= prio) return; td->td_user_pri = prio; } void sched_lend_user_prio(struct thread *td, u_char prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_lend_user_pri = prio; td->td_user_pri = min(prio, td->td_base_user_pri); if (td->td_priority > td->td_user_pri) sched_prio(td, td->td_user_pri); else if (td->td_priority != td->td_user_pri) td->td_flags |= TDF_NEEDRESCHED; } /* * Like the above but first check if there is anything to do. */ void sched_lend_user_prio_cond(struct thread *td, u_char prio) { if (td->td_lend_user_pri != prio) goto lend; if (td->td_user_pri != min(prio, td->td_base_user_pri)) goto lend; if (td->td_priority != td->td_user_pri) goto lend; return; lend: thread_lock(td); sched_lend_user_prio(td, prio); thread_unlock(td); } void sched_sleep(struct thread *td, int pri) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_slptick = ticks; td_get_sched(td)->ts_slptime = 0; if (pri != 0 && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_prio(td, pri); if (TD_IS_SUSPENDED(td) || pri >= PSOCK) td->td_flags |= TDF_CANSWAP; } void sched_switch(struct thread *td, int flags) { struct thread *newtd; struct mtx *tmtx; struct td_sched *ts; struct proc *p; int preempted; tmtx = &sched_lock; ts = td_get_sched(td); p = td->td_proc; THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_lastcpu = td->td_oncpu; preempted = (td->td_flags & TDF_SLICEEND) == 0 && (flags & SW_PREEMPT) != 0; td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND); td->td_owepreempt = 0; td->td_oncpu = NOCPU; /* * At the last moment, if this thread is still marked RUNNING, * then put it back on the run queue as it has not been suspended * or stopped or any thing else similar. We never put the idle * threads on the run queue, however. */ if (td->td_flags & TDF_IDLETD) { TD_SET_CAN_RUN(td); #ifdef SMP CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask); #endif } else { if (TD_IS_RUNNING(td)) { /* Put us back on the run queue. */ sched_add(td, preempted ? SRQ_HOLDTD|SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_HOLDTD|SRQ_OURSELF|SRQ_YIELDING); } } /* * Switch to the sched lock to fix things up and pick * a new thread. Block the td_lock in order to avoid * breaking the critical path. */ if (td->td_lock != &sched_lock) { mtx_lock_spin(&sched_lock); tmtx = thread_lock_block(td); mtx_unlock_spin(tmtx); } if ((td->td_flags & TDF_NOLOAD) == 0) sched_load_rem(); newtd = choosethread(); MPASS(newtd->td_lock == &sched_lock); #if (KTR_COMPILE & KTR_SCHED) != 0 if (TD_IS_IDLETHREAD(td)) KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle", "prio:%d", td->td_priority); else KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td), "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg, "lockname:\"%s\"", td->td_lockname); #endif if (td != newtd) { #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc); /* I feel sleepy */ lock_profile_release_lock(&sched_lock.lock_object, true); #ifdef KDTRACE_HOOKS /* * If DTrace has set the active vtime enum to anything * other than INACTIVE (0), then it should have set the * function to call. */ if (dtrace_vtime_active) (*dtrace_vtime_switch_func)(newtd); #endif cpu_switch(td, newtd, tmtx); lock_profile_obtain_lock_success(&sched_lock.lock_object, true, 0, 0, __FILE__, __LINE__); /* * Where am I? What year is it? * We are in the same thread that went to sleep above, * but any amount of time may have passed. All our context * will still be available as will local variables. * PCPU values however may have changed as we may have * changed CPU so don't trust cached values of them. * New threads will go to fork_exit() instead of here * so if you change things here you may need to change * things there too. * * If the thread above was exiting it will never wake * up again here, so either it has saved everything it * needed to, or the thread_wait() or wait() will * need to reap it. */ SDT_PROBE0(sched, , , on__cpu); #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); #endif } else { td->td_lock = &sched_lock; SDT_PROBE0(sched, , , remain__cpu); } KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); #ifdef SMP if (td->td_flags & TDF_IDLETD) CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask); #endif sched_lock.mtx_lock = (uintptr_t)td; td->td_oncpu = PCPU_GET(cpuid); spinlock_enter(); mtx_unlock_spin(&sched_lock); } void sched_wakeup(struct thread *td, int srqflags) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); td->td_flags &= ~TDF_CANSWAP; if (ts->ts_slptime > 1) { updatepri(td); resetpriority(td); } td->td_slptick = 0; ts->ts_slptime = 0; ts->ts_slice = sched_slice; sched_add(td, srqflags); } #ifdef SMP static int forward_wakeup(int cpunum) { struct pcpu *pc; cpuset_t dontuse, map, map2; u_int id, me; int iscpuset; mtx_assert(&sched_lock, MA_OWNED); CTR0(KTR_RUNQ, "forward_wakeup()"); if ((!forward_wakeup_enabled) || (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0)) return (0); if (!smp_started || KERNEL_PANICKED()) return (0); forward_wakeups_requested++; /* * Check the idle mask we received against what we calculated * before in the old version. */ me = PCPU_GET(cpuid); /* Don't bother if we should be doing it ourself. */ if (CPU_ISSET(me, &idle_cpus_mask) && (cpunum == NOCPU || me == cpunum)) return (0); CPU_SETOF(me, &dontuse); CPU_OR(&dontuse, &dontuse, &stopped_cpus); CPU_OR(&dontuse, &dontuse, &hlt_cpus_mask); CPU_ZERO(&map2); if (forward_wakeup_use_loop) { STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { id = pc->pc_cpuid; if (!CPU_ISSET(id, &dontuse) && pc->pc_curthread == pc->pc_idlethread) { CPU_SET(id, &map2); } } } if (forward_wakeup_use_mask) { map = idle_cpus_mask; CPU_ANDNOT(&map, &map, &dontuse); /* If they are both on, compare and use loop if different. */ if (forward_wakeup_use_loop) { if (CPU_CMP(&map, &map2)) { printf("map != map2, loop method preferred\n"); map = map2; } } } else { map = map2; } /* If we only allow a specific CPU, then mask off all the others. */ if (cpunum != NOCPU) { KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum.")); iscpuset = CPU_ISSET(cpunum, &map); if (iscpuset == 0) CPU_ZERO(&map); else CPU_SETOF(cpunum, &map); } if (!CPU_EMPTY(&map)) { forward_wakeups_delivered++; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { id = pc->pc_cpuid; if (!CPU_ISSET(id, &map)) continue; if (cpu_idle_wakeup(pc->pc_cpuid)) CPU_CLR(id, &map); } if (!CPU_EMPTY(&map)) ipi_selected(map, IPI_AST); return (1); } if (cpunum == NOCPU) printf("forward_wakeup: Idle processor not found\n"); return (0); } static void kick_other_cpu(int pri, int cpuid) { struct pcpu *pcpu; int cpri; pcpu = pcpu_find(cpuid); if (CPU_ISSET(cpuid, &idle_cpus_mask)) { forward_wakeups_delivered++; if (!cpu_idle_wakeup(cpuid)) ipi_cpu(cpuid, IPI_AST); return; } cpri = pcpu->pc_curthread->td_priority; if (pri >= cpri) return; #if defined(IPI_PREEMPTION) && defined(PREEMPTION) #if !defined(FULL_PREEMPTION) if (pri <= PRI_MAX_ITHD) #endif /* ! FULL_PREEMPTION */ { ipi_cpu(cpuid, IPI_PREEMPT); return; } #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */ pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED; ipi_cpu(cpuid, IPI_AST); return; } #endif /* SMP */ #ifdef SMP static int sched_pickcpu(struct thread *td) { int best, cpu; mtx_assert(&sched_lock, MA_OWNED); if (td->td_lastcpu != NOCPU && THREAD_CAN_SCHED(td, td->td_lastcpu)) best = td->td_lastcpu; else best = NOCPU; CPU_FOREACH(cpu) { if (!THREAD_CAN_SCHED(td, cpu)) continue; if (best == NOCPU) best = cpu; else if (runq_length[cpu] < runq_length[best]) best = cpu; } KASSERT(best != NOCPU, ("no valid CPUs")); return (best); } #endif void sched_add(struct thread *td, int flags) #ifdef SMP { cpuset_t tidlemsk; struct td_sched *ts; u_int cpu, cpuid; int forwarded = 0; int single_cpu = 0; ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), ("sched_add: bad thread state")); KASSERT(td->td_flags & TDF_INMEM, ("sched_add: thread swapped out")); KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, flags & SRQ_PREEMPTED); /* * Now that the thread is moving to the run-queue, set the lock * to the scheduler's lock. */ if (td->td_lock != &sched_lock) { mtx_lock_spin(&sched_lock); if ((flags & SRQ_HOLD) != 0) td->td_lock = &sched_lock; else thread_lock_set(td, &sched_lock); } TD_SET_RUNQ(td); /* * If SMP is started and the thread is pinned or otherwise limited to * a specific set of CPUs, queue the thread to a per-CPU run queue. * Otherwise, queue the thread to the global run queue. * * If SMP has not yet been started we must use the global run queue * as per-CPU state may not be initialized yet and we may crash if we * try to access the per-CPU run queues. */ if (smp_started && (td->td_pinned != 0 || td->td_flags & TDF_BOUND || ts->ts_flags & TSF_AFFINITY)) { if (td->td_pinned != 0) cpu = td->td_lastcpu; else if (td->td_flags & TDF_BOUND) { /* Find CPU from bound runq. */ KASSERT(SKE_RUNQ_PCPU(ts), ("sched_add: bound td_sched not on cpu runq")); cpu = ts->ts_runq - &runq_pcpu[0]; } else /* Find a valid CPU for our cpuset */ cpu = sched_pickcpu(td); ts->ts_runq = &runq_pcpu[cpu]; single_cpu = 1; CTR3(KTR_RUNQ, "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu); } else { CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts, td); cpu = NOCPU; ts->ts_runq = &runq; } if ((td->td_flags & TDF_NOLOAD) == 0) sched_load_add(); runq_add(ts->ts_runq, td, flags); if (cpu != NOCPU) runq_length[cpu]++; cpuid = PCPU_GET(cpuid); if (single_cpu && cpu != cpuid) { kick_other_cpu(td->td_priority, cpu); } else { if (!single_cpu) { tidlemsk = idle_cpus_mask; CPU_ANDNOT(&tidlemsk, &tidlemsk, &hlt_cpus_mask); CPU_CLR(cpuid, &tidlemsk); if (!CPU_ISSET(cpuid, &idle_cpus_mask) && ((flags & SRQ_INTR) == 0) && !CPU_EMPTY(&tidlemsk)) forwarded = forward_wakeup(cpu); } if (!forwarded) { if (!maybe_preempt(td)) maybe_resched(td); } } if ((flags & SRQ_HOLDTD) == 0) thread_unlock(td); } #else /* SMP */ { struct td_sched *ts; ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), ("sched_add: bad thread state")); KASSERT(td->td_flags & TDF_INMEM, ("sched_add: thread swapped out")); KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, flags & SRQ_PREEMPTED); /* * Now that the thread is moving to the run-queue, set the lock * to the scheduler's lock. */ if (td->td_lock != &sched_lock) { mtx_lock_spin(&sched_lock); if ((flags & SRQ_HOLD) != 0) td->td_lock = &sched_lock; else thread_lock_set(td, &sched_lock); } TD_SET_RUNQ(td); CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td); ts->ts_runq = &runq; if ((td->td_flags & TDF_NOLOAD) == 0) sched_load_add(); runq_add(ts->ts_runq, td, flags); if (!maybe_preempt(td)) maybe_resched(td); if ((flags & SRQ_HOLDTD) == 0) thread_unlock(td); } #endif /* SMP */ void sched_rem(struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); KASSERT(td->td_flags & TDF_INMEM, ("sched_rem: thread swapped out")); KASSERT(TD_ON_RUNQ(td), ("sched_rem: thread not on run queue")); mtx_assert(&sched_lock, MA_OWNED); KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL); if ((td->td_flags & TDF_NOLOAD) == 0) sched_load_rem(); #ifdef SMP if (ts->ts_runq != &runq) runq_length[ts->ts_runq - runq_pcpu]--; #endif runq_remove(ts->ts_runq, td); TD_SET_CAN_RUN(td); } /* * Select threads to run. Note that running threads still consume a * slot. */ struct thread * sched_choose(void) { struct thread *td; struct runq *rq; mtx_assert(&sched_lock, MA_OWNED); #ifdef SMP struct thread *tdcpu; rq = &runq; td = runq_choose_fuzz(&runq, runq_fuzz); tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]); if (td == NULL || (tdcpu != NULL && tdcpu->td_priority < td->td_priority)) { CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu, PCPU_GET(cpuid)); td = tdcpu; rq = &runq_pcpu[PCPU_GET(cpuid)]; } else { CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td); } #else rq = &runq; td = runq_choose(&runq); #endif if (td) { #ifdef SMP if (td == tdcpu) runq_length[PCPU_GET(cpuid)]--; #endif runq_remove(rq, td); td->td_flags |= TDF_DIDRUN; KASSERT(td->td_flags & TDF_INMEM, ("sched_choose: thread swapped out")); return (td); } return (PCPU_GET(idlethread)); } void sched_preempt(struct thread *td) { SDT_PROBE2(sched, , , surrender, td, td->td_proc); if (td->td_critnest > 1) { td->td_owepreempt = 1; } else { thread_lock(td); mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT); } } void sched_userret_slowpath(struct thread *td) { thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; thread_unlock(td); } void sched_bind(struct thread *td, int cpu) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); KASSERT(td == curthread, ("sched_bind: can only bind curthread")); ts = td_get_sched(td); td->td_flags |= TDF_BOUND; #ifdef SMP ts->ts_runq = &runq_pcpu[cpu]; if (PCPU_GET(cpuid) == cpu) return; mi_switch(SW_VOL); thread_lock(td); #endif } void sched_unbind(struct thread* td) { THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td == curthread, ("sched_unbind: can only bind curthread")); td->td_flags &= ~TDF_BOUND; } int sched_is_bound(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); return (td->td_flags & TDF_BOUND); } void sched_relinquish(struct thread *td) { thread_lock(td); mi_switch(SW_VOL | SWT_RELINQUISH); } int sched_load(void) { return (sched_tdcnt); } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); } fixpt_t sched_pctcpu(struct thread *td) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); return (ts->ts_pctcpu); } #ifdef RACCT /* * Calculates the contribution to the thread cpu usage for the latest * (unfinished) second. */ fixpt_t sched_pctcpu_delta(struct thread *td) { struct td_sched *ts; fixpt_t delta; int realstathz; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); delta = 0; realstathz = stathz ? stathz : hz; if (ts->ts_cpticks != 0) { #if (FSHIFT >= CCPU_SHIFT) delta = (realstathz == 100) ? ((fixpt_t) ts->ts_cpticks) << (FSHIFT - CCPU_SHIFT) : 100 * (((fixpt_t) ts->ts_cpticks) << (FSHIFT - CCPU_SHIFT)) / realstathz; #else delta = ((FSCALE - ccpu) * (ts->ts_cpticks * FSCALE / realstathz)) >> FSHIFT; #endif } return (delta); } #endif u_int sched_estcpu(struct thread *td) { return (td_get_sched(td)->ts_estcpu); } /* * The actual idle process. */ void sched_idletd(void *dummy) { struct pcpuidlestat *stat; THREAD_NO_SLEEPING(); stat = DPCPU_PTR(idlestat); for (;;) { mtx_assert(&Giant, MA_NOTOWNED); while (sched_runnable() == 0) { cpu_idle(stat->idlecalls + stat->oldidlecalls > 64); stat->idlecalls++; } mtx_lock_spin(&sched_lock); mi_switch(SW_VOL | SWT_IDLE); } } /* * A CPU is entering for the first time or a thread is exiting. */ void sched_throw(struct thread *td) { /* * Correct spinlock nesting. The idle thread context that we are * borrowing was created so that it would start out with a single * spin lock (sched_lock) held in fork_trampoline(). Since we've * explicitly acquired locks in this function, the nesting count * is now 2 rather than 1. Since we are nested, calling * spinlock_exit() will simply adjust the counts without allowing * spin lock using code to interrupt us. */ if (td == NULL) { mtx_lock_spin(&sched_lock); spinlock_exit(); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); } else { lock_profile_release_lock(&sched_lock.lock_object, true); MPASS(td->td_lock == &sched_lock); td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; } mtx_assert(&sched_lock, MA_OWNED); KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); cpu_throw(td, choosethread()); /* doesn't return */ } void sched_fork_exit(struct thread *td) { /* * Finish setting up thread glue so that it begins execution in a * non-nested critical section with sched_lock held but not recursed. */ td->td_oncpu = PCPU_GET(cpuid); sched_lock.mtx_lock = (uintptr_t)td; lock_profile_obtain_lock_success(&sched_lock.lock_object, true, 0, 0, __FILE__, __LINE__); THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); SDT_PROBE0(sched, , , on__cpu); } char * sched_tdname(struct thread *td) { #ifdef KTR struct td_sched *ts; ts = td_get_sched(td); if (ts->ts_name[0] == '\0') snprintf(ts->ts_name, sizeof(ts->ts_name), "%s tid %d", td->td_name, td->td_tid); return (ts->ts_name); #else return (td->td_name); #endif } #ifdef KTR void sched_clear_tdname(struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); ts->ts_name[0] = '\0'; } #endif void sched_affinity(struct thread *td) { #ifdef SMP struct td_sched *ts; int cpu; THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Set the TSF_AFFINITY flag if there is at least one CPU this * thread can't run on. */ ts = td_get_sched(td); ts->ts_flags &= ~TSF_AFFINITY; CPU_FOREACH(cpu) { if (!THREAD_CAN_SCHED(td, cpu)) { ts->ts_flags |= TSF_AFFINITY; break; } } /* * If this thread can run on all CPUs, nothing else to do. */ if (!(ts->ts_flags & TSF_AFFINITY)) return; /* Pinned threads and bound threads should be left alone. */ if (td->td_pinned != 0 || td->td_flags & TDF_BOUND) return; switch (td->td_state) { case TDS_RUNQ: /* * If we are on a per-CPU runqueue that is in the set, * then nothing needs to be done. */ if (ts->ts_runq != &runq && THREAD_CAN_SCHED(td, ts->ts_runq - runq_pcpu)) return; /* Put this thread on a valid per-CPU runqueue. */ sched_rem(td); sched_add(td, SRQ_HOLDTD | SRQ_BORING); break; case TDS_RUNNING: /* * See if our current CPU is in the set. If not, force a * context switch. */ if (THREAD_CAN_SCHED(td, td->td_oncpu)) return; td->td_flags |= TDF_NEEDRESCHED; if (td != curthread) ipi_cpu(cpu, IPI_AST); break; default: break; } #endif } diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index c9498cac7912..1f859b286843 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -1,3212 +1,3227 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2007, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This file implements the ULE scheduler. ULE supports independent CPU * run queues and fine grain locking. It has superior interactive * performance under load even on uni-processor systems. * * etymology: * ULE is the last three letters in schedule. It owes its name to a * generic user created for a scheduling system by Paul Mikesell at * Isilon Systems and a general lack of creativity on the part of the author. */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #ifdef KDTRACE_HOOKS #include int __read_mostly dtrace_vtime_active; dtrace_vtime_switch_func_t dtrace_vtime_switch_func; #endif #include #include #define KTR_ULE 0 #define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX))) #define TDQ_NAME_LEN (sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU))) #define TDQ_LOADNAME_LEN (sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load")) /* * Thread scheduler specific section. All fields are protected * by the thread lock. */ struct td_sched { struct runq *ts_runq; /* Run-queue we're queued on. */ short ts_flags; /* TSF_* flags. */ int ts_cpu; /* CPU that we have affinity for. */ int ts_rltick; /* Real last tick, for affinity. */ int ts_slice; /* Ticks of slice remaining. */ u_int ts_slptime; /* Number of ticks we vol. slept */ u_int ts_runtime; /* Number of ticks we were running */ int ts_ltick; /* Last tick that we were running on */ int ts_ftick; /* First tick that we were running on */ int ts_ticks; /* Tick count */ #ifdef KTR char ts_name[TS_NAME_LEN]; #endif }; /* flags kept in ts_flags */ #define TSF_BOUND 0x0001 /* Thread can not migrate. */ #define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ #define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) #define THREAD_CAN_SCHED(td, cpu) \ CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <= sizeof(struct thread0_storage), "increase struct thread0_storage.t0st_sched size"); /* * Priority ranges used for interactive and non-interactive timeshare * threads. The timeshare priorities are split up into four ranges. * The first range handles interactive threads. The last three ranges * (NHALF, x, and NHALF) handle non-interactive threads with the outer * ranges supporting nice values. */ #define PRI_TIMESHARE_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) #define PRI_INTERACT_RANGE ((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2) #define PRI_BATCH_RANGE (PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE) #define PRI_MIN_INTERACT PRI_MIN_TIMESHARE #define PRI_MAX_INTERACT (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1) #define PRI_MIN_BATCH (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE) #define PRI_MAX_BATCH PRI_MAX_TIMESHARE /* * Cpu percentage computation macros and defines. * * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. * SCHED_TICK_MAX: Maximum number of ticks before scaling back. * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. */ #define SCHED_TICK_SECS 10 #define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) #define SCHED_TICK_MAX (SCHED_TICK_TARG + hz) #define SCHED_TICK_SHIFT 10 #define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) #define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz)) /* * These macros determine priorities for non-interactive threads. They are * assigned a priority based on their recent cpu utilization as expressed * by the ratio of ticks to the tick total. NHALF priorities at the start * and end of the MIN to MAX timeshare range are only reachable with negative * or positive nice respectively. * * PRI_RANGE: Priority range for utilization dependent priorities. * PRI_NRESV: Number of nice values. * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. * PRI_NICE: Determines the part of the priority inherited from nice. */ #define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN) #define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) #define SCHED_PRI_MIN (PRI_MIN_BATCH + SCHED_PRI_NHALF) #define SCHED_PRI_MAX (PRI_MAX_BATCH - SCHED_PRI_NHALF) #define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1) #define SCHED_PRI_TICKS(ts) \ (SCHED_TICK_HZ((ts)) / \ (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) #define SCHED_PRI_NICE(nice) (nice) /* * These determine the interactivity of a process. Interactivity differs from * cpu utilization in that it expresses the voluntary time slept vs time ran * while cpu utilization includes all time not running. This more accurately * models the intent of the thread. * * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate * before throttling back. * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. * INTERACT_MAX: Maximum interactivity value. Smaller is better. * INTERACT_THRESH: Threshold for placement on the current runq. */ #define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) #define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) #define SCHED_INTERACT_MAX (100) #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) #define SCHED_INTERACT_THRESH (30) /* * These parameters determine the slice behavior for batch work. */ #define SCHED_SLICE_DEFAULT_DIVISOR 10 /* ~94 ms, 12 stathz ticks. */ #define SCHED_SLICE_MIN_DIVISOR 6 /* DEFAULT/MIN = ~16 ms. */ /* Flags kept in td_flags. */ #define TDF_PICKCPU TDF_SCHED0 /* Thread should pick new CPU. */ #define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */ /* * tickincr: Converts a stathz tick into a hz domain scaled by * the shift factor. Without the shift the error rate * due to rounding would be unacceptably high. * realstathz: stathz is sometimes 0 and run off of hz. * sched_slice: Runtime of each thread before rescheduling. * preempt_thresh: Priority threshold for preemption and remote IPIs. */ static u_int __read_mostly sched_interact = SCHED_INTERACT_THRESH; static int __read_mostly tickincr = 8 << SCHED_TICK_SHIFT; static int __read_mostly realstathz = 127; /* reset during boot. */ static int __read_mostly sched_slice = 10; /* reset during boot. */ static int __read_mostly sched_slice_min = 1; /* reset during boot. */ #ifdef PREEMPTION #ifdef FULL_PREEMPTION static int __read_mostly preempt_thresh = PRI_MAX_IDLE; #else static int __read_mostly preempt_thresh = PRI_MIN_KERN; #endif #else static int __read_mostly preempt_thresh = 0; #endif static int __read_mostly static_boost = PRI_MIN_BATCH; static int __read_mostly sched_idlespins = 10000; static int __read_mostly sched_idlespinthresh = -1; /* * tdq - per processor runqs and statistics. All fields are protected by the * tdq_lock. The load and lowpri may be accessed without to avoid excess * locking in sched_pickcpu(); */ struct tdq { /* * Ordered to improve efficiency of cpu_search() and switch(). * tdq_lock is padded to avoid false sharing with tdq_load and * tdq_cpu_idle. */ struct mtx_padalign tdq_lock; /* run queue lock. */ struct cpu_group *tdq_cg; /* Pointer to cpu topology. */ volatile int tdq_load; /* Aggregate load. */ volatile int tdq_cpu_idle; /* cpu_idle() is active. */ int tdq_sysload; /* For loadavg, !ITHD load. */ volatile int tdq_transferable; /* Transferable thread count. */ volatile short tdq_switchcnt; /* Switches this tick. */ volatile short tdq_oldswitchcnt; /* Switches last tick. */ u_char tdq_lowpri; /* Lowest priority thread. */ u_char tdq_owepreempt; /* Remote preemption pending. */ u_char tdq_idx; /* Current insert index. */ u_char tdq_ridx; /* Current removal index. */ int tdq_id; /* cpuid. */ struct runq tdq_realtime; /* real-time run queue. */ struct runq tdq_timeshare; /* timeshare run queue. */ struct runq tdq_idle; /* Queue of IDLE threads. */ char tdq_name[TDQ_NAME_LEN]; #ifdef KTR char tdq_loadname[TDQ_LOADNAME_LEN]; #endif } __aligned(64); /* Idle thread states and config. */ #define TDQ_RUNNING 1 #define TDQ_IDLE 2 #ifdef SMP struct cpu_group __read_mostly *cpu_top; /* CPU topology */ #define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000)) #define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity)) /* * Run-time tunables. */ static int rebalance = 1; static int balance_interval = 128; /* Default set in sched_initticks(). */ static int __read_mostly affinity; static int __read_mostly steal_idle = 1; static int __read_mostly steal_thresh = 2; static int __read_mostly always_steal = 0; static int __read_mostly trysteal_limit = 2; /* * One thread queue per processor. */ static struct tdq __read_mostly *balance_tdq; static int balance_ticks; DPCPU_DEFINE_STATIC(struct tdq, tdq); DPCPU_DEFINE_STATIC(uint32_t, randomval); #define TDQ_SELF() ((struct tdq *)PCPU_GET(sched)) #define TDQ_CPU(x) (DPCPU_ID_PTR((x), tdq)) #define TDQ_ID(x) ((x)->tdq_id) #else /* !SMP */ static struct tdq tdq_cpu; #define TDQ_ID(x) (0) #define TDQ_SELF() (&tdq_cpu) #define TDQ_CPU(x) (&tdq_cpu) #endif #define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type)) #define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) #define TDQ_TRYLOCK(t) mtx_trylock_spin(TDQ_LOCKPTR((t))) #define TDQ_TRYLOCK_FLAGS(t, f) mtx_trylock_spin_flags(TDQ_LOCKPTR((t)), (f)) #define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCKPTR(t) ((struct mtx *)(&(t)->tdq_lock)) static void sched_priority(struct thread *); static void sched_thread_priority(struct thread *, u_char); static int sched_interact_score(struct thread *); static void sched_interact_update(struct thread *); static void sched_interact_fork(struct thread *); static void sched_pctcpu_update(struct td_sched *, int); /* Operations on per processor queues */ static struct thread *tdq_choose(struct tdq *); static void tdq_setup(struct tdq *, int i); static void tdq_load_add(struct tdq *, struct thread *); static void tdq_load_rem(struct tdq *, struct thread *); static __inline void tdq_runq_add(struct tdq *, struct thread *, int); static __inline void tdq_runq_rem(struct tdq *, struct thread *); static inline int sched_shouldpreempt(int, int, int); void tdq_print(int cpu); static void runq_print(struct runq *rq); static void tdq_add(struct tdq *, struct thread *, int); #ifdef SMP static struct thread *tdq_move(struct tdq *, struct tdq *); static int tdq_idled(struct tdq *); static void tdq_notify(struct tdq *, struct thread *); static struct thread *tdq_steal(struct tdq *, int); static struct thread *runq_steal(struct runq *, int); static int sched_pickcpu(struct thread *, int); static void sched_balance(void); static int sched_balance_pair(struct tdq *, struct tdq *); static inline struct tdq *sched_setcpu(struct thread *, int, int); static inline void thread_unblock_switch(struct thread *, struct mtx *); static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS); static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg, int indent); #endif static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); static void sched_initticks(void *dummy); SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL); SDT_PROVIDER_DEFINE(sched); SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", "struct proc *", "uint8_t"); SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", "struct proc *", "void *"); SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", "struct proc *", "void *", "int"); SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", "struct proc *", "uint8_t", "struct thread *"); SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int"); SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", "struct proc *"); SDT_PROBE_DEFINE(sched, , , on__cpu); SDT_PROBE_DEFINE(sched, , , remain__cpu); SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", "struct proc *"); /* * Print the threads waiting on a run-queue. */ static void runq_print(struct runq *rq) { struct rqhead *rqh; struct thread *td; int pri; int j; int i; for (i = 0; i < RQB_LEN; i++) { printf("\t\trunq bits %d 0x%zx\n", i, rq->rq_status.rqb_bits[i]); for (j = 0; j < RQB_BPW; j++) if (rq->rq_status.rqb_bits[i] & (1ul << j)) { pri = j + (i << RQB_L2BPW); rqh = &rq->rq_queues[pri]; TAILQ_FOREACH(td, rqh, td_runq) { printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", td, td->td_name, td->td_priority, td->td_rqindex, pri); } } } } /* * Print the status of a per-cpu thread queue. Should be a ddb show cmd. */ void tdq_print(int cpu) { struct tdq *tdq; tdq = TDQ_CPU(cpu); printf("tdq %d:\n", TDQ_ID(tdq)); printf("\tlock %p\n", TDQ_LOCKPTR(tdq)); printf("\tLock name: %s\n", tdq->tdq_name); printf("\tload: %d\n", tdq->tdq_load); printf("\tswitch cnt: %d\n", tdq->tdq_switchcnt); printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt); printf("\ttimeshare idx: %d\n", tdq->tdq_idx); printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); printf("\tload transferable: %d\n", tdq->tdq_transferable); printf("\tlowest priority: %d\n", tdq->tdq_lowpri); printf("\trealtime runq:\n"); runq_print(&tdq->tdq_realtime); printf("\ttimeshare runq:\n"); runq_print(&tdq->tdq_timeshare); printf("\tidle runq:\n"); runq_print(&tdq->tdq_idle); } static inline int sched_shouldpreempt(int pri, int cpri, int remote) { /* * If the new priority is not better than the current priority there is * nothing to do. */ if (pri >= cpri) return (0); /* * Always preempt idle. */ if (cpri >= PRI_MIN_IDLE) return (1); /* * If preemption is disabled don't preempt others. */ if (preempt_thresh == 0) return (0); /* * Preempt if we exceed the threshold. */ if (pri <= preempt_thresh) return (1); /* * If we're interactive or better and there is non-interactive * or worse running preempt only remote processors. */ if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT) return (1); return (0); } /* * Add a thread to the actual run-queue. Keeps transferable counts up to * date with what is actually on the run-queue. Selects the correct * queue position for timeshare threads. */ static __inline void tdq_runq_add(struct tdq *tdq, struct thread *td, int flags) { struct td_sched *ts; u_char pri; TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); pri = td->td_priority; ts = td_get_sched(td); TD_SET_RUNQ(td); if (THREAD_CAN_MIGRATE(td)) { tdq->tdq_transferable++; ts->ts_flags |= TSF_XFERABLE; } if (pri < PRI_MIN_BATCH) { ts->ts_runq = &tdq->tdq_realtime; } else if (pri <= PRI_MAX_BATCH) { ts->ts_runq = &tdq->tdq_timeshare; KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH, ("Invalid priority %d on timeshare runq", pri)); /* * This queue contains only priorities between MIN and MAX * realtime. Use the whole queue to represent these values. */ if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) { pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE; pri = (pri + tdq->tdq_idx) % RQ_NQS; /* * This effectively shortens the queue by one so we * can have a one slot difference between idx and * ridx while we wait for threads to drain. */ if (tdq->tdq_ridx != tdq->tdq_idx && pri == tdq->tdq_ridx) pri = (unsigned char)(pri - 1) % RQ_NQS; } else pri = tdq->tdq_ridx; runq_add_pri(ts->ts_runq, td, pri, flags); return; } else ts->ts_runq = &tdq->tdq_idle; runq_add(ts->ts_runq, td, flags); } /* * Remove a thread from a run-queue. This typically happens when a thread * is selected to run. Running threads are not on the queue and the * transferable count does not reflect them. */ static __inline void tdq_runq_rem(struct tdq *tdq, struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); KASSERT(ts->ts_runq != NULL, ("tdq_runq_remove: thread %p null ts_runq", td)); if (ts->ts_flags & TSF_XFERABLE) { tdq->tdq_transferable--; ts->ts_flags &= ~TSF_XFERABLE; } if (ts->ts_runq == &tdq->tdq_timeshare) { if (tdq->tdq_idx != tdq->tdq_ridx) runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx); else runq_remove_idx(ts->ts_runq, td, NULL); } else runq_remove(ts->ts_runq, td); } /* * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load * for this thread to the referenced thread queue. */ static void tdq_load_add(struct tdq *tdq, struct thread *td) { TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); tdq->tdq_load++; if ((td->td_flags & TDF_NOLOAD) == 0) tdq->tdq_sysload++; KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load); SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load); } /* * Remove the load from a thread that is transitioning to a sleep state or * exiting. */ static void tdq_load_rem(struct tdq *tdq, struct thread *td) { TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); KASSERT(tdq->tdq_load != 0, ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); tdq->tdq_load--; if ((td->td_flags & TDF_NOLOAD) == 0) tdq->tdq_sysload--; KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load); SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load); } /* * Bound timeshare latency by decreasing slice size as load increases. We * consider the maximum latency as the sum of the threads waiting to run * aside from curthread and target no more than sched_slice latency but * no less than sched_slice_min runtime. */ static inline int tdq_slice(struct tdq *tdq) { int load; /* * It is safe to use sys_load here because this is called from * contexts where timeshare threads are running and so there * cannot be higher priority load in the system. */ load = tdq->tdq_sysload - 1; if (load >= SCHED_SLICE_MIN_DIVISOR) return (sched_slice_min); if (load <= 1) return (sched_slice); return (sched_slice / load); } /* * Set lowpri to its exact value by searching the run-queue and * evaluating curthread. curthread may be passed as an optimization. */ static void tdq_setlowpri(struct tdq *tdq, struct thread *ctd) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); if (ctd == NULL) ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread; td = tdq_choose(tdq); if (td == NULL || td->td_priority > ctd->td_priority) tdq->tdq_lowpri = ctd->td_priority; else tdq->tdq_lowpri = td->td_priority; } #ifdef SMP /* * We need some randomness. Implement a classic Linear Congruential * Generator X_{n+1}=(aX_n+c) mod m. These values are optimized for * m = 2^32, a = 69069 and c = 5. We only return the upper 16 bits * of the random state (in the low bits of our answer) to keep * the maximum randomness. */ static uint32_t sched_random(void) { uint32_t *rndptr; rndptr = DPCPU_PTR(randomval); *rndptr = *rndptr * 69069 + 5; return (*rndptr >> 16); } struct cpu_search { cpuset_t *cs_mask; /* The mask of allowed CPUs to choose from. */ int cs_prefer; /* Prefer this CPU and groups including it. */ int cs_running; /* The thread is now running at cs_prefer. */ int cs_pri; /* Min priority for low. */ int cs_load; /* Max load for low, min load for high. */ int cs_trans; /* Min transferable load for high. */ }; struct cpu_search_res { int csr_cpu; /* The best CPU found. */ int csr_load; /* The load of cs_cpu. */ }; /* * Search the tree of cpu_groups for the lowest or highest loaded CPU. * These routines actually compare the load on all paths through the tree * and find the least loaded cpu on the least loaded path, which may differ * from the least loaded cpu in the system. This balances work among caches * and buses. */ static int cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s, struct cpu_search_res *r) { struct cpu_search_res lr; struct tdq *tdq; int c, bload, l, load, p, total; total = 0; bload = INT_MAX; r->csr_cpu = -1; /* Loop through children CPU groups if there are any. */ if (cg->cg_children > 0) { for (c = cg->cg_children - 1; c >= 0; c--) { load = cpu_search_lowest(&cg->cg_child[c], s, &lr); total += load; /* * When balancing do not prefer SMT groups with load >1. * It allows round-robin between SMT groups with equal * load within parent group for more fair scheduling. */ if (__predict_false(s->cs_running) && (cg->cg_child[c].cg_flags & CG_FLAG_THREAD) && load >= 128 && (load & 128) != 0) load += 128; if (lr.csr_cpu >= 0 && (load < bload || (load == bload && lr.csr_load < r->csr_load))) { bload = load; r->csr_cpu = lr.csr_cpu; r->csr_load = lr.csr_load; } } return (total); } /* Loop through children CPUs otherwise. */ for (c = cg->cg_last; c >= cg->cg_first; c--) { if (!CPU_ISSET(c, &cg->cg_mask)) continue; tdq = TDQ_CPU(c); l = tdq->tdq_load; if (c == s->cs_prefer) { if (__predict_false(s->cs_running)) l--; p = 128; } else p = 0; load = l * 256; total += load - p; /* * Check this CPU is acceptable. * If the threads is already on the CPU, don't look on the TDQ * priority, since it can be the priority of the thread itself. */ if (l > s->cs_load || (tdq->tdq_lowpri <= s->cs_pri && (!s->cs_running || c != s->cs_prefer)) || !CPU_ISSET(c, s->cs_mask)) continue; /* * When balancing do not prefer CPUs with load > 1. * It allows round-robin between CPUs with equal load * within the CPU group for more fair scheduling. */ if (__predict_false(s->cs_running) && l > 0) p = 0; load -= sched_random() % 128; if (bload > load - p) { bload = load - p; r->csr_cpu = c; r->csr_load = load; } } return (total); } static int cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s, struct cpu_search_res *r) { struct cpu_search_res lr; struct tdq *tdq; int c, bload, l, load, total; total = 0; bload = INT_MIN; r->csr_cpu = -1; /* Loop through children CPU groups if there are any. */ if (cg->cg_children > 0) { for (c = cg->cg_children - 1; c >= 0; c--) { load = cpu_search_highest(&cg->cg_child[c], s, &lr); total += load; if (lr.csr_cpu >= 0 && (load > bload || (load == bload && lr.csr_load > r->csr_load))) { bload = load; r->csr_cpu = lr.csr_cpu; r->csr_load = lr.csr_load; } } return (total); } /* Loop through children CPUs otherwise. */ for (c = cg->cg_last; c >= cg->cg_first; c--) { if (!CPU_ISSET(c, &cg->cg_mask)) continue; tdq = TDQ_CPU(c); l = tdq->tdq_load; load = l * 256; total += load; /* * Check this CPU is acceptable. */ if (l < s->cs_load || (tdq->tdq_transferable < s->cs_trans) || !CPU_ISSET(c, s->cs_mask)) continue; load -= sched_random() % 256; if (load > bload) { bload = load; r->csr_cpu = c; } } r->csr_load = bload; return (total); } /* * Find the cpu with the least load via the least loaded path that has a * lowpri greater than pri pri. A pri of -1 indicates any priority is * acceptable. */ static inline int sched_lowest(const struct cpu_group *cg, cpuset_t *mask, int pri, int maxload, int prefer, int running) { struct cpu_search s; struct cpu_search_res r; s.cs_prefer = prefer; s.cs_running = running; s.cs_mask = mask; s.cs_pri = pri; s.cs_load = maxload; cpu_search_lowest(cg, &s, &r); return (r.csr_cpu); } /* * Find the cpu with the highest load via the highest loaded path. */ static inline int sched_highest(const struct cpu_group *cg, cpuset_t *mask, int minload, int mintrans) { struct cpu_search s; struct cpu_search_res r; s.cs_mask = mask; s.cs_load = minload; s.cs_trans = mintrans; cpu_search_highest(cg, &s, &r); return (r.csr_cpu); } static void sched_balance_group(struct cpu_group *cg) { struct tdq *tdq; struct thread *td; cpuset_t hmask, lmask; int high, low, anylow; CPU_FILL(&hmask); for (;;) { high = sched_highest(cg, &hmask, 1, 0); /* Stop if there is no more CPU with transferrable threads. */ if (high == -1) break; CPU_CLR(high, &hmask); CPU_COPY(&hmask, &lmask); /* Stop if there is no more CPU left for low. */ if (CPU_EMPTY(&lmask)) break; tdq = TDQ_CPU(high); if (tdq->tdq_load == 1) { /* * There is only one running thread. We can't move * it from here, so tell it to pick new CPU by itself. */ TDQ_LOCK(tdq); td = pcpu_find(high)->pc_curthread; if ((td->td_flags & TDF_IDLETD) == 0 && THREAD_CAN_MIGRATE(td)) { td->td_flags |= TDF_NEEDRESCHED | TDF_PICKCPU; if (high != curcpu) ipi_cpu(high, IPI_AST); } TDQ_UNLOCK(tdq); break; } anylow = 1; nextlow: if (tdq->tdq_transferable == 0) continue; low = sched_lowest(cg, &lmask, -1, tdq->tdq_load - 1, high, 1); /* Stop if we looked well and found no less loaded CPU. */ if (anylow && low == -1) break; /* Go to next high if we found no less loaded CPU. */ if (low == -1) continue; /* Transfer thread from high to low. */ if (sched_balance_pair(tdq, TDQ_CPU(low))) { /* CPU that got thread can no longer be a donor. */ CPU_CLR(low, &hmask); } else { /* * If failed, then there is no threads on high * that can run on this low. Drop low from low * mask and look for different one. */ CPU_CLR(low, &lmask); anylow = 0; goto nextlow; } } } static void sched_balance(void) { struct tdq *tdq; balance_ticks = max(balance_interval / 2, 1) + (sched_random() % balance_interval); tdq = TDQ_SELF(); TDQ_UNLOCK(tdq); sched_balance_group(cpu_top); TDQ_LOCK(tdq); } /* * Lock two thread queues using their address to maintain lock order. */ static void tdq_lock_pair(struct tdq *one, struct tdq *two) { if (one < two) { TDQ_LOCK(one); TDQ_LOCK_FLAGS(two, MTX_DUPOK); } else { TDQ_LOCK(two); TDQ_LOCK_FLAGS(one, MTX_DUPOK); } } /* * Unlock two thread queues. Order is not important here. */ static void tdq_unlock_pair(struct tdq *one, struct tdq *two) { TDQ_UNLOCK(one); TDQ_UNLOCK(two); } /* * Transfer load between two imbalanced thread queues. */ static int sched_balance_pair(struct tdq *high, struct tdq *low) { struct thread *td; int cpu; tdq_lock_pair(high, low); td = NULL; /* * Transfer a thread from high to low. */ if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load && (td = tdq_move(high, low)) != NULL) { /* * In case the target isn't the current cpu notify it of the * new load, possibly sending an IPI to force it to reschedule. */ cpu = TDQ_ID(low); if (cpu != PCPU_GET(cpuid)) tdq_notify(low, td); } tdq_unlock_pair(high, low); return (td != NULL); } /* * Move a thread from one thread queue to another. */ static struct thread * tdq_move(struct tdq *from, struct tdq *to) { struct thread *td; struct tdq *tdq; int cpu; TDQ_LOCK_ASSERT(from, MA_OWNED); TDQ_LOCK_ASSERT(to, MA_OWNED); tdq = from; cpu = TDQ_ID(to); td = tdq_steal(tdq, cpu); if (td == NULL) return (NULL); /* * Although the run queue is locked the thread may be * blocked. We can not set the lock until it is unblocked. */ thread_lock_block_wait(td); sched_rem(td); THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(from)); td->td_lock = TDQ_LOCKPTR(to); td_get_sched(td)->ts_cpu = cpu; tdq_add(to, td, SRQ_YIELDING); return (td); } /* * This tdq has idled. Try to steal a thread from another cpu and switch * to it. */ static int tdq_idled(struct tdq *tdq) { struct cpu_group *cg, *parent; struct tdq *steal; cpuset_t mask; int cpu, switchcnt, goup; if (smp_started == 0 || steal_idle == 0 || tdq->tdq_cg == NULL) return (1); CPU_FILL(&mask); CPU_CLR(PCPU_GET(cpuid), &mask); restart: switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; for (cg = tdq->tdq_cg, goup = 0; ; ) { cpu = sched_highest(cg, &mask, steal_thresh, 1); /* * We were assigned a thread but not preempted. Returning * 0 here will cause our caller to switch to it. */ if (tdq->tdq_load) return (0); /* * We found no CPU to steal from in this group. Escalate to * the parent and repeat. But if parent has only two children * groups we can avoid searching this group again by searching * the other one specifically and then escalating two levels. */ if (cpu == -1) { if (goup) { cg = cg->cg_parent; goup = 0; } parent = cg->cg_parent; if (parent == NULL) return (1); if (parent->cg_children == 2) { if (cg == &parent->cg_child[0]) cg = &parent->cg_child[1]; else cg = &parent->cg_child[0]; goup = 1; } else cg = parent; continue; } steal = TDQ_CPU(cpu); /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread. * * Testing this ahead of tdq_lock_pair() only catches * this situation about 20% of the time on an 8 core * 16 thread Ryzen 7, but it still helps performance. */ if (steal->tdq_load < steal_thresh || steal->tdq_transferable == 0) goto restart; /* * Try to lock both queues. If we are assigned a thread while * waited for the lock, switch to it now instead of stealing. * If we can't get the lock, then somebody likely got there * first so continue searching. */ TDQ_LOCK(tdq); if (tdq->tdq_load > 0) { mi_switch(SW_VOL | SWT_IDLE); return (0); } if (TDQ_TRYLOCK_FLAGS(steal, MTX_DUPOK) == 0) { TDQ_UNLOCK(tdq); CPU_CLR(cpu, &mask); continue; } /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread, or * we were preempted and the CPU loading info may be out * of date. The latter is rare. In either case restart * the search. */ if (steal->tdq_load < steal_thresh || steal->tdq_transferable == 0 || switchcnt != tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt) { tdq_unlock_pair(tdq, steal); goto restart; } /* * Steal the thread and switch to it. */ if (tdq_move(steal, tdq) != NULL) break; /* * We failed to acquire a thread even though it looked * like one was available. This could be due to affinity * restrictions or for other reasons. Loop again after * removing this CPU from the set. The restart logic * above does not restore this CPU to the set due to the * likelyhood of failing here again. */ CPU_CLR(cpu, &mask); tdq_unlock_pair(tdq, steal); } TDQ_UNLOCK(steal); mi_switch(SW_VOL | SWT_IDLE); return (0); } /* * Notify a remote cpu of new work. Sends an IPI if criteria are met. */ static void tdq_notify(struct tdq *tdq, struct thread *td) { struct thread *ctd; int pri; int cpu; if (tdq->tdq_owepreempt) return; cpu = td_get_sched(td)->ts_cpu; pri = td->td_priority; ctd = pcpu_find(cpu)->pc_curthread; if (!sched_shouldpreempt(pri, ctd->td_priority, 1)) return; /* * Make sure that our caller's earlier update to tdq_load is * globally visible before we read tdq_cpu_idle. Idle thread * accesses both of them without locks, and the order is important. */ atomic_thread_fence_seq_cst(); if (TD_IS_IDLETHREAD(ctd)) { /* * If the MD code has an idle wakeup routine try that before * falling back to IPI. */ if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu)) return; } /* * The run queues have been updated, so any switch on the remote CPU * will satisfy the preemption request. */ tdq->tdq_owepreempt = 1; ipi_cpu(cpu, IPI_PREEMPT); } /* * Steals load from a timeshare queue. Honors the rotating queue head * index. */ static struct thread * runq_steal_from(struct runq *rq, int cpu, u_char start) { struct rqbits *rqb; struct rqhead *rqh; struct thread *td, *first; int bit; int i; rqb = &rq->rq_status; bit = start & (RQB_BPW -1); first = NULL; again: for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) { if (rqb->rqb_bits[i] == 0) continue; if (bit == 0) bit = RQB_FFS(rqb->rqb_bits[i]); for (; bit < RQB_BPW; bit++) { if ((rqb->rqb_bits[i] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (i << RQB_L2BPW)]; TAILQ_FOREACH(td, rqh, td_runq) { if (first) { if (THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, cpu)) return (td); } else first = td; } } } if (start != 0) { start = 0; goto again; } if (first && THREAD_CAN_MIGRATE(first) && THREAD_CAN_SCHED(first, cpu)) return (first); return (NULL); } /* * Steals load from a standard linear queue. */ static struct thread * runq_steal(struct runq *rq, int cpu) { struct rqhead *rqh; struct rqbits *rqb; struct thread *td; int word; int bit; rqb = &rq->rq_status; for (word = 0; word < RQB_LEN; word++) { if (rqb->rqb_bits[word] == 0) continue; for (bit = 0; bit < RQB_BPW; bit++) { if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; TAILQ_FOREACH(td, rqh, td_runq) if (THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, cpu)) return (td); } } return (NULL); } /* * Attempt to steal a thread in priority order from a thread queue. */ static struct thread * tdq_steal(struct tdq *tdq, int cpu) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL) return (td); if ((td = runq_steal_from(&tdq->tdq_timeshare, cpu, tdq->tdq_ridx)) != NULL) return (td); return (runq_steal(&tdq->tdq_idle, cpu)); } /* * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the * current lock and returns with the assigned queue locked. */ static inline struct tdq * sched_setcpu(struct thread *td, int cpu, int flags) { struct tdq *tdq; struct mtx *mtx; THREAD_LOCK_ASSERT(td, MA_OWNED); tdq = TDQ_CPU(cpu); td_get_sched(td)->ts_cpu = cpu; /* * If the lock matches just return the queue. */ if (td->td_lock == TDQ_LOCKPTR(tdq)) { KASSERT((flags & SRQ_HOLD) == 0, ("sched_setcpu: Invalid lock for SRQ_HOLD")); return (tdq); } /* * The hard case, migration, we need to block the thread first to * prevent order reversals with other cpus locks. */ spinlock_enter(); mtx = thread_lock_block(td); if ((flags & SRQ_HOLD) == 0) mtx_unlock_spin(mtx); TDQ_LOCK(tdq); thread_lock_unblock(td, TDQ_LOCKPTR(tdq)); spinlock_exit(); return (tdq); } SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding"); SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity"); SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity"); SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load"); SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu"); SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration"); static int sched_pickcpu(struct thread *td, int flags) { struct cpu_group *cg, *ccg; struct td_sched *ts; struct tdq *tdq; cpuset_t *mask; int cpu, pri, r, self, intr; self = PCPU_GET(cpuid); ts = td_get_sched(td); KASSERT(!CPU_ABSENT(ts->ts_cpu), ("sched_pickcpu: Start scheduler on " "absent CPU %d for thread %s.", ts->ts_cpu, td->td_name)); if (smp_started == 0) return (self); /* * Don't migrate a running thread from sched_switch(). */ if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td)) return (ts->ts_cpu); /* * Prefer to run interrupt threads on the processors that generate * the interrupt. */ if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) && curthread->td_intr_nesting_level) { tdq = TDQ_SELF(); if (tdq->tdq_lowpri >= PRI_MIN_IDLE) { SCHED_STAT_INC(pickcpu_idle_affinity); return (self); } ts->ts_cpu = self; intr = 1; cg = tdq->tdq_cg; goto llc; } else { intr = 0; tdq = TDQ_CPU(ts->ts_cpu); cg = tdq->tdq_cg; } /* * If the thread can run on the last cpu and the affinity has not * expired and it is idle, run it there. */ if (THREAD_CAN_SCHED(td, ts->ts_cpu) && tdq->tdq_lowpri >= PRI_MIN_IDLE && SCHED_AFFINITY(ts, CG_SHARE_L2)) { if (cg->cg_flags & CG_FLAG_THREAD) { /* Check all SMT threads for being idle. */ for (cpu = cg->cg_first; cpu <= cg->cg_last; cpu++) { if (CPU_ISSET(cpu, &cg->cg_mask) && TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) break; } if (cpu > cg->cg_last) { SCHED_STAT_INC(pickcpu_idle_affinity); return (ts->ts_cpu); } } else { SCHED_STAT_INC(pickcpu_idle_affinity); return (ts->ts_cpu); } } llc: /* * Search for the last level cache CPU group in the tree. * Skip SMT, identical groups and caches with expired affinity. * Interrupt threads affinity is explicit and never expires. */ for (ccg = NULL; cg != NULL; cg = cg->cg_parent) { if (cg->cg_flags & CG_FLAG_THREAD) continue; if (cg->cg_children == 1 || cg->cg_count == 1) continue; if (cg->cg_level == CG_SHARE_NONE || (!intr && !SCHED_AFFINITY(ts, cg->cg_level))) continue; ccg = cg; } /* Found LLC shared by all CPUs, so do a global search. */ if (ccg == cpu_top) ccg = NULL; cpu = -1; mask = &td->td_cpuset->cs_mask; pri = td->td_priority; r = TD_IS_RUNNING(td); /* * Try hard to keep interrupts within found LLC. Search the LLC for * the least loaded CPU we can run now. For NUMA systems it should * be within target domain, and it also reduces scheduling overhead. */ if (ccg != NULL && intr) { cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_intrbind); } else /* Search the LLC for the least loaded idle CPU we can run now. */ if (ccg != NULL) { cpu = sched_lowest(ccg, mask, max(pri, PRI_MAX_TIMESHARE), INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_affinity); } /* Search globally for the least loaded CPU we can run now. */ if (cpu < 0) { cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_lowest); } /* Search globally for the least loaded CPU. */ if (cpu < 0) { cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_lowest); } KASSERT(cpu >= 0, ("sched_pickcpu: Failed to find a cpu.")); KASSERT(!CPU_ABSENT(cpu), ("sched_pickcpu: Picked absent CPU %d.", cpu)); /* * Compare the lowest loaded cpu to current cpu. */ tdq = TDQ_CPU(cpu); if (THREAD_CAN_SCHED(td, self) && TDQ_SELF()->tdq_lowpri > pri && tdq->tdq_lowpri < PRI_MIN_IDLE && TDQ_SELF()->tdq_load <= tdq->tdq_load + 1) { SCHED_STAT_INC(pickcpu_local); cpu = self; } if (cpu != ts->ts_cpu) SCHED_STAT_INC(pickcpu_migration); return (cpu); } #endif /* * Pick the highest priority task we have and return it. */ static struct thread * tdq_choose(struct tdq *tdq) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); td = runq_choose(&tdq->tdq_realtime); if (td != NULL) return (td); td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx); if (td != NULL) { KASSERT(td->td_priority >= PRI_MIN_BATCH, ("tdq_choose: Invalid priority on timeshare queue %d", td->td_priority)); return (td); } td = runq_choose(&tdq->tdq_idle); if (td != NULL) { KASSERT(td->td_priority >= PRI_MIN_IDLE, ("tdq_choose: Invalid priority on idle queue %d", td->td_priority)); return (td); } return (NULL); } /* * Initialize a thread queue. */ static void tdq_setup(struct tdq *tdq, int id) { if (bootverbose) printf("ULE: setup cpu %d\n", id); runq_init(&tdq->tdq_realtime); runq_init(&tdq->tdq_timeshare); runq_init(&tdq->tdq_idle); tdq->tdq_id = id; snprintf(tdq->tdq_name, sizeof(tdq->tdq_name), "sched lock %d", (int)TDQ_ID(tdq)); mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", MTX_SPIN); #ifdef KTR snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname), "CPU %d load", (int)TDQ_ID(tdq)); #endif } #ifdef SMP static void sched_setup_smp(void) { struct tdq *tdq; int i; cpu_top = smp_topo(); CPU_FOREACH(i) { tdq = DPCPU_ID_PTR(i, tdq); tdq_setup(tdq, i); tdq->tdq_cg = smp_topo_find(cpu_top, i); if (tdq->tdq_cg == NULL) panic("Can't find cpu group for %d\n", i); DPCPU_ID_SET(i, randomval, i * 69069 + 5); } PCPU_SET(sched, DPCPU_PTR(tdq)); balance_tdq = TDQ_SELF(); } #endif /* * Setup the thread queues and initialize the topology based on MD * information. */ static void sched_setup(void *dummy) { struct tdq *tdq; #ifdef SMP sched_setup_smp(); #else tdq_setup(TDQ_SELF(), 0); #endif tdq = TDQ_SELF(); /* Add thread0's load since it's running. */ TDQ_LOCK(tdq); thread0.td_lock = TDQ_LOCKPTR(tdq); tdq_load_add(tdq, &thread0); tdq->tdq_lowpri = thread0.td_priority; TDQ_UNLOCK(tdq); } /* * This routine determines time constants after stathz and hz are setup. */ /* ARGSUSED */ static void sched_initticks(void *dummy) { int incr; realstathz = stathz ? stathz : hz; sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR; sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR; hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); /* * tickincr is shifted out by 10 to avoid rounding errors due to * hz not being evenly divisible by stathz on all platforms. */ incr = (hz << SCHED_TICK_SHIFT) / realstathz; /* * This does not work for values of stathz that are more than * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. */ if (incr == 0) incr = 1; tickincr = incr; #ifdef SMP /* * Set the default balance interval now that we know * what realstathz is. */ balance_interval = realstathz; balance_ticks = balance_interval; affinity = SCHED_AFFINITY_DEFAULT; #endif if (sched_idlespinthresh < 0) sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz; } /* * This is the core of the interactivity algorithm. Determines a score based * on past behavior. It is the ratio of sleep time to run time scaled to * a [0, 100] integer. This is the voluntary sleep time of a process, which * differs from the cpu usage because it does not account for time spent * waiting on a run-queue. Would be prettier if we had floating point. * * When a thread's sleep time is greater than its run time the * calculation is: * * scaling factor * interactivity score = --------------------- * sleep time / run time * * * When a thread's run time is greater than its sleep time the * calculation is: * * scaling factor * interactivity score = --------------------- + scaling factor * run time / sleep time */ static int sched_interact_score(struct thread *td) { struct td_sched *ts; int div; ts = td_get_sched(td); /* * The score is only needed if this is likely to be an interactive * task. Don't go through the expense of computing it if there's * no chance. */ if (sched_interact <= SCHED_INTERACT_HALF && ts->ts_runtime >= ts->ts_slptime) return (SCHED_INTERACT_HALF); if (ts->ts_runtime > ts->ts_slptime) { div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF); return (SCHED_INTERACT_HALF + (SCHED_INTERACT_HALF - (ts->ts_slptime / div))); } if (ts->ts_slptime > ts->ts_runtime) { div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF); return (ts->ts_runtime / div); } /* runtime == slptime */ if (ts->ts_runtime) return (SCHED_INTERACT_HALF); /* * This can happen if slptime and runtime are 0. */ return (0); } /* * Scale the scheduling priority according to the "interactivity" of this * process. */ static void sched_priority(struct thread *td) { u_int pri, score; if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) return; /* * If the score is interactive we place the thread in the realtime * queue with a priority that is less than kernel and interrupt * priorities. These threads are not subject to nice restrictions. * * Scores greater than this are placed on the normal timeshare queue * where the priority is partially decided by the most recent cpu * utilization and the rest is decided by nice value. * * The nice value of the process has a linear effect on the calculated * score. Negative nice values make it easier for a thread to be * considered interactive. */ score = imax(0, sched_interact_score(td) + td->td_proc->p_nice); if (score < sched_interact) { pri = PRI_MIN_INTERACT; pri += (PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) * score / sched_interact; KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT, ("sched_priority: invalid interactive priority %u score %u", pri, score)); } else { pri = SCHED_PRI_MIN; if (td_get_sched(td)->ts_ticks) pri += min(SCHED_PRI_TICKS(td_get_sched(td)), SCHED_PRI_RANGE - 1); pri += SCHED_PRI_NICE(td->td_proc->p_nice); KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH, ("sched_priority: invalid priority %u: nice %d, " "ticks %d ftick %d ltick %d tick pri %d", pri, td->td_proc->p_nice, td_get_sched(td)->ts_ticks, td_get_sched(td)->ts_ftick, td_get_sched(td)->ts_ltick, SCHED_PRI_TICKS(td_get_sched(td)))); } sched_user_prio(td, pri); return; } /* * This routine enforces a maximum limit on the amount of scheduling history * kept. It is called after either the slptime or runtime is adjusted. This * function is ugly due to integer math. */ static void sched_interact_update(struct thread *td) { struct td_sched *ts; u_int sum; ts = td_get_sched(td); sum = ts->ts_runtime + ts->ts_slptime; if (sum < SCHED_SLP_RUN_MAX) return; /* * This only happens from two places: * 1) We have added an unusual amount of run time from fork_exit. * 2) We have added an unusual amount of sleep time from sched_sleep(). */ if (sum > SCHED_SLP_RUN_MAX * 2) { if (ts->ts_runtime > ts->ts_slptime) { ts->ts_runtime = SCHED_SLP_RUN_MAX; ts->ts_slptime = 1; } else { ts->ts_slptime = SCHED_SLP_RUN_MAX; ts->ts_runtime = 1; } return; } /* * If we have exceeded by more than 1/5th then the algorithm below * will not bring us back into range. Dividing by two here forces * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] */ if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { ts->ts_runtime /= 2; ts->ts_slptime /= 2; return; } ts->ts_runtime = (ts->ts_runtime / 5) * 4; ts->ts_slptime = (ts->ts_slptime / 5) * 4; } /* * Scale back the interactivity history when a child thread is created. The * history is inherited from the parent but the thread may behave totally * differently. For example, a shell spawning a compiler process. We want * to learn that the compiler is behaving badly very quickly. */ static void sched_interact_fork(struct thread *td) { struct td_sched *ts; int ratio; int sum; ts = td_get_sched(td); sum = ts->ts_runtime + ts->ts_slptime; if (sum > SCHED_SLP_RUN_FORK) { ratio = sum / SCHED_SLP_RUN_FORK; ts->ts_runtime /= ratio; ts->ts_slptime /= ratio; } } /* * Called from proc0_init() to setup the scheduler fields. */ void schedinit(void) { struct td_sched *ts0; /* * Set up the scheduler specific parts of thread0. */ ts0 = td_get_sched(&thread0); ts0->ts_ltick = ticks; ts0->ts_ftick = ticks; ts0->ts_slice = 0; ts0->ts_cpu = curcpu; /* set valid CPU number */ } +/* + * schedinit_ap() is needed prior to calling sched_throw(NULL) to ensure that + * the pcpu requirements are met for any calls in the period between curthread + * initialization and sched_throw(). One can safely add threads to the queue + * before sched_throw(), for instance, as long as the thread lock is setup + * correctly. + * + * TDQ_SELF() relies on the below sched pcpu setting; it may be used only + * after schedinit_ap(). + */ +void +schedinit_ap(void) +{ + +#ifdef SMP + PCPU_SET(sched, DPCPU_PTR(tdq)); +#endif + PCPU_GET(idlethread)->td_lock = TDQ_LOCKPTR(TDQ_SELF()); +} + /* * This is only somewhat accurate since given many processes of the same * priority they will switch when their slices run out, which will be * at most sched_slice stathz ticks. */ int sched_rr_interval(void) { /* Convert sched_slice from stathz to hz. */ return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz)); } /* * Update the percent cpu tracking information when it is requested or * the total history exceeds the maximum. We keep a sliding history of * tick counts that slowly decays. This is less precise than the 4BSD * mechanism since it happens with less regular and frequent events. */ static void sched_pctcpu_update(struct td_sched *ts, int run) { int t = ticks; /* * The signed difference may be negative if the thread hasn't run for * over half of the ticks rollover period. */ if ((u_int)(t - ts->ts_ltick) >= SCHED_TICK_TARG) { ts->ts_ticks = 0; ts->ts_ftick = t - SCHED_TICK_TARG; } else if (t - ts->ts_ftick >= SCHED_TICK_MAX) { ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) * (ts->ts_ltick - (t - SCHED_TICK_TARG)); ts->ts_ftick = t - SCHED_TICK_TARG; } if (run) ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT; ts->ts_ltick = t; } /* * Adjust the priority of a thread. Move it to the appropriate run-queue * if necessary. This is the back-end for several priority related * functions. */ static void sched_thread_priority(struct thread *td, u_char prio) { struct td_sched *ts; struct tdq *tdq; int oldpri; KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(curthread)); SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio); if (td != curthread && prio < td->td_priority) { KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread), "lend prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, curthread); } ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority == prio) return; /* * If the priority has been elevated due to priority * propagation, we may have to move ourselves to a new * queue. This could be optimized to not re-add in some * cases. */ if (TD_ON_RUNQ(td) && prio < td->td_priority) { sched_rem(td); td->td_priority = prio; sched_add(td, SRQ_BORROWING | SRQ_HOLDTD); return; } /* * If the thread is currently running we may have to adjust the lowpri * information so other cpus are aware of our current priority. */ if (TD_IS_RUNNING(td)) { tdq = TDQ_CPU(ts->ts_cpu); oldpri = td->td_priority; td->td_priority = prio; if (prio < tdq->tdq_lowpri) tdq->tdq_lowpri = prio; else if (tdq->tdq_lowpri == oldpri) tdq_setlowpri(tdq, td); return; } td->td_priority = prio; } /* * Update a thread's priority when it is lent another thread's * priority. */ void sched_lend_prio(struct thread *td, u_char prio) { td->td_flags |= TDF_BORROWING; sched_thread_priority(td, prio); } /* * Restore a thread's priority when priority propagation is * over. The prio argument is the minimum priority the thread * needs to have to satisfy other possible priority lending * requests. If the thread's regular priority is less * important than prio, the thread will keep a priority boost * of prio. */ void sched_unlend_prio(struct thread *td, u_char prio) { u_char base_pri; if (td->td_base_pri >= PRI_MIN_TIMESHARE && td->td_base_pri <= PRI_MAX_TIMESHARE) base_pri = td->td_user_pri; else base_pri = td->td_base_pri; if (prio >= base_pri) { td->td_flags &= ~TDF_BORROWING; sched_thread_priority(td, base_pri); } else sched_lend_prio(td, prio); } /* * Standard entry for setting the priority to an absolute value. */ void sched_prio(struct thread *td, u_char prio) { u_char oldprio; /* First, update the base priority. */ td->td_base_pri = prio; /* * If the thread is borrowing another thread's priority, don't * ever lower the priority. */ if (td->td_flags & TDF_BORROWING && td->td_priority < prio) return; /* Change the real priority. */ oldprio = td->td_priority; sched_thread_priority(td, prio); /* * If the thread is on a turnstile, then let the turnstile update * its state. */ if (TD_ON_LOCK(td) && oldprio != prio) turnstile_adjust(td, oldprio); } /* * Set the base user priority, does not effect current running priority. */ void sched_user_prio(struct thread *td, u_char prio) { td->td_base_user_pri = prio; if (td->td_lend_user_pri <= prio) return; td->td_user_pri = prio; } void sched_lend_user_prio(struct thread *td, u_char prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_lend_user_pri = prio; td->td_user_pri = min(prio, td->td_base_user_pri); if (td->td_priority > td->td_user_pri) sched_prio(td, td->td_user_pri); else if (td->td_priority != td->td_user_pri) td->td_flags |= TDF_NEEDRESCHED; } /* * Like the above but first check if there is anything to do. */ void sched_lend_user_prio_cond(struct thread *td, u_char prio) { if (td->td_lend_user_pri != prio) goto lend; if (td->td_user_pri != min(prio, td->td_base_user_pri)) goto lend; if (td->td_priority != td->td_user_pri) goto lend; return; lend: thread_lock(td); sched_lend_user_prio(td, prio); thread_unlock(td); } #ifdef SMP /* * This tdq is about to idle. Try to steal a thread from another CPU before * choosing the idle thread. */ static void tdq_trysteal(struct tdq *tdq) { struct cpu_group *cg, *parent; struct tdq *steal; cpuset_t mask; int cpu, i, goup; if (smp_started == 0 || steal_idle == 0 || trysteal_limit == 0 || tdq->tdq_cg == NULL) return; CPU_FILL(&mask); CPU_CLR(PCPU_GET(cpuid), &mask); /* We don't want to be preempted while we're iterating. */ spinlock_enter(); TDQ_UNLOCK(tdq); for (i = 1, cg = tdq->tdq_cg, goup = 0; ; ) { cpu = sched_highest(cg, &mask, steal_thresh, 1); /* * If a thread was added while interrupts were disabled don't * steal one here. */ if (tdq->tdq_load > 0) { TDQ_LOCK(tdq); break; } /* * We found no CPU to steal from in this group. Escalate to * the parent and repeat. But if parent has only two children * groups we can avoid searching this group again by searching * the other one specifically and then escalating two levels. */ if (cpu == -1) { if (goup) { cg = cg->cg_parent; goup = 0; } if (++i > trysteal_limit) { TDQ_LOCK(tdq); break; } parent = cg->cg_parent; if (parent == NULL) { TDQ_LOCK(tdq); break; } if (parent->cg_children == 2) { if (cg == &parent->cg_child[0]) cg = &parent->cg_child[1]; else cg = &parent->cg_child[0]; goup = 1; } else cg = parent; continue; } steal = TDQ_CPU(cpu); /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread. * At this point unconditionally exit the loop to bound * the time spent in the critcal section. */ if (steal->tdq_load < steal_thresh || steal->tdq_transferable == 0) continue; /* * Try to lock both queues. If we are assigned a thread while * waited for the lock, switch to it now instead of stealing. * If we can't get the lock, then somebody likely got there * first. */ TDQ_LOCK(tdq); if (tdq->tdq_load > 0) break; if (TDQ_TRYLOCK_FLAGS(steal, MTX_DUPOK) == 0) break; /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread. */ if (steal->tdq_load < steal_thresh || steal->tdq_transferable == 0) { TDQ_UNLOCK(steal); break; } /* * If we fail to acquire one due to affinity restrictions, * bail out and let the idle thread to a more complete search * outside of a critical section. */ if (tdq_move(steal, tdq) == NULL) { TDQ_UNLOCK(steal); break; } TDQ_UNLOCK(steal); break; } spinlock_exit(); } #endif /* * Handle migration from sched_switch(). This happens only for * cpu binding. */ static struct mtx * sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags) { struct tdq *tdn; KASSERT(THREAD_CAN_MIGRATE(td) || (td_get_sched(td)->ts_flags & TSF_BOUND) != 0, ("Thread %p shouldn't migrate", td)); KASSERT(!CPU_ABSENT(td_get_sched(td)->ts_cpu), ("sched_switch_migrate: " "thread %s queued on absent CPU %d.", td->td_name, td_get_sched(td)->ts_cpu)); tdn = TDQ_CPU(td_get_sched(td)->ts_cpu); #ifdef SMP tdq_load_rem(tdq, td); /* * Do the lock dance required to avoid LOR. We have an * extra spinlock nesting from sched_switch() which will * prevent preemption while we're holding neither run-queue lock. */ TDQ_UNLOCK(tdq); TDQ_LOCK(tdn); tdq_add(tdn, td, flags); tdq_notify(tdn, td); TDQ_UNLOCK(tdn); TDQ_LOCK(tdq); #endif return (TDQ_LOCKPTR(tdn)); } /* * thread_lock_unblock() that does not assume td_lock is blocked. */ static inline void thread_unblock_switch(struct thread *td, struct mtx *mtx) { atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock, (uintptr_t)mtx); } /* * Switch threads. This function has to handle threads coming in while * blocked for some reason, running, or idle. It also must deal with * migrating a thread from one queue to another as running threads may * be assigned elsewhere via binding. */ void sched_switch(struct thread *td, int flags) { struct thread *newtd; struct tdq *tdq; struct td_sched *ts; struct mtx *mtx; int srqflag; int cpuid, preempted; #ifdef SMP int pickcpu; #endif THREAD_LOCK_ASSERT(td, MA_OWNED); cpuid = PCPU_GET(cpuid); tdq = TDQ_SELF(); ts = td_get_sched(td); sched_pctcpu_update(ts, 1); #ifdef SMP pickcpu = (td->td_flags & TDF_PICKCPU) != 0; if (pickcpu) ts->ts_rltick = ticks - affinity * MAX_CACHE_LEVELS; else ts->ts_rltick = ticks; #endif td->td_lastcpu = td->td_oncpu; preempted = (td->td_flags & TDF_SLICEEND) == 0 && (flags & SW_PREEMPT) != 0; td->td_flags &= ~(TDF_NEEDRESCHED | TDF_PICKCPU | TDF_SLICEEND); td->td_owepreempt = 0; tdq->tdq_owepreempt = 0; if (!TD_IS_IDLETHREAD(td)) tdq->tdq_switchcnt++; /* * Always block the thread lock so we can drop the tdq lock early. */ mtx = thread_lock_block(td); spinlock_enter(); if (TD_IS_IDLETHREAD(td)) { MPASS(mtx == TDQ_LOCKPTR(tdq)); TD_SET_CAN_RUN(td); } else if (TD_IS_RUNNING(td)) { MPASS(mtx == TDQ_LOCKPTR(tdq)); srqflag = preempted ? SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_OURSELF|SRQ_YIELDING; #ifdef SMP if (THREAD_CAN_MIGRATE(td) && (!THREAD_CAN_SCHED(td, ts->ts_cpu) || pickcpu)) ts->ts_cpu = sched_pickcpu(td, 0); #endif if (ts->ts_cpu == cpuid) tdq_runq_add(tdq, td, srqflag); else mtx = sched_switch_migrate(tdq, td, srqflag); } else { /* This thread must be going to sleep. */ if (mtx != TDQ_LOCKPTR(tdq)) { mtx_unlock_spin(mtx); TDQ_LOCK(tdq); } tdq_load_rem(tdq, td); #ifdef SMP if (tdq->tdq_load == 0) tdq_trysteal(tdq); #endif } #if (KTR_COMPILE & KTR_SCHED) != 0 if (TD_IS_IDLETHREAD(td)) KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle", "prio:%d", td->td_priority); else KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td), "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg, "lockname:\"%s\"", td->td_lockname); #endif /* * We enter here with the thread blocked and assigned to the * appropriate cpu run-queue or sleep-queue and with the current * thread-queue locked. */ TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); newtd = choosethread(); sched_pctcpu_update(td_get_sched(newtd), 0); TDQ_UNLOCK(tdq); /* * Call the MD code to switch contexts if necessary. */ if (td != newtd) { #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc); #ifdef KDTRACE_HOOKS /* * If DTrace has set the active vtime enum to anything * other than INACTIVE (0), then it should have set the * function to call. */ if (dtrace_vtime_active) (*dtrace_vtime_switch_func)(newtd); #endif td->td_oncpu = NOCPU; cpu_switch(td, newtd, mtx); cpuid = td->td_oncpu = PCPU_GET(cpuid); SDT_PROBE0(sched, , , on__cpu); #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); #endif } else { thread_unblock_switch(td, mtx); SDT_PROBE0(sched, , , remain__cpu); } KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count %d", curthread->td_md.md_spinlock_count)); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); } /* * Adjust thread priorities as a result of a nice request. */ void sched_nice(struct proc *p, int nice) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_nice = nice; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); sched_priority(td); sched_prio(td, td->td_base_user_pri); thread_unlock(td); } } /* * Record the sleep time for the interactivity scorer. */ void sched_sleep(struct thread *td, int prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_slptick = ticks; if (TD_IS_SUSPENDED(td) || prio >= PSOCK) td->td_flags |= TDF_CANSWAP; if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) return; if (static_boost == 1 && prio) sched_prio(td, prio); else if (static_boost && td->td_priority > static_boost) sched_prio(td, static_boost); } /* * Schedule a thread to resume execution and record how long it voluntarily * slept. We also update the pctcpu, interactivity, and priority. * * Requires the thread lock on entry, drops on exit. */ void sched_wakeup(struct thread *td, int srqflags) { struct td_sched *ts; int slptick; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); td->td_flags &= ~TDF_CANSWAP; /* * If we slept for more than a tick update our interactivity and * priority. */ slptick = td->td_slptick; td->td_slptick = 0; if (slptick && slptick != ticks) { ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT; sched_interact_update(td); sched_pctcpu_update(ts, 0); } /* * Reset the slice value since we slept and advanced the round-robin. */ ts->ts_slice = 0; sched_add(td, SRQ_BORING | srqflags); } /* * Penalize the parent for creating a new child and initialize the child's * priority. */ void sched_fork(struct thread *td, struct thread *child) { THREAD_LOCK_ASSERT(td, MA_OWNED); sched_pctcpu_update(td_get_sched(td), 1); sched_fork_thread(td, child); /* * Penalize the parent and child for forking. */ sched_interact_fork(child); sched_priority(child); td_get_sched(td)->ts_runtime += tickincr; sched_interact_update(td); sched_priority(td); } /* * Fork a new thread, may be within the same process. */ void sched_fork_thread(struct thread *td, struct thread *child) { struct td_sched *ts; struct td_sched *ts2; struct tdq *tdq; tdq = TDQ_SELF(); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Initialize child. */ ts = td_get_sched(td); ts2 = td_get_sched(child); child->td_oncpu = NOCPU; child->td_lastcpu = NOCPU; child->td_lock = TDQ_LOCKPTR(tdq); child->td_cpuset = cpuset_ref(td->td_cpuset); child->td_domain.dr_policy = td->td_cpuset->cs_domain; ts2->ts_cpu = ts->ts_cpu; ts2->ts_flags = 0; /* * Grab our parents cpu estimation information. */ ts2->ts_ticks = ts->ts_ticks; ts2->ts_ltick = ts->ts_ltick; ts2->ts_ftick = ts->ts_ftick; /* * Do not inherit any borrowed priority from the parent. */ child->td_priority = child->td_base_pri; /* * And update interactivity score. */ ts2->ts_slptime = ts->ts_slptime; ts2->ts_runtime = ts->ts_runtime; /* Attempt to quickly learn interactivity. */ ts2->ts_slice = tdq_slice(tdq) - sched_slice_min; #ifdef KTR bzero(ts2->ts_name, sizeof(ts2->ts_name)); #endif } /* * Adjust the priority class of a thread. */ void sched_class(struct thread *td, int class) { THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_pri_class == class) return; td->td_pri_class = class; } /* * Return some of the child's priority and interactivity to the parent. */ void sched_exit(struct proc *p, struct thread *child) { struct thread *td; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit", "prio:%d", child->td_priority); PROC_LOCK_ASSERT(p, MA_OWNED); td = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td, child); } /* * Penalize another thread for the time spent on this one. This helps to * worsen the priority and interactivity of processes which schedule batch * jobs such as make. This has little effect on the make process itself but * causes new processes spawned by it to receive worse scores immediately. */ void sched_exit_thread(struct thread *td, struct thread *child) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit", "prio:%d", child->td_priority); /* * Give the child's runtime to the parent without returning the * sleep time as a penalty to the parent. This causes shells that * launch expensive things to mark their children as expensive. */ thread_lock(td); td_get_sched(td)->ts_runtime += td_get_sched(child)->ts_runtime; sched_interact_update(td); sched_priority(td); thread_unlock(td); } void sched_preempt(struct thread *td) { struct tdq *tdq; int flags; SDT_PROBE2(sched, , , surrender, td, td->td_proc); thread_lock(td); tdq = TDQ_SELF(); TDQ_LOCK_ASSERT(tdq, MA_OWNED); if (td->td_priority > tdq->tdq_lowpri) { if (td->td_critnest == 1) { flags = SW_INVOL | SW_PREEMPT; flags |= TD_IS_IDLETHREAD(td) ? SWT_REMOTEWAKEIDLE : SWT_REMOTEPREEMPT; mi_switch(flags); /* Switch dropped thread lock. */ return; } td->td_owepreempt = 1; } else { tdq->tdq_owepreempt = 0; } thread_unlock(td); } /* * Fix priorities on return to user-space. Priorities may be elevated due * to static priorities in msleep() or similar. */ void sched_userret_slowpath(struct thread *td) { thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; tdq_setlowpri(TDQ_SELF(), td); thread_unlock(td); } /* * Handle a stathz tick. This is really only relevant for timeshare * threads. */ void sched_clock(struct thread *td, int cnt) { struct tdq *tdq; struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); tdq = TDQ_SELF(); #ifdef SMP /* * We run the long term load balancer infrequently on the first cpu. */ if (balance_tdq == tdq && smp_started != 0 && rebalance != 0 && balance_ticks != 0) { balance_ticks -= cnt; if (balance_ticks <= 0) sched_balance(); } #endif /* * Save the old switch count so we have a record of the last ticks * activity. Initialize the new switch count based on our load. * If there is some activity seed it to reflect that. */ tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt; tdq->tdq_switchcnt = tdq->tdq_load; /* * Advance the insert index once for each tick to ensure that all * threads get a chance to run. */ if (tdq->tdq_idx == tdq->tdq_ridx) { tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx])) tdq->tdq_ridx = tdq->tdq_idx; } ts = td_get_sched(td); sched_pctcpu_update(ts, 1); if ((td->td_pri_class & PRI_FIFO_BIT) || TD_IS_IDLETHREAD(td)) return; if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) { /* * We used a tick; charge it to the thread so * that we can compute our interactivity. */ td_get_sched(td)->ts_runtime += tickincr * cnt; sched_interact_update(td); sched_priority(td); } /* * Force a context switch if the current thread has used up a full * time slice (default is 100ms). */ ts->ts_slice += cnt; if (ts->ts_slice >= tdq_slice(tdq)) { ts->ts_slice = 0; td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND; } } u_int sched_estcpu(struct thread *td __unused) { return (0); } /* * Return whether the current CPU has runnable tasks. Used for in-kernel * cooperative idle threads. */ int sched_runnable(void) { struct tdq *tdq; int load; load = 1; tdq = TDQ_SELF(); if ((curthread->td_flags & TDF_IDLETD) != 0) { if (tdq->tdq_load > 0) goto out; } else if (tdq->tdq_load - 1 > 0) goto out; load = 0; out: return (load); } /* * Choose the highest priority thread to run. The thread is removed from * the run-queue while running however the load remains. For SMP we set * the tdq in the global idle bitmask if it idles here. */ struct thread * sched_choose(void) { struct thread *td; struct tdq *tdq; tdq = TDQ_SELF(); TDQ_LOCK_ASSERT(tdq, MA_OWNED); td = tdq_choose(tdq); if (td) { tdq_runq_rem(tdq, td); tdq->tdq_lowpri = td->td_priority; return (td); } tdq->tdq_lowpri = PRI_MAX_IDLE; return (PCPU_GET(idlethread)); } /* * Set owepreempt if necessary. Preemption never happens directly in ULE, * we always request it once we exit a critical section. */ static inline void sched_setpreempt(struct thread *td) { struct thread *ctd; int cpri; int pri; THREAD_LOCK_ASSERT(curthread, MA_OWNED); ctd = curthread; pri = td->td_priority; cpri = ctd->td_priority; if (pri < cpri) ctd->td_flags |= TDF_NEEDRESCHED; if (KERNEL_PANICKED() || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) return; if (!sched_shouldpreempt(pri, cpri, 0)) return; ctd->td_owepreempt = 1; } /* * Add a thread to a thread queue. Select the appropriate runq and add the * thread to it. This is the internal function called when the tdq is * predetermined. */ void tdq_add(struct tdq *tdq, struct thread *td, int flags) { TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), ("sched_add: bad thread state")); KASSERT(td->td_flags & TDF_INMEM, ("sched_add: thread swapped out")); if (td->td_priority < tdq->tdq_lowpri) tdq->tdq_lowpri = td->td_priority; tdq_runq_add(tdq, td, flags); tdq_load_add(tdq, td); } /* * Select the target thread queue and add a thread to it. Request * preemption or IPI a remote processor if required. * * Requires the thread lock on entry, drops on exit. */ void sched_add(struct thread *td, int flags) { struct tdq *tdq; #ifdef SMP int cpu; #endif KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, flags & SRQ_PREEMPTED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Recalculate the priority before we select the target cpu or * run-queue. */ if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_priority(td); #ifdef SMP /* * Pick the destination cpu and if it isn't ours transfer to the * target cpu. */ cpu = sched_pickcpu(td, flags); tdq = sched_setcpu(td, cpu, flags); tdq_add(tdq, td, flags); if (cpu != PCPU_GET(cpuid)) tdq_notify(tdq, td); else if (!(flags & SRQ_YIELDING)) sched_setpreempt(td); #else tdq = TDQ_SELF(); /* * Now that the thread is moving to the run-queue, set the lock * to the scheduler's lock. */ if (td->td_lock != TDQ_LOCKPTR(tdq)) { TDQ_LOCK(tdq); if ((flags & SRQ_HOLD) != 0) td->td_lock = TDQ_LOCKPTR(tdq); else thread_lock_set(td, TDQ_LOCKPTR(tdq)); } tdq_add(tdq, td, flags); if (!(flags & SRQ_YIELDING)) sched_setpreempt(td); #endif if (!(flags & SRQ_HOLDTD)) thread_unlock(td); } /* * Remove a thread from a run-queue without running it. This is used * when we're stealing a thread from a remote queue. Otherwise all threads * exit by calling sched_exit_thread() and sched_throw() themselves. */ void sched_rem(struct thread *td) { struct tdq *tdq; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem", "prio:%d", td->td_priority); SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL); tdq = TDQ_CPU(td_get_sched(td)->ts_cpu); TDQ_LOCK_ASSERT(tdq, MA_OWNED); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); KASSERT(TD_ON_RUNQ(td), ("sched_rem: thread not on run queue")); tdq_runq_rem(tdq, td); tdq_load_rem(tdq, td); TD_SET_CAN_RUN(td); if (td->td_priority == tdq->tdq_lowpri) tdq_setlowpri(tdq, NULL); } /* * Fetch cpu utilization information. Updates on demand. */ fixpt_t sched_pctcpu(struct thread *td) { fixpt_t pctcpu; struct td_sched *ts; pctcpu = 0; ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); sched_pctcpu_update(ts, TD_IS_RUNNING(td)); if (ts->ts_ticks) { int rtick; /* How many rtick per second ? */ rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; } return (pctcpu); } /* * Enforce affinity settings for a thread. Called after adjustments to * cpumask. */ void sched_affinity(struct thread *td) { #ifdef SMP struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); if (THREAD_CAN_SCHED(td, ts->ts_cpu)) return; if (TD_ON_RUNQ(td)) { sched_rem(td); sched_add(td, SRQ_BORING | SRQ_HOLDTD); return; } if (!TD_IS_RUNNING(td)) return; /* * Force a switch before returning to userspace. If the * target thread is not running locally send an ipi to force * the issue. */ td->td_flags |= TDF_NEEDRESCHED; if (td != curthread) ipi_cpu(ts->ts_cpu, IPI_PREEMPT); #endif } /* * Bind a thread to a target cpu. */ void sched_bind(struct thread *td, int cpu) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); KASSERT(td == curthread, ("sched_bind: can only bind curthread")); ts = td_get_sched(td); if (ts->ts_flags & TSF_BOUND) sched_unbind(td); KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td)); ts->ts_flags |= TSF_BOUND; sched_pin(); if (PCPU_GET(cpuid) == cpu) return; ts->ts_cpu = cpu; /* When we return from mi_switch we'll be on the correct cpu. */ mi_switch(SW_VOL); thread_lock(td); } /* * Release a bound thread. */ void sched_unbind(struct thread *td) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td == curthread, ("sched_unbind: can only bind curthread")); ts = td_get_sched(td); if ((ts->ts_flags & TSF_BOUND) == 0) return; ts->ts_flags &= ~TSF_BOUND; sched_unpin(); } int sched_is_bound(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); return (td_get_sched(td)->ts_flags & TSF_BOUND); } /* * Basic yield call. */ void sched_relinquish(struct thread *td) { thread_lock(td); mi_switch(SW_VOL | SWT_RELINQUISH); } /* * Return the total system load. */ int sched_load(void) { #ifdef SMP int total; int i; total = 0; CPU_FOREACH(i) total += TDQ_CPU(i)->tdq_sysload; return (total); #else return (TDQ_SELF()->tdq_sysload); #endif } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); } #ifdef SMP #define TDQ_IDLESPIN(tdq) \ ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0) #else #define TDQ_IDLESPIN(tdq) 1 #endif /* * The actual idle process. */ void sched_idletd(void *dummy) { struct thread *td; struct tdq *tdq; int oldswitchcnt, switchcnt; int i; mtx_assert(&Giant, MA_NOTOWNED); td = curthread; tdq = TDQ_SELF(); THREAD_NO_SLEEPING(); oldswitchcnt = -1; for (;;) { if (tdq->tdq_load) { thread_lock(td); mi_switch(SW_VOL | SWT_IDLE); } switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; #ifdef SMP if (always_steal || switchcnt != oldswitchcnt) { oldswitchcnt = switchcnt; if (tdq_idled(tdq) == 0) continue; } switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; #else oldswitchcnt = switchcnt; #endif /* * If we're switching very frequently, spin while checking * for load rather than entering a low power state that * may require an IPI. However, don't do any busy * loops while on SMT machines as this simply steals * cycles from cores doing useful work. */ if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) { for (i = 0; i < sched_idlespins; i++) { if (tdq->tdq_load) break; cpu_spinwait(); } } /* If there was context switch during spin, restart it. */ switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt) continue; /* Run main MD idle handler. */ tdq->tdq_cpu_idle = 1; /* * Make sure that tdq_cpu_idle update is globally visible * before cpu_idle() read tdq_load. The order is important * to avoid race with tdq_notify. */ atomic_thread_fence_seq_cst(); /* * Checking for again after the fence picks up assigned * threads often enough to make it worthwhile to do so in * order to avoid calling cpu_idle(). */ if (tdq->tdq_load != 0) { tdq->tdq_cpu_idle = 0; continue; } cpu_idle(switchcnt * 4 > sched_idlespinthresh); tdq->tdq_cpu_idle = 0; /* * Account thread-less hardware interrupts and * other wakeup reasons equal to context switches. */ switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; if (switchcnt != oldswitchcnt) continue; tdq->tdq_switchcnt++; oldswitchcnt++; } } /* * A CPU is entering for the first time or a thread is exiting. */ void sched_throw(struct thread *td) { struct thread *newtd; struct tdq *tdq; + tdq = TDQ_SELF(); if (__predict_false(td == NULL)) { -#ifdef SMP - PCPU_SET(sched, DPCPU_PTR(tdq)); -#endif - /* Correct spinlock nesting and acquire the correct lock. */ - tdq = TDQ_SELF(); TDQ_LOCK(tdq); + /* Correct spinlock nesting. */ spinlock_exit(); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); - PCPU_GET(idlethread)->td_lock = TDQ_LOCKPTR(tdq); } else { - tdq = TDQ_SELF(); THREAD_LOCK_ASSERT(td, MA_OWNED); THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(tdq)); tdq_load_rem(tdq, td); td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; thread_lock_block(td); } newtd = choosethread(); spinlock_enter(); TDQ_UNLOCK(tdq); KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count %d", curthread->td_md.md_spinlock_count)); /* doesn't return */ if (__predict_false(td == NULL)) cpu_throw(td, newtd); /* doesn't return */ else cpu_switch(td, newtd, TDQ_LOCKPTR(tdq)); } /* * This is called from fork_exit(). Just acquire the correct locks and * let fork do the rest of the work. */ void sched_fork_exit(struct thread *td) { struct tdq *tdq; int cpuid; /* * Finish setting up thread glue so that it begins execution in a * non-nested critical section with the scheduler lock held. */ KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count %d", curthread->td_md.md_spinlock_count)); cpuid = PCPU_GET(cpuid); tdq = TDQ_SELF(); TDQ_LOCK(tdq); spinlock_exit(); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); td->td_oncpu = cpuid; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); SDT_PROBE0(sched, , , on__cpu); } /* * Create on first use to catch odd startup conditions. */ char * sched_tdname(struct thread *td) { #ifdef KTR struct td_sched *ts; ts = td_get_sched(td); if (ts->ts_name[0] == '\0') snprintf(ts->ts_name, sizeof(ts->ts_name), "%s tid %d", td->td_name, td->td_tid); return (ts->ts_name); #else return (td->td_name); #endif } #ifdef KTR void sched_clear_tdname(struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); ts->ts_name[0] = '\0'; } #endif #ifdef SMP /* * Build the CPU topology dump string. Is recursively called to collect * the topology tree. */ static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg, int indent) { char cpusetbuf[CPUSETBUFSIZ]; int i, first; sbuf_printf(sb, "%*s\n", indent, "", 1 + indent / 2, cg->cg_level); sbuf_printf(sb, "%*s ", indent, "", cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask)); first = TRUE; for (i = cg->cg_first; i <= cg->cg_last; i++) { if (CPU_ISSET(i, &cg->cg_mask)) { if (!first) sbuf_printf(sb, ", "); else first = FALSE; sbuf_printf(sb, "%d", i); } } sbuf_printf(sb, "\n"); if (cg->cg_flags != 0) { sbuf_printf(sb, "%*s ", indent, ""); if ((cg->cg_flags & CG_FLAG_HTT) != 0) sbuf_printf(sb, "HTT group"); if ((cg->cg_flags & CG_FLAG_THREAD) != 0) sbuf_printf(sb, "THREAD group"); if ((cg->cg_flags & CG_FLAG_SMT) != 0) sbuf_printf(sb, "SMT group"); if ((cg->cg_flags & CG_FLAG_NODE) != 0) sbuf_printf(sb, "NUMA node"); sbuf_printf(sb, "\n"); } if (cg->cg_children > 0) { sbuf_printf(sb, "%*s \n", indent, ""); for (i = 0; i < cg->cg_children; i++) sysctl_kern_sched_topology_spec_internal(sb, &cg->cg_child[i], indent+2); sbuf_printf(sb, "%*s \n", indent, ""); } sbuf_printf(sb, "%*s\n", indent, ""); return (0); } /* * Sysctl handler for retrieving topology dump. It's a wrapper for * the recursive sysctl_kern_smp_topology_spec_internal(). */ static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS) { struct sbuf *topo; int err; KASSERT(cpu_top != NULL, ("cpu_top isn't initialized")); topo = sbuf_new_for_sysctl(NULL, NULL, 512, req); if (topo == NULL) return (ENOMEM); sbuf_printf(topo, "\n"); err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1); sbuf_printf(topo, "\n"); if (err == 0) { err = sbuf_finish(topo); } sbuf_delete(topo); return (err); } #endif static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) { int error, new_val, period; period = 1000000 / realstathz; new_val = period * sched_slice; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val <= 0) return (EINVAL); sched_slice = imax(1, (new_val + period / 2) / period); sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR; hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); return (0); } SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Scheduler"); SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0, "Scheduler name"); SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_quantum, "I", "Quantum for timeshare threads in microseconds"); SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "Quantum for timeshare threads in stathz ticks"); SYSCTL_UINT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "Interactivity score threshold"); SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, 0, "Maximal (lowest) priority for preemption"); SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0, "Assign static kernel priorities to sleeping threads"); SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0, "Number of times idle thread will spin waiting for new work"); SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW, &sched_idlespinthresh, 0, "Threshold before we will permit idle thread spinning"); #ifdef SMP SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, "Number of hz ticks to keep thread affinity for"); SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "Enables the long-term load balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW, &balance_interval, 0, "Average period in stathz ticks to run the long-term balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0, "Attempts to steal work from other cores before idling"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0, "Minimum load on remote CPU before we'll steal"); SYSCTL_INT(_kern_sched, OID_AUTO, trysteal_limit, CTLFLAG_RW, &trysteal_limit, 0, "Topological distance limit for stealing threads in sched_switch()"); SYSCTL_INT(_kern_sched, OID_AUTO, always_steal, CTLFLAG_RW, &always_steal, 0, "Always run the stealer from the idle thread"); SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A", "XML dump of detected CPU topology"); #endif /* ps compat. All cpu percentages from ULE are weighted. */ static int ccpu = 0; SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "Decay factor used for updating %CPU in 4BSD scheduler"); diff --git a/sys/mips/mips/mp_machdep.c b/sys/mips/mips/mp_machdep.c index 1a5a023db381..dc089db1d189 100644 --- a/sys/mips/mips/mp_machdep.c +++ b/sys/mips/mips/mp_machdep.c @@ -1,374 +1,375 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009 Neelkanth Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct pcb stoppcbs[MAXCPU]; static void *dpcpu; static struct mtx ap_boot_mtx; static volatile int aps_ready; static volatile int mp_naps; static void ipi_send(struct pcpu *pc, int ipi) { CTR3(KTR_SMP, "%s: cpu=%d, ipi=%x", __func__, pc->pc_cpuid, ipi); atomic_set_32(&pc->pc_pending_ipis, ipi); platform_ipi_send(pc->pc_cpuid); CTR1(KTR_SMP, "%s: sent", __func__); } void ipi_all_but_self(int ipi) { cpuset_t other_cpus; other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); ipi_selected(other_cpus, ipi); } /* Send an IPI to a set of cpus. */ void ipi_selected(cpuset_t cpus, int ipi) { struct pcpu *pc; STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (CPU_ISSET(pc->pc_cpuid, &cpus)) { CTR3(KTR_SMP, "%s: pc: %p, ipi: %x\n", __func__, pc, ipi); ipi_send(pc, ipi); } } } /* Send an IPI to a specific CPU. */ void ipi_cpu(int cpu, u_int ipi) { CTR3(KTR_SMP, "%s: cpu: %d, ipi: %x\n", __func__, cpu, ipi); ipi_send(cpuid_to_pcpu[cpu], ipi); } /* * Handle an IPI sent to this processor. */ static int mips_ipi_handler(void *arg) { u_int cpu, ipi, ipi_bitmap; int bit; cpu = PCPU_GET(cpuid); platform_ipi_clear(); /* quiesce the pending ipi interrupt */ ipi_bitmap = atomic_readandclear_int(PCPU_PTR(pending_ipis)); if (ipi_bitmap == 0) return (FILTER_STRAY); CTR1(KTR_SMP, "smp_handle_ipi(), ipi_bitmap=%x", ipi_bitmap); while ((bit = ffs(ipi_bitmap))) { bit = bit - 1; ipi = 1 << bit; ipi_bitmap &= ~ipi; switch (ipi) { case IPI_RENDEZVOUS: CTR0(KTR_SMP, "IPI_RENDEZVOUS"); smp_rendezvous_action(); break; case IPI_AST: CTR0(KTR_SMP, "IPI_AST"); break; case IPI_STOP: /* * IPI_STOP_HARD is mapped to IPI_STOP so it is not * necessary to add it in the switch. */ CTR0(KTR_SMP, "IPI_STOP or IPI_STOP_HARD"); savectx(&stoppcbs[cpu]); tlb_save(); /* Indicate we are stopped */ CPU_SET_ATOMIC(cpu, &stopped_cpus); /* Wait for restart */ while (!CPU_ISSET(cpu, &started_cpus)) cpu_spinwait(); CPU_CLR_ATOMIC(cpu, &started_cpus); CPU_CLR_ATOMIC(cpu, &stopped_cpus); CTR0(KTR_SMP, "IPI_STOP (restart)"); break; case IPI_PREEMPT: CTR1(KTR_SMP, "%s: IPI_PREEMPT", __func__); sched_preempt(curthread); break; case IPI_HARDCLOCK: CTR1(KTR_SMP, "%s: IPI_HARDCLOCK", __func__); hardclockintr(); break; default: panic("Unknown IPI 0x%0x on cpu %d", ipi, curcpu); } } return (FILTER_HANDLED); } static int start_ap(int cpuid) { int cpus, ms; cpus = mp_naps; dpcpu = (void *)kmem_malloc(DPCPU_SIZE, M_WAITOK | M_ZERO); mips_sync(); if (platform_start_ap(cpuid) != 0) return (-1); /* could not start AP */ for (ms = 0; ms < 5000; ++ms) { if (mp_naps > cpus) return (0); /* success */ else DELAY(1000); } return (-2); /* timeout initializing AP */ } void cpu_mp_setmaxid(void) { cpuset_t cpumask; int cpu, last; platform_cpu_mask(&cpumask); mp_ncpus = 0; last = 1; while ((cpu = CPU_FFS(&cpumask)) != 0) { last = cpu; cpu--; CPU_CLR(cpu, &cpumask); mp_ncpus++; } if (mp_ncpus <= 0) mp_ncpus = 1; mp_maxid = min(last, MAXCPU) - 1; } void cpu_mp_announce(void) { /* NOTHING */ } struct cpu_group * cpu_topo(void) { return (platform_smp_topo()); } int cpu_mp_probe(void) { return (mp_ncpus > 1); } void cpu_mp_start(void) { int error, cpuid; cpuset_t cpumask; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); CPU_ZERO(&all_cpus); platform_cpu_mask(&cpumask); while (!CPU_EMPTY(&cpumask)) { cpuid = CPU_FFS(&cpumask) - 1; CPU_CLR(cpuid, &cpumask); if (cpuid >= MAXCPU) { printf("cpu_mp_start: ignoring AP #%d.\n", cpuid); continue; } if (cpuid != platform_processor_id()) { if ((error = start_ap(cpuid)) != 0) { printf("AP #%d failed to start: %d\n", cpuid, error); continue; } if (bootverbose) printf("AP #%d started!\n", cpuid); } CPU_SET(cpuid, &all_cpus); } } void smp_init_secondary(u_int32_t cpuid) { /* TLB */ mips_wr_wired(0); tlb_invalidate_all(); mips_wr_wired(VMWIRED_ENTRIES); /* * We assume that the L1 cache on the APs is identical to the one * on the BSP. */ mips_dcache_wbinv_all(); mips_icache_sync_all(); mips_sync(); mips_wr_entryhi(0); pcpu_init(PCPU_ADDR(cpuid), cpuid, sizeof(struct pcpu)); dpcpu_init(dpcpu, cpuid); /* The AP has initialized successfully - allow the BSP to proceed */ ++mp_naps; /* Spin until the BSP is ready to release the APs */ while (!aps_ready) ; #ifdef PLATFORM_INIT_SECONDARY platform_init_secondary(cpuid); #endif /* Initialize curthread. */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); PCPU_SET(curthread, PCPU_GET(idlethread)); + schedinit_ap(); mtx_lock_spin(&ap_boot_mtx); smp_cpus++; CTR1(KTR_SMP, "SMP: AP CPU #%d launched", PCPU_GET(cpuid)); if (bootverbose) printf("SMP: AP CPU #%d launched.\n", PCPU_GET(cpuid)); if (smp_cpus == mp_ncpus) { atomic_store_rel_int(&smp_started, 1); } mtx_unlock_spin(&ap_boot_mtx); while (smp_started == 0) ; /* nothing */ /* Start per-CPU event timers. */ cpu_initclocks_ap(); /* enter the scheduler */ sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ } static void release_aps(void *dummy __unused) { int ipi_irq; if (mp_ncpus == 1) return; #ifdef PLATFORM_INIT_SECONDARY platform_init_secondary(0); #endif /* * IPI handler */ ipi_irq = platform_ipi_hardintr_num(); if (ipi_irq != -1) { cpu_establish_hardintr("ipi", mips_ipi_handler, NULL, NULL, ipi_irq, INTR_TYPE_MISC | INTR_EXCL, NULL); } else { ipi_irq = platform_ipi_softintr_num(); cpu_establish_softintr("ipi", mips_ipi_handler, NULL, NULL, ipi_irq, INTR_TYPE_MISC | INTR_EXCL, NULL); } atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ; /* nothing */ } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); diff --git a/sys/powerpc/aim/mp_cpudep.c b/sys/powerpc/aim/mp_cpudep.c index 33aae520c4b2..a73246487683 100644 --- a/sys/powerpc/aim/mp_cpudep.c +++ b/sys/powerpc/aim/mp_cpudep.c @@ -1,424 +1,426 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include void *ap_pcpu; static register_t bsp_state[8] __aligned(8); static void cpudep_save_config(void *dummy); SYSINIT(cpu_save_config, SI_SUB_CPU, SI_ORDER_ANY, cpudep_save_config, NULL); void cpudep_ap_early_bootstrap(void) { #ifndef __powerpc64__ register_t reg; #endif switch (mfpvr() >> 16) { case IBM970: case IBM970FX: case IBM970MP: /* Set HIOR to 0 */ __asm __volatile("mtspr 311,%0" :: "r"(0)); powerpc_sync(); /* Restore HID4 and HID5, which are necessary for the MMU */ #ifdef __powerpc64__ mtspr(SPR_HID4, bsp_state[2]); powerpc_sync(); isync(); mtspr(SPR_HID5, bsp_state[3]); powerpc_sync(); isync(); #else __asm __volatile("ld %0, 16(%2); sync; isync; \ mtspr %1, %0; sync; isync;" : "=r"(reg) : "K"(SPR_HID4), "b"(bsp_state)); __asm __volatile("ld %0, 24(%2); sync; isync; \ mtspr %1, %0; sync; isync;" : "=r"(reg) : "K"(SPR_HID5), "b"(bsp_state)); #endif powerpc_sync(); break; case IBMPOWER8: case IBMPOWER8E: case IBMPOWER8NVL: case IBMPOWER9: #ifdef __powerpc64__ if (mfmsr() & PSL_HV) { isync(); /* * Direct interrupts to SRR instead of HSRR and * reset LPCR otherwise */ mtspr(SPR_LPID, 0); isync(); mtspr(SPR_LPCR, lpcr); isync(); /* * Nuke FSCR, to be managed on a per-process basis * later. */ mtspr(SPR_FSCR, 0); } #endif break; } __asm __volatile("mtsprg 0, %0" :: "r"(ap_pcpu)); powerpc_sync(); } uintptr_t cpudep_ap_bootstrap(void) { register_t msr, sp; msr = psl_kernset & ~PSL_EE; mtmsr(msr); pcpup->pc_curthread = pcpup->pc_idlethread; #ifdef __powerpc64__ __asm __volatile("mr 13,%0" :: "r"(pcpup->pc_curthread)); #else __asm __volatile("mr 2,%0" :: "r"(pcpup->pc_curthread)); #endif pcpup->pc_curpcb = pcpup->pc_curthread->td_pcb; sp = pcpup->pc_curpcb->pcb_sp; + schedinit_ap(); return (sp); } static register_t mpc74xx_l2_enable(register_t l2cr_config) { register_t ccr, bit; uint16_t vers; vers = mfpvr() >> 16; switch (vers) { case MPC7400: case MPC7410: bit = L2CR_L2IP; break; default: bit = L2CR_L2I; break; } ccr = mfspr(SPR_L2CR); if (ccr & L2CR_L2E) return (ccr); /* Configure L2 cache. */ ccr = l2cr_config & ~L2CR_L2E; mtspr(SPR_L2CR, ccr | L2CR_L2I); do { ccr = mfspr(SPR_L2CR); } while (ccr & bit); powerpc_sync(); mtspr(SPR_L2CR, l2cr_config); powerpc_sync(); return (l2cr_config); } static register_t mpc745x_l3_enable(register_t l3cr_config) { register_t ccr; ccr = mfspr(SPR_L3CR); if (ccr & L3CR_L3E) return (ccr); /* Configure L3 cache. */ ccr = l3cr_config & ~(L3CR_L3E | L3CR_L3I | L3CR_L3PE | L3CR_L3CLKEN); mtspr(SPR_L3CR, ccr); ccr |= 0x4000000; /* Magic, but documented. */ mtspr(SPR_L3CR, ccr); ccr |= L3CR_L3CLKEN; mtspr(SPR_L3CR, ccr); mtspr(SPR_L3CR, ccr | L3CR_L3I); while (mfspr(SPR_L3CR) & L3CR_L3I) ; mtspr(SPR_L3CR, ccr & ~L3CR_L3CLKEN); powerpc_sync(); DELAY(100); mtspr(SPR_L3CR, ccr); powerpc_sync(); DELAY(100); ccr |= L3CR_L3E; mtspr(SPR_L3CR, ccr); powerpc_sync(); return(ccr); } static register_t mpc74xx_l1d_enable(void) { register_t hid; hid = mfspr(SPR_HID0); if (hid & HID0_DCE) return (hid); /* Enable L1 D-cache */ hid |= HID0_DCE; powerpc_sync(); mtspr(SPR_HID0, hid | HID0_DCFI); powerpc_sync(); return (hid); } static register_t mpc74xx_l1i_enable(void) { register_t hid; hid = mfspr(SPR_HID0); if (hid & HID0_ICE) return (hid); /* Enable L1 I-cache */ hid |= HID0_ICE; isync(); mtspr(SPR_HID0, hid | HID0_ICFI); isync(); return (hid); } static void cpudep_save_config(void *dummy) { uint16_t vers; vers = mfpvr() >> 16; switch(vers) { case IBM970: case IBM970FX: case IBM970MP: #ifdef __powerpc64__ bsp_state[0] = mfspr(SPR_HID0); bsp_state[1] = mfspr(SPR_HID1); bsp_state[2] = mfspr(SPR_HID4); bsp_state[3] = mfspr(SPR_HID5); #else __asm __volatile ("mfspr %0,%2; mr %1,%0; srdi %0,%0,32" : "=r" (bsp_state[0]),"=r" (bsp_state[1]) : "K" (SPR_HID0)); __asm __volatile ("mfspr %0,%2; mr %1,%0; srdi %0,%0,32" : "=r" (bsp_state[2]),"=r" (bsp_state[3]) : "K" (SPR_HID1)); __asm __volatile ("mfspr %0,%2; mr %1,%0; srdi %0,%0,32" : "=r" (bsp_state[4]),"=r" (bsp_state[5]) : "K" (SPR_HID4)); __asm __volatile ("mfspr %0,%2; mr %1,%0; srdi %0,%0,32" : "=r" (bsp_state[6]),"=r" (bsp_state[7]) : "K" (SPR_HID5)); #endif powerpc_sync(); break; case IBMCELLBE: #ifdef NOTYET /* Causes problems if in instruction stream on 970 */ if (mfmsr() & PSL_HV) { bsp_state[0] = mfspr(SPR_HID0); bsp_state[1] = mfspr(SPR_HID1); bsp_state[2] = mfspr(SPR_HID4); bsp_state[3] = mfspr(SPR_HID6); bsp_state[4] = mfspr(SPR_CELL_TSCR); } #endif bsp_state[5] = mfspr(SPR_CELL_TSRL); break; case MPC7450: case MPC7455: case MPC7457: /* Only MPC745x CPUs have an L3 cache. */ bsp_state[3] = mfspr(SPR_L3CR); /* Fallthrough */ case MPC7400: case MPC7410: case MPC7447A: case MPC7448: bsp_state[2] = mfspr(SPR_L2CR); bsp_state[1] = mfspr(SPR_HID1); bsp_state[0] = mfspr(SPR_HID0); break; } } void cpudep_ap_setup() { register_t reg; uint16_t vers; vers = mfpvr() >> 16; switch(vers) { case IBM970: case IBM970FX: case IBM970MP: /* * The 970 has strange rules about how to update HID registers. * See Table 2-3, 970MP manual * * Note: HID4 and HID5 restored already in * cpudep_ap_early_bootstrap() */ __asm __volatile("mtasr %0; sync" :: "r"(0)); #ifdef __powerpc64__ __asm __volatile(" \ sync; isync; \ mtspr %1, %0; \ mfspr %0, %1; mfspr %0, %1; mfspr %0, %1; \ mfspr %0, %1; mfspr %0, %1; mfspr %0, %1; \ sync; isync" :: "r"(bsp_state[0]), "K"(SPR_HID0)); __asm __volatile("sync; isync; \ mtspr %1, %0; mtspr %1, %0; sync; isync" :: "r"(bsp_state[1]), "K"(SPR_HID1)); #else __asm __volatile(" \ ld %0,0(%2); \ sync; isync; \ mtspr %1, %0; \ mfspr %0, %1; mfspr %0, %1; mfspr %0, %1; \ mfspr %0, %1; mfspr %0, %1; mfspr %0, %1; \ sync; isync" : "=r"(reg) : "K"(SPR_HID0), "b"(bsp_state)); __asm __volatile("ld %0, 8(%2); sync; isync; \ mtspr %1, %0; mtspr %1, %0; sync; isync" : "=r"(reg) : "K"(SPR_HID1), "b"(bsp_state)); #endif powerpc_sync(); break; case IBMCELLBE: #ifdef NOTYET /* Causes problems if in instruction stream on 970 */ if (mfmsr() & PSL_HV) { mtspr(SPR_HID0, bsp_state[0]); mtspr(SPR_HID1, bsp_state[1]); mtspr(SPR_HID4, bsp_state[2]); mtspr(SPR_HID6, bsp_state[3]); mtspr(SPR_CELL_TSCR, bsp_state[4]); } #endif mtspr(SPR_CELL_TSRL, bsp_state[5]); break; case MPC7400: case MPC7410: case MPC7447A: case MPC7448: case MPC7450: case MPC7455: case MPC7457: /* XXX: Program the CPU ID into PIR */ __asm __volatile("mtspr 1023,%0" :: "r"(PCPU_GET(cpuid))); powerpc_sync(); isync(); mtspr(SPR_HID0, bsp_state[0]); isync(); mtspr(SPR_HID1, bsp_state[1]); isync(); /* Now enable the L3 cache. */ switch (vers) { case MPC7450: case MPC7455: case MPC7457: /* Only MPC745x CPUs have an L3 cache. */ reg = mpc745x_l3_enable(bsp_state[3]); default: break; } reg = mpc74xx_l2_enable(bsp_state[2]); reg = mpc74xx_l1d_enable(); reg = mpc74xx_l1i_enable(); break; case IBMPOWER7: case IBMPOWER7PLUS: case IBMPOWER8: case IBMPOWER8E: case IBMPOWER8NVL: case IBMPOWER9: #ifdef __powerpc64__ if (mfmsr() & PSL_HV) { mtspr(SPR_LPCR, mfspr(SPR_LPCR) | lpcr | LPCR_PECE_WAKESET); isync(); } #endif break; default: #ifdef __powerpc64__ if (!(mfmsr() & PSL_HV)) /* Rely on HV to have set things up */ break; #endif printf("WARNING: Unknown CPU type. Cache performace may be " "suboptimal.\n"); break; } } diff --git a/sys/powerpc/booke/mp_cpudep.c b/sys/powerpc/booke/mp_cpudep.c index c07e8c06ce05..709578d8e1b4 100644 --- a/sys/powerpc/booke/mp_cpudep.c +++ b/sys/powerpc/booke/mp_cpudep.c @@ -1,97 +1,99 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008-2009 Semihalf, Rafal Jaworowski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include +#include #include #include #include #include #include extern void dcache_enable(void); extern void dcache_inval(void); extern void icache_enable(void); extern void icache_inval(void); volatile void *ap_pcpu; uintptr_t cpudep_ap_bootstrap() { uint32_t msr, csr; uintptr_t sp; /* Enable L1 caches */ csr = mfspr(SPR_L1CSR0); if ((csr & L1CSR0_DCE) == 0) { dcache_inval(); dcache_enable(); } csr = mfspr(SPR_L1CSR1); if ((csr & L1CSR1_ICE) == 0) { icache_inval(); icache_enable(); } /* Set MSR */ #ifdef __powerpc64__ msr = PSL_CM | PSL_ME; #else msr = PSL_ME; #endif mtmsr(msr); /* Assign pcpu fields, return ptr to this AP's idle thread kstack */ pcpup->pc_curthread = pcpup->pc_idlethread; #ifdef __powerpc64__ __asm __volatile("mr 13,%0" :: "r"(pcpup->pc_curthread)); #else __asm __volatile("mr 2,%0" :: "r"(pcpup->pc_curthread)); #endif pcpup->pc_curpcb = pcpup->pc_curthread->td_pcb; sp = pcpup->pc_curpcb->pcb_sp; + schedinit_ap(); /* XXX shouldn't the pcb_sp be checked/forced for alignment here?? */ return (sp); } void cpudep_ap_setup() { } diff --git a/sys/riscv/riscv/mp_machdep.c b/sys/riscv/riscv/mp_machdep.c index cdcc86e715dd..6a66b7eb80f7 100644 --- a/sys/riscv/riscv/mp_machdep.c +++ b/sys/riscv/riscv/mp_machdep.c @@ -1,554 +1,555 @@ /*- * Copyright (c) 2015 The FreeBSD Foundation * Copyright (c) 2016 Ruslan Bukin * All rights reserved. * * Portions of this software were developed by Andrew Turner under * sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by SRI International and the * University of Cambridge Computer Laboratory under DARPA/AFRL contract * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. * * Portions of this software were developed by the University of Cambridge * Computer Laboratory as part of the CTSRD Project, with support from the * UK Higher Education Innovation Fund (HEIF). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_kstack_pages.h" #include "opt_platform.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef FDT #include #include #endif boolean_t ofw_cpu_reg(phandle_t node, u_int, cell_t *); uint32_t __riscv_boot_ap[MAXCPU]; static enum { CPUS_UNKNOWN, #ifdef FDT CPUS_FDT, #endif } cpu_enum_method; static device_identify_t riscv64_cpu_identify; static device_probe_t riscv64_cpu_probe; static device_attach_t riscv64_cpu_attach; static int ipi_handler(void *); struct pcb stoppcbs[MAXCPU]; extern uint32_t boot_hart; extern cpuset_t all_harts; #ifdef INVARIANTS static uint32_t cpu_reg[MAXCPU][2]; #endif static device_t cpu_list[MAXCPU]; void mpentry(u_long hartid); void init_secondary(uint64_t); static struct mtx ap_boot_mtx; /* Stacks for AP initialization, discarded once idle threads are started. */ void *bootstack; static void *bootstacks[MAXCPU]; /* Count of started APs, used to synchronize access to bootstack. */ static volatile int aps_started; /* Set to 1 once we're ready to let the APs out of the pen. */ static volatile int aps_ready; /* Temporary variables for init_secondary() */ void *dpcpu[MAXCPU - 1]; static device_method_t riscv64_cpu_methods[] = { /* Device interface */ DEVMETHOD(device_identify, riscv64_cpu_identify), DEVMETHOD(device_probe, riscv64_cpu_probe), DEVMETHOD(device_attach, riscv64_cpu_attach), DEVMETHOD_END }; static devclass_t riscv64_cpu_devclass; static driver_t riscv64_cpu_driver = { "riscv64_cpu", riscv64_cpu_methods, 0 }; DRIVER_MODULE(riscv64_cpu, cpu, riscv64_cpu_driver, riscv64_cpu_devclass, 0, 0); static void riscv64_cpu_identify(driver_t *driver, device_t parent) { if (device_find_child(parent, "riscv64_cpu", -1) != NULL) return; if (BUS_ADD_CHILD(parent, 0, "riscv64_cpu", -1) == NULL) device_printf(parent, "add child failed\n"); } static int riscv64_cpu_probe(device_t dev) { u_int cpuid; cpuid = device_get_unit(dev); if (cpuid >= MAXCPU || cpuid > mp_maxid) return (EINVAL); device_quiet(dev); return (0); } static int riscv64_cpu_attach(device_t dev) { const uint32_t *reg; size_t reg_size; u_int cpuid; int i; cpuid = device_get_unit(dev); if (cpuid >= MAXCPU || cpuid > mp_maxid) return (EINVAL); KASSERT(cpu_list[cpuid] == NULL, ("Already have cpu %u", cpuid)); reg = cpu_get_cpuid(dev, ®_size); if (reg == NULL) return (EINVAL); if (bootverbose) { device_printf(dev, "register <"); for (i = 0; i < reg_size; i++) printf("%s%x", (i == 0) ? "" : " ", reg[i]); printf(">\n"); } /* Set the device to start it later */ cpu_list[cpuid] = dev; return (0); } static void release_aps(void *dummy __unused) { cpuset_t mask; int i; if (mp_ncpus == 1) return; /* Setup the IPI handler */ riscv_setup_ipihandler(ipi_handler); atomic_store_rel_int(&aps_ready, 1); /* Wake up the other CPUs */ mask = all_harts; CPU_CLR(boot_hart, &mask); printf("Release APs\n"); sbi_send_ipi(mask.__bits); for (i = 0; i < 2000; i++) { if (smp_started) return; DELAY(1000); } printf("APs not started\n"); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); void init_secondary(uint64_t hart) { struct pcpu *pcpup; u_int cpuid; /* Renumber this cpu */ cpuid = hart; if (cpuid < boot_hart) cpuid += mp_maxid + 1; cpuid -= boot_hart; /* Setup the pcpu pointer */ pcpup = &__pcpu[cpuid]; __asm __volatile("mv tp, %0" :: "r"(pcpup)); /* Workaround: make sure wfi doesn't halt the hart */ csr_set(sie, SIE_SSIE); csr_set(sip, SIE_SSIE); /* Signal the BSP and spin until it has released all APs. */ atomic_add_int(&aps_started, 1); while (!atomic_load_int(&aps_ready)) __asm __volatile("wfi"); /* Initialize curthread */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); pcpup->pc_curthread = pcpup->pc_idlethread; + schedinit_ap(); /* * Identify current CPU. This is necessary to setup * affinity registers and to provide support for * runtime chip identification. */ identify_cpu(); /* Enable software interrupts */ riscv_unmask_ipi(); #ifndef EARLY_AP_STARTUP /* Start per-CPU event timers. */ cpu_initclocks_ap(); #endif /* Enable external (PLIC) interrupts */ csr_set(sie, SIE_SEIE); /* Activate this hart in the kernel pmap. */ CPU_SET_ATOMIC(hart, &kernel_pmap->pm_active); /* Activate process 0's pmap. */ pmap_activate_boot(vmspace_pmap(proc0.p_vmspace)); mtx_lock_spin(&ap_boot_mtx); atomic_add_rel_32(&smp_cpus, 1); if (smp_cpus == mp_ncpus) { /* enable IPI's, tlb shootdown, freezes etc */ atomic_store_rel_int(&smp_started, 1); } mtx_unlock_spin(&ap_boot_mtx); /* * Assert that smp_after_idle_runnable condition is reasonable. */ MPASS(PCPU_GET(curpcb) == NULL); /* Enter the scheduler */ sched_throw(NULL); panic("scheduler returned us to init_secondary"); /* NOTREACHED */ } static void smp_after_idle_runnable(void *arg __unused) { struct pcpu *pc; int cpu; for (cpu = 1; cpu <= mp_maxid; cpu++) { if (bootstacks[cpu] != NULL) { pc = pcpu_find(cpu); while (atomic_load_ptr(&pc->pc_curpcb) == NULL) cpu_spinwait(); kmem_free((vm_offset_t)bootstacks[cpu], PAGE_SIZE); } } } SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY, smp_after_idle_runnable, NULL); static int ipi_handler(void *arg) { u_int ipi_bitmap; u_int cpu, ipi; int bit; csr_clear(sip, SIP_SSIP); cpu = PCPU_GET(cpuid); mb(); ipi_bitmap = atomic_readandclear_int(PCPU_PTR(pending_ipis)); if (ipi_bitmap == 0) return (FILTER_HANDLED); while ((bit = ffs(ipi_bitmap))) { bit = (bit - 1); ipi = (1 << bit); ipi_bitmap &= ~ipi; mb(); switch (ipi) { case IPI_AST: CTR0(KTR_SMP, "IPI_AST"); break; case IPI_PREEMPT: CTR1(KTR_SMP, "%s: IPI_PREEMPT", __func__); sched_preempt(curthread); break; case IPI_RENDEZVOUS: CTR0(KTR_SMP, "IPI_RENDEZVOUS"); smp_rendezvous_action(); break; case IPI_STOP: case IPI_STOP_HARD: CTR0(KTR_SMP, (ipi == IPI_STOP) ? "IPI_STOP" : "IPI_STOP_HARD"); savectx(&stoppcbs[cpu]); /* Indicate we are stopped */ CPU_SET_ATOMIC(cpu, &stopped_cpus); /* Wait for restart */ while (!CPU_ISSET(cpu, &started_cpus)) cpu_spinwait(); CPU_CLR_ATOMIC(cpu, &started_cpus); CPU_CLR_ATOMIC(cpu, &stopped_cpus); CTR0(KTR_SMP, "IPI_STOP (restart)"); /* * The kernel debugger might have set a breakpoint, * so flush the instruction cache. */ fence_i(); break; case IPI_HARDCLOCK: CTR1(KTR_SMP, "%s: IPI_HARDCLOCK", __func__); hardclockintr(); break; default: panic("Unknown IPI %#0x on cpu %d", ipi, curcpu); } } return (FILTER_HANDLED); } struct cpu_group * cpu_topo(void) { return (smp_topo_none()); } /* Determine if we running MP machine */ int cpu_mp_probe(void) { return (mp_ncpus > 1); } #ifdef FDT static boolean_t cpu_init_fdt(u_int id, phandle_t node, u_int addr_size, pcell_t *reg) { struct pcpu *pcpup; vm_paddr_t start_addr; uint64_t hart; u_int cpuid; int naps; int error; /* Check if this hart supports MMU. */ if (OF_getproplen(node, "mmu-type") < 0) return (0); KASSERT(id < MAXCPU, ("Too many CPUs")); KASSERT(addr_size == 1 || addr_size == 2, ("Invalid register size")); #ifdef INVARIANTS cpu_reg[id][0] = reg[0]; if (addr_size == 2) cpu_reg[id][1] = reg[1]; #endif hart = reg[0]; if (addr_size == 2) { hart <<= 32; hart |= reg[1]; } KASSERT(hart < MAXCPU, ("Too many harts.")); /* We are already running on this cpu */ if (hart == boot_hart) return (1); /* * Rotate the CPU IDs to put the boot CPU as CPU 0. * We keep the other CPUs ordered. */ cpuid = hart; if (cpuid < boot_hart) cpuid += mp_maxid + 1; cpuid -= boot_hart; /* Check if we are able to start this cpu */ if (cpuid > mp_maxid) return (0); /* * Depending on the SBI implementation, APs are waiting either in * locore.S or to be activated explicitly, via SBI call. */ if (sbi_probe_extension(SBI_EXT_ID_HSM) != 0) { start_addr = pmap_kextract((vm_offset_t)mpentry); error = sbi_hsm_hart_start(hart, start_addr, 0); if (error != 0) { mp_ncpus--; /* Send a warning to the user and continue. */ printf("AP %u (hart %lu) failed to start, error %d\n", cpuid, hart, error); return (0); } } pcpup = &__pcpu[cpuid]; pcpu_init(pcpup, cpuid, sizeof(struct pcpu)); pcpup->pc_hart = hart; dpcpu[cpuid - 1] = (void *)kmem_malloc(DPCPU_SIZE, M_WAITOK | M_ZERO); dpcpu_init(dpcpu[cpuid - 1], cpuid); bootstacks[cpuid] = (void *)kmem_malloc(PAGE_SIZE, M_WAITOK | M_ZERO); naps = atomic_load_int(&aps_started); bootstack = (char *)bootstacks[cpuid] + PAGE_SIZE; printf("Starting CPU %u (hart %lx)\n", cpuid, hart); atomic_store_32(&__riscv_boot_ap[hart], 1); /* Wait for the AP to switch to its boot stack. */ while (atomic_load_int(&aps_started) < naps + 1) cpu_spinwait(); CPU_SET(cpuid, &all_cpus); CPU_SET(hart, &all_harts); return (1); } #endif /* Initialize and fire up non-boot processors */ void cpu_mp_start(void) { mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); CPU_SET(0, &all_cpus); CPU_SET(boot_hart, &all_harts); switch(cpu_enum_method) { #ifdef FDT case CPUS_FDT: ofw_cpu_early_foreach(cpu_init_fdt, true); break; #endif case CPUS_UNKNOWN: break; } } /* Introduce rest of cores to the world */ void cpu_mp_announce(void) { } static boolean_t cpu_check_mmu(u_int id, phandle_t node, u_int addr_size, pcell_t *reg) { /* Check if this hart supports MMU. */ if (OF_getproplen(node, "mmu-type") < 0) return (0); return (1); } void cpu_mp_setmaxid(void) { #ifdef FDT int cores; cores = ofw_cpu_early_foreach(cpu_check_mmu, true); if (cores > 0) { cores = MIN(cores, MAXCPU); if (bootverbose) printf("Found %d CPUs in the device tree\n", cores); mp_ncpus = cores; mp_maxid = cores - 1; cpu_enum_method = CPUS_FDT; return; } #endif if (bootverbose) printf("No CPU data, limiting to 1 core\n"); mp_ncpus = 1; mp_maxid = 0; } diff --git a/sys/sys/sched.h b/sys/sys/sched.h index 64651ffa9c90..8041a2bc12d4 100644 --- a/sys/sys/sched.h +++ b/sys/sys/sched.h @@ -1,269 +1,274 @@ /*- * SPDX-License-Identifier: (BSD-4-Clause AND BSD-2-Clause-FreeBSD) * * Copyright (c) 1996, 1997 * HD Associates, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by HD Associates, Inc * and Jukka Antero Ukkonen. * 4. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Copyright (c) 2002-2008, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SCHED_H_ #define _SCHED_H_ #ifdef _KERNEL /* * General scheduling info. * * sched_load: * Total runnable non-ithread threads in the system. * * sched_runnable: * Runnable threads for this processor. */ int sched_load(void); int sched_rr_interval(void); int sched_runnable(void); /* * Proc related scheduling hooks. */ void sched_exit(struct proc *p, struct thread *childtd); void sched_fork(struct thread *td, struct thread *childtd); void sched_fork_exit(struct thread *td); void sched_class(struct thread *td, int class); void sched_nice(struct proc *p, int nice); /* * Threads are switched in and out, block on resources, have temporary * priorities inherited from their procs, and use up cpu time. */ void sched_exit_thread(struct thread *td, struct thread *child); u_int sched_estcpu(struct thread *td); void sched_fork_thread(struct thread *td, struct thread *child); void sched_lend_prio(struct thread *td, u_char prio); void sched_lend_user_prio(struct thread *td, u_char pri); void sched_lend_user_prio_cond(struct thread *td, u_char pri); fixpt_t sched_pctcpu(struct thread *td); void sched_prio(struct thread *td, u_char prio); void sched_sleep(struct thread *td, int prio); void sched_switch(struct thread *td, int flags); void sched_throw(struct thread *td); void sched_unlend_prio(struct thread *td, u_char prio); void sched_user_prio(struct thread *td, u_char prio); void sched_userret_slowpath(struct thread *td); #ifdef RACCT #ifdef SCHED_4BSD fixpt_t sched_pctcpu_delta(struct thread *td); #endif #endif static inline void sched_userret(struct thread *td) { /* * XXX we cheat slightly on the locking here to avoid locking in * the usual case. Setting td_priority here is essentially an * incomplete workaround for not setting it properly elsewhere. * Now that some interrupt handlers are threads, not setting it * properly elsewhere can clobber it in the window between setting * it here and returning to user mode, so don't waste time setting * it perfectly here. */ KASSERT((td->td_flags & TDF_BORROWING) == 0, ("thread with borrowed priority returning to userland")); if (__predict_false(td->td_priority != td->td_user_pri)) sched_userret_slowpath(td); } /* * Threads are moved on and off of run queues */ void sched_add(struct thread *td, int flags); struct thread *sched_choose(void); void sched_clock(struct thread *td, int cnt); void sched_idletd(void *); void sched_preempt(struct thread *td); void sched_relinquish(struct thread *td); void sched_rem(struct thread *td); void sched_wakeup(struct thread *td, int srqflags); /* * Binding makes cpu affinity permanent while pinning is used to temporarily * hold a thread on a particular CPU. */ void sched_bind(struct thread *td, int cpu); static __inline void sched_pin(void); void sched_unbind(struct thread *td); static __inline void sched_unpin(void); int sched_is_bound(struct thread *td); void sched_affinity(struct thread *td); /* * These procedures tell the process data structure allocation code how * many bytes to actually allocate. */ int sched_sizeof_proc(void); int sched_sizeof_thread(void); /* * This routine provides a consistent thread name for use with KTR graphing * functions. */ char *sched_tdname(struct thread *td); #ifdef KTR void sched_clear_tdname(struct thread *td); #endif static __inline void sched_pin(void) { curthread->td_pinned++; atomic_interrupt_fence(); } static __inline void sched_unpin(void) { atomic_interrupt_fence(); curthread->td_pinned--; } /* sched_add arguments (formerly setrunqueue) */ #define SRQ_BORING 0x0000 /* No special circumstances. */ #define SRQ_YIELDING 0x0001 /* We are yielding (from mi_switch). */ #define SRQ_OURSELF 0x0002 /* It is ourself (from mi_switch). */ #define SRQ_INTR 0x0004 /* It is probably urgent. */ #define SRQ_PREEMPTED 0x0008 /* has been preempted.. be kind */ #define SRQ_BORROWING 0x0010 /* Priority updated due to prio_lend */ #define SRQ_HOLD 0x0020 /* Return holding original td lock */ #define SRQ_HOLDTD 0x0040 /* Return holding td lock */ /* Scheduler stats. */ #ifdef SCHED_STATS DPCPU_DECLARE(long, sched_switch_stats[SWT_COUNT]); #define SCHED_STAT_DEFINE_VAR(name, ptr, descr) \ static void name ## _add_proc(void *dummy __unused) \ { \ \ SYSCTL_ADD_PROC(NULL, \ SYSCTL_STATIC_CHILDREN(_kern_sched_stats), OID_AUTO, \ #name, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE, \ ptr, 0, sysctl_dpcpu_long, "LU", descr); \ } \ SYSINIT(name, SI_SUB_LAST, SI_ORDER_MIDDLE, name ## _add_proc, NULL); #define SCHED_STAT_DEFINE(name, descr) \ DPCPU_DEFINE(unsigned long, name); \ SCHED_STAT_DEFINE_VAR(name, &DPCPU_NAME(name), descr) /* * Sched stats are always incremented in critical sections so no atomic * is necesssary to increment them. */ #define SCHED_STAT_INC(var) DPCPU_GET(var)++; #else #define SCHED_STAT_DEFINE_VAR(name, descr, ptr) #define SCHED_STAT_DEFINE(name, descr) #define SCHED_STAT_INC(var) (void)0 #endif /* * Fixup scheduler state for proc0 and thread0 */ void schedinit(void); + +/* + * Fixup scheduler state for secondary APs + */ +void schedinit_ap(void); #endif /* _KERNEL */ /* POSIX 1003.1b Process Scheduling */ /* * POSIX scheduling policies */ #define SCHED_FIFO 1 #define SCHED_OTHER 2 #define SCHED_RR 3 struct sched_param { int sched_priority; }; /* * POSIX scheduling declarations for userland. */ #ifndef _KERNEL #include #include #include #ifndef _PID_T_DECLARED typedef __pid_t pid_t; #define _PID_T_DECLARED #endif __BEGIN_DECLS int sched_get_priority_max(int); int sched_get_priority_min(int); int sched_getparam(pid_t, struct sched_param *); int sched_getscheduler(pid_t); int sched_rr_get_interval(pid_t, struct timespec *); int sched_setparam(pid_t, const struct sched_param *); int sched_setscheduler(pid_t, int, const struct sched_param *); int sched_yield(void); __END_DECLS #endif #endif /* !_SCHED_H_ */ diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c index 7724370c2c93..64cf276614dc 100644 --- a/sys/x86/x86/mp_x86.c +++ b/sys/x86/x86/mp_x86.c @@ -1,1693 +1,1694 @@ /*- * Copyright (c) 1996, by Steve Passe * Copyright (c) 2003, by Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #ifdef __i386__ #include "opt_apic.h" #endif #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_gdb.h" #include "opt_kstack_pages.h" #include "opt_pmap.h" #include "opt_sched.h" #include "opt_smp.h" #include "opt_stack.h" #include #include #include #include #include /* cngetc() */ #include #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DEV_ACPI #include #include #endif static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items"); /* lock region used by kernel profiling */ int mcount_lock; int mp_naps; /* # of Applications processors */ int boot_cpu_id = -1; /* designated BSP */ /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; int bootAP; /* Free these after use */ void *bootstacks[MAXCPU]; void *dpcpu; struct pcb stoppcbs[MAXCPU]; struct susppcb **susppcbs; #ifdef COUNT_IPIS /* Interrupt counts. */ static u_long *ipi_preempt_counts[MAXCPU]; static u_long *ipi_ast_counts[MAXCPU]; u_long *ipi_invltlb_counts[MAXCPU]; u_long *ipi_invlrng_counts[MAXCPU]; u_long *ipi_invlpg_counts[MAXCPU]; u_long *ipi_invlcache_counts[MAXCPU]; u_long *ipi_rendezvous_counts[MAXCPU]; static u_long *ipi_hardclock_counts[MAXCPU]; #endif /* Default cpu_ops implementation. */ struct cpu_ops cpu_ops; /* * Local data and functions. */ static volatile cpuset_t ipi_stop_nmi_pending; volatile cpuset_t resuming_cpus; volatile cpuset_t toresume_cpus; /* used to hold the AP's until we are ready to release them */ struct mtx ap_boot_mtx; /* Set to 1 once we're ready to let the APs out of the pen. */ volatile int aps_ready = 0; /* * Store data from cpu_add() until later in the boot when we actually setup * the APs. */ struct cpu_info *cpu_info; int *apic_cpuids; int cpu_apic_ids[MAXCPU]; _Static_assert(MAXCPU <= MAX_APIC_ID, "MAXCPU cannot be larger that MAX_APIC_ID"); _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID, "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID"); static void release_aps(void *dummy); static void cpustop_handler_post(u_int cpu); static int hyperthreading_allowed = 1; SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); static int hyperthreading_intr_allowed = 0; SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN, &hyperthreading_intr_allowed, 0, "Allow interrupts on HTT logical CPUs"); static struct topo_node topo_root; static int pkg_id_shift; static int node_id_shift; static int core_id_shift; static int disabled_cpus; struct cache_info { int id_shift; int present; } static caches[MAX_CACHE_LEVELS]; static bool stop_mwait = false; SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0, "Use MONITOR/MWAIT when stopping CPU, if available"); void mem_range_AP_init(void) { if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) mem_range_softc.mr_op->initAP(&mem_range_softc); } /* * Round up to the next power of two, if necessary, and then * take log2. * Returns -1 if argument is zero. */ static __inline int mask_width(u_int x) { return (fls(x << (1 - powerof2(x))) - 1); } /* * Add a cache level to the cache topology description. */ static int add_deterministic_cache(int type, int level, int share_count) { if (type == 0) return (0); if (type > 3) { printf("unexpected cache type %d\n", type); return (1); } if (type == 2) /* ignore instruction cache */ return (1); if (level == 0 || level > MAX_CACHE_LEVELS) { printf("unexpected cache level %d\n", type); return (1); } if (caches[level - 1].present) { printf("WARNING: multiple entries for L%u data cache\n", level); printf("%u => %u\n", caches[level - 1].id_shift, mask_width(share_count)); } caches[level - 1].id_shift = mask_width(share_count); caches[level - 1].present = 1; if (caches[level - 1].id_shift > pkg_id_shift) { printf("WARNING: L%u data cache covers more " "APIC IDs than a package (%u > %u)\n", level, caches[level - 1].id_shift, pkg_id_shift); caches[level - 1].id_shift = pkg_id_shift; } if (caches[level - 1].id_shift < core_id_shift) { printf("WARNING: L%u data cache covers fewer " "APIC IDs than a core (%u < %u)\n", level, caches[level - 1].id_shift, core_id_shift); caches[level - 1].id_shift = core_id_shift; } return (1); } /* * Determine topology of processing units and caches for AMD CPUs. * See: * - AMD CPUID Specification (Publication # 25481) * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) * - BKDG For AMD Family 10h Processors (Publication # 31116) * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945) */ static void topo_probe_amd(void) { u_int p[4]; uint64_t v; int level; int nodes_per_socket; int share_count; int type; int i; /* No multi-core capability. */ if ((amd_feature2 & AMDID2_CMP) == 0) return; /* For families 10h and newer. */ pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> AMDID_COREID_SIZE_SHIFT; /* For 0Fh family. */ if (pkg_id_shift == 0) pkg_id_shift = mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); /* * Families prior to 16h define the following value as * cores per compute unit and we don't really care about the AMD * compute units at the moment. Perhaps we should treat them as * cores and cores within the compute units as hardware threads, * but that's up for debate. * Later families define the value as threads per compute unit, * so we are following AMD's nomenclature here. */ if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && CPUID_TO_FAMILY(cpu_id) >= 0x16) { cpuid_count(0x8000001e, 0, p); share_count = ((p[1] >> 8) & 0xff) + 1; core_id_shift = mask_width(share_count); /* * For Zen (17h), gather Nodes per Processor. Each node is a * Zeppelin die; TR and EPYC CPUs will have multiple dies per * package. Communication latency between dies is higher than * within them. */ nodes_per_socket = ((p[2] >> 8) & 0x7) + 1; node_id_shift = pkg_id_shift - mask_width(nodes_per_socket); } if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { for (i = 0; ; i++) { cpuid_count(0x8000001d, i, p); type = p[0] & 0x1f; level = (p[0] >> 5) & 0x7; share_count = 1 + ((p[0] >> 14) & 0xfff); if (!add_deterministic_cache(type, level, share_count)) break; } } else { if (cpu_exthigh >= 0x80000005) { cpuid_count(0x80000005, 0, p); if (((p[2] >> 24) & 0xff) != 0) { caches[0].id_shift = 0; caches[0].present = 1; } } if (cpu_exthigh >= 0x80000006) { cpuid_count(0x80000006, 0, p); if (((p[2] >> 16) & 0xffff) != 0) { caches[1].id_shift = 0; caches[1].present = 1; } if (((p[3] >> 18) & 0x3fff) != 0) { nodes_per_socket = 1; if ((amd_feature2 & AMDID2_NODE_ID) != 0) { /* * Handle multi-node processors that * have multiple chips, each with its * own L3 cache, on the same die. */ v = rdmsr(0xc001100c); nodes_per_socket = 1 + ((v >> 3) & 0x7); } caches[2].id_shift = pkg_id_shift - mask_width(nodes_per_socket); caches[2].present = 1; } } } } /* * Determine topology of processing units for Intel CPUs * using CPUID Leaf 1 and Leaf 4, if supported. * See: * - Intel 64 Architecture Processor Topology Enumeration * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS */ static void topo_probe_intel_0x4(void) { u_int p[4]; int max_cores; int max_logical; /* Both zero and one here mean one logical processor per package. */ max_logical = (cpu_feature & CPUID_HTT) != 0 ? (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; if (max_logical <= 1) return; if (cpu_high >= 0x4) { cpuid_count(0x04, 0, p); max_cores = ((p[0] >> 26) & 0x3f) + 1; } else max_cores = 1; core_id_shift = mask_width(max_logical/max_cores); KASSERT(core_id_shift >= 0, ("intel topo: max_cores > max_logical\n")); pkg_id_shift = core_id_shift + mask_width(max_cores); } /* * Determine topology of processing units for Intel CPUs * using CPUID Leaf 1Fh or 0Bh, if supported. * See: * - Intel 64 Architecture Processor Topology Enumeration * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS */ static void topo_probe_intel_0xb(void) { u_int leaf; u_int p[4] = { 0 }; int bits; int type; int i; /* Prefer leaf 1Fh (V2 Extended Topology Enumeration). */ if (cpu_high >= 0x1f) { leaf = 0x1f; cpuid_count(leaf, 0, p); } /* Fall back to leaf 0Bh (Extended Topology Enumeration). */ if (p[1] == 0) { leaf = 0x0b; cpuid_count(leaf, 0, p); } /* Fall back to leaf 04h (Deterministic Cache Parameters). */ if (p[1] == 0) { topo_probe_intel_0x4(); return; } /* We only support three levels for now. */ for (i = 0; ; i++) { cpuid_count(leaf, i, p); bits = p[0] & 0x1f; type = (p[2] >> 8) & 0xff; if (type == 0) break; if (type == CPUID_TYPE_SMT) core_id_shift = bits; else if (type == CPUID_TYPE_CORE) pkg_id_shift = bits; else if (bootverbose) printf("Topology level type %d shift: %d\n", type, bits); } if (pkg_id_shift < core_id_shift) { printf("WARNING: core covers more APIC IDs than a package\n"); core_id_shift = pkg_id_shift; } } /* * Determine topology of caches for Intel CPUs. * See: * - Intel 64 Architecture Processor Topology Enumeration * - Intel 64 and IA-32 Architectures Software Developer’s Manual * Volume 2A: Instruction Set Reference, A-M, * CPUID instruction */ static void topo_probe_intel_caches(void) { u_int p[4]; int level; int share_count; int type; int i; if (cpu_high < 0x4) { /* * Available cache level and sizes can be determined * via CPUID leaf 2, but that requires a huge table of hardcoded * values, so for now just assume L1 and L2 caches potentially * shared only by HTT processing units, if HTT is present. */ caches[0].id_shift = pkg_id_shift; caches[0].present = 1; caches[1].id_shift = pkg_id_shift; caches[1].present = 1; return; } for (i = 0; ; i++) { cpuid_count(0x4, i, p); type = p[0] & 0x1f; level = (p[0] >> 5) & 0x7; share_count = 1 + ((p[0] >> 14) & 0xfff); if (!add_deterministic_cache(type, level, share_count)) break; } } /* * Determine topology of processing units and caches for Intel CPUs. * See: * - Intel 64 Architecture Processor Topology Enumeration */ static void topo_probe_intel(void) { /* * Note that 0x1 <= cpu_high < 4 case should be * compatible with topo_probe_intel_0x4() logic when * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) * or it should trigger the fallback otherwise. */ if (cpu_high >= 0xb) topo_probe_intel_0xb(); else if (cpu_high >= 0x1) topo_probe_intel_0x4(); topo_probe_intel_caches(); } /* * Topology information is queried only on BSP, on which this * code runs and for which it can query CPUID information. * Then topology is extrapolated on all packages using an * assumption that APIC ID to hardware component ID mapping is * homogenious. * That doesn't necesserily imply that the topology is uniform. */ void topo_probe(void) { static int cpu_topo_probed = 0; struct x86_topo_layer { int type; int subtype; int id_shift; } topo_layers[MAX_CACHE_LEVELS + 5]; struct topo_node *parent; struct topo_node *node; int layer; int nlayers; int node_id; int i; #if defined(DEV_ACPI) && MAXMEMDOM > 1 int d, domain; #endif if (cpu_topo_probed) return; CPU_ZERO(&logical_cpus_mask); if (mp_ncpus <= 1) ; /* nothing */ else if (cpu_vendor_id == CPU_VENDOR_AMD || cpu_vendor_id == CPU_VENDOR_HYGON) topo_probe_amd(); else if (cpu_vendor_id == CPU_VENDOR_INTEL) topo_probe_intel(); KASSERT(pkg_id_shift >= core_id_shift, ("bug in APIC topology discovery")); nlayers = 0; bzero(topo_layers, sizeof(topo_layers)); topo_layers[nlayers].type = TOPO_TYPE_PKG; topo_layers[nlayers].id_shift = pkg_id_shift; if (bootverbose) printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); nlayers++; if (pkg_id_shift > node_id_shift && node_id_shift != 0) { topo_layers[nlayers].type = TOPO_TYPE_GROUP; topo_layers[nlayers].id_shift = node_id_shift; if (bootverbose) printf("Node ID shift: %u\n", topo_layers[nlayers].id_shift); nlayers++; } /* * Consider all caches to be within a package/chip * and "in front" of all sub-components like * cores and hardware threads. */ for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { if (caches[i].present) { if (node_id_shift != 0) KASSERT(caches[i].id_shift <= node_id_shift, ("bug in APIC topology discovery")); KASSERT(caches[i].id_shift <= pkg_id_shift, ("bug in APIC topology discovery")); KASSERT(caches[i].id_shift >= core_id_shift, ("bug in APIC topology discovery")); topo_layers[nlayers].type = TOPO_TYPE_CACHE; topo_layers[nlayers].subtype = i + 1; topo_layers[nlayers].id_shift = caches[i].id_shift; if (bootverbose) printf("L%u cache ID shift: %u\n", topo_layers[nlayers].subtype, topo_layers[nlayers].id_shift); nlayers++; } } if (pkg_id_shift > core_id_shift) { topo_layers[nlayers].type = TOPO_TYPE_CORE; topo_layers[nlayers].id_shift = core_id_shift; if (bootverbose) printf("Core ID shift: %u\n", topo_layers[nlayers].id_shift); nlayers++; } topo_layers[nlayers].type = TOPO_TYPE_PU; topo_layers[nlayers].id_shift = 0; nlayers++; #if defined(DEV_ACPI) && MAXMEMDOM > 1 if (vm_ndomains > 1) { for (layer = 0; layer < nlayers; ++layer) { for (i = 0; i <= max_apic_id; ++i) { if ((i & ((1 << topo_layers[layer].id_shift) - 1)) == 0) domain = -1; if (!cpu_info[i].cpu_present) continue; d = acpi_pxm_get_cpu_locality(i); if (domain >= 0 && domain != d) break; domain = d; } if (i > max_apic_id) break; } KASSERT(layer < nlayers, ("NUMA domain smaller than PU")); memmove(&topo_layers[layer+1], &topo_layers[layer], sizeof(*topo_layers) * (nlayers - layer)); topo_layers[layer].type = TOPO_TYPE_NODE; topo_layers[layer].subtype = CG_SHARE_NONE; nlayers++; } #endif topo_init_root(&topo_root); for (i = 0; i <= max_apic_id; ++i) { if (!cpu_info[i].cpu_present) continue; parent = &topo_root; for (layer = 0; layer < nlayers; ++layer) { #if defined(DEV_ACPI) && MAXMEMDOM > 1 if (topo_layers[layer].type == TOPO_TYPE_NODE) { node_id = acpi_pxm_get_cpu_locality(i); } else #endif node_id = i >> topo_layers[layer].id_shift; parent = topo_add_node_by_hwid(parent, node_id, topo_layers[layer].type, topo_layers[layer].subtype); } } parent = &topo_root; for (layer = 0; layer < nlayers; ++layer) { #if defined(DEV_ACPI) && MAXMEMDOM > 1 if (topo_layers[layer].type == TOPO_TYPE_NODE) node_id = acpi_pxm_get_cpu_locality(boot_cpu_id); else #endif node_id = boot_cpu_id >> topo_layers[layer].id_shift; node = topo_find_node_by_hwid(parent, node_id, topo_layers[layer].type, topo_layers[layer].subtype); topo_promote_child(node); parent = node; } cpu_topo_probed = 1; } /* * Assign logical CPU IDs to local APICs. */ void assign_cpu_ids(void) { struct topo_node *node; u_int smt_mask; int nhyper; smt_mask = (1u << core_id_shift) - 1; /* * Assign CPU IDs to local APIC IDs and disable any CPUs * beyond MAXCPU. CPU 0 is always assigned to the BSP. */ mp_ncpus = 0; nhyper = 0; TOPO_FOREACH(node, &topo_root) { if (node->type != TOPO_TYPE_PU) continue; if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) cpu_info[node->hwid].cpu_hyperthread = 1; if (resource_disabled("lapic", node->hwid)) { if (node->hwid != boot_cpu_id) cpu_info[node->hwid].cpu_disabled = 1; else printf("Cannot disable BSP, APIC ID = %d\n", node->hwid); } if (!hyperthreading_allowed && cpu_info[node->hwid].cpu_hyperthread) cpu_info[node->hwid].cpu_disabled = 1; if (mp_ncpus >= MAXCPU) cpu_info[node->hwid].cpu_disabled = 1; if (cpu_info[node->hwid].cpu_disabled) { disabled_cpus++; continue; } if (cpu_info[node->hwid].cpu_hyperthread) nhyper++; cpu_apic_ids[mp_ncpus] = node->hwid; apic_cpuids[node->hwid] = mp_ncpus; topo_set_pu_id(node, mp_ncpus); mp_ncpus++; } KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); mp_ncores = mp_ncpus - nhyper; smp_threads_per_core = mp_ncpus / mp_ncores; } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { struct topo_node *node; const char *hyperthread; struct topo_analysis topology; printf("FreeBSD/SMP: "); if (topo_analyze(&topo_root, 1, &topology)) { printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]); if (topology.entities[TOPO_LEVEL_GROUP] > 1) printf(" x %d groups", topology.entities[TOPO_LEVEL_GROUP]); if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) printf(" x %d cache groups", topology.entities[TOPO_LEVEL_CACHEGROUP]); if (topology.entities[TOPO_LEVEL_CORE] > 0) printf(" x %d core(s)", topology.entities[TOPO_LEVEL_CORE]); if (topology.entities[TOPO_LEVEL_THREAD] > 1) printf(" x %d hardware threads", topology.entities[TOPO_LEVEL_THREAD]); } else { printf("Non-uniform topology"); } printf("\n"); if (disabled_cpus) { printf("FreeBSD/SMP Online: "); if (topo_analyze(&topo_root, 0, &topology)) { printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]); if (topology.entities[TOPO_LEVEL_GROUP] > 1) printf(" x %d groups", topology.entities[TOPO_LEVEL_GROUP]); if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) printf(" x %d cache groups", topology.entities[TOPO_LEVEL_CACHEGROUP]); if (topology.entities[TOPO_LEVEL_CORE] > 0) printf(" x %d core(s)", topology.entities[TOPO_LEVEL_CORE]); if (topology.entities[TOPO_LEVEL_THREAD] > 1) printf(" x %d hardware threads", topology.entities[TOPO_LEVEL_THREAD]); } else { printf("Non-uniform topology"); } printf("\n"); } if (!bootverbose) return; TOPO_FOREACH(node, &topo_root) { switch (node->type) { case TOPO_TYPE_PKG: printf("Package HW ID = %u\n", node->hwid); break; case TOPO_TYPE_CORE: printf("\tCore HW ID = %u\n", node->hwid); break; case TOPO_TYPE_PU: if (cpu_info[node->hwid].cpu_hyperthread) hyperthread = "/HT"; else hyperthread = ""; if (node->subtype == 0) printf("\t\tCPU (AP%s): APIC ID: %u" "(disabled)\n", hyperthread, node->hwid); else if (node->id == 0) printf("\t\tCPU0 (BSP): APIC ID: %u\n", node->hwid); else printf("\t\tCPU%u (AP%s): APIC ID: %u\n", node->id, hyperthread, node->hwid); break; default: /* ignored */ break; } } } /* * Add a scheduling group, a group of logical processors sharing * a particular cache (and, thus having an affinity), to the scheduling * topology. * This function recursively works on lower level caches. */ static void x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) { struct topo_node *node; int nchildren; int ncores; int i; KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE || root->type == TOPO_TYPE_NODE || root->type == TOPO_TYPE_GROUP, ("x86topo_add_sched_group: bad type: %u", root->type)); CPU_COPY(&root->cpuset, &cg_root->cg_mask); cg_root->cg_count = root->cpu_count; if (root->type == TOPO_TYPE_CACHE) cg_root->cg_level = root->subtype; else cg_root->cg_level = CG_SHARE_NONE; if (root->type == TOPO_TYPE_NODE) cg_root->cg_flags = CG_FLAG_NODE; else cg_root->cg_flags = 0; /* * Check how many core nodes we have under the given root node. * If we have multiple logical processors, but not multiple * cores, then those processors must be hardware threads. */ ncores = 0; node = root; while (node != NULL) { if (node->type != TOPO_TYPE_CORE) { node = topo_next_node(root, node); continue; } ncores++; node = topo_next_nonchild_node(root, node); } if (cg_root->cg_level != CG_SHARE_NONE && root->cpu_count > 1 && ncores < 2) cg_root->cg_flags |= CG_FLAG_SMT; /* * Find out how many cache nodes we have under the given root node. * We ignore cache nodes that cover all the same processors as the * root node. Also, we do not descend below found cache nodes. * That is, we count top-level "non-redundant" caches under the root * node. */ nchildren = 0; node = root; while (node != NULL) { if (CPU_CMP(&node->cpuset, &root->cpuset) == 0) { if (node->type == TOPO_TYPE_CACHE && cg_root->cg_level < node->subtype) cg_root->cg_level = node->subtype; if (node->type == TOPO_TYPE_NODE) cg_root->cg_flags |= CG_FLAG_NODE; node = topo_next_node(root, node); continue; } if (node->type != TOPO_TYPE_GROUP && node->type != TOPO_TYPE_NODE && node->type != TOPO_TYPE_CACHE) { node = topo_next_node(root, node); continue; } nchildren++; node = topo_next_nonchild_node(root, node); } /* * We are not interested in nodes including only one CPU each. */ if (nchildren == root->cpu_count) return; cg_root->cg_child = smp_topo_alloc(nchildren); cg_root->cg_children = nchildren; /* * Now find again the same cache nodes as above and recursively * build scheduling topologies for them. */ node = root; i = 0; while (node != NULL) { if ((node->type != TOPO_TYPE_GROUP && node->type != TOPO_TYPE_NODE && node->type != TOPO_TYPE_CACHE) || CPU_CMP(&node->cpuset, &root->cpuset) == 0) { node = topo_next_node(root, node); continue; } cg_root->cg_child[i].cg_parent = cg_root; x86topo_add_sched_group(node, &cg_root->cg_child[i]); i++; node = topo_next_nonchild_node(root, node); } } /* * Build the MI scheduling topology from the discovered hardware topology. */ struct cpu_group * cpu_topo(void) { struct cpu_group *cg_root; if (mp_ncpus <= 1) return (smp_topo_none()); cg_root = smp_topo_alloc(1); x86topo_add_sched_group(&topo_root, cg_root); return (cg_root); } static void cpu_alloc(void *dummy __unused) { /* * Dynamically allocate the arrays that depend on the * maximum APIC ID. */ cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS, M_WAITOK | M_ZERO); apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS, M_WAITOK | M_ZERO); } SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL); /* * Add a logical CPU to the topology. */ void cpu_add(u_int apic_id, char boot_cpu) { if (apic_id > max_apic_id) { panic("SMP: APIC ID %d too high", apic_id); return; } KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice", apic_id)); cpu_info[apic_id].cpu_present = 1; if (boot_cpu) { KASSERT(boot_cpu_id == -1, ("CPU %u claims to be BSP, but CPU %u already is", apic_id, boot_cpu_id)); boot_cpu_id = apic_id; cpu_info[apic_id].cpu_bsp = 1; } if (bootverbose) printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" : "AP"); } void cpu_mp_setmaxid(void) { /* * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). * If there were no calls to cpu_add() assume this is a UP system. */ if (mp_ncpus == 0) mp_ncpus = 1; } int cpu_mp_probe(void) { /* * Always record BSP in CPU map so that the mbuf init code works * correctly. */ CPU_SETOF(0, &all_cpus); return (mp_ncpus > 1); } /* * AP CPU's call this to initialize themselves. */ void init_secondary_tail(void) { u_int cpuid; pmap_activate_boot(vmspace_pmap(proc0.p_vmspace)); /* * On real hardware, switch to x2apic mode if possible. Do it * after aps_ready was signalled, to avoid manipulating the * mode while BSP might still want to send some IPI to us * (second startup IPI is ignored on modern hardware etc). */ lapic_xapic_mode(); /* Initialize the PAT MSR. */ pmap_init_pat(); /* set up CPU registers and state */ cpu_setregs(); /* set up SSE/NX */ initializecpu(); /* set up FPU state on the AP */ #ifdef __amd64__ fpuinit(); #else npxinit(false); #endif if (cpu_ops.cpu_init) cpu_ops.cpu_init(); /* A quick check from sanity claus */ cpuid = PCPU_GET(cpuid); if (PCPU_GET(apic_id) != lapic_id()) { printf("SMP: cpuid = %d\n", cpuid); printf("SMP: actual apic_id = %d\n", lapic_id()); printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); panic("cpuid mismatch! boom!!"); } /* Initialize curthread. */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); PCPU_SET(curthread, PCPU_GET(idlethread)); + schedinit_ap(); mtx_lock_spin(&ap_boot_mtx); mca_init(); /* Init local apic for irq's */ lapic_setup(1); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); smp_cpus++; CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); if (bootverbose) printf("SMP: AP CPU #%d Launched!\n", cpuid); else printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "", cpuid, smp_cpus == mp_ncpus ? "\n" : " "); /* Determine if we are a logical CPU. */ if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) CPU_SET(cpuid, &logical_cpus_mask); if (bootverbose) lapic_dump("AP"); if (smp_cpus == mp_ncpus) { /* enable IPI's, tlb shootdown, freezes etc */ atomic_store_rel_int(&smp_started, 1); } #ifdef __amd64__ if (pmap_pcid_enabled) load_cr4(rcr4() | CR4_PCIDE); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); #endif mtx_unlock_spin(&ap_boot_mtx); /* Wait until all the AP's are up. */ while (atomic_load_acq_int(&smp_started) == 0) ia32_pause(); #ifndef EARLY_AP_STARTUP /* Start per-CPU event timers. */ cpu_initclocks_ap(); #endif kcsan_cpu_init(cpuid); /* * Assert that smp_after_idle_runnable condition is reasonable. */ MPASS(PCPU_GET(curpcb) == NULL); sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ } static void smp_after_idle_runnable(void *arg __unused) { struct pcpu *pc; int cpu; for (cpu = 1; cpu < mp_ncpus; cpu++) { pc = pcpu_find(cpu); while (atomic_load_ptr(&pc->pc_curpcb) == NULL) cpu_spinwait(); kmem_free((vm_offset_t)bootstacks[cpu], kstack_pages * PAGE_SIZE); } } SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY, smp_after_idle_runnable, NULL); /* * We tell the I/O APIC code about all the CPUs we want to receive * interrupts. If we don't want certain CPUs to receive IRQs we * can simply not tell the I/O APIC code about them in this function. * We also do not tell it about the BSP since it tells itself about * the BSP internally to work with UP kernels and on UP machines. */ void set_interrupt_apic_ids(void) { u_int i, apic_id; for (i = 0; i < MAXCPU; i++) { apic_id = cpu_apic_ids[i]; if (apic_id == -1) continue; if (cpu_info[apic_id].cpu_bsp) continue; if (cpu_info[apic_id].cpu_disabled) continue; /* Don't let hyperthreads service interrupts. */ if (cpu_info[apic_id].cpu_hyperthread && !hyperthreading_intr_allowed) continue; intr_add_cpu(i); } } #ifdef COUNT_XINVLTLB_HITS u_int xhits_gbl[MAXCPU]; u_int xhits_pg[MAXCPU]; u_int xhits_rng[MAXCPU]; static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, sizeof(xhits_gbl), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, sizeof(xhits_pg), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, sizeof(xhits_rng), "IU", ""); u_int ipi_global; u_int ipi_page; u_int ipi_range; u_int ipi_range_size; SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 0, ""); #endif /* COUNT_XINVLTLB_HITS */ /* * Init and startup IPI. */ void ipi_startup(int apic_id, int vector) { /* * This attempts to follow the algorithm described in the * Intel Multiprocessor Specification v1.4 in section B.4. * For each IPI, we allow the local APIC ~20us to deliver the * IPI. If that times out, we panic. */ /* * first we do an INIT IPI: this INIT IPI might be run, resetting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); lapic_ipi_wait(100); /* Explicitly deassert the INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); DELAY(10000); /* wait ~10mS */ /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); if (!lapic_ipi_wait(100)) panic("Failed to deliver first STARTUP IPI to APIC %d", apic_id); DELAY(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); if (!lapic_ipi_wait(100)) panic("Failed to deliver second STARTUP IPI to APIC %d", apic_id); DELAY(200); /* wait ~200uS */ } static bool ipi_bitmap_set(int cpu, u_int ipi) { u_int bitmap, old, new; u_int *cpu_bitmap; bitmap = 1 << ipi; cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap; old = *cpu_bitmap; for (;;) { if ((old & bitmap) != 0) break; new = old | bitmap; if (atomic_fcmpset_int(cpu_bitmap, &old, new)) break; } return (old != 0); } /* * Send an IPI to specified CPU handling the bitmap logic. */ static void ipi_send_cpu(int cpu, u_int ipi) { KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); if (IPI_IS_BITMAPED(ipi)) { if (ipi_bitmap_set(cpu, ipi)) return; ipi = IPI_BITMAP_VECTOR; } lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); } void ipi_bitmap_handler(struct trapframe frame) { struct trapframe *oldframe; struct thread *td; int cpu = PCPU_GET(cpuid); u_int ipi_bitmap; kasan_mark(&frame, sizeof(frame), sizeof(frame), 0); td = curthread; ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]-> pc_ipi_bitmap); /* * sched_preempt() must be called to clear the pending preempt * IPI to enable delivery of further preempts. However, the * critical section will cause extra scheduler lock thrashing * when used unconditionally. Only critical_enter() if * hardclock must also run, which requires the section entry. */ if (ipi_bitmap & (1 << IPI_HARDCLOCK)) critical_enter(); td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = &frame; #if defined(STACK) || defined(DDB) if (ipi_bitmap & (1 << IPI_TRACE)) stack_capture_intr(); #endif if (ipi_bitmap & (1 << IPI_PREEMPT)) { #ifdef COUNT_IPIS (*ipi_preempt_counts[cpu])++; #endif sched_preempt(td); } if (ipi_bitmap & (1 << IPI_AST)) { #ifdef COUNT_IPIS (*ipi_ast_counts[cpu])++; #endif /* Nothing to do for AST */ } if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { #ifdef COUNT_IPIS (*ipi_hardclock_counts[cpu])++; #endif hardclockintr(); } td->td_intr_frame = oldframe; td->td_intr_nesting_level--; if (ipi_bitmap & (1 << IPI_HARDCLOCK)) critical_exit(); } /* * send an IPI to a set of cpus. */ void ipi_selected(cpuset_t cpus, u_int ipi) { int cpu; /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); CPU_FOREACH_ISSET(cpu, &cpus) { CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } } /* * send an IPI to a specific CPU. */ void ipi_cpu(int cpu, u_int ipi) { /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { cpuset_t other_cpus; int cpu, c; /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) { other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); } CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); if (IPI_IS_BITMAPED(ipi)) { cpu = PCPU_GET(cpuid); CPU_FOREACH(c) { if (c != cpu) ipi_bitmap_set(c, ipi); } ipi = IPI_BITMAP_VECTOR; } lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); } void ipi_self_from_nmi(u_int vector) { lapic_ipi_vectored(vector, APIC_IPI_DEST_SELF); /* Wait for IPI to finish. */ if (!lapic_ipi_wait(50000)) { if (KERNEL_PANICKED()) return; else panic("APIC: IPI is stuck"); } } int ipi_nmi_handler(void) { u_int cpuid; /* * As long as there is not a simple way to know about a NMI's * source, if the bitmask for the current CPU is present in * the global pending bitword an IPI_STOP_HARD has been issued * and should be handled. */ cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) return (1); CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); cpustop_handler(); return (0); } int nmi_kdb_lock; void nmi_call_kdb_smp(u_int type, struct trapframe *frame) { int cpu; bool call_post; cpu = PCPU_GET(cpuid); if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { nmi_call_kdb(cpu, type, frame); call_post = false; } else { savectx(&stoppcbs[cpu]); CPU_SET_ATOMIC(cpu, &stopped_cpus); while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) ia32_pause(); call_post = true; } atomic_store_rel_int(&nmi_kdb_lock, 0); if (call_post) cpustop_handler_post(cpu); } /* * Handle an IPI_STOP by saving our current context and spinning (or mwaiting, * if available) until we are resumed. */ void cpustop_handler(void) { struct monitorbuf *mb; u_int cpu; bool use_mwait; cpu = PCPU_GET(cpuid); savectx(&stoppcbs[cpu]); use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 && !mwait_cpustop_broken); if (use_mwait) { mb = PCPU_PTR(monitorbuf); atomic_store_int(&mb->stop_state, MONITOR_STOPSTATE_STOPPED); } /* Indicate that we are stopped */ CPU_SET_ATOMIC(cpu, &stopped_cpus); /* Wait for restart */ while (!CPU_ISSET(cpu, &started_cpus)) { if (use_mwait) { cpu_monitor(mb, 0, 0); if (atomic_load_int(&mb->stop_state) == MONITOR_STOPSTATE_STOPPED) cpu_mwait(0, MWAIT_C1); continue; } ia32_pause(); /* * Halt non-BSP CPUs on panic -- we're never going to need them * again, and might as well save power / release resources * (e.g., overprovisioned VM infrastructure). */ while (__predict_false(!IS_BSP() && KERNEL_PANICKED())) halt(); } cpustop_handler_post(cpu); } static void cpustop_handler_post(u_int cpu) { CPU_CLR_ATOMIC(cpu, &started_cpus); CPU_CLR_ATOMIC(cpu, &stopped_cpus); /* * We don't broadcast TLB invalidations to other CPUs when they are * stopped. Hence, we clear the TLB before resuming. */ invltlb_glob(); #if defined(__amd64__) && (defined(DDB) || defined(GDB)) amd64_db_resume_dbreg(); #endif if (cpu == 0 && cpustop_restartfunc != NULL) { cpustop_restartfunc(); cpustop_restartfunc = NULL; } } /* * Handle an IPI_SUSPEND by saving our current context and spinning until we * are resumed. */ void cpususpend_handler(void) { u_int cpu; mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); cpu = PCPU_GET(cpuid); if (savectx(&susppcbs[cpu]->sp_pcb)) { #ifdef __amd64__ fpususpend(susppcbs[cpu]->sp_fpususpend); #else npxsuspend(susppcbs[cpu]->sp_fpususpend); #endif /* * suspended_cpus is cleared shortly after each AP is restarted * by a Startup IPI, so that the BSP can proceed to restarting * the next AP. * * resuming_cpus gets cleared when the AP completes * initialization after having been released by the BSP. * resuming_cpus is probably not the best name for the * variable, because it is actually a set of processors that * haven't resumed yet and haven't necessarily started resuming. * * Note that suspended_cpus is meaningful only for ACPI suspend * as it's not really used for Xen suspend since the APs are * automatically restored to the running state and the correct * context. For the same reason resumectx is never called in * that case. */ CPU_SET_ATOMIC(cpu, &suspended_cpus); CPU_SET_ATOMIC(cpu, &resuming_cpus); /* * Invalidate the cache after setting the global status bits. * The last AP to set its bit may end up being an Owner of the * corresponding cache line in MOESI protocol. The AP may be * stopped before the cache line is written to the main memory. */ wbinvd(); } else { #ifdef __amd64__ fpuresume(susppcbs[cpu]->sp_fpususpend); #else npxresume(susppcbs[cpu]->sp_fpususpend); #endif pmap_init_pat(); initializecpu(); PCPU_SET(switchtime, 0); PCPU_SET(switchticks, ticks); /* Indicate that we have restarted and restored the context. */ CPU_CLR_ATOMIC(cpu, &suspended_cpus); } /* Wait for resume directive */ while (!CPU_ISSET(cpu, &toresume_cpus)) ia32_pause(); /* Re-apply microcode updates. */ ucode_reload(); #ifdef __i386__ /* Finish removing the identity mapping of low memory for this AP. */ invltlb_glob(); #endif if (cpu_ops.cpu_resume) cpu_ops.cpu_resume(); #ifdef __amd64__ if (vmm_resume_p) vmm_resume_p(); #endif /* Resume MCA and local APIC */ lapic_xapic_mode(); mca_resume(); lapic_setup(0); /* Indicate that we are resumed */ CPU_CLR_ATOMIC(cpu, &resuming_cpus); CPU_CLR_ATOMIC(cpu, &suspended_cpus); CPU_CLR_ATOMIC(cpu, &toresume_cpus); } /* * Handle an IPI_SWI by waking delayed SWI thread. */ void ipi_swi_handler(struct trapframe frame) { intr_event_handle(clk_intr_event, &frame); } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ static void release_aps(void *dummy __unused) { if (mp_ncpus == 1) return; atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ia32_pause(); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); #ifdef COUNT_IPIS /* * Setup interrupt counters for IPI handlers. */ static void mp_ipi_intrcnt(void *dummy) { char buf[64]; int i; CPU_FOREACH(i) { snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); intrcnt_add(buf, &ipi_invltlb_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); intrcnt_add(buf, &ipi_invlrng_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); intrcnt_add(buf, &ipi_invlpg_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); intrcnt_add(buf, &ipi_invlcache_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:preempt", i); intrcnt_add(buf, &ipi_preempt_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:ast", i); intrcnt_add(buf, &ipi_ast_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); intrcnt_add(buf, &ipi_rendezvous_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); intrcnt_add(buf, &ipi_hardclock_counts[i]); } } SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); #endif