diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c index d1e4010d2ebe..49199b6f319f 100644 --- a/sys/kern/sched_ule.c +++ b/sys/kern/sched_ule.c @@ -1,3209 +1,3211 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002-2007, Jeffrey Roberson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This file implements the ULE scheduler. ULE supports independent CPU * run queues and fine grain locking. It has superior interactive * performance under load even on uni-processor systems. * * etymology: * ULE is the last three letters in schedule. It owes its name to a * generic user created for a scheduling system by Paul Mikesell at * Isilon Systems and a general lack of creativity on the part of the author. */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif #ifdef KDTRACE_HOOKS #include int __read_mostly dtrace_vtime_active; dtrace_vtime_switch_func_t dtrace_vtime_switch_func; #endif #include #include #define KTR_ULE 0 #define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX))) #define TDQ_NAME_LEN (sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU))) #define TDQ_LOADNAME_LEN (sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load")) /* * Thread scheduler specific section. All fields are protected * by the thread lock. */ struct td_sched { struct runq *ts_runq; /* Run-queue we're queued on. */ short ts_flags; /* TSF_* flags. */ int ts_cpu; /* CPU that we have affinity for. */ int ts_rltick; /* Real last tick, for affinity. */ int ts_slice; /* Ticks of slice remaining. */ u_int ts_slptime; /* Number of ticks we vol. slept */ u_int ts_runtime; /* Number of ticks we were running */ int ts_ltick; /* Last tick that we were running on */ int ts_ftick; /* First tick that we were running on */ int ts_ticks; /* Tick count */ #ifdef KTR char ts_name[TS_NAME_LEN]; #endif }; /* flags kept in ts_flags */ #define TSF_BOUND 0x0001 /* Thread can not migrate. */ #define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ #define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) #define THREAD_CAN_SCHED(td, cpu) \ CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <= sizeof(struct thread0_storage), "increase struct thread0_storage.t0st_sched size"); /* * Priority ranges used for interactive and non-interactive timeshare * threads. The timeshare priorities are split up into four ranges. * The first range handles interactive threads. The last three ranges * (NHALF, x, and NHALF) handle non-interactive threads with the outer * ranges supporting nice values. */ #define PRI_TIMESHARE_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) #define PRI_INTERACT_RANGE ((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2) #define PRI_BATCH_RANGE (PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE) #define PRI_MIN_INTERACT PRI_MIN_TIMESHARE #define PRI_MAX_INTERACT (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1) #define PRI_MIN_BATCH (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE) #define PRI_MAX_BATCH PRI_MAX_TIMESHARE /* * Cpu percentage computation macros and defines. * * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. * SCHED_TICK_MAX: Maximum number of ticks before scaling back. * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. */ #define SCHED_TICK_SECS 10 #define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) #define SCHED_TICK_MAX (SCHED_TICK_TARG + hz) #define SCHED_TICK_SHIFT 10 #define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) #define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz)) /* * These macros determine priorities for non-interactive threads. They are * assigned a priority based on their recent cpu utilization as expressed * by the ratio of ticks to the tick total. NHALF priorities at the start * and end of the MIN to MAX timeshare range are only reachable with negative * or positive nice respectively. * * PRI_RANGE: Priority range for utilization dependent priorities. * PRI_NRESV: Number of nice values. * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. * PRI_NICE: Determines the part of the priority inherited from nice. */ #define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN) #define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) #define SCHED_PRI_MIN (PRI_MIN_BATCH + SCHED_PRI_NHALF) #define SCHED_PRI_MAX (PRI_MAX_BATCH - SCHED_PRI_NHALF) #define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN + 1) #define SCHED_PRI_TICKS(ts) \ (SCHED_TICK_HZ((ts)) / \ (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) #define SCHED_PRI_NICE(nice) (nice) /* * These determine the interactivity of a process. Interactivity differs from * cpu utilization in that it expresses the voluntary time slept vs time ran * while cpu utilization includes all time not running. This more accurately * models the intent of the thread. * * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate * before throttling back. * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. * INTERACT_MAX: Maximum interactivity value. Smaller is better. * INTERACT_THRESH: Threshold for placement on the current runq. */ #define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) #define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) #define SCHED_INTERACT_MAX (100) #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) #define SCHED_INTERACT_THRESH (30) /* * These parameters determine the slice behavior for batch work. */ #define SCHED_SLICE_DEFAULT_DIVISOR 10 /* ~94 ms, 12 stathz ticks. */ #define SCHED_SLICE_MIN_DIVISOR 6 /* DEFAULT/MIN = ~16 ms. */ /* Flags kept in td_flags. */ #define TDF_PICKCPU TDF_SCHED0 /* Thread should pick new CPU. */ #define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */ /* * tickincr: Converts a stathz tick into a hz domain scaled by * the shift factor. Without the shift the error rate * due to rounding would be unacceptably high. * realstathz: stathz is sometimes 0 and run off of hz. * sched_slice: Runtime of each thread before rescheduling. * preempt_thresh: Priority threshold for preemption and remote IPIs. */ static int __read_mostly sched_interact = SCHED_INTERACT_THRESH; static int __read_mostly tickincr = 8 << SCHED_TICK_SHIFT; static int __read_mostly realstathz = 127; /* reset during boot. */ static int __read_mostly sched_slice = 10; /* reset during boot. */ static int __read_mostly sched_slice_min = 1; /* reset during boot. */ #ifdef PREEMPTION #ifdef FULL_PREEMPTION static int __read_mostly preempt_thresh = PRI_MAX_IDLE; #else static int __read_mostly preempt_thresh = PRI_MIN_KERN; #endif #else static int __read_mostly preempt_thresh = 0; #endif static int __read_mostly static_boost = PRI_MIN_BATCH; static int __read_mostly sched_idlespins = 10000; static int __read_mostly sched_idlespinthresh = -1; /* * tdq - per processor runqs and statistics. All fields are protected by the * tdq_lock. The load and lowpri may be accessed without to avoid excess * locking in sched_pickcpu(); */ struct tdq { /* * Ordered to improve efficiency of cpu_search() and switch(). * tdq_lock is padded to avoid false sharing with tdq_load and * tdq_cpu_idle. */ struct mtx_padalign tdq_lock; /* run queue lock. */ struct cpu_group *tdq_cg; /* Pointer to cpu topology. */ volatile int tdq_load; /* Aggregate load. */ volatile int tdq_cpu_idle; /* cpu_idle() is active. */ int tdq_sysload; /* For loadavg, !ITHD load. */ volatile int tdq_transferable; /* Transferable thread count. */ volatile short tdq_switchcnt; /* Switches this tick. */ volatile short tdq_oldswitchcnt; /* Switches last tick. */ u_char tdq_lowpri; /* Lowest priority thread. */ u_char tdq_owepreempt; /* Remote preemption pending. */ u_char tdq_idx; /* Current insert index. */ u_char tdq_ridx; /* Current removal index. */ int tdq_id; /* cpuid. */ struct runq tdq_realtime; /* real-time run queue. */ struct runq tdq_timeshare; /* timeshare run queue. */ struct runq tdq_idle; /* Queue of IDLE threads. */ char tdq_name[TDQ_NAME_LEN]; #ifdef KTR char tdq_loadname[TDQ_LOADNAME_LEN]; #endif } __aligned(64); /* Idle thread states and config. */ #define TDQ_RUNNING 1 #define TDQ_IDLE 2 #ifdef SMP struct cpu_group __read_mostly *cpu_top; /* CPU topology */ #define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000)) #define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity)) /* * Run-time tunables. */ static int rebalance = 1; static int balance_interval = 128; /* Default set in sched_initticks(). */ static int __read_mostly affinity; static int __read_mostly steal_idle = 1; static int __read_mostly steal_thresh = 2; static int __read_mostly always_steal = 0; static int __read_mostly trysteal_limit = 2; /* * One thread queue per processor. */ static struct tdq __read_mostly *balance_tdq; static int balance_ticks; DPCPU_DEFINE_STATIC(struct tdq, tdq); DPCPU_DEFINE_STATIC(uint32_t, randomval); #define TDQ_SELF() ((struct tdq *)PCPU_GET(sched)) #define TDQ_CPU(x) (DPCPU_ID_PTR((x), tdq)) #define TDQ_ID(x) ((x)->tdq_id) #else /* !SMP */ static struct tdq tdq_cpu; #define TDQ_ID(x) (0) #define TDQ_SELF() (&tdq_cpu) #define TDQ_CPU(x) (&tdq_cpu) #endif #define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type)) #define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) #define TDQ_TRYLOCK(t) mtx_trylock_spin(TDQ_LOCKPTR((t))) #define TDQ_TRYLOCK_FLAGS(t, f) mtx_trylock_spin_flags(TDQ_LOCKPTR((t)), (f)) #define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCKPTR(t) ((struct mtx *)(&(t)->tdq_lock)) static void sched_priority(struct thread *); static void sched_thread_priority(struct thread *, u_char); static int sched_interact_score(struct thread *); static void sched_interact_update(struct thread *); static void sched_interact_fork(struct thread *); static void sched_pctcpu_update(struct td_sched *, int); /* Operations on per processor queues */ static struct thread *tdq_choose(struct tdq *); static void tdq_setup(struct tdq *, int i); static void tdq_load_add(struct tdq *, struct thread *); static void tdq_load_rem(struct tdq *, struct thread *); static __inline void tdq_runq_add(struct tdq *, struct thread *, int); static __inline void tdq_runq_rem(struct tdq *, struct thread *); static inline int sched_shouldpreempt(int, int, int); void tdq_print(int cpu); static void runq_print(struct runq *rq); static void tdq_add(struct tdq *, struct thread *, int); #ifdef SMP static struct thread *tdq_move(struct tdq *, struct tdq *); static int tdq_idled(struct tdq *); static void tdq_notify(struct tdq *, struct thread *); static struct thread *tdq_steal(struct tdq *, int); static struct thread *runq_steal(struct runq *, int); static int sched_pickcpu(struct thread *, int); static void sched_balance(void); static int sched_balance_pair(struct tdq *, struct tdq *); static inline struct tdq *sched_setcpu(struct thread *, int, int); static inline void thread_unblock_switch(struct thread *, struct mtx *); static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS); static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg, int indent); #endif static void sched_setup(void *dummy); SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); static void sched_initticks(void *dummy); SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL); SDT_PROVIDER_DEFINE(sched); SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", "struct proc *", "uint8_t"); SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", "struct proc *", "void *"); SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", "struct proc *", "void *", "int"); SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", "struct proc *", "uint8_t", "struct thread *"); SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int"); SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", "struct proc *"); SDT_PROBE_DEFINE(sched, , , on__cpu); SDT_PROBE_DEFINE(sched, , , remain__cpu); SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", "struct proc *"); /* * Print the threads waiting on a run-queue. */ static void runq_print(struct runq *rq) { struct rqhead *rqh; struct thread *td; int pri; int j; int i; for (i = 0; i < RQB_LEN; i++) { printf("\t\trunq bits %d 0x%zx\n", i, rq->rq_status.rqb_bits[i]); for (j = 0; j < RQB_BPW; j++) if (rq->rq_status.rqb_bits[i] & (1ul << j)) { pri = j + (i << RQB_L2BPW); rqh = &rq->rq_queues[pri]; TAILQ_FOREACH(td, rqh, td_runq) { printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", td, td->td_name, td->td_priority, td->td_rqindex, pri); } } } } /* * Print the status of a per-cpu thread queue. Should be a ddb show cmd. */ void tdq_print(int cpu) { struct tdq *tdq; tdq = TDQ_CPU(cpu); printf("tdq %d:\n", TDQ_ID(tdq)); printf("\tlock %p\n", TDQ_LOCKPTR(tdq)); printf("\tLock name: %s\n", tdq->tdq_name); printf("\tload: %d\n", tdq->tdq_load); printf("\tswitch cnt: %d\n", tdq->tdq_switchcnt); printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt); printf("\ttimeshare idx: %d\n", tdq->tdq_idx); printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); printf("\tload transferable: %d\n", tdq->tdq_transferable); printf("\tlowest priority: %d\n", tdq->tdq_lowpri); printf("\trealtime runq:\n"); runq_print(&tdq->tdq_realtime); printf("\ttimeshare runq:\n"); runq_print(&tdq->tdq_timeshare); printf("\tidle runq:\n"); runq_print(&tdq->tdq_idle); } static inline int sched_shouldpreempt(int pri, int cpri, int remote) { /* * If the new priority is not better than the current priority there is * nothing to do. */ if (pri >= cpri) return (0); /* * Always preempt idle. */ if (cpri >= PRI_MIN_IDLE) return (1); /* * If preemption is disabled don't preempt others. */ if (preempt_thresh == 0) return (0); /* * Preempt if we exceed the threshold. */ if (pri <= preempt_thresh) return (1); /* * If we're interactive or better and there is non-interactive * or worse running preempt only remote processors. */ if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT) return (1); return (0); } /* * Add a thread to the actual run-queue. Keeps transferable counts up to * date with what is actually on the run-queue. Selects the correct * queue position for timeshare threads. */ static __inline void tdq_runq_add(struct tdq *tdq, struct thread *td, int flags) { struct td_sched *ts; u_char pri; TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); pri = td->td_priority; ts = td_get_sched(td); TD_SET_RUNQ(td); if (THREAD_CAN_MIGRATE(td)) { tdq->tdq_transferable++; ts->ts_flags |= TSF_XFERABLE; } if (pri < PRI_MIN_BATCH) { ts->ts_runq = &tdq->tdq_realtime; } else if (pri <= PRI_MAX_BATCH) { ts->ts_runq = &tdq->tdq_timeshare; KASSERT(pri <= PRI_MAX_BATCH && pri >= PRI_MIN_BATCH, ("Invalid priority %d on timeshare runq", pri)); /* * This queue contains only priorities between MIN and MAX * realtime. Use the whole queue to represent these values. */ if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) { pri = RQ_NQS * (pri - PRI_MIN_BATCH) / PRI_BATCH_RANGE; pri = (pri + tdq->tdq_idx) % RQ_NQS; /* * This effectively shortens the queue by one so we * can have a one slot difference between idx and * ridx while we wait for threads to drain. */ if (tdq->tdq_ridx != tdq->tdq_idx && pri == tdq->tdq_ridx) pri = (unsigned char)(pri - 1) % RQ_NQS; } else pri = tdq->tdq_ridx; runq_add_pri(ts->ts_runq, td, pri, flags); return; } else ts->ts_runq = &tdq->tdq_idle; runq_add(ts->ts_runq, td, flags); } /* * Remove a thread from a run-queue. This typically happens when a thread * is selected to run. Running threads are not on the queue and the * transferable count does not reflect them. */ static __inline void tdq_runq_rem(struct tdq *tdq, struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); KASSERT(ts->ts_runq != NULL, ("tdq_runq_remove: thread %p null ts_runq", td)); if (ts->ts_flags & TSF_XFERABLE) { tdq->tdq_transferable--; ts->ts_flags &= ~TSF_XFERABLE; } if (ts->ts_runq == &tdq->tdq_timeshare) { if (tdq->tdq_idx != tdq->tdq_ridx) runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx); else runq_remove_idx(ts->ts_runq, td, NULL); } else runq_remove(ts->ts_runq, td); } /* * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load * for this thread to the referenced thread queue. */ static void tdq_load_add(struct tdq *tdq, struct thread *td) { TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); tdq->tdq_load++; if ((td->td_flags & TDF_NOLOAD) == 0) tdq->tdq_sysload++; KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load); SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load); } /* * Remove the load from a thread that is transitioning to a sleep state or * exiting. */ static void tdq_load_rem(struct tdq *tdq, struct thread *td) { TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); KASSERT(tdq->tdq_load != 0, ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); tdq->tdq_load--; if ((td->td_flags & TDF_NOLOAD) == 0) tdq->tdq_sysload--; KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load); SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load); } /* * Bound timeshare latency by decreasing slice size as load increases. We * consider the maximum latency as the sum of the threads waiting to run * aside from curthread and target no more than sched_slice latency but * no less than sched_slice_min runtime. */ static inline int tdq_slice(struct tdq *tdq) { int load; /* * It is safe to use sys_load here because this is called from * contexts where timeshare threads are running and so there * cannot be higher priority load in the system. */ load = tdq->tdq_sysload - 1; if (load >= SCHED_SLICE_MIN_DIVISOR) return (sched_slice_min); if (load <= 1) return (sched_slice); return (sched_slice / load); } /* * Set lowpri to its exact value by searching the run-queue and * evaluating curthread. curthread may be passed as an optimization. */ static void tdq_setlowpri(struct tdq *tdq, struct thread *ctd) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); if (ctd == NULL) ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread; td = tdq_choose(tdq); if (td == NULL || td->td_priority > ctd->td_priority) tdq->tdq_lowpri = ctd->td_priority; else tdq->tdq_lowpri = td->td_priority; } #ifdef SMP /* * We need some randomness. Implement a classic Linear Congruential * Generator X_{n+1}=(aX_n+c) mod m. These values are optimized for * m = 2^32, a = 69069 and c = 5. We only return the upper 16 bits * of the random state (in the low bits of our answer) to keep * the maximum randomness. */ static uint32_t sched_random(void) { uint32_t *rndptr; rndptr = DPCPU_PTR(randomval); *rndptr = *rndptr * 69069 + 5; return (*rndptr >> 16); } struct cpu_search { cpuset_t *cs_mask; /* The mask of allowed CPUs to choose from. */ int cs_prefer; /* Prefer this CPU and groups including it. */ int cs_running; /* The thread is now running at cs_prefer. */ int cs_pri; /* Min priority for low. */ int cs_limit; /* Max load for low, min load for high. */ }; struct cpu_search_res { int cs_cpu; /* The best CPU found. */ int cs_load; /* The load of cs_cpu. */ }; /* * Search the tree of cpu_groups for the lowest or highest loaded CPU. * These routines actually compare the load on all paths through the tree * and find the least loaded cpu on the least loaded path, which may differ * from the least loaded cpu in the system. This balances work among caches * and buses. */ static int cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s, struct cpu_search_res *r) { struct cpu_search_res lr; struct tdq *tdq; int c, bload, l, load, p, total; total = 0; bload = INT_MAX; r->cs_cpu = -1; /* Loop through children CPU groups if there are any. */ if (cg->cg_children > 0) { for (c = cg->cg_children - 1; c >= 0; c--) { load = cpu_search_lowest(&cg->cg_child[c], s, &lr); total += load; /* * When balancing do not prefer SMT groups with load >1. * It allows round-robin between SMT groups with equal * load within parent group for more fair scheduling. */ if (__predict_false(s->cs_running) && (cg->cg_child[c].cg_flags & CG_FLAG_THREAD) && load >= 128 && (load & 128) != 0) load += 128; if (lr.cs_cpu >= 0 && (load < bload || (load == bload && lr.cs_load < r->cs_load))) { bload = load; r->cs_cpu = lr.cs_cpu; r->cs_load = lr.cs_load; } } return (total); } /* Loop through children CPUs otherwise. */ for (c = cg->cg_last; c >= cg->cg_first; c--) { if (!CPU_ISSET(c, &cg->cg_mask)) continue; tdq = TDQ_CPU(c); l = tdq->tdq_load; if (c == s->cs_prefer) { if (__predict_false(s->cs_running)) l--; p = 128; } else p = 0; load = l * 256; total += load - p; /* * Check this CPU is acceptable. * If the threads is already on the CPU, don't look on the TDQ * priority, since it can be the priority of the thread itself. */ if (l > s->cs_limit || (tdq->tdq_lowpri <= s->cs_pri && (!s->cs_running || c != s->cs_prefer)) || !CPU_ISSET(c, s->cs_mask)) continue; /* * When balancing do not prefer CPUs with load > 1. * It allows round-robin between CPUs with equal load * within the CPU group for more fair scheduling. */ if (__predict_false(s->cs_running) && l > 0) p = 0; load -= sched_random() % 128; if (bload > load - p) { bload = load - p; r->cs_cpu = c; r->cs_load = load; } } return (total); } static int cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s, struct cpu_search_res *r) { struct cpu_search_res lr; struct tdq *tdq; int c, bload, l, load, total; total = 0; bload = INT_MIN; r->cs_cpu = -1; /* Loop through children CPU groups if there are any. */ if (cg->cg_children > 0) { for (c = cg->cg_children - 1; c >= 0; c--) { load = cpu_search_highest(&cg->cg_child[c], s, &lr); total += load; if (lr.cs_cpu >= 0 && (load > bload || (load == bload && lr.cs_load > r->cs_load))) { bload = load; r->cs_cpu = lr.cs_cpu; r->cs_load = lr.cs_load; } } return (total); } /* Loop through children CPUs otherwise. */ for (c = cg->cg_last; c >= cg->cg_first; c--) { if (!CPU_ISSET(c, &cg->cg_mask)) continue; tdq = TDQ_CPU(c); l = tdq->tdq_load; load = l * 256; total += load; /* * Check this CPU is acceptable. * If requested minimum load is 1, then caller must know how * to handle running threads, not counted in tdq_transferable. */ if (l < s->cs_limit || (tdq->tdq_transferable == 0 && (s->cs_limit > 1 || l > 1)) || !CPU_ISSET(c, s->cs_mask)) continue; load -= sched_random() % 256; if (load > bload) { bload = load; r->cs_cpu = c; } } r->cs_load = bload; return (total); } /* * Find the cpu with the least load via the least loaded path that has a * lowpri greater than pri pri. A pri of -1 indicates any priority is * acceptable. */ static inline int sched_lowest(const struct cpu_group *cg, cpuset_t *mask, int pri, int maxload, int prefer, int running) { struct cpu_search s; struct cpu_search_res r; s.cs_prefer = prefer; s.cs_running = running; s.cs_mask = mask; s.cs_pri = pri; s.cs_limit = maxload; cpu_search_lowest(cg, &s, &r); return (r.cs_cpu); } /* * Find the cpu with the highest load via the highest loaded path. */ static inline int sched_highest(const struct cpu_group *cg, cpuset_t *mask, int minload) { struct cpu_search s; struct cpu_search_res r; s.cs_mask = mask; s.cs_limit = minload; cpu_search_highest(cg, &s, &r); return (r.cs_cpu); } static void sched_balance_group(struct cpu_group *cg) { struct tdq *tdq; struct thread *td; cpuset_t hmask, lmask; int high, low, anylow; CPU_FILL(&hmask); for (;;) { high = sched_highest(cg, &hmask, 1); /* Stop if there is no more CPU with transferrable threads. */ if (high == -1) break; CPU_CLR(high, &hmask); CPU_COPY(&hmask, &lmask); /* Stop if there is no more CPU left for low. */ if (CPU_EMPTY(&lmask)) break; tdq = TDQ_CPU(high); if (tdq->tdq_load == 1) { /* * There is only one running thread. We can't move * it from here, so tell it to pick new CPU by itself. */ TDQ_LOCK(tdq); td = pcpu_find(high)->pc_curthread; if ((td->td_flags & TDF_IDLETD) == 0 && THREAD_CAN_MIGRATE(td)) { td->td_flags |= TDF_NEEDRESCHED | TDF_PICKCPU; if (high != curcpu) ipi_cpu(high, IPI_AST); } TDQ_UNLOCK(tdq); break; } anylow = 1; nextlow: if (tdq->tdq_transferable == 0) continue; low = sched_lowest(cg, &lmask, -1, tdq->tdq_load - 1, high, 1); /* Stop if we looked well and found no less loaded CPU. */ if (anylow && low == -1) break; /* Go to next high if we found no less loaded CPU. */ if (low == -1) continue; /* Transfer thread from high to low. */ if (sched_balance_pair(tdq, TDQ_CPU(low))) { /* CPU that got thread can no longer be a donor. */ CPU_CLR(low, &hmask); } else { /* * If failed, then there is no threads on high * that can run on this low. Drop low from low * mask and look for different one. */ CPU_CLR(low, &lmask); anylow = 0; goto nextlow; } } } static void sched_balance(void) { struct tdq *tdq; balance_ticks = max(balance_interval / 2, 1) + (sched_random() % balance_interval); tdq = TDQ_SELF(); TDQ_UNLOCK(tdq); sched_balance_group(cpu_top); TDQ_LOCK(tdq); } /* * Lock two thread queues using their address to maintain lock order. */ static void tdq_lock_pair(struct tdq *one, struct tdq *two) { if (one < two) { TDQ_LOCK(one); TDQ_LOCK_FLAGS(two, MTX_DUPOK); } else { TDQ_LOCK(two); TDQ_LOCK_FLAGS(one, MTX_DUPOK); } } /* * Unlock two thread queues. Order is not important here. */ static void tdq_unlock_pair(struct tdq *one, struct tdq *two) { TDQ_UNLOCK(one); TDQ_UNLOCK(two); } /* * Transfer load between two imbalanced thread queues. */ static int sched_balance_pair(struct tdq *high, struct tdq *low) { struct thread *td; int cpu; tdq_lock_pair(high, low); td = NULL; /* * Transfer a thread from high to low. */ if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load && (td = tdq_move(high, low)) != NULL) { /* * In case the target isn't the current cpu notify it of the * new load, possibly sending an IPI to force it to reschedule. */ cpu = TDQ_ID(low); if (cpu != PCPU_GET(cpuid)) tdq_notify(low, td); } tdq_unlock_pair(high, low); return (td != NULL); } /* * Move a thread from one thread queue to another. */ static struct thread * tdq_move(struct tdq *from, struct tdq *to) { struct thread *td; struct tdq *tdq; int cpu; TDQ_LOCK_ASSERT(from, MA_OWNED); TDQ_LOCK_ASSERT(to, MA_OWNED); tdq = from; cpu = TDQ_ID(to); td = tdq_steal(tdq, cpu); if (td == NULL) return (NULL); /* * Although the run queue is locked the thread may be * blocked. We can not set the lock until it is unblocked. */ thread_lock_block_wait(td); sched_rem(td); THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(from)); td->td_lock = TDQ_LOCKPTR(to); td_get_sched(td)->ts_cpu = cpu; tdq_add(to, td, SRQ_YIELDING); return (td); } /* * This tdq has idled. Try to steal a thread from another cpu and switch * to it. */ static int tdq_idled(struct tdq *tdq) { struct cpu_group *cg, *parent; struct tdq *steal; cpuset_t mask; int cpu, switchcnt, goup; if (smp_started == 0 || steal_idle == 0 || tdq->tdq_cg == NULL) return (1); CPU_FILL(&mask); CPU_CLR(PCPU_GET(cpuid), &mask); restart: switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; for (cg = tdq->tdq_cg, goup = 0; ; ) { cpu = sched_highest(cg, &mask, steal_thresh); /* * We were assigned a thread but not preempted. Returning * 0 here will cause our caller to switch to it. */ if (tdq->tdq_load) return (0); /* * We found no CPU to steal from in this group. Escalate to * the parent and repeat. But if parent has only two children * groups we can avoid searching this group again by searching * the other one specifically and then escalating two levels. */ if (cpu == -1) { if (goup) { cg = cg->cg_parent; goup = 0; } parent = cg->cg_parent; if (parent == NULL) return (1); if (parent->cg_children == 2) { if (cg == &parent->cg_child[0]) cg = &parent->cg_child[1]; else cg = &parent->cg_child[0]; goup = 1; } else cg = parent; continue; } steal = TDQ_CPU(cpu); /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread. * * Testing this ahead of tdq_lock_pair() only catches * this situation about 20% of the time on an 8 core * 16 thread Ryzen 7, but it still helps performance. */ if (steal->tdq_load < steal_thresh || steal->tdq_transferable == 0) goto restart; /* * Try to lock both queues. If we are assigned a thread while * waited for the lock, switch to it now instead of stealing. * If we can't get the lock, then somebody likely got there * first so continue searching. */ TDQ_LOCK(tdq); if (tdq->tdq_load > 0) { mi_switch(SW_VOL | SWT_IDLE); return (0); } if (TDQ_TRYLOCK_FLAGS(steal, MTX_DUPOK) == 0) { TDQ_UNLOCK(tdq); CPU_CLR(cpu, &mask); continue; } /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread, or * we were preempted and the CPU loading info may be out * of date. The latter is rare. In either case restart * the search. */ if (steal->tdq_load < steal_thresh || steal->tdq_transferable == 0 || switchcnt != tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt) { tdq_unlock_pair(tdq, steal); goto restart; } /* * Steal the thread and switch to it. */ if (tdq_move(steal, tdq) != NULL) break; /* * We failed to acquire a thread even though it looked * like one was available. This could be due to affinity * restrictions or for other reasons. Loop again after * removing this CPU from the set. The restart logic * above does not restore this CPU to the set due to the * likelyhood of failing here again. */ CPU_CLR(cpu, &mask); tdq_unlock_pair(tdq, steal); } TDQ_UNLOCK(steal); mi_switch(SW_VOL | SWT_IDLE); return (0); } /* * Notify a remote cpu of new work. Sends an IPI if criteria are met. */ static void tdq_notify(struct tdq *tdq, struct thread *td) { struct thread *ctd; int pri; int cpu; if (tdq->tdq_owepreempt) return; cpu = td_get_sched(td)->ts_cpu; pri = td->td_priority; ctd = pcpu_find(cpu)->pc_curthread; if (!sched_shouldpreempt(pri, ctd->td_priority, 1)) return; /* * Make sure that our caller's earlier update to tdq_load is * globally visible before we read tdq_cpu_idle. Idle thread * accesses both of them without locks, and the order is important. */ atomic_thread_fence_seq_cst(); if (TD_IS_IDLETHREAD(ctd)) { /* * If the MD code has an idle wakeup routine try that before * falling back to IPI. */ if (!tdq->tdq_cpu_idle || cpu_idle_wakeup(cpu)) return; } /* * The run queues have been updated, so any switch on the remote CPU * will satisfy the preemption request. */ tdq->tdq_owepreempt = 1; ipi_cpu(cpu, IPI_PREEMPT); } /* * Steals load from a timeshare queue. Honors the rotating queue head * index. */ static struct thread * runq_steal_from(struct runq *rq, int cpu, u_char start) { struct rqbits *rqb; struct rqhead *rqh; struct thread *td, *first; int bit; int i; rqb = &rq->rq_status; bit = start & (RQB_BPW -1); first = NULL; again: for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) { if (rqb->rqb_bits[i] == 0) continue; if (bit == 0) bit = RQB_FFS(rqb->rqb_bits[i]); for (; bit < RQB_BPW; bit++) { if ((rqb->rqb_bits[i] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (i << RQB_L2BPW)]; TAILQ_FOREACH(td, rqh, td_runq) { if (first) { if (THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, cpu)) return (td); } else first = td; } } } if (start != 0) { start = 0; goto again; } if (first && THREAD_CAN_MIGRATE(first) && THREAD_CAN_SCHED(first, cpu)) return (first); return (NULL); } /* * Steals load from a standard linear queue. */ static struct thread * runq_steal(struct runq *rq, int cpu) { struct rqhead *rqh; struct rqbits *rqb; struct thread *td; int word; int bit; rqb = &rq->rq_status; for (word = 0; word < RQB_LEN; word++) { if (rqb->rqb_bits[word] == 0) continue; for (bit = 0; bit < RQB_BPW; bit++) { if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) continue; rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; TAILQ_FOREACH(td, rqh, td_runq) if (THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, cpu)) return (td); } } return (NULL); } /* * Attempt to steal a thread in priority order from a thread queue. */ static struct thread * tdq_steal(struct tdq *tdq, int cpu) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); if ((td = runq_steal(&tdq->tdq_realtime, cpu)) != NULL) return (td); if ((td = runq_steal_from(&tdq->tdq_timeshare, cpu, tdq->tdq_ridx)) != NULL) return (td); return (runq_steal(&tdq->tdq_idle, cpu)); } /* * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the * current lock and returns with the assigned queue locked. */ static inline struct tdq * sched_setcpu(struct thread *td, int cpu, int flags) { struct tdq *tdq; struct mtx *mtx; THREAD_LOCK_ASSERT(td, MA_OWNED); tdq = TDQ_CPU(cpu); td_get_sched(td)->ts_cpu = cpu; /* * If the lock matches just return the queue. */ if (td->td_lock == TDQ_LOCKPTR(tdq)) { KASSERT((flags & SRQ_HOLD) == 0, ("sched_setcpu: Invalid lock for SRQ_HOLD")); return (tdq); } /* * The hard case, migration, we need to block the thread first to * prevent order reversals with other cpus locks. */ spinlock_enter(); mtx = thread_lock_block(td); if ((flags & SRQ_HOLD) == 0) mtx_unlock_spin(mtx); TDQ_LOCK(tdq); thread_lock_unblock(td, TDQ_LOCKPTR(tdq)); spinlock_exit(); return (tdq); } SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding"); SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity"); SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity"); SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load"); SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu"); SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration"); static int sched_pickcpu(struct thread *td, int flags) { struct cpu_group *cg, *ccg; struct td_sched *ts; struct tdq *tdq; cpuset_t *mask; int cpu, pri, r, self, intr; self = PCPU_GET(cpuid); ts = td_get_sched(td); KASSERT(!CPU_ABSENT(ts->ts_cpu), ("sched_pickcpu: Start scheduler on " "absent CPU %d for thread %s.", ts->ts_cpu, td->td_name)); if (smp_started == 0) return (self); /* * Don't migrate a running thread from sched_switch(). */ if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td)) return (ts->ts_cpu); /* * Prefer to run interrupt threads on the processors that generate * the interrupt. */ if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) && curthread->td_intr_nesting_level) { tdq = TDQ_SELF(); if (tdq->tdq_lowpri >= PRI_MIN_IDLE) { SCHED_STAT_INC(pickcpu_idle_affinity); return (self); } ts->ts_cpu = self; intr = 1; cg = tdq->tdq_cg; goto llc; } else { intr = 0; tdq = TDQ_CPU(ts->ts_cpu); cg = tdq->tdq_cg; } /* * If the thread can run on the last cpu and the affinity has not * expired and it is idle, run it there. */ if (THREAD_CAN_SCHED(td, ts->ts_cpu) && tdq->tdq_lowpri >= PRI_MIN_IDLE && SCHED_AFFINITY(ts, CG_SHARE_L2)) { if (cg->cg_flags & CG_FLAG_THREAD) { /* Check all SMT threads for being idle. */ for (cpu = cg->cg_first; cpu <= cg->cg_last; cpu++) { if (CPU_ISSET(cpu, &cg->cg_mask) && TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) break; } if (cpu > cg->cg_last) { SCHED_STAT_INC(pickcpu_idle_affinity); return (ts->ts_cpu); } } else { SCHED_STAT_INC(pickcpu_idle_affinity); return (ts->ts_cpu); } } llc: /* * Search for the last level cache CPU group in the tree. * Skip SMT, identical groups and caches with expired affinity. * Interrupt threads affinity is explicit and never expires. */ for (ccg = NULL; cg != NULL; cg = cg->cg_parent) { if (cg->cg_flags & CG_FLAG_THREAD) continue; if (cg->cg_children == 1 || cg->cg_count == 1) continue; if (cg->cg_level == CG_SHARE_NONE || (!intr && !SCHED_AFFINITY(ts, cg->cg_level))) continue; ccg = cg; } /* Found LLC shared by all CPUs, so do a global search. */ if (ccg == cpu_top) ccg = NULL; cpu = -1; mask = &td->td_cpuset->cs_mask; pri = td->td_priority; r = TD_IS_RUNNING(td); /* * Try hard to keep interrupts within found LLC. Search the LLC for * the least loaded CPU we can run now. For NUMA systems it should * be within target domain, and it also reduces scheduling overhead. */ if (ccg != NULL && intr) { cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_intrbind); } else /* Search the LLC for the least loaded idle CPU we can run now. */ if (ccg != NULL) { cpu = sched_lowest(ccg, mask, max(pri, PRI_MAX_TIMESHARE), INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_affinity); } /* Search globally for the least loaded CPU we can run now. */ if (cpu < 0) { cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_lowest); } /* Search globally for the least loaded CPU. */ if (cpu < 0) { cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu, r); if (cpu >= 0) SCHED_STAT_INC(pickcpu_lowest); } KASSERT(cpu >= 0, ("sched_pickcpu: Failed to find a cpu.")); KASSERT(!CPU_ABSENT(cpu), ("sched_pickcpu: Picked absent CPU %d.", cpu)); /* * Compare the lowest loaded cpu to current cpu. */ tdq = TDQ_CPU(cpu); if (THREAD_CAN_SCHED(td, self) && TDQ_SELF()->tdq_lowpri > pri && tdq->tdq_lowpri < PRI_MIN_IDLE && TDQ_SELF()->tdq_load <= tdq->tdq_load + 1) { SCHED_STAT_INC(pickcpu_local); cpu = self; } if (cpu != ts->ts_cpu) SCHED_STAT_INC(pickcpu_migration); return (cpu); } #endif /* * Pick the highest priority task we have and return it. */ static struct thread * tdq_choose(struct tdq *tdq) { struct thread *td; TDQ_LOCK_ASSERT(tdq, MA_OWNED); td = runq_choose(&tdq->tdq_realtime); if (td != NULL) return (td); td = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx); if (td != NULL) { KASSERT(td->td_priority >= PRI_MIN_BATCH, ("tdq_choose: Invalid priority on timeshare queue %d", td->td_priority)); return (td); } td = runq_choose(&tdq->tdq_idle); if (td != NULL) { KASSERT(td->td_priority >= PRI_MIN_IDLE, ("tdq_choose: Invalid priority on idle queue %d", td->td_priority)); return (td); } return (NULL); } /* * Initialize a thread queue. */ static void tdq_setup(struct tdq *tdq, int id) { if (bootverbose) printf("ULE: setup cpu %d\n", id); runq_init(&tdq->tdq_realtime); runq_init(&tdq->tdq_timeshare); runq_init(&tdq->tdq_idle); tdq->tdq_id = id; snprintf(tdq->tdq_name, sizeof(tdq->tdq_name), "sched lock %d", (int)TDQ_ID(tdq)); mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", MTX_SPIN); #ifdef KTR snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname), "CPU %d load", (int)TDQ_ID(tdq)); #endif } #ifdef SMP static void sched_setup_smp(void) { struct tdq *tdq; int i; cpu_top = smp_topo(); CPU_FOREACH(i) { tdq = DPCPU_ID_PTR(i, tdq); tdq_setup(tdq, i); tdq->tdq_cg = smp_topo_find(cpu_top, i); if (tdq->tdq_cg == NULL) panic("Can't find cpu group for %d\n", i); DPCPU_ID_SET(i, randomval, i * 69069 + 5); } PCPU_SET(sched, DPCPU_PTR(tdq)); balance_tdq = TDQ_SELF(); } #endif /* * Setup the thread queues and initialize the topology based on MD * information. */ static void sched_setup(void *dummy) { struct tdq *tdq; #ifdef SMP sched_setup_smp(); #else tdq_setup(TDQ_SELF(), 0); #endif tdq = TDQ_SELF(); /* Add thread0's load since it's running. */ TDQ_LOCK(tdq); thread0.td_lock = TDQ_LOCKPTR(tdq); tdq_load_add(tdq, &thread0); tdq->tdq_lowpri = thread0.td_priority; TDQ_UNLOCK(tdq); } /* * This routine determines time constants after stathz and hz are setup. */ /* ARGSUSED */ static void sched_initticks(void *dummy) { int incr; realstathz = stathz ? stathz : hz; sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR; sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR; hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); /* * tickincr is shifted out by 10 to avoid rounding errors due to * hz not being evenly divisible by stathz on all platforms. */ incr = (hz << SCHED_TICK_SHIFT) / realstathz; /* * This does not work for values of stathz that are more than * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. */ if (incr == 0) incr = 1; tickincr = incr; #ifdef SMP /* * Set the default balance interval now that we know * what realstathz is. */ balance_interval = realstathz; balance_ticks = balance_interval; affinity = SCHED_AFFINITY_DEFAULT; #endif if (sched_idlespinthresh < 0) sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz; } /* * This is the core of the interactivity algorithm. Determines a score based * on past behavior. It is the ratio of sleep time to run time scaled to * a [0, 100] integer. This is the voluntary sleep time of a process, which * differs from the cpu usage because it does not account for time spent * waiting on a run-queue. Would be prettier if we had floating point. * * When a thread's sleep time is greater than its run time the * calculation is: * * scaling factor * interactivity score = --------------------- * sleep time / run time * * * When a thread's run time is greater than its sleep time the * calculation is: * * scaling factor * interactivity score = 2 * scaling factor - --------------------- * run time / sleep time */ static int sched_interact_score(struct thread *td) { struct td_sched *ts; int div; ts = td_get_sched(td); /* * The score is only needed if this is likely to be an interactive * task. Don't go through the expense of computing it if there's * no chance. */ if (sched_interact <= SCHED_INTERACT_HALF && ts->ts_runtime >= ts->ts_slptime) return (SCHED_INTERACT_HALF); if (ts->ts_runtime > ts->ts_slptime) { div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF); return (SCHED_INTERACT_HALF + (SCHED_INTERACT_HALF - (ts->ts_slptime / div))); } if (ts->ts_slptime > ts->ts_runtime) { div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF); return (ts->ts_runtime / div); } /* runtime == slptime */ if (ts->ts_runtime) return (SCHED_INTERACT_HALF); /* * This can happen if slptime and runtime are 0. */ return (0); } /* * Scale the scheduling priority according to the "interactivity" of this * process. */ static void sched_priority(struct thread *td) { int score; int pri; if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) return; /* * If the score is interactive we place the thread in the realtime * queue with a priority that is less than kernel and interrupt * priorities. These threads are not subject to nice restrictions. * * Scores greater than this are placed on the normal timeshare queue * where the priority is partially decided by the most recent cpu * utilization and the rest is decided by nice value. * * The nice value of the process has a linear effect on the calculated * score. Negative nice values make it easier for a thread to be * considered interactive. */ score = imax(0, sched_interact_score(td) + td->td_proc->p_nice); if (score < sched_interact) { pri = PRI_MIN_INTERACT; pri += ((PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) / sched_interact) * score; KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT, ("sched_priority: invalid interactive priority %d score %d", pri, score)); } else { pri = SCHED_PRI_MIN; if (td_get_sched(td)->ts_ticks) pri += min(SCHED_PRI_TICKS(td_get_sched(td)), SCHED_PRI_RANGE - 1); pri += SCHED_PRI_NICE(td->td_proc->p_nice); KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH, ("sched_priority: invalid priority %d: nice %d, " "ticks %d ftick %d ltick %d tick pri %d", pri, td->td_proc->p_nice, td_get_sched(td)->ts_ticks, td_get_sched(td)->ts_ftick, td_get_sched(td)->ts_ltick, SCHED_PRI_TICKS(td_get_sched(td)))); } sched_user_prio(td, pri); return; } /* * This routine enforces a maximum limit on the amount of scheduling history * kept. It is called after either the slptime or runtime is adjusted. This * function is ugly due to integer math. */ static void sched_interact_update(struct thread *td) { struct td_sched *ts; u_int sum; ts = td_get_sched(td); sum = ts->ts_runtime + ts->ts_slptime; if (sum < SCHED_SLP_RUN_MAX) return; /* * This only happens from two places: * 1) We have added an unusual amount of run time from fork_exit. * 2) We have added an unusual amount of sleep time from sched_sleep(). */ if (sum > SCHED_SLP_RUN_MAX * 2) { if (ts->ts_runtime > ts->ts_slptime) { ts->ts_runtime = SCHED_SLP_RUN_MAX; ts->ts_slptime = 1; } else { ts->ts_slptime = SCHED_SLP_RUN_MAX; ts->ts_runtime = 1; } return; } /* * If we have exceeded by more than 1/5th then the algorithm below * will not bring us back into range. Dividing by two here forces * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] */ if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { ts->ts_runtime /= 2; ts->ts_slptime /= 2; return; } ts->ts_runtime = (ts->ts_runtime / 5) * 4; ts->ts_slptime = (ts->ts_slptime / 5) * 4; } /* * Scale back the interactivity history when a child thread is created. The * history is inherited from the parent but the thread may behave totally * differently. For example, a shell spawning a compiler process. We want * to learn that the compiler is behaving badly very quickly. */ static void sched_interact_fork(struct thread *td) { struct td_sched *ts; int ratio; int sum; ts = td_get_sched(td); sum = ts->ts_runtime + ts->ts_slptime; if (sum > SCHED_SLP_RUN_FORK) { ratio = sum / SCHED_SLP_RUN_FORK; ts->ts_runtime /= ratio; ts->ts_slptime /= ratio; } } /* * Called from proc0_init() to setup the scheduler fields. */ void schedinit(void) { struct td_sched *ts0; /* * Set up the scheduler specific parts of thread0. */ ts0 = td_get_sched(&thread0); ts0->ts_ltick = ticks; ts0->ts_ftick = ticks; ts0->ts_slice = 0; ts0->ts_cpu = curcpu; /* set valid CPU number */ } /* * This is only somewhat accurate since given many processes of the same * priority they will switch when their slices run out, which will be * at most sched_slice stathz ticks. */ int sched_rr_interval(void) { /* Convert sched_slice from stathz to hz. */ return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz)); } /* * Update the percent cpu tracking information when it is requested or * the total history exceeds the maximum. We keep a sliding history of * tick counts that slowly decays. This is less precise than the 4BSD * mechanism since it happens with less regular and frequent events. */ static void sched_pctcpu_update(struct td_sched *ts, int run) { int t = ticks; /* * The signed difference may be negative if the thread hasn't run for * over half of the ticks rollover period. */ if ((u_int)(t - ts->ts_ltick) >= SCHED_TICK_TARG) { ts->ts_ticks = 0; ts->ts_ftick = t - SCHED_TICK_TARG; } else if (t - ts->ts_ftick >= SCHED_TICK_MAX) { ts->ts_ticks = (ts->ts_ticks / (ts->ts_ltick - ts->ts_ftick)) * (ts->ts_ltick - (t - SCHED_TICK_TARG)); ts->ts_ftick = t - SCHED_TICK_TARG; } if (run) ts->ts_ticks += (t - ts->ts_ltick) << SCHED_TICK_SHIFT; ts->ts_ltick = t; } /* * Adjust the priority of a thread. Move it to the appropriate run-queue * if necessary. This is the back-end for several priority related * functions. */ static void sched_thread_priority(struct thread *td, u_char prio) { struct td_sched *ts; struct tdq *tdq; int oldpri; KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(curthread)); SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio); if (td != curthread && prio < td->td_priority) { KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread), "lend prio", "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, curthread); } ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_priority == prio) return; /* * If the priority has been elevated due to priority * propagation, we may have to move ourselves to a new * queue. This could be optimized to not re-add in some * cases. */ if (TD_ON_RUNQ(td) && prio < td->td_priority) { sched_rem(td); td->td_priority = prio; sched_add(td, SRQ_BORROWING | SRQ_HOLDTD); return; } /* * If the thread is currently running we may have to adjust the lowpri * information so other cpus are aware of our current priority. */ if (TD_IS_RUNNING(td)) { tdq = TDQ_CPU(ts->ts_cpu); oldpri = td->td_priority; td->td_priority = prio; if (prio < tdq->tdq_lowpri) tdq->tdq_lowpri = prio; else if (tdq->tdq_lowpri == oldpri) tdq_setlowpri(tdq, td); return; } td->td_priority = prio; } /* * Update a thread's priority when it is lent another thread's * priority. */ void sched_lend_prio(struct thread *td, u_char prio) { td->td_flags |= TDF_BORROWING; sched_thread_priority(td, prio); } /* * Restore a thread's priority when priority propagation is * over. The prio argument is the minimum priority the thread * needs to have to satisfy other possible priority lending * requests. If the thread's regular priority is less * important than prio, the thread will keep a priority boost * of prio. */ void sched_unlend_prio(struct thread *td, u_char prio) { u_char base_pri; if (td->td_base_pri >= PRI_MIN_TIMESHARE && td->td_base_pri <= PRI_MAX_TIMESHARE) base_pri = td->td_user_pri; else base_pri = td->td_base_pri; if (prio >= base_pri) { td->td_flags &= ~TDF_BORROWING; sched_thread_priority(td, base_pri); } else sched_lend_prio(td, prio); } /* * Standard entry for setting the priority to an absolute value. */ void sched_prio(struct thread *td, u_char prio) { u_char oldprio; /* First, update the base priority. */ td->td_base_pri = prio; /* * If the thread is borrowing another thread's priority, don't * ever lower the priority. */ if (td->td_flags & TDF_BORROWING && td->td_priority < prio) return; /* Change the real priority. */ oldprio = td->td_priority; sched_thread_priority(td, prio); /* * If the thread is on a turnstile, then let the turnstile update * its state. */ if (TD_ON_LOCK(td) && oldprio != prio) turnstile_adjust(td, oldprio); } /* * Set the base user priority, does not effect current running priority. */ void sched_user_prio(struct thread *td, u_char prio) { td->td_base_user_pri = prio; if (td->td_lend_user_pri <= prio) return; td->td_user_pri = prio; } void sched_lend_user_prio(struct thread *td, u_char prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_lend_user_pri = prio; td->td_user_pri = min(prio, td->td_base_user_pri); if (td->td_priority > td->td_user_pri) sched_prio(td, td->td_user_pri); else if (td->td_priority != td->td_user_pri) td->td_flags |= TDF_NEEDRESCHED; } /* * Like the above but first check if there is anything to do. */ void sched_lend_user_prio_cond(struct thread *td, u_char prio) { if (td->td_lend_user_pri != prio) goto lend; if (td->td_user_pri != min(prio, td->td_base_user_pri)) goto lend; if (td->td_priority != td->td_user_pri) goto lend; return; lend: thread_lock(td); sched_lend_user_prio(td, prio); thread_unlock(td); } #ifdef SMP /* * This tdq is about to idle. Try to steal a thread from another CPU before * choosing the idle thread. */ static void tdq_trysteal(struct tdq *tdq) { struct cpu_group *cg, *parent; struct tdq *steal; cpuset_t mask; int cpu, i, goup; if (smp_started == 0 || trysteal_limit == 0 || tdq->tdq_cg == NULL) return; CPU_FILL(&mask); CPU_CLR(PCPU_GET(cpuid), &mask); /* We don't want to be preempted while we're iterating. */ spinlock_enter(); TDQ_UNLOCK(tdq); for (i = 1, cg = tdq->tdq_cg, goup = 0; ; ) { cpu = sched_highest(cg, &mask, steal_thresh); /* * If a thread was added while interrupts were disabled don't * steal one here. */ if (tdq->tdq_load > 0) { TDQ_LOCK(tdq); break; } /* * We found no CPU to steal from in this group. Escalate to * the parent and repeat. But if parent has only two children * groups we can avoid searching this group again by searching * the other one specifically and then escalating two levels. */ if (cpu == -1) { if (goup) { cg = cg->cg_parent; goup = 0; } if (++i > trysteal_limit) { TDQ_LOCK(tdq); break; } parent = cg->cg_parent; if (parent == NULL) { TDQ_LOCK(tdq); break; } if (parent->cg_children == 2) { if (cg == &parent->cg_child[0]) cg = &parent->cg_child[1]; else cg = &parent->cg_child[0]; goup = 1; } else cg = parent; continue; } steal = TDQ_CPU(cpu); /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread. */ if (steal->tdq_load < steal_thresh || steal->tdq_transferable == 0) continue; /* * Try to lock both queues. If we are assigned a thread while * waited for the lock, switch to it now instead of stealing. * If we can't get the lock, then somebody likely got there * first. At this point unconditonally exit the loop to * bound the time spent in the critcal section. */ TDQ_LOCK(tdq); if (tdq->tdq_load > 0) break; if (TDQ_TRYLOCK_FLAGS(steal, MTX_DUPOK) == 0) break; /* * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread. */ if (steal->tdq_load < steal_thresh || steal->tdq_transferable == 0) { TDQ_UNLOCK(steal); break; } /* * If we fail to acquire one due to affinity restrictions, * bail out and let the idle thread to a more complete search * outside of a critical section. */ if (tdq_move(steal, tdq) == NULL) { TDQ_UNLOCK(steal); break; } TDQ_UNLOCK(steal); break; } spinlock_exit(); } #endif /* * Handle migration from sched_switch(). This happens only for * cpu binding. */ static struct mtx * sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags) { struct tdq *tdn; KASSERT(THREAD_CAN_MIGRATE(td) || (td_get_sched(td)->ts_flags & TSF_BOUND) != 0, ("Thread %p shouldn't migrate", td)); KASSERT(!CPU_ABSENT(td_get_sched(td)->ts_cpu), ("sched_switch_migrate: " "thread %s queued on absent CPU %d.", td->td_name, td_get_sched(td)->ts_cpu)); tdn = TDQ_CPU(td_get_sched(td)->ts_cpu); #ifdef SMP tdq_load_rem(tdq, td); /* * Do the lock dance required to avoid LOR. We have an * extra spinlock nesting from sched_switch() which will * prevent preemption while we're holding neither run-queue lock. */ TDQ_UNLOCK(tdq); TDQ_LOCK(tdn); tdq_add(tdn, td, flags); tdq_notify(tdn, td); TDQ_UNLOCK(tdn); TDQ_LOCK(tdq); #endif return (TDQ_LOCKPTR(tdn)); } /* * thread_lock_unblock() that does not assume td_lock is blocked. */ static inline void thread_unblock_switch(struct thread *td, struct mtx *mtx) { atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock, (uintptr_t)mtx); } /* * Switch threads. This function has to handle threads coming in while * blocked for some reason, running, or idle. It also must deal with * migrating a thread from one queue to another as running threads may * be assigned elsewhere via binding. */ void sched_switch(struct thread *td, int flags) { struct thread *newtd; struct tdq *tdq; struct td_sched *ts; struct mtx *mtx; int srqflag; int cpuid, preempted; #ifdef SMP int pickcpu; #endif THREAD_LOCK_ASSERT(td, MA_OWNED); cpuid = PCPU_GET(cpuid); tdq = TDQ_SELF(); ts = td_get_sched(td); sched_pctcpu_update(ts, 1); #ifdef SMP pickcpu = (td->td_flags & TDF_PICKCPU) != 0; if (pickcpu) ts->ts_rltick = ticks - affinity * MAX_CACHE_LEVELS; else ts->ts_rltick = ticks; #endif td->td_lastcpu = td->td_oncpu; preempted = (td->td_flags & TDF_SLICEEND) == 0 && (flags & SW_PREEMPT) != 0; td->td_flags &= ~(TDF_NEEDRESCHED | TDF_PICKCPU | TDF_SLICEEND); td->td_owepreempt = 0; tdq->tdq_owepreempt = 0; if (!TD_IS_IDLETHREAD(td)) tdq->tdq_switchcnt++; /* * Always block the thread lock so we can drop the tdq lock early. */ mtx = thread_lock_block(td); spinlock_enter(); if (TD_IS_IDLETHREAD(td)) { MPASS(mtx == TDQ_LOCKPTR(tdq)); TD_SET_CAN_RUN(td); } else if (TD_IS_RUNNING(td)) { MPASS(mtx == TDQ_LOCKPTR(tdq)); srqflag = preempted ? SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : SRQ_OURSELF|SRQ_YIELDING; #ifdef SMP if (THREAD_CAN_MIGRATE(td) && (!THREAD_CAN_SCHED(td, ts->ts_cpu) || pickcpu)) ts->ts_cpu = sched_pickcpu(td, 0); #endif if (ts->ts_cpu == cpuid) tdq_runq_add(tdq, td, srqflag); else mtx = sched_switch_migrate(tdq, td, srqflag); } else { /* This thread must be going to sleep. */ if (mtx != TDQ_LOCKPTR(tdq)) { mtx_unlock_spin(mtx); TDQ_LOCK(tdq); } tdq_load_rem(tdq, td); #ifdef SMP if (tdq->tdq_load == 0) tdq_trysteal(tdq); #endif } #if (KTR_COMPILE & KTR_SCHED) != 0 if (TD_IS_IDLETHREAD(td)) KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle", "prio:%d", td->td_priority); else KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td), "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg, "lockname:\"%s\"", td->td_lockname); #endif /* * We enter here with the thread blocked and assigned to the * appropriate cpu run-queue or sleep-queue and with the current * thread-queue locked. */ TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); newtd = choosethread(); sched_pctcpu_update(td_get_sched(newtd), 0); TDQ_UNLOCK(tdq); /* * Call the MD code to switch contexts if necessary. */ if (td != newtd) { #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); #endif SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc); #ifdef KDTRACE_HOOKS /* * If DTrace has set the active vtime enum to anything * other than INACTIVE (0), then it should have set the * function to call. */ if (dtrace_vtime_active) (*dtrace_vtime_switch_func)(newtd); #endif td->td_oncpu = NOCPU; cpu_switch(td, newtd, mtx); cpuid = td->td_oncpu = PCPU_GET(cpuid); SDT_PROBE0(sched, , , on__cpu); #ifdef HWPMC_HOOKS if (PMC_PROC_IS_USING_PMCS(td->td_proc)) PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); #endif } else { thread_unblock_switch(td, mtx); SDT_PROBE0(sched, , , remain__cpu); } KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count %d", curthread->td_md.md_spinlock_count)); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); } /* * Adjust thread priorities as a result of a nice request. */ void sched_nice(struct proc *p, int nice) { struct thread *td; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_nice = nice; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); sched_priority(td); sched_prio(td, td->td_base_user_pri); thread_unlock(td); } } /* * Record the sleep time for the interactivity scorer. */ void sched_sleep(struct thread *td, int prio) { THREAD_LOCK_ASSERT(td, MA_OWNED); td->td_slptick = ticks; if (TD_IS_SUSPENDED(td) || prio >= PSOCK) td->td_flags |= TDF_CANSWAP; if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) return; if (static_boost == 1 && prio) sched_prio(td, prio); else if (static_boost && td->td_priority > static_boost) sched_prio(td, static_boost); } /* * Schedule a thread to resume execution and record how long it voluntarily * slept. We also update the pctcpu, interactivity, and priority. * * Requires the thread lock on entry, drops on exit. */ void sched_wakeup(struct thread *td, int srqflags) { struct td_sched *ts; int slptick; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); td->td_flags &= ~TDF_CANSWAP; /* * If we slept for more than a tick update our interactivity and * priority. */ slptick = td->td_slptick; td->td_slptick = 0; if (slptick && slptick != ticks) { ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT; sched_interact_update(td); sched_pctcpu_update(ts, 0); } /* * Reset the slice value since we slept and advanced the round-robin. */ ts->ts_slice = 0; sched_add(td, SRQ_BORING | srqflags); } /* * Penalize the parent for creating a new child and initialize the child's * priority. */ void sched_fork(struct thread *td, struct thread *child) { THREAD_LOCK_ASSERT(td, MA_OWNED); sched_pctcpu_update(td_get_sched(td), 1); sched_fork_thread(td, child); /* * Penalize the parent and child for forking. */ sched_interact_fork(child); sched_priority(child); td_get_sched(td)->ts_runtime += tickincr; sched_interact_update(td); sched_priority(td); } /* * Fork a new thread, may be within the same process. */ void sched_fork_thread(struct thread *td, struct thread *child) { struct td_sched *ts; struct td_sched *ts2; struct tdq *tdq; tdq = TDQ_SELF(); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Initialize child. */ ts = td_get_sched(td); ts2 = td_get_sched(child); child->td_oncpu = NOCPU; child->td_lastcpu = NOCPU; child->td_lock = TDQ_LOCKPTR(tdq); child->td_cpuset = cpuset_ref(td->td_cpuset); child->td_domain.dr_policy = td->td_cpuset->cs_domain; ts2->ts_cpu = ts->ts_cpu; ts2->ts_flags = 0; /* * Grab our parents cpu estimation information. */ ts2->ts_ticks = ts->ts_ticks; ts2->ts_ltick = ts->ts_ltick; ts2->ts_ftick = ts->ts_ftick; /* * Do not inherit any borrowed priority from the parent. */ child->td_priority = child->td_base_pri; /* * And update interactivity score. */ ts2->ts_slptime = ts->ts_slptime; ts2->ts_runtime = ts->ts_runtime; /* Attempt to quickly learn interactivity. */ ts2->ts_slice = tdq_slice(tdq) - sched_slice_min; #ifdef KTR bzero(ts2->ts_name, sizeof(ts2->ts_name)); #endif } /* * Adjust the priority class of a thread. */ void sched_class(struct thread *td, int class) { THREAD_LOCK_ASSERT(td, MA_OWNED); if (td->td_pri_class == class) return; td->td_pri_class = class; } /* * Return some of the child's priority and interactivity to the parent. */ void sched_exit(struct proc *p, struct thread *child) { struct thread *td; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit", "prio:%d", child->td_priority); PROC_LOCK_ASSERT(p, MA_OWNED); td = FIRST_THREAD_IN_PROC(p); sched_exit_thread(td, child); } /* * Penalize another thread for the time spent on this one. This helps to * worsen the priority and interactivity of processes which schedule batch * jobs such as make. This has little effect on the make process itself but * causes new processes spawned by it to receive worse scores immediately. */ void sched_exit_thread(struct thread *td, struct thread *child) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit", "prio:%d", child->td_priority); /* * Give the child's runtime to the parent without returning the * sleep time as a penalty to the parent. This causes shells that * launch expensive things to mark their children as expensive. */ thread_lock(td); td_get_sched(td)->ts_runtime += td_get_sched(child)->ts_runtime; sched_interact_update(td); sched_priority(td); thread_unlock(td); } void sched_preempt(struct thread *td) { struct tdq *tdq; int flags; SDT_PROBE2(sched, , , surrender, td, td->td_proc); thread_lock(td); tdq = TDQ_SELF(); TDQ_LOCK_ASSERT(tdq, MA_OWNED); if (td->td_priority > tdq->tdq_lowpri) { if (td->td_critnest == 1) { flags = SW_INVOL | SW_PREEMPT; flags |= TD_IS_IDLETHREAD(td) ? SWT_REMOTEWAKEIDLE : SWT_REMOTEPREEMPT; mi_switch(flags); /* Switch dropped thread lock. */ return; } td->td_owepreempt = 1; } else { tdq->tdq_owepreempt = 0; } thread_unlock(td); } /* * Fix priorities on return to user-space. Priorities may be elevated due * to static priorities in msleep() or similar. */ void sched_userret_slowpath(struct thread *td) { thread_lock(td); td->td_priority = td->td_user_pri; td->td_base_pri = td->td_user_pri; tdq_setlowpri(TDQ_SELF(), td); thread_unlock(td); } /* * Handle a stathz tick. This is really only relevant for timeshare * threads. */ void sched_clock(struct thread *td, int cnt) { struct tdq *tdq; struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); tdq = TDQ_SELF(); #ifdef SMP /* * We run the long term load balancer infrequently on the first cpu. */ if (balance_tdq == tdq && smp_started != 0 && rebalance != 0 && balance_ticks != 0) { balance_ticks -= cnt; if (balance_ticks <= 0) sched_balance(); } #endif /* * Save the old switch count so we have a record of the last ticks * activity. Initialize the new switch count based on our load. * If there is some activity seed it to reflect that. */ tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt; tdq->tdq_switchcnt = tdq->tdq_load; /* * Advance the insert index once for each tick to ensure that all * threads get a chance to run. */ if (tdq->tdq_idx == tdq->tdq_ridx) { tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx])) tdq->tdq_ridx = tdq->tdq_idx; } ts = td_get_sched(td); sched_pctcpu_update(ts, 1); if ((td->td_pri_class & PRI_FIFO_BIT) || TD_IS_IDLETHREAD(td)) return; if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) { /* * We used a tick; charge it to the thread so * that we can compute our interactivity. */ td_get_sched(td)->ts_runtime += tickincr * cnt; sched_interact_update(td); sched_priority(td); } /* * Force a context switch if the current thread has used up a full * time slice (default is 100ms). */ ts->ts_slice += cnt; if (ts->ts_slice >= tdq_slice(tdq)) { ts->ts_slice = 0; td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND; } } u_int sched_estcpu(struct thread *td __unused) { return (0); } /* * Return whether the current CPU has runnable tasks. Used for in-kernel * cooperative idle threads. */ int sched_runnable(void) { struct tdq *tdq; int load; load = 1; tdq = TDQ_SELF(); if ((curthread->td_flags & TDF_IDLETD) != 0) { if (tdq->tdq_load > 0) goto out; } else if (tdq->tdq_load - 1 > 0) goto out; load = 0; out: return (load); } /* * Choose the highest priority thread to run. The thread is removed from * the run-queue while running however the load remains. For SMP we set * the tdq in the global idle bitmask if it idles here. */ struct thread * sched_choose(void) { struct thread *td; struct tdq *tdq; tdq = TDQ_SELF(); TDQ_LOCK_ASSERT(tdq, MA_OWNED); td = tdq_choose(tdq); if (td) { tdq_runq_rem(tdq, td); tdq->tdq_lowpri = td->td_priority; return (td); } tdq->tdq_lowpri = PRI_MAX_IDLE; return (PCPU_GET(idlethread)); } /* * Set owepreempt if necessary. Preemption never happens directly in ULE, * we always request it once we exit a critical section. */ static inline void sched_setpreempt(struct thread *td) { struct thread *ctd; int cpri; int pri; THREAD_LOCK_ASSERT(curthread, MA_OWNED); ctd = curthread; pri = td->td_priority; cpri = ctd->td_priority; if (pri < cpri) ctd->td_flags |= TDF_NEEDRESCHED; if (KERNEL_PANICKED() || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) return; if (!sched_shouldpreempt(pri, cpri, 0)) return; ctd->td_owepreempt = 1; } /* * Add a thread to a thread queue. Select the appropriate runq and add the * thread to it. This is the internal function called when the tdq is * predetermined. */ void tdq_add(struct tdq *tdq, struct thread *td, int flags) { TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); KASSERT((td->td_inhibitors == 0), ("sched_add: trying to run inhibited thread")); KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), ("sched_add: bad thread state")); KASSERT(td->td_flags & TDF_INMEM, ("sched_add: thread swapped out")); if (td->td_priority < tdq->tdq_lowpri) tdq->tdq_lowpri = td->td_priority; tdq_runq_add(tdq, td, flags); tdq_load_add(tdq, td); } /* * Select the target thread queue and add a thread to it. Request * preemption or IPI a remote processor if required. * * Requires the thread lock on entry, drops on exit. */ void sched_add(struct thread *td, int flags) { struct tdq *tdq; #ifdef SMP int cpu; #endif KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", "prio:%d", td->td_priority, KTR_ATTR_LINKED, sched_tdname(curthread)); KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", KTR_ATTR_LINKED, sched_tdname(td)); SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, flags & SRQ_PREEMPTED); THREAD_LOCK_ASSERT(td, MA_OWNED); /* * Recalculate the priority before we select the target cpu or * run-queue. */ if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_priority(td); #ifdef SMP /* * Pick the destination cpu and if it isn't ours transfer to the * target cpu. */ cpu = sched_pickcpu(td, flags); tdq = sched_setcpu(td, cpu, flags); tdq_add(tdq, td, flags); if (cpu != PCPU_GET(cpuid)) tdq_notify(tdq, td); else if (!(flags & SRQ_YIELDING)) sched_setpreempt(td); #else tdq = TDQ_SELF(); /* * Now that the thread is moving to the run-queue, set the lock * to the scheduler's lock. */ if (td->td_lock != TDQ_LOCKPTR(tdq)) { TDQ_LOCK(tdq); if ((flags & SRQ_HOLD) != 0) td->td_lock = TDQ_LOCKPTR(tdq); else thread_lock_set(td, TDQ_LOCKPTR(tdq)); } tdq_add(tdq, td, flags); if (!(flags & SRQ_YIELDING)) sched_setpreempt(td); #endif if (!(flags & SRQ_HOLDTD)) thread_unlock(td); } /* * Remove a thread from a run-queue without running it. This is used * when we're stealing a thread from a remote queue. Otherwise all threads * exit by calling sched_exit_thread() and sched_throw() themselves. */ void sched_rem(struct thread *td) { struct tdq *tdq; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem", "prio:%d", td->td_priority); SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL); tdq = TDQ_CPU(td_get_sched(td)->ts_cpu); TDQ_LOCK_ASSERT(tdq, MA_OWNED); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); KASSERT(TD_ON_RUNQ(td), ("sched_rem: thread not on run queue")); tdq_runq_rem(tdq, td); tdq_load_rem(tdq, td); TD_SET_CAN_RUN(td); if (td->td_priority == tdq->tdq_lowpri) tdq_setlowpri(tdq, NULL); } /* * Fetch cpu utilization information. Updates on demand. */ fixpt_t sched_pctcpu(struct thread *td) { fixpt_t pctcpu; struct td_sched *ts; pctcpu = 0; ts = td_get_sched(td); THREAD_LOCK_ASSERT(td, MA_OWNED); sched_pctcpu_update(ts, TD_IS_RUNNING(td)); if (ts->ts_ticks) { int rtick; /* How many rtick per second ? */ rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; } return (pctcpu); } /* * Enforce affinity settings for a thread. Called after adjustments to * cpumask. */ void sched_affinity(struct thread *td) { #ifdef SMP struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); ts = td_get_sched(td); if (THREAD_CAN_SCHED(td, ts->ts_cpu)) return; if (TD_ON_RUNQ(td)) { sched_rem(td); sched_add(td, SRQ_BORING | SRQ_HOLDTD); return; } if (!TD_IS_RUNNING(td)) return; /* * Force a switch before returning to userspace. If the * target thread is not running locally send an ipi to force * the issue. */ td->td_flags |= TDF_NEEDRESCHED; if (td != curthread) ipi_cpu(ts->ts_cpu, IPI_PREEMPT); #endif } /* * Bind a thread to a target cpu. */ void sched_bind(struct thread *td, int cpu) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); KASSERT(td == curthread, ("sched_bind: can only bind curthread")); ts = td_get_sched(td); if (ts->ts_flags & TSF_BOUND) sched_unbind(td); KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td)); ts->ts_flags |= TSF_BOUND; sched_pin(); if (PCPU_GET(cpuid) == cpu) return; ts->ts_cpu = cpu; /* When we return from mi_switch we'll be on the correct cpu. */ mi_switch(SW_VOL); thread_lock(td); } /* * Release a bound thread. */ void sched_unbind(struct thread *td) { struct td_sched *ts; THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td == curthread, ("sched_unbind: can only bind curthread")); ts = td_get_sched(td); if ((ts->ts_flags & TSF_BOUND) == 0) return; ts->ts_flags &= ~TSF_BOUND; sched_unpin(); } int sched_is_bound(struct thread *td) { THREAD_LOCK_ASSERT(td, MA_OWNED); return (td_get_sched(td)->ts_flags & TSF_BOUND); } /* * Basic yield call. */ void sched_relinquish(struct thread *td) { thread_lock(td); mi_switch(SW_VOL | SWT_RELINQUISH); } /* * Return the total system load. */ int sched_load(void) { #ifdef SMP int total; int i; total = 0; CPU_FOREACH(i) total += TDQ_CPU(i)->tdq_sysload; return (total); #else return (TDQ_SELF()->tdq_sysload); #endif } int sched_sizeof_proc(void) { return (sizeof(struct proc)); } int sched_sizeof_thread(void) { return (sizeof(struct thread) + sizeof(struct td_sched)); } #ifdef SMP #define TDQ_IDLESPIN(tdq) \ ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0) #else #define TDQ_IDLESPIN(tdq) 1 #endif /* * The actual idle process. */ void sched_idletd(void *dummy) { struct thread *td; struct tdq *tdq; int oldswitchcnt, switchcnt; int i; mtx_assert(&Giant, MA_NOTOWNED); td = curthread; tdq = TDQ_SELF(); THREAD_NO_SLEEPING(); oldswitchcnt = -1; for (;;) { if (tdq->tdq_load) { thread_lock(td); mi_switch(SW_VOL | SWT_IDLE); } switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; #ifdef SMP if (always_steal || switchcnt != oldswitchcnt) { oldswitchcnt = switchcnt; if (tdq_idled(tdq) == 0) continue; } switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; #else oldswitchcnt = switchcnt; #endif /* * If we're switching very frequently, spin while checking * for load rather than entering a low power state that * may require an IPI. However, don't do any busy * loops while on SMT machines as this simply steals * cycles from cores doing useful work. */ if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) { for (i = 0; i < sched_idlespins; i++) { if (tdq->tdq_load) break; cpu_spinwait(); } } /* If there was context switch during spin, restart it. */ switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt) continue; /* Run main MD idle handler. */ tdq->tdq_cpu_idle = 1; /* * Make sure that tdq_cpu_idle update is globally visible * before cpu_idle() read tdq_load. The order is important * to avoid race with tdq_notify. */ atomic_thread_fence_seq_cst(); /* * Checking for again after the fence picks up assigned * threads often enough to make it worthwhile to do so in * order to avoid calling cpu_idle(). */ if (tdq->tdq_load != 0) { tdq->tdq_cpu_idle = 0; continue; } cpu_idle(switchcnt * 4 > sched_idlespinthresh); tdq->tdq_cpu_idle = 0; /* * Account thread-less hardware interrupts and * other wakeup reasons equal to context switches. */ switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; if (switchcnt != oldswitchcnt) continue; tdq->tdq_switchcnt++; oldswitchcnt++; } } /* * A CPU is entering for the first time or a thread is exiting. */ void sched_throw(struct thread *td) { struct thread *newtd; struct tdq *tdq; if (__predict_false(td == NULL)) { #ifdef SMP PCPU_SET(sched, DPCPU_PTR(tdq)); #endif /* Correct spinlock nesting and acquire the correct lock. */ tdq = TDQ_SELF(); TDQ_LOCK(tdq); spinlock_exit(); PCPU_SET(switchtime, cpu_ticks()); PCPU_SET(switchticks, ticks); PCPU_GET(idlethread)->td_lock = TDQ_LOCKPTR(tdq); } else { tdq = TDQ_SELF(); THREAD_LOCK_ASSERT(td, MA_OWNED); THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(tdq)); tdq_load_rem(tdq, td); td->td_lastcpu = td->td_oncpu; td->td_oncpu = NOCPU; thread_lock_block(td); } newtd = choosethread(); spinlock_enter(); TDQ_UNLOCK(tdq); KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count %d", curthread->td_md.md_spinlock_count)); /* doesn't return */ if (__predict_false(td == NULL)) cpu_throw(td, newtd); /* doesn't return */ else cpu_switch(td, newtd, TDQ_LOCKPTR(tdq)); } /* * This is called from fork_exit(). Just acquire the correct locks and * let fork do the rest of the work. */ void sched_fork_exit(struct thread *td) { struct tdq *tdq; int cpuid; /* * Finish setting up thread glue so that it begins execution in a * non-nested critical section with the scheduler lock held. */ KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count %d", curthread->td_md.md_spinlock_count)); cpuid = PCPU_GET(cpuid); tdq = TDQ_SELF(); TDQ_LOCK(tdq); spinlock_exit(); MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); td->td_oncpu = cpuid; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", "prio:%d", td->td_priority); SDT_PROBE0(sched, , , on__cpu); } /* * Create on first use to catch odd startup conditons. */ char * sched_tdname(struct thread *td) { #ifdef KTR struct td_sched *ts; ts = td_get_sched(td); if (ts->ts_name[0] == '\0') snprintf(ts->ts_name, sizeof(ts->ts_name), "%s tid %d", td->td_name, td->td_tid); return (ts->ts_name); #else return (td->td_name); #endif } #ifdef KTR void sched_clear_tdname(struct thread *td) { struct td_sched *ts; ts = td_get_sched(td); ts->ts_name[0] = '\0'; } #endif #ifdef SMP /* * Build the CPU topology dump string. Is recursively called to collect * the topology tree. */ static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg, int indent) { char cpusetbuf[CPUSETBUFSIZ]; int i, first; sbuf_printf(sb, "%*s\n", indent, "", 1 + indent / 2, cg->cg_level); sbuf_printf(sb, "%*s ", indent, "", cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask)); first = TRUE; for (i = cg->cg_first; i <= cg->cg_last; i++) { if (CPU_ISSET(i, &cg->cg_mask)) { if (!first) sbuf_printf(sb, ", "); else first = FALSE; sbuf_printf(sb, "%d", i); } } sbuf_printf(sb, "\n"); if (cg->cg_flags != 0) { sbuf_printf(sb, "%*s ", indent, ""); if ((cg->cg_flags & CG_FLAG_HTT) != 0) sbuf_printf(sb, "HTT group"); if ((cg->cg_flags & CG_FLAG_THREAD) != 0) sbuf_printf(sb, "THREAD group"); if ((cg->cg_flags & CG_FLAG_SMT) != 0) sbuf_printf(sb, "SMT group"); + if ((cg->cg_flags & CG_FLAG_NODE) != 0) + sbuf_printf(sb, "NUMA node"); sbuf_printf(sb, "\n"); } if (cg->cg_children > 0) { sbuf_printf(sb, "%*s \n", indent, ""); for (i = 0; i < cg->cg_children; i++) sysctl_kern_sched_topology_spec_internal(sb, &cg->cg_child[i], indent+2); sbuf_printf(sb, "%*s \n", indent, ""); } sbuf_printf(sb, "%*s\n", indent, ""); return (0); } /* * Sysctl handler for retrieving topology dump. It's a wrapper for * the recursive sysctl_kern_smp_topology_spec_internal(). */ static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS) { struct sbuf *topo; int err; KASSERT(cpu_top != NULL, ("cpu_top isn't initialized")); topo = sbuf_new_for_sysctl(NULL, NULL, 512, req); if (topo == NULL) return (ENOMEM); sbuf_printf(topo, "\n"); err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1); sbuf_printf(topo, "\n"); if (err == 0) { err = sbuf_finish(topo); } sbuf_delete(topo); return (err); } #endif static int sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) { int error, new_val, period; period = 1000000 / realstathz; new_val = period * sched_slice; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val <= 0) return (EINVAL); sched_slice = imax(1, (new_val + period / 2) / period); sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR; hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / realstathz); return (0); } SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Scheduler"); SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0, "Scheduler name"); SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_kern_quantum, "I", "Quantum for timeshare threads in microseconds"); SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "Quantum for timeshare threads in stathz ticks"); SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "Interactivity score threshold"); SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, 0, "Maximal (lowest) priority for preemption"); SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RW, &static_boost, 0, "Assign static kernel priorities to sleeping threads"); SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RW, &sched_idlespins, 0, "Number of times idle thread will spin waiting for new work"); SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW, &sched_idlespinthresh, 0, "Threshold before we will permit idle thread spinning"); #ifdef SMP SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, "Number of hz ticks to keep thread affinity for"); SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "Enables the long-term load balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW, &balance_interval, 0, "Average period in stathz ticks to run the long-term balancer"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0, "Attempts to steal work from other cores before idling"); SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0, "Minimum load on remote CPU before we'll steal"); SYSCTL_INT(_kern_sched, OID_AUTO, trysteal_limit, CTLFLAG_RW, &trysteal_limit, 0, "Topological distance limit for stealing threads in sched_switch()"); SYSCTL_INT(_kern_sched, OID_AUTO, always_steal, CTLFLAG_RW, &always_steal, 0, "Always run the stealer from the idle thread"); SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A", "XML dump of detected CPU topology"); #endif /* ps compat. All cpu percentages from ULE are weighted. */ static int ccpu = 0; SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "Decay factor used for updating %CPU in 4BSD scheduler"); diff --git a/sys/sys/smp.h b/sys/sys/smp.h index cee1199015a7..1da557212ae2 100644 --- a/sys/sys/smp.h +++ b/sys/sys/smp.h @@ -1,296 +1,297 @@ /*- * SPDX-License-Identifier: Beerware * * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * * $FreeBSD$ */ #ifndef _SYS_SMP_H_ #define _SYS_SMP_H_ #ifdef _KERNEL #ifndef LOCORE #include #include /* * Types of nodes in the topological tree. */ typedef enum { /* No node has this type; can be used in topo API calls. */ TOPO_TYPE_DUMMY, /* Processing unit aka computing unit aka logical CPU. */ TOPO_TYPE_PU, /* Physical subdivision of a package. */ TOPO_TYPE_CORE, /* CPU L1/L2/L3 cache. */ TOPO_TYPE_CACHE, /* Package aka chip, equivalent to socket. */ TOPO_TYPE_PKG, /* NUMA node. */ TOPO_TYPE_NODE, /* Other logical or physical grouping of PUs. */ /* E.g. PUs on the same dye, or PUs sharing an FPU. */ TOPO_TYPE_GROUP, /* The whole system. */ TOPO_TYPE_SYSTEM } topo_node_type; /* Hardware indenitifier of a topology component. */ typedef unsigned int hwid_t; /* Logical CPU idenitifier. */ typedef int cpuid_t; /* A node in the topology. */ struct topo_node { struct topo_node *parent; TAILQ_HEAD(topo_children, topo_node) children; TAILQ_ENTRY(topo_node) siblings; cpuset_t cpuset; topo_node_type type; uintptr_t subtype; hwid_t hwid; cpuid_t id; int nchildren; int cpu_count; }; /* * Scheduling topology of a NUMA or SMP system. * * The top level topology is an array of pointers to groups. Each group * contains a bitmask of cpus in its group or subgroups. It may also * contain a pointer to an array of child groups. * * The bitmasks at non leaf groups may be used by consumers who support * a smaller depth than the hardware provides. * * The topology may be omitted by systems where all CPUs are equal. */ struct cpu_group { struct cpu_group *cg_parent; /* Our parent group. */ struct cpu_group *cg_child; /* Optional children groups. */ cpuset_t cg_mask; /* Mask of cpus in this group. */ int32_t cg_count; /* Count of cpus in this group. */ int32_t cg_first; /* First cpu in this group. */ int32_t cg_last; /* Last cpu in this group. */ int16_t cg_children; /* Number of children groups. */ int8_t cg_level; /* Shared cache level. */ int8_t cg_flags; /* Traversal modifiers. */ }; typedef struct cpu_group *cpu_group_t; /* * Defines common resources for CPUs in the group. The highest level * resource should be used when multiple are shared. */ #define CG_SHARE_NONE 0 #define CG_SHARE_L1 1 #define CG_SHARE_L2 2 #define CG_SHARE_L3 3 #define MAX_CACHE_LEVELS CG_SHARE_L3 /* * Behavior modifiers for load balancing and affinity. */ #define CG_FLAG_HTT 0x01 /* Schedule the alternate core last. */ #define CG_FLAG_SMT 0x02 /* New age htt, less crippled. */ #define CG_FLAG_THREAD (CG_FLAG_HTT | CG_FLAG_SMT) /* Any threading. */ +#define CG_FLAG_NODE 0x04 /* NUMA node. */ /* * Convenience routines for building and traversing topologies. */ #ifdef SMP void topo_init_node(struct topo_node *node); void topo_init_root(struct topo_node *root); struct topo_node * topo_add_node_by_hwid(struct topo_node *parent, int hwid, topo_node_type type, uintptr_t subtype); struct topo_node * topo_find_node_by_hwid(struct topo_node *parent, int hwid, topo_node_type type, uintptr_t subtype); void topo_promote_child(struct topo_node *child); struct topo_node * topo_next_node(struct topo_node *top, struct topo_node *node); struct topo_node * topo_next_nonchild_node(struct topo_node *top, struct topo_node *node); void topo_set_pu_id(struct topo_node *node, cpuid_t id); enum topo_level { TOPO_LEVEL_PKG = 0, /* * Some systems have useful sub-package core organizations. On these, * a package has one or more subgroups. Each subgroup contains one or * more cache groups (cores that share a last level cache). */ TOPO_LEVEL_GROUP, TOPO_LEVEL_CACHEGROUP, TOPO_LEVEL_CORE, TOPO_LEVEL_THREAD, TOPO_LEVEL_COUNT /* Must be last */ }; struct topo_analysis { int entities[TOPO_LEVEL_COUNT]; }; int topo_analyze(struct topo_node *topo_root, int all, struct topo_analysis *results); #define TOPO_FOREACH(i, root) \ for (i = root; i != NULL; i = topo_next_node(root, i)) struct cpu_group *smp_topo(void); struct cpu_group *smp_topo_alloc(u_int count); struct cpu_group *smp_topo_none(void); struct cpu_group *smp_topo_1level(int l1share, int l1count, int l1flags); struct cpu_group *smp_topo_2level(int l2share, int l2count, int l1share, int l1count, int l1flags); struct cpu_group *smp_topo_find(struct cpu_group *top, int cpu); extern void (*cpustop_restartfunc)(void); /* The suspend/resume cpusets are x86 only, but minimize ifdefs. */ extern volatile cpuset_t resuming_cpus; /* woken up cpus in suspend pen */ extern volatile cpuset_t started_cpus; /* cpus to let out of stop pen */ extern volatile cpuset_t stopped_cpus; /* cpus in stop pen */ extern volatile cpuset_t suspended_cpus; /* cpus [near] sleeping in susp pen */ extern volatile cpuset_t toresume_cpus; /* cpus to let out of suspend pen */ extern cpuset_t hlt_cpus_mask; /* XXX 'mask' is detail in old impl */ extern cpuset_t logical_cpus_mask; #endif /* SMP */ extern u_int mp_maxid; extern int mp_maxcpus; extern int mp_ncores; extern int mp_ncpus; extern int smp_cpus; extern volatile int smp_started; extern int smp_threads_per_core; extern cpuset_t all_cpus; extern cpuset_t cpuset_domain[MAXMEMDOM]; /* CPUs in each NUMA domain. */ /* * Macro allowing us to determine whether a CPU is absent at any given * time, thus permitting us to configure sparse maps of cpuid-dependent * (per-CPU) structures. */ #define CPU_ABSENT(x_cpu) (!CPU_ISSET(x_cpu, &all_cpus)) /* * Macros to iterate over non-absent CPUs. CPU_FOREACH() takes an * integer iterator and iterates over the available set of CPUs. * CPU_FIRST() returns the id of the first non-absent CPU. CPU_NEXT() * returns the id of the next non-absent CPU. It will wrap back to * CPU_FIRST() once the end of the list is reached. The iterators are * currently implemented via inline functions. */ #define CPU_FOREACH(i) \ for ((i) = 0; (i) <= mp_maxid; (i)++) \ if (!CPU_ABSENT((i))) static __inline int cpu_first(void) { int i; for (i = 0;; i++) if (!CPU_ABSENT(i)) return (i); } static __inline int cpu_next(int i) { for (;;) { i++; if (i > mp_maxid) i = 0; if (!CPU_ABSENT(i)) return (i); } } #define CPU_FIRST() cpu_first() #define CPU_NEXT(i) cpu_next((i)) #ifdef SMP /* * Machine dependent functions used to initialize MP support. * * The cpu_mp_probe() should check to see if MP support is present and return * zero if it is not or non-zero if it is. If MP support is present, then * cpu_mp_start() will be called so that MP can be enabled. This function * should do things such as startup secondary processors. It should also * setup mp_ncpus, all_cpus, and smp_cpus. It should also ensure that * smp_started is initialized at the appropriate time. * Once cpu_mp_start() returns, machine independent MP startup code will be * executed and a simple message will be output to the console. Finally, * cpu_mp_announce() will be called so that machine dependent messages about * the MP support may be output to the console if desired. * * The cpu_setmaxid() function is called very early during the boot process * so that the MD code may set mp_maxid to provide an upper bound on CPU IDs * that other subsystems may use. If a platform is not able to determine * the exact maximum ID that early, then it may set mp_maxid to MAXCPU - 1. */ struct thread; struct cpu_group *cpu_topo(void); void cpu_mp_announce(void); int cpu_mp_probe(void); void cpu_mp_setmaxid(void); void cpu_mp_start(void); void forward_signal(struct thread *); int restart_cpus(cpuset_t); int stop_cpus(cpuset_t); int stop_cpus_hard(cpuset_t); #if defined(__amd64__) || defined(__i386__) int suspend_cpus(cpuset_t); int resume_cpus(cpuset_t); #endif void smp_rendezvous_action(void); extern struct mtx smp_ipi_mtx; #endif /* SMP */ int quiesce_all_cpus(const char *, int); int quiesce_cpus(cpuset_t, const char *, int); void quiesce_all_critical(void); void cpus_fence_seq_cst(void); void smp_no_rendezvous_barrier(void *); void smp_rendezvous(void (*)(void *), void (*)(void *), void (*)(void *), void *arg); void smp_rendezvous_cpus(cpuset_t, void (*)(void *), void (*)(void *), void (*)(void *), void *arg); struct smp_rendezvous_cpus_retry_arg { cpuset_t cpus; }; void smp_rendezvous_cpus_retry(cpuset_t, void (*)(void *), void (*)(void *), void (*)(void *), void (*)(void *, int), struct smp_rendezvous_cpus_retry_arg *); void smp_rendezvous_cpus_done(struct smp_rendezvous_cpus_retry_arg *); #endif /* !LOCORE */ #endif /* _KERNEL */ #endif /* _SYS_SMP_H_ */ diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c index 5e9a9735b09a..54ca82a2c90d 100644 --- a/sys/x86/x86/mp_x86.c +++ b/sys/x86/x86/mp_x86.c @@ -1,1630 +1,1686 @@ /*- * Copyright (c) 1996, by Steve Passe * Copyright (c) 2003, by Peter Wemm * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. The name of the developer may NOT be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); +#include "opt_acpi.h" #ifdef __i386__ #include "opt_apic.h" #endif #include "opt_cpu.h" #include "opt_ddb.h" #include "opt_gdb.h" #include "opt_kstack_pages.h" #include "opt_pmap.h" #include "opt_sched.h" #include "opt_smp.h" #include "opt_stack.h" #include #include #include #include #include /* cngetc() */ #include #include #ifdef GPROF #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#ifdef DEV_ACPI +#include +#include +#endif + static MALLOC_DEFINE(M_CPUS, "cpus", "CPU items"); /* lock region used by kernel profiling */ int mcount_lock; int mp_naps; /* # of Applications processors */ int boot_cpu_id = -1; /* designated BSP */ /* AP uses this during bootstrap. Do not staticize. */ char *bootSTK; int bootAP; /* Free these after use */ void *bootstacks[MAXCPU]; void *dpcpu; struct pcb stoppcbs[MAXCPU]; struct susppcb **susppcbs; #ifdef COUNT_IPIS /* Interrupt counts. */ static u_long *ipi_preempt_counts[MAXCPU]; static u_long *ipi_ast_counts[MAXCPU]; u_long *ipi_invltlb_counts[MAXCPU]; u_long *ipi_invlrng_counts[MAXCPU]; u_long *ipi_invlpg_counts[MAXCPU]; u_long *ipi_invlcache_counts[MAXCPU]; u_long *ipi_rendezvous_counts[MAXCPU]; static u_long *ipi_hardclock_counts[MAXCPU]; #endif /* Default cpu_ops implementation. */ struct cpu_ops cpu_ops; /* * Local data and functions. */ static volatile cpuset_t ipi_stop_nmi_pending; volatile cpuset_t resuming_cpus; volatile cpuset_t toresume_cpus; /* used to hold the AP's until we are ready to release them */ struct mtx ap_boot_mtx; /* Set to 1 once we're ready to let the APs out of the pen. */ volatile int aps_ready = 0; /* * Store data from cpu_add() until later in the boot when we actually setup * the APs. */ struct cpu_info *cpu_info; int *apic_cpuids; int cpu_apic_ids[MAXCPU]; _Static_assert(MAXCPU <= MAX_APIC_ID, "MAXCPU cannot be larger that MAX_APIC_ID"); _Static_assert(xAPIC_MAX_APIC_ID <= MAX_APIC_ID, "xAPIC_MAX_APIC_ID cannot be larger that MAX_APIC_ID"); static void release_aps(void *dummy); static void cpustop_handler_post(u_int cpu); static int hyperthreading_allowed = 1; SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_allowed, CTLFLAG_RDTUN, &hyperthreading_allowed, 0, "Use Intel HTT logical CPUs"); static int hyperthreading_intr_allowed = 0; SYSCTL_INT(_machdep, OID_AUTO, hyperthreading_intr_allowed, CTLFLAG_RDTUN, &hyperthreading_intr_allowed, 0, "Allow interrupts on HTT logical CPUs"); static struct topo_node topo_root; static int pkg_id_shift; static int node_id_shift; static int core_id_shift; static int disabled_cpus; struct cache_info { int id_shift; int present; } static caches[MAX_CACHE_LEVELS]; static bool stop_mwait = false; SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0, "Use MONITOR/MWAIT when stopping CPU, if available"); void mem_range_AP_init(void) { if (mem_range_softc.mr_op && mem_range_softc.mr_op->initAP) mem_range_softc.mr_op->initAP(&mem_range_softc); } /* * Round up to the next power of two, if necessary, and then * take log2. * Returns -1 if argument is zero. */ static __inline int mask_width(u_int x) { return (fls(x << (1 - powerof2(x))) - 1); } /* * Add a cache level to the cache topology description. */ static int add_deterministic_cache(int type, int level, int share_count) { if (type == 0) return (0); if (type > 3) { printf("unexpected cache type %d\n", type); return (1); } if (type == 2) /* ignore instruction cache */ return (1); if (level == 0 || level > MAX_CACHE_LEVELS) { printf("unexpected cache level %d\n", type); return (1); } if (caches[level - 1].present) { printf("WARNING: multiple entries for L%u data cache\n", level); printf("%u => %u\n", caches[level - 1].id_shift, mask_width(share_count)); } caches[level - 1].id_shift = mask_width(share_count); caches[level - 1].present = 1; if (caches[level - 1].id_shift > pkg_id_shift) { printf("WARNING: L%u data cache covers more " "APIC IDs than a package (%u > %u)\n", level, caches[level - 1].id_shift, pkg_id_shift); caches[level - 1].id_shift = pkg_id_shift; } if (caches[level - 1].id_shift < core_id_shift) { printf("WARNING: L%u data cache covers fewer " "APIC IDs than a core (%u < %u)\n", level, caches[level - 1].id_shift, core_id_shift); caches[level - 1].id_shift = core_id_shift; } return (1); } /* * Determine topology of processing units and caches for AMD CPUs. * See: * - AMD CPUID Specification (Publication # 25481) * - BKDG for AMD NPT Family 0Fh Processors (Publication # 32559) * - BKDG For AMD Family 10h Processors (Publication # 31116) * - BKDG For AMD Family 15h Models 00h-0Fh Processors (Publication # 42301) * - BKDG For AMD Family 16h Models 00h-0Fh Processors (Publication # 48751) * - PPR For AMD Family 17h Models 00h-0Fh Processors (Publication # 54945) */ static void topo_probe_amd(void) { u_int p[4]; uint64_t v; int level; int nodes_per_socket; int share_count; int type; int i; /* No multi-core capability. */ if ((amd_feature2 & AMDID2_CMP) == 0) return; /* For families 10h and newer. */ pkg_id_shift = (cpu_procinfo2 & AMDID_COREID_SIZE) >> AMDID_COREID_SIZE_SHIFT; /* For 0Fh family. */ if (pkg_id_shift == 0) pkg_id_shift = mask_width((cpu_procinfo2 & AMDID_CMP_CORES) + 1); /* * Families prior to 16h define the following value as * cores per compute unit and we don't really care about the AMD * compute units at the moment. Perhaps we should treat them as * cores and cores within the compute units as hardware threads, * but that's up for debate. * Later families define the value as threads per compute unit, * so we are following AMD's nomenclature here. */ if ((amd_feature2 & AMDID2_TOPOLOGY) != 0 && CPUID_TO_FAMILY(cpu_id) >= 0x16) { cpuid_count(0x8000001e, 0, p); share_count = ((p[1] >> 8) & 0xff) + 1; core_id_shift = mask_width(share_count); /* * For Zen (17h), gather Nodes per Processor. Each node is a * Zeppelin die; TR and EPYC CPUs will have multiple dies per * package. Communication latency between dies is higher than * within them. */ nodes_per_socket = ((p[2] >> 8) & 0x7) + 1; node_id_shift = pkg_id_shift - mask_width(nodes_per_socket); } if ((amd_feature2 & AMDID2_TOPOLOGY) != 0) { for (i = 0; ; i++) { cpuid_count(0x8000001d, i, p); type = p[0] & 0x1f; level = (p[0] >> 5) & 0x7; share_count = 1 + ((p[0] >> 14) & 0xfff); if (!add_deterministic_cache(type, level, share_count)) break; } } else { if (cpu_exthigh >= 0x80000005) { cpuid_count(0x80000005, 0, p); if (((p[2] >> 24) & 0xff) != 0) { caches[0].id_shift = 0; caches[0].present = 1; } } if (cpu_exthigh >= 0x80000006) { cpuid_count(0x80000006, 0, p); if (((p[2] >> 16) & 0xffff) != 0) { caches[1].id_shift = 0; caches[1].present = 1; } if (((p[3] >> 18) & 0x3fff) != 0) { nodes_per_socket = 1; if ((amd_feature2 & AMDID2_NODE_ID) != 0) { /* * Handle multi-node processors that * have multiple chips, each with its * own L3 cache, on the same die. */ v = rdmsr(0xc001100c); nodes_per_socket = 1 + ((v >> 3) & 0x7); } caches[2].id_shift = pkg_id_shift - mask_width(nodes_per_socket); caches[2].present = 1; } } } } /* * Determine topology of processing units for Intel CPUs * using CPUID Leaf 1 and Leaf 4, if supported. * See: * - Intel 64 Architecture Processor Topology Enumeration * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS */ static void topo_probe_intel_0x4(void) { u_int p[4]; int max_cores; int max_logical; /* Both zero and one here mean one logical processor per package. */ max_logical = (cpu_feature & CPUID_HTT) != 0 ? (cpu_procinfo & CPUID_HTT_CORES) >> 16 : 1; if (max_logical <= 1) return; if (cpu_high >= 0x4) { cpuid_count(0x04, 0, p); max_cores = ((p[0] >> 26) & 0x3f) + 1; } else max_cores = 1; core_id_shift = mask_width(max_logical/max_cores); KASSERT(core_id_shift >= 0, ("intel topo: max_cores > max_logical\n")); pkg_id_shift = core_id_shift + mask_width(max_cores); } /* * Determine topology of processing units for Intel CPUs * using CPUID Leaf 11, if supported. * See: * - Intel 64 Architecture Processor Topology Enumeration * - Intel 64 and IA-32 ArchitecturesSoftware Developer’s Manual, * Volume 3A: System Programming Guide, PROGRAMMING CONSIDERATIONS * FOR HARDWARE MULTI-THREADING CAPABLE PROCESSORS */ static void topo_probe_intel_0xb(void) { u_int p[4]; int bits; int type; int i; /* Fall back if CPU leaf 11 doesn't really exist. */ cpuid_count(0x0b, 0, p); if (p[1] == 0) { topo_probe_intel_0x4(); return; } /* We only support three levels for now. */ for (i = 0; ; i++) { cpuid_count(0x0b, i, p); bits = p[0] & 0x1f; type = (p[2] >> 8) & 0xff; if (type == 0) break; /* TODO: check for duplicate (re-)assignment */ if (type == CPUID_TYPE_SMT) core_id_shift = bits; else if (type == CPUID_TYPE_CORE) pkg_id_shift = bits; else printf("unknown CPU level type %d\n", type); } if (pkg_id_shift < core_id_shift) { printf("WARNING: core covers more APIC IDs than a package\n"); core_id_shift = pkg_id_shift; } } /* * Determine topology of caches for Intel CPUs. * See: * - Intel 64 Architecture Processor Topology Enumeration * - Intel 64 and IA-32 Architectures Software Developer’s Manual * Volume 2A: Instruction Set Reference, A-M, * CPUID instruction */ static void topo_probe_intel_caches(void) { u_int p[4]; int level; int share_count; int type; int i; if (cpu_high < 0x4) { /* * Available cache level and sizes can be determined * via CPUID leaf 2, but that requires a huge table of hardcoded * values, so for now just assume L1 and L2 caches potentially * shared only by HTT processing units, if HTT is present. */ caches[0].id_shift = pkg_id_shift; caches[0].present = 1; caches[1].id_shift = pkg_id_shift; caches[1].present = 1; return; } for (i = 0; ; i++) { cpuid_count(0x4, i, p); type = p[0] & 0x1f; level = (p[0] >> 5) & 0x7; share_count = 1 + ((p[0] >> 14) & 0xfff); if (!add_deterministic_cache(type, level, share_count)) break; } } /* * Determine topology of processing units and caches for Intel CPUs. * See: * - Intel 64 Architecture Processor Topology Enumeration */ static void topo_probe_intel(void) { /* * Note that 0x1 <= cpu_high < 4 case should be * compatible with topo_probe_intel_0x4() logic when * CPUID.1:EBX[23:16] > 0 (cpu_cores will be 1) * or it should trigger the fallback otherwise. */ if (cpu_high >= 0xb) topo_probe_intel_0xb(); else if (cpu_high >= 0x1) topo_probe_intel_0x4(); topo_probe_intel_caches(); } /* * Topology information is queried only on BSP, on which this * code runs and for which it can query CPUID information. * Then topology is extrapolated on all packages using an * assumption that APIC ID to hardware component ID mapping is * homogenious. * That doesn't necesserily imply that the topology is uniform. */ void topo_probe(void) { static int cpu_topo_probed = 0; struct x86_topo_layer { int type; int subtype; int id_shift; - } topo_layers[MAX_CACHE_LEVELS + 4]; + } topo_layers[MAX_CACHE_LEVELS + 5]; struct topo_node *parent; struct topo_node *node; int layer; int nlayers; int node_id; int i; +#if defined(DEV_ACPI) && MAXMEMDOM > 1 + int d, domain; +#endif if (cpu_topo_probed) return; CPU_ZERO(&logical_cpus_mask); if (mp_ncpus <= 1) ; /* nothing */ else if (cpu_vendor_id == CPU_VENDOR_AMD || cpu_vendor_id == CPU_VENDOR_HYGON) topo_probe_amd(); else if (cpu_vendor_id == CPU_VENDOR_INTEL) topo_probe_intel(); KASSERT(pkg_id_shift >= core_id_shift, ("bug in APIC topology discovery")); nlayers = 0; bzero(topo_layers, sizeof(topo_layers)); topo_layers[nlayers].type = TOPO_TYPE_PKG; topo_layers[nlayers].id_shift = pkg_id_shift; if (bootverbose) printf("Package ID shift: %u\n", topo_layers[nlayers].id_shift); nlayers++; if (pkg_id_shift > node_id_shift && node_id_shift != 0) { topo_layers[nlayers].type = TOPO_TYPE_GROUP; topo_layers[nlayers].id_shift = node_id_shift; if (bootverbose) printf("Node ID shift: %u\n", topo_layers[nlayers].id_shift); nlayers++; } /* * Consider all caches to be within a package/chip * and "in front" of all sub-components like * cores and hardware threads. */ for (i = MAX_CACHE_LEVELS - 1; i >= 0; --i) { if (caches[i].present) { if (node_id_shift != 0) KASSERT(caches[i].id_shift <= node_id_shift, ("bug in APIC topology discovery")); KASSERT(caches[i].id_shift <= pkg_id_shift, ("bug in APIC topology discovery")); KASSERT(caches[i].id_shift >= core_id_shift, ("bug in APIC topology discovery")); topo_layers[nlayers].type = TOPO_TYPE_CACHE; topo_layers[nlayers].subtype = i + 1; topo_layers[nlayers].id_shift = caches[i].id_shift; if (bootverbose) printf("L%u cache ID shift: %u\n", topo_layers[nlayers].subtype, topo_layers[nlayers].id_shift); nlayers++; } } if (pkg_id_shift > core_id_shift) { topo_layers[nlayers].type = TOPO_TYPE_CORE; topo_layers[nlayers].id_shift = core_id_shift; if (bootverbose) printf("Core ID shift: %u\n", topo_layers[nlayers].id_shift); nlayers++; } topo_layers[nlayers].type = TOPO_TYPE_PU; topo_layers[nlayers].id_shift = 0; nlayers++; +#if defined(DEV_ACPI) && MAXMEMDOM > 1 + if (vm_ndomains > 1) { + for (layer = 0; layer < nlayers; ++layer) { + for (i = 0; i <= max_apic_id; ++i) { + if ((i & ((1 << topo_layers[layer].id_shift) - 1)) == 0) + domain = -1; + if (!cpu_info[i].cpu_present) + continue; + d = acpi_pxm_get_cpu_locality(i); + if (domain >= 0 && domain != d) + break; + domain = d; + } + if (i > max_apic_id) + break; + } + KASSERT(layer < nlayers, ("NUMA domain smaller than PU")); + memmove(&topo_layers[layer+1], &topo_layers[layer], + sizeof(*topo_layers) * (nlayers - layer)); + topo_layers[layer].type = TOPO_TYPE_NODE; + topo_layers[layer].subtype = CG_SHARE_NONE; + nlayers++; + } +#endif + topo_init_root(&topo_root); for (i = 0; i <= max_apic_id; ++i) { if (!cpu_info[i].cpu_present) continue; parent = &topo_root; for (layer = 0; layer < nlayers; ++layer) { - node_id = i >> topo_layers[layer].id_shift; +#if defined(DEV_ACPI) && MAXMEMDOM > 1 + if (topo_layers[layer].type == TOPO_TYPE_NODE) { + node_id = acpi_pxm_get_cpu_locality(i); + } else +#endif + node_id = i >> topo_layers[layer].id_shift; parent = topo_add_node_by_hwid(parent, node_id, topo_layers[layer].type, topo_layers[layer].subtype); } } parent = &topo_root; for (layer = 0; layer < nlayers; ++layer) { - node_id = boot_cpu_id >> topo_layers[layer].id_shift; +#if defined(DEV_ACPI) && MAXMEMDOM > 1 + if (topo_layers[layer].type == TOPO_TYPE_NODE) + node_id = acpi_pxm_get_cpu_locality(boot_cpu_id); + else +#endif + node_id = boot_cpu_id >> topo_layers[layer].id_shift; node = topo_find_node_by_hwid(parent, node_id, topo_layers[layer].type, topo_layers[layer].subtype); topo_promote_child(node); parent = node; } cpu_topo_probed = 1; } /* * Assign logical CPU IDs to local APICs. */ void assign_cpu_ids(void) { struct topo_node *node; u_int smt_mask; int nhyper; smt_mask = (1u << core_id_shift) - 1; /* * Assign CPU IDs to local APIC IDs and disable any CPUs * beyond MAXCPU. CPU 0 is always assigned to the BSP. */ mp_ncpus = 0; nhyper = 0; TOPO_FOREACH(node, &topo_root) { if (node->type != TOPO_TYPE_PU) continue; if ((node->hwid & smt_mask) != (boot_cpu_id & smt_mask)) cpu_info[node->hwid].cpu_hyperthread = 1; if (resource_disabled("lapic", node->hwid)) { if (node->hwid != boot_cpu_id) cpu_info[node->hwid].cpu_disabled = 1; else printf("Cannot disable BSP, APIC ID = %d\n", node->hwid); } if (!hyperthreading_allowed && cpu_info[node->hwid].cpu_hyperthread) cpu_info[node->hwid].cpu_disabled = 1; if (mp_ncpus >= MAXCPU) cpu_info[node->hwid].cpu_disabled = 1; if (cpu_info[node->hwid].cpu_disabled) { disabled_cpus++; continue; } if (cpu_info[node->hwid].cpu_hyperthread) nhyper++; cpu_apic_ids[mp_ncpus] = node->hwid; apic_cpuids[node->hwid] = mp_ncpus; topo_set_pu_id(node, mp_ncpus); mp_ncpus++; } KASSERT(mp_maxid >= mp_ncpus - 1, ("%s: counters out of sync: max %d, count %d", __func__, mp_maxid, mp_ncpus)); mp_ncores = mp_ncpus - nhyper; smp_threads_per_core = mp_ncpus / mp_ncores; } /* * Print various information about the SMP system hardware and setup. */ void cpu_mp_announce(void) { struct topo_node *node; const char *hyperthread; struct topo_analysis topology; printf("FreeBSD/SMP: "); if (topo_analyze(&topo_root, 1, &topology)) { printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]); if (topology.entities[TOPO_LEVEL_GROUP] > 1) printf(" x %d groups", topology.entities[TOPO_LEVEL_GROUP]); if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) printf(" x %d cache groups", topology.entities[TOPO_LEVEL_CACHEGROUP]); if (topology.entities[TOPO_LEVEL_CORE] > 0) printf(" x %d core(s)", topology.entities[TOPO_LEVEL_CORE]); if (topology.entities[TOPO_LEVEL_THREAD] > 1) printf(" x %d hardware threads", topology.entities[TOPO_LEVEL_THREAD]); } else { printf("Non-uniform topology"); } printf("\n"); if (disabled_cpus) { printf("FreeBSD/SMP Online: "); if (topo_analyze(&topo_root, 0, &topology)) { printf("%d package(s)", topology.entities[TOPO_LEVEL_PKG]); if (topology.entities[TOPO_LEVEL_GROUP] > 1) printf(" x %d groups", topology.entities[TOPO_LEVEL_GROUP]); if (topology.entities[TOPO_LEVEL_CACHEGROUP] > 1) printf(" x %d cache groups", topology.entities[TOPO_LEVEL_CACHEGROUP]); if (topology.entities[TOPO_LEVEL_CORE] > 0) printf(" x %d core(s)", topology.entities[TOPO_LEVEL_CORE]); if (topology.entities[TOPO_LEVEL_THREAD] > 1) printf(" x %d hardware threads", topology.entities[TOPO_LEVEL_THREAD]); } else { printf("Non-uniform topology"); } printf("\n"); } if (!bootverbose) return; TOPO_FOREACH(node, &topo_root) { switch (node->type) { case TOPO_TYPE_PKG: printf("Package HW ID = %u\n", node->hwid); break; case TOPO_TYPE_CORE: printf("\tCore HW ID = %u\n", node->hwid); break; case TOPO_TYPE_PU: if (cpu_info[node->hwid].cpu_hyperthread) hyperthread = "/HT"; else hyperthread = ""; if (node->subtype == 0) printf("\t\tCPU (AP%s): APIC ID: %u" "(disabled)\n", hyperthread, node->hwid); else if (node->id == 0) printf("\t\tCPU0 (BSP): APIC ID: %u\n", node->hwid); else printf("\t\tCPU%u (AP%s): APIC ID: %u\n", node->id, hyperthread, node->hwid); break; default: /* ignored */ break; } } } /* * Add a scheduling group, a group of logical processors sharing * a particular cache (and, thus having an affinity), to the scheduling * topology. * This function recursively works on lower level caches. */ static void x86topo_add_sched_group(struct topo_node *root, struct cpu_group *cg_root) { struct topo_node *node; int nchildren; int ncores; int i; KASSERT(root->type == TOPO_TYPE_SYSTEM || root->type == TOPO_TYPE_CACHE || - root->type == TOPO_TYPE_GROUP, + root->type == TOPO_TYPE_NODE || root->type == TOPO_TYPE_GROUP, ("x86topo_add_sched_group: bad type: %u", root->type)); CPU_COPY(&root->cpuset, &cg_root->cg_mask); cg_root->cg_count = root->cpu_count; - if (root->type == TOPO_TYPE_SYSTEM) + if (root->type == TOPO_TYPE_CACHE) + cg_root->cg_level = root->subtype; + else cg_root->cg_level = CG_SHARE_NONE; + if (root->type == TOPO_TYPE_NODE) + cg_root->cg_flags = CG_FLAG_NODE; else - cg_root->cg_level = root->subtype; + cg_root->cg_flags = 0; /* * Check how many core nodes we have under the given root node. * If we have multiple logical processors, but not multiple * cores, then those processors must be hardware threads. */ ncores = 0; node = root; while (node != NULL) { if (node->type != TOPO_TYPE_CORE) { node = topo_next_node(root, node); continue; } ncores++; node = topo_next_nonchild_node(root, node); } if (cg_root->cg_level != CG_SHARE_NONE && root->cpu_count > 1 && ncores < 2) - cg_root->cg_flags = CG_FLAG_SMT; + cg_root->cg_flags |= CG_FLAG_SMT; /* * Find out how many cache nodes we have under the given root node. * We ignore cache nodes that cover all the same processors as the * root node. Also, we do not descend below found cache nodes. * That is, we count top-level "non-redundant" caches under the root * node. */ nchildren = 0; node = root; while (node != NULL) { - if ((node->type != TOPO_TYPE_GROUP && - node->type != TOPO_TYPE_CACHE) || - (root->type != TOPO_TYPE_SYSTEM && - CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { + if (CPU_CMP(&node->cpuset, &root->cpuset) == 0) { + if (node->type == TOPO_TYPE_CACHE && + cg_root->cg_level < node->subtype) + cg_root->cg_level = node->subtype; + if (node->type == TOPO_TYPE_NODE) + cg_root->cg_flags |= CG_FLAG_NODE; + node = topo_next_node(root, node); + continue; + } + if (node->type != TOPO_TYPE_GROUP && + node->type != TOPO_TYPE_NODE && + node->type != TOPO_TYPE_CACHE) { node = topo_next_node(root, node); continue; } nchildren++; node = topo_next_nonchild_node(root, node); } /* * We are not interested in nodes including only one CPU each. */ if (nchildren == root->cpu_count) return; cg_root->cg_child = smp_topo_alloc(nchildren); cg_root->cg_children = nchildren; /* * Now find again the same cache nodes as above and recursively * build scheduling topologies for them. */ node = root; i = 0; while (node != NULL) { if ((node->type != TOPO_TYPE_GROUP && + node->type != TOPO_TYPE_NODE && node->type != TOPO_TYPE_CACHE) || - (root->type != TOPO_TYPE_SYSTEM && - CPU_CMP(&node->cpuset, &root->cpuset) == 0)) { + CPU_CMP(&node->cpuset, &root->cpuset) == 0) { node = topo_next_node(root, node); continue; } cg_root->cg_child[i].cg_parent = cg_root; x86topo_add_sched_group(node, &cg_root->cg_child[i]); i++; node = topo_next_nonchild_node(root, node); } } /* * Build the MI scheduling topology from the discovered hardware topology. */ struct cpu_group * cpu_topo(void) { struct cpu_group *cg_root; if (mp_ncpus <= 1) return (smp_topo_none()); cg_root = smp_topo_alloc(1); x86topo_add_sched_group(&topo_root, cg_root); return (cg_root); } static void cpu_alloc(void *dummy __unused) { /* * Dynamically allocate the arrays that depend on the * maximum APIC ID. */ cpu_info = malloc(sizeof(*cpu_info) * (max_apic_id + 1), M_CPUS, M_WAITOK | M_ZERO); apic_cpuids = malloc(sizeof(*apic_cpuids) * (max_apic_id + 1), M_CPUS, M_WAITOK | M_ZERO); } SYSINIT(cpu_alloc, SI_SUB_CPU, SI_ORDER_FIRST, cpu_alloc, NULL); /* * Add a logical CPU to the topology. */ void cpu_add(u_int apic_id, char boot_cpu) { if (apic_id > max_apic_id) { panic("SMP: APIC ID %d too high", apic_id); return; } KASSERT(cpu_info[apic_id].cpu_present == 0, ("CPU %u added twice", apic_id)); cpu_info[apic_id].cpu_present = 1; if (boot_cpu) { KASSERT(boot_cpu_id == -1, ("CPU %u claims to be BSP, but CPU %u already is", apic_id, boot_cpu_id)); boot_cpu_id = apic_id; cpu_info[apic_id].cpu_bsp = 1; } if (bootverbose) printf("SMP: Added CPU %u (%s)\n", apic_id, boot_cpu ? "BSP" : "AP"); } void cpu_mp_setmaxid(void) { /* * mp_ncpus and mp_maxid should be already set by calls to cpu_add(). * If there were no calls to cpu_add() assume this is a UP system. */ if (mp_ncpus == 0) mp_ncpus = 1; } int cpu_mp_probe(void) { /* * Always record BSP in CPU map so that the mbuf init code works * correctly. */ CPU_SETOF(0, &all_cpus); return (mp_ncpus > 1); } /* * AP CPU's call this to initialize themselves. */ void init_secondary_tail(void) { u_int cpuid; pmap_activate_boot(vmspace_pmap(proc0.p_vmspace)); /* * On real hardware, switch to x2apic mode if possible. Do it * after aps_ready was signalled, to avoid manipulating the * mode while BSP might still want to send some IPI to us * (second startup IPI is ignored on modern hardware etc). */ lapic_xapic_mode(); /* Initialize the PAT MSR. */ pmap_init_pat(); /* set up CPU registers and state */ cpu_setregs(); /* set up SSE/NX */ initializecpu(); /* set up FPU state on the AP */ #ifdef __amd64__ fpuinit(); #else npxinit(false); #endif if (cpu_ops.cpu_init) cpu_ops.cpu_init(); /* A quick check from sanity claus */ cpuid = PCPU_GET(cpuid); if (PCPU_GET(apic_id) != lapic_id()) { printf("SMP: cpuid = %d\n", cpuid); printf("SMP: actual apic_id = %d\n", lapic_id()); printf("SMP: correct apic_id = %d\n", PCPU_GET(apic_id)); panic("cpuid mismatch! boom!!"); } /* Initialize curthread. */ KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread")); PCPU_SET(curthread, PCPU_GET(idlethread)); mtx_lock_spin(&ap_boot_mtx); mca_init(); /* Init local apic for irq's */ lapic_setup(1); /* Set memory range attributes for this CPU to match the BSP */ mem_range_AP_init(); smp_cpus++; CTR1(KTR_SMP, "SMP: AP CPU #%d Launched", cpuid); if (bootverbose) printf("SMP: AP CPU #%d Launched!\n", cpuid); else printf("%s%d%s", smp_cpus == 2 ? "Launching APs: " : "", cpuid, smp_cpus == mp_ncpus ? "\n" : " "); /* Determine if we are a logical CPU. */ if (cpu_info[PCPU_GET(apic_id)].cpu_hyperthread) CPU_SET(cpuid, &logical_cpus_mask); if (bootverbose) lapic_dump("AP"); if (smp_cpus == mp_ncpus) { /* enable IPI's, tlb shootdown, freezes etc */ atomic_store_rel_int(&smp_started, 1); } #ifdef __amd64__ if (pmap_pcid_enabled) load_cr4(rcr4() | CR4_PCIDE); load_ds(_udatasel); load_es(_udatasel); load_fs(_ufssel); #endif mtx_unlock_spin(&ap_boot_mtx); /* Wait until all the AP's are up. */ while (atomic_load_acq_int(&smp_started) == 0) ia32_pause(); #ifndef EARLY_AP_STARTUP /* Start per-CPU event timers. */ cpu_initclocks_ap(); #endif kcsan_cpu_init(cpuid); /* * Assert that smp_after_idle_runnable condition is reasonable. */ MPASS(PCPU_GET(curpcb) == NULL); sched_throw(NULL); panic("scheduler returned us to %s", __func__); /* NOTREACHED */ } static void smp_after_idle_runnable(void *arg __unused) { struct pcpu *pc; int cpu; for (cpu = 1; cpu < mp_ncpus; cpu++) { pc = pcpu_find(cpu); while (atomic_load_ptr(&pc->pc_curpcb) == NULL) cpu_spinwait(); kmem_free((vm_offset_t)bootstacks[cpu], kstack_pages * PAGE_SIZE); } } SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY, smp_after_idle_runnable, NULL); /* * We tell the I/O APIC code about all the CPUs we want to receive * interrupts. If we don't want certain CPUs to receive IRQs we * can simply not tell the I/O APIC code about them in this function. * We also do not tell it about the BSP since it tells itself about * the BSP internally to work with UP kernels and on UP machines. */ void set_interrupt_apic_ids(void) { u_int i, apic_id; for (i = 0; i < MAXCPU; i++) { apic_id = cpu_apic_ids[i]; if (apic_id == -1) continue; if (cpu_info[apic_id].cpu_bsp) continue; if (cpu_info[apic_id].cpu_disabled) continue; /* Don't let hyperthreads service interrupts. */ if (cpu_info[apic_id].cpu_hyperthread && !hyperthreading_intr_allowed) continue; intr_add_cpu(i); } } #ifdef COUNT_XINVLTLB_HITS u_int xhits_gbl[MAXCPU]; u_int xhits_pg[MAXCPU]; u_int xhits_rng[MAXCPU]; static SYSCTL_NODE(_debug, OID_AUTO, xhits, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, global, CTLFLAG_RW, &xhits_gbl, sizeof(xhits_gbl), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, page, CTLFLAG_RW, &xhits_pg, sizeof(xhits_pg), "IU", ""); SYSCTL_OPAQUE(_debug_xhits, OID_AUTO, range, CTLFLAG_RW, &xhits_rng, sizeof(xhits_rng), "IU", ""); u_int ipi_global; u_int ipi_page; u_int ipi_range; u_int ipi_range_size; SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_global, CTLFLAG_RW, &ipi_global, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_page, CTLFLAG_RW, &ipi_page, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range, CTLFLAG_RW, &ipi_range, 0, ""); SYSCTL_INT(_debug_xhits, OID_AUTO, ipi_range_size, CTLFLAG_RW, &ipi_range_size, 0, ""); #endif /* COUNT_XINVLTLB_HITS */ /* * Init and startup IPI. */ void ipi_startup(int apic_id, int vector) { /* * This attempts to follow the algorithm described in the * Intel Multiprocessor Specification v1.4 in section B.4. * For each IPI, we allow the local APIC ~20us to deliver the * IPI. If that times out, we panic. */ /* * first we do an INIT IPI: this INIT IPI might be run, resetting * and running the target CPU. OR this INIT IPI might be latched (P5 * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be * ignored. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); lapic_ipi_wait(100); /* Explicitly deassert the INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_LEVEL | APIC_LEVEL_DEASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_INIT, apic_id); DELAY(10000); /* wait ~10mS */ /* * next we do a STARTUP IPI: the previous INIT IPI might still be * latched, (P5 bug) this 1st STARTUP would then terminate * immediately, and the previously started INIT IPI would continue. OR * the previous INIT IPI has already run. and this STARTUP IPI will * run. OR the previous INIT IPI was ignored. and this STARTUP IPI * will run. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); if (!lapic_ipi_wait(100)) panic("Failed to deliver first STARTUP IPI to APIC %d", apic_id); DELAY(200); /* wait ~200uS */ /* * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is * recognized after hardware RESET or INIT IPI. */ lapic_ipi_raw(APIC_DEST_DESTFLD | APIC_TRIGMOD_EDGE | APIC_LEVEL_ASSERT | APIC_DESTMODE_PHY | APIC_DELMODE_STARTUP | vector, apic_id); if (!lapic_ipi_wait(100)) panic("Failed to deliver second STARTUP IPI to APIC %d", apic_id); DELAY(200); /* wait ~200uS */ } static bool ipi_bitmap_set(int cpu, u_int ipi) { u_int bitmap, old, new; u_int *cpu_bitmap; bitmap = 1 << ipi; cpu_bitmap = &cpuid_to_pcpu[cpu]->pc_ipi_bitmap; old = *cpu_bitmap; for (;;) { if ((old & bitmap) != 0) break; new = old | bitmap; if (atomic_fcmpset_int(cpu_bitmap, &old, new)) break; } return (old != 0); } /* * Send an IPI to specified CPU handling the bitmap logic. */ static void ipi_send_cpu(int cpu, u_int ipi) { KASSERT((u_int)cpu < MAXCPU && cpu_apic_ids[cpu] != -1, ("IPI to non-existent CPU %d", cpu)); if (IPI_IS_BITMAPED(ipi)) { if (ipi_bitmap_set(cpu, ipi)) return; ipi = IPI_BITMAP_VECTOR; } lapic_ipi_vectored(ipi, cpu_apic_ids[cpu]); } void ipi_bitmap_handler(struct trapframe frame) { struct trapframe *oldframe; struct thread *td; int cpu = PCPU_GET(cpuid); u_int ipi_bitmap; kasan_mark(&frame, sizeof(frame), sizeof(frame), 0); td = curthread; ipi_bitmap = atomic_readandclear_int(&cpuid_to_pcpu[cpu]-> pc_ipi_bitmap); /* * sched_preempt() must be called to clear the pending preempt * IPI to enable delivery of further preempts. However, the * critical section will cause extra scheduler lock thrashing * when used unconditionally. Only critical_enter() if * hardclock must also run, which requires the section entry. */ if (ipi_bitmap & (1 << IPI_HARDCLOCK)) critical_enter(); td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = &frame; #if defined(STACK) || defined(DDB) if (ipi_bitmap & (1 << IPI_TRACE)) stack_capture_intr(); #endif if (ipi_bitmap & (1 << IPI_PREEMPT)) { #ifdef COUNT_IPIS (*ipi_preempt_counts[cpu])++; #endif sched_preempt(td); } if (ipi_bitmap & (1 << IPI_AST)) { #ifdef COUNT_IPIS (*ipi_ast_counts[cpu])++; #endif /* Nothing to do for AST */ } if (ipi_bitmap & (1 << IPI_HARDCLOCK)) { #ifdef COUNT_IPIS (*ipi_hardclock_counts[cpu])++; #endif hardclockintr(); } td->td_intr_frame = oldframe; td->td_intr_nesting_level--; if (ipi_bitmap & (1 << IPI_HARDCLOCK)) critical_exit(); } /* * send an IPI to a set of cpus. */ void ipi_selected(cpuset_t cpus, u_int ipi) { int cpu; /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &cpus); while ((cpu = CPU_FFS(&cpus)) != 0) { cpu--; CPU_CLR(cpu, &cpus); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } } /* * send an IPI to a specific CPU. */ void ipi_cpu(int cpu, u_int ipi) { /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) CPU_SET_ATOMIC(cpu, &ipi_stop_nmi_pending); CTR3(KTR_SMP, "%s: cpu: %d ipi: %x", __func__, cpu, ipi); ipi_send_cpu(cpu, ipi); } /* * send an IPI to all CPUs EXCEPT myself */ void ipi_all_but_self(u_int ipi) { cpuset_t other_cpus; int cpu, c; /* * IPI_STOP_HARD maps to a NMI and the trap handler needs a bit * of help in order to understand what is the source. * Set the mask of receiving CPUs for this purpose. */ if (ipi == IPI_STOP_HARD) { other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); CPU_OR_ATOMIC(&ipi_stop_nmi_pending, &other_cpus); } CTR2(KTR_SMP, "%s: ipi: %x", __func__, ipi); if (IPI_IS_BITMAPED(ipi)) { cpu = PCPU_GET(cpuid); CPU_FOREACH(c) { if (c != cpu) ipi_bitmap_set(c, ipi); } ipi = IPI_BITMAP_VECTOR; } lapic_ipi_vectored(ipi, APIC_IPI_DEST_OTHERS); } void ipi_self_from_nmi(u_int vector) { lapic_ipi_vectored(vector, APIC_IPI_DEST_SELF); /* Wait for IPI to finish. */ if (!lapic_ipi_wait(50000)) { if (KERNEL_PANICKED()) return; else panic("APIC: IPI is stuck"); } } int ipi_nmi_handler(void) { u_int cpuid; /* * As long as there is not a simple way to know about a NMI's * source, if the bitmask for the current CPU is present in * the global pending bitword an IPI_STOP_HARD has been issued * and should be handled. */ cpuid = PCPU_GET(cpuid); if (!CPU_ISSET(cpuid, &ipi_stop_nmi_pending)) return (1); CPU_CLR_ATOMIC(cpuid, &ipi_stop_nmi_pending); cpustop_handler(); return (0); } int nmi_kdb_lock; void nmi_call_kdb_smp(u_int type, struct trapframe *frame) { int cpu; bool call_post; cpu = PCPU_GET(cpuid); if (atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) { nmi_call_kdb(cpu, type, frame); call_post = false; } else { savectx(&stoppcbs[cpu]); CPU_SET_ATOMIC(cpu, &stopped_cpus); while (!atomic_cmpset_acq_int(&nmi_kdb_lock, 0, 1)) ia32_pause(); call_post = true; } atomic_store_rel_int(&nmi_kdb_lock, 0); if (call_post) cpustop_handler_post(cpu); } /* * Handle an IPI_STOP by saving our current context and spinning (or mwaiting, * if available) until we are resumed. */ void cpustop_handler(void) { struct monitorbuf *mb; u_int cpu; bool use_mwait; cpu = PCPU_GET(cpuid); savectx(&stoppcbs[cpu]); use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 && !mwait_cpustop_broken); if (use_mwait) { mb = PCPU_PTR(monitorbuf); atomic_store_int(&mb->stop_state, MONITOR_STOPSTATE_STOPPED); } /* Indicate that we are stopped */ CPU_SET_ATOMIC(cpu, &stopped_cpus); /* Wait for restart */ while (!CPU_ISSET(cpu, &started_cpus)) { if (use_mwait) { cpu_monitor(mb, 0, 0); if (atomic_load_int(&mb->stop_state) == MONITOR_STOPSTATE_STOPPED) cpu_mwait(0, MWAIT_C1); continue; } ia32_pause(); /* * Halt non-BSP CPUs on panic -- we're never going to need them * again, and might as well save power / release resources * (e.g., overprovisioned VM infrastructure). */ while (__predict_false(!IS_BSP() && KERNEL_PANICKED())) halt(); } cpustop_handler_post(cpu); } static void cpustop_handler_post(u_int cpu) { CPU_CLR_ATOMIC(cpu, &started_cpus); CPU_CLR_ATOMIC(cpu, &stopped_cpus); /* * We don't broadcast TLB invalidations to other CPUs when they are * stopped. Hence, we clear the TLB before resuming. */ invltlb_glob(); #if defined(__amd64__) && (defined(DDB) || defined(GDB)) amd64_db_resume_dbreg(); #endif if (cpu == 0 && cpustop_restartfunc != NULL) { cpustop_restartfunc(); cpustop_restartfunc = NULL; } } /* * Handle an IPI_SUSPEND by saving our current context and spinning until we * are resumed. */ void cpususpend_handler(void) { u_int cpu; mtx_assert(&smp_ipi_mtx, MA_NOTOWNED); cpu = PCPU_GET(cpuid); if (savectx(&susppcbs[cpu]->sp_pcb)) { #ifdef __amd64__ fpususpend(susppcbs[cpu]->sp_fpususpend); #else npxsuspend(susppcbs[cpu]->sp_fpususpend); #endif /* * suspended_cpus is cleared shortly after each AP is restarted * by a Startup IPI, so that the BSP can proceed to restarting * the next AP. * * resuming_cpus gets cleared when the AP completes * initialization after having been released by the BSP. * resuming_cpus is probably not the best name for the * variable, because it is actually a set of processors that * haven't resumed yet and haven't necessarily started resuming. * * Note that suspended_cpus is meaningful only for ACPI suspend * as it's not really used for Xen suspend since the APs are * automatically restored to the running state and the correct * context. For the same reason resumectx is never called in * that case. */ CPU_SET_ATOMIC(cpu, &suspended_cpus); CPU_SET_ATOMIC(cpu, &resuming_cpus); /* * Invalidate the cache after setting the global status bits. * The last AP to set its bit may end up being an Owner of the * corresponding cache line in MOESI protocol. The AP may be * stopped before the cache line is written to the main memory. */ wbinvd(); } else { #ifdef __amd64__ fpuresume(susppcbs[cpu]->sp_fpususpend); #else npxresume(susppcbs[cpu]->sp_fpususpend); #endif pmap_init_pat(); initializecpu(); PCPU_SET(switchtime, 0); PCPU_SET(switchticks, ticks); /* Indicate that we have restarted and restored the context. */ CPU_CLR_ATOMIC(cpu, &suspended_cpus); } /* Wait for resume directive */ while (!CPU_ISSET(cpu, &toresume_cpus)) ia32_pause(); /* Re-apply microcode updates. */ ucode_reload(); #ifdef __i386__ /* Finish removing the identity mapping of low memory for this AP. */ invltlb_glob(); #endif if (cpu_ops.cpu_resume) cpu_ops.cpu_resume(); #ifdef __amd64__ if (vmm_resume_p) vmm_resume_p(); #endif /* Resume MCA and local APIC */ lapic_xapic_mode(); mca_resume(); lapic_setup(0); /* Indicate that we are resumed */ CPU_CLR_ATOMIC(cpu, &resuming_cpus); CPU_CLR_ATOMIC(cpu, &suspended_cpus); CPU_CLR_ATOMIC(cpu, &toresume_cpus); } /* * Handle an IPI_SWI by waking delayed SWI thread. */ void ipi_swi_handler(struct trapframe frame) { intr_event_handle(clk_intr_event, &frame); } /* * This is called once the rest of the system is up and running and we're * ready to let the AP's out of the pen. */ static void release_aps(void *dummy __unused) { if (mp_ncpus == 1) return; atomic_store_rel_int(&aps_ready, 1); while (smp_started == 0) ia32_pause(); } SYSINIT(start_aps, SI_SUB_SMP, SI_ORDER_FIRST, release_aps, NULL); #ifdef COUNT_IPIS /* * Setup interrupt counters for IPI handlers. */ static void mp_ipi_intrcnt(void *dummy) { char buf[64]; int i; CPU_FOREACH(i) { snprintf(buf, sizeof(buf), "cpu%d:invltlb", i); intrcnt_add(buf, &ipi_invltlb_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlrng", i); intrcnt_add(buf, &ipi_invlrng_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlpg", i); intrcnt_add(buf, &ipi_invlpg_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:invlcache", i); intrcnt_add(buf, &ipi_invlcache_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:preempt", i); intrcnt_add(buf, &ipi_preempt_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:ast", i); intrcnt_add(buf, &ipi_ast_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:rendezvous", i); intrcnt_add(buf, &ipi_rendezvous_counts[i]); snprintf(buf, sizeof(buf), "cpu%d:hardclock", i); intrcnt_add(buf, &ipi_hardclock_counts[i]); } } SYSINIT(mp_ipi_intrcnt, SI_SUB_INTR, SI_ORDER_MIDDLE, mp_ipi_intrcnt, NULL); #endif