Index: sys/kern/kern_clocksource.c =================================================================== --- sys/kern/kern_clocksource.c +++ sys/kern/kern_clocksource.c @@ -67,7 +67,7 @@ struct pcpu_state; static sbintime_t getnextcpuevent(struct pcpu_state *state, int idle); -static sbintime_t getnextevent(void); +static sbintime_t getnextevent(struct pcpu_state *state); static int handleevents(sbintime_t now, int fake); static struct mtx et_hw_mtx; @@ -256,12 +256,10 @@ * Schedule binuptime of the next event on all CPUs. */ static sbintime_t -getnextevent(void) +getnextevent(struct pcpu_state *state) { - struct pcpu_state *state; sbintime_t event; - state = DPCPU_PTR(timerstate); ET_HW_ASSERT_LOCKED(state); event = state->nextevent; #ifdef SMP @@ -367,10 +365,10 @@ uint64_t tmp; int eq; - if (timer->et_flags & ET_FLAGS_PERCPU) { - state = DPCPU_PTR(timerstate); + state = DPCPU_PTR(timerstate); + if (timer->et_flags & ET_FLAGS_PERCPU) next = &state->nexttick; - } else + else next = &nexttick; ET_HW_ASSERT_LOCKED(state); if (periodic) { @@ -390,7 +388,7 @@ et_start(timer, new, timerperiod); } } else { - new = getnextevent(); + new = getnextevent(state); eq = (new == *next); CTR3(KTR_SPARE2, "load: next %d.%08x eq %d", (int)(new >> 32), (u_int)(new & 0xffffffff), eq); Index: sys/kern/sched_ule.c =================================================================== --- sys/kern/sched_ule.c +++ sys/kern/sched_ule.c @@ -226,9 +226,8 @@ static int __read_mostly sched_idlespinthresh = -1; /* - * tdq - per processor runqs and statistics. All fields are protected by the - * tdq_lock. The load and lowpri may be accessed without to avoid excess - * locking in sched_pickcpu(); + * tdq - per processor runqs and statistics. A mutex synchronizes access to + * most fields. Some fields are loaded or modified without the mutex. */ struct tdq { /* @@ -239,12 +238,12 @@ struct mtx_padalign tdq_lock; /* run queue lock. */ struct cpu_group *tdq_cg; /* Pointer to cpu topology. */ struct thread *tdq_curthread; /* Current executing thread. */ - volatile int tdq_load; /* Aggregate load. */ - volatile int tdq_cpu_idle; /* cpu_idle() is active. */ + int tdq_load; /* Aggregate load. */ int tdq_sysload; /* For loadavg, !ITHD load. */ - volatile int tdq_transferable; /* Transferable thread count. */ - volatile short tdq_switchcnt; /* Switches this tick. */ - volatile short tdq_oldswitchcnt; /* Switches last tick. */ + int tdq_cpu_idle; /* cpu_idle() is active. */ + int tdq_transferable; /* Transferable thread count. */ + short tdq_switchcnt; /* Switches this tick. */ + short tdq_oldswitchcnt; /* Switches last tick. */ u_char tdq_lowpri; /* Lowest priority thread. */ u_char tdq_owepreempt; /* Remote preemption pending. */ u_char tdq_idx; /* Current insert index. */ @@ -257,12 +256,20 @@ #ifdef KTR char tdq_loadname[TDQ_LOADNAME_LEN]; #endif -} __aligned(64); +}; /* Idle thread states and config. */ #define TDQ_RUNNING 1 #define TDQ_IDLE 2 +/* Lockless accessors. */ +#define TDQ_LOAD(tdq) atomic_load_int(&(tdq)->tdq_load) +#define TDQ_TRANSFERABLE(tdq) atomic_load_int(&(tdq)->tdq_transferable) +#define TDQ_SWITCHCNT(tdq) (atomic_load_short(&(tdq)->tdq_switchcnt) + \ + atomic_load_short(&(tdq)->tdq_oldswitchcnt)) +#define TDQ_SWITCHCNT_INC(tdq) (atomic_store_short(&(tdq)->tdq_switchcnt, \ + atomic_load_short(&(tdq)->tdq_switchcnt) + 1)) + #ifdef SMP struct cpu_group __read_mostly *cpu_top; /* CPU topology */ @@ -322,7 +329,7 @@ static __inline void tdq_runq_add(struct tdq *, struct thread *, int); static __inline void tdq_runq_rem(struct tdq *, struct thread *); static inline int sched_shouldpreempt(int, int, int); -void tdq_print(int cpu); +static void tdq_print(int cpu); static void runq_print(struct runq *rq); static int tdq_add(struct tdq *, struct thread *, int); #ifdef SMP @@ -397,7 +404,7 @@ /* * Print the status of a per-cpu thread queue. Should be a ddb show cmd. */ -void +static void __unused tdq_print(int cpu) { struct tdq *tdq; @@ -698,7 +705,7 @@ if (!CPU_ISSET(c, &cg->cg_mask)) continue; tdq = TDQ_CPU(c); - l = tdq->tdq_load; + l = TDQ_LOAD(tdq); if (c == s->cs_prefer) { if (__predict_false(s->cs_running)) l--; @@ -768,14 +775,14 @@ if (!CPU_ISSET(c, &cg->cg_mask)) continue; tdq = TDQ_CPU(c); - l = tdq->tdq_load; + l = TDQ_LOAD(tdq); load = l * 256; total += load; /* * Check this CPU is acceptable. */ - if (l < s->cs_load || (tdq->tdq_transferable < s->cs_trans) || + if (l < s->cs_load || TDQ_TRANSFERABLE(tdq) < s->cs_trans || !CPU_ISSET(c, s->cs_mask)) continue; @@ -847,7 +854,7 @@ if (CPU_EMPTY(&lmask)) break; tdq = TDQ_CPU(high); - if (tdq->tdq_load == 1) { + if (TDQ_LOAD(tdq) == 1) { /* * There is only one running thread. We can't move * it from here, so tell it to pick new CPU by itself. @@ -865,9 +872,9 @@ } anylow = 1; nextlow: - if (tdq->tdq_transferable == 0) + if (TDQ_TRANSFERABLE(tdq) == 0) continue; - low = sched_lowest(cg, &lmask, -1, tdq->tdq_load - 1, high, 1); + low = sched_lowest(cg, &lmask, -1, TDQ_LOAD(tdq) - 1, high, 1); /* Stop if we looked well and found no less loaded CPU. */ if (anylow && low == -1) break; @@ -1012,15 +1019,15 @@ return (1); CPU_FILL(&mask); CPU_CLR(PCPU_GET(cpuid), &mask); - restart: - switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; +restart: + switchcnt = TDQ_SWITCHCNT(tdq); for (cg = tdq->tdq_cg, goup = 0; ; ) { cpu = sched_highest(cg, &mask, steal_thresh, 1); /* * We were assigned a thread but not preempted. Returning * 0 here will cause our caller to switch to it. */ - if (tdq->tdq_load) + if (TDQ_LOAD(tdq)) return (0); /* @@ -1056,8 +1063,8 @@ * this situation about 20% of the time on an 8 core * 16 thread Ryzen 7, but it still helps performance. */ - if (steal->tdq_load < steal_thresh || - steal->tdq_transferable == 0) + if (TDQ_LOAD(steal) < steal_thresh || + TDQ_TRANSFERABLE(steal) == 0) goto restart; /* * Try to lock both queues. If we are assigned a thread while @@ -1082,9 +1089,9 @@ * of date. The latter is rare. In either case restart * the search. */ - if (steal->tdq_load < steal_thresh || - steal->tdq_transferable == 0 || - switchcnt != tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt) { + if (TDQ_LOAD(steal) < steal_thresh || + TDQ_TRANSFERABLE(steal) == 0 || + switchcnt != TDQ_SWITCHCNT(tdq)) { tdq_unlock_pair(tdq, steal); goto restart; } @@ -1148,7 +1155,8 @@ */ cpu = TDQ_ID(tdq); if (TD_IS_IDLETHREAD(tdq->tdq_curthread) && - tdq->tdq_cpu_idle != 0 && cpu_idle_wakeup(cpu)) + atomic_load_int(&tdq->tdq_cpu_idle) != 0 && + cpu_idle_wakeup(cpu)) return; /* @@ -1419,7 +1427,7 @@ tdq = TDQ_CPU(cpu); if (THREAD_CAN_SCHED(td, self) && TDQ_SELF()->tdq_lowpri > pri && tdq->tdq_lowpri < PRI_MIN_IDLE && - TDQ_SELF()->tdq_load <= tdq->tdq_load + 1) { + TDQ_LOAD(TDQ_SELF()) <= TDQ_LOAD(tdq) + 1) { SCHED_STAT_INC(pickcpu_local); cpu = self; } @@ -2015,7 +2023,7 @@ * If a thread was added while interrupts were disabled don't * steal one here. */ - if (tdq->tdq_load > 0) { + if (TDQ_LOAD(tdq) > 0) { TDQ_LOCK(tdq); break; } @@ -2057,8 +2065,8 @@ * At this point unconditionally exit the loop to bound * the time spent in the critcal section. */ - if (steal->tdq_load < steal_thresh || - steal->tdq_transferable == 0) + if (TDQ_LOAD(steal) < steal_thresh || + TDQ_TRANSFERABLE(steal) == 0) continue; /* * Try to lock both queues. If we are assigned a thread while @@ -2075,8 +2083,8 @@ * The data returned by sched_highest() is stale and * the chosen CPU no longer has an eligible thread. */ - if (steal->tdq_load < steal_thresh || - steal->tdq_transferable == 0) { + if (TDQ_LOAD(steal) < steal_thresh || + TDQ_TRANSFERABLE(steal) == 0) { TDQ_UNLOCK(steal); break; } @@ -2177,9 +2185,9 @@ (flags & SW_PREEMPT) != 0; td->td_flags &= ~(TDF_NEEDRESCHED | TDF_PICKCPU | TDF_SLICEEND); td->td_owepreempt = 0; - tdq->tdq_owepreempt = 0; + atomic_store_char(&tdq->tdq_owepreempt, 0); if (!TD_IS_IDLETHREAD(td)) - tdq->tdq_switchcnt++; + TDQ_SWITCHCNT_INC(tdq); /* * Always block the thread lock so we can drop the tdq lock early. @@ -2539,6 +2547,7 @@ */ tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt; tdq->tdq_switchcnt = tdq->tdq_load; + /* * Advance the insert index once for each tick to ensure that all * threads get a chance to run. @@ -2595,10 +2604,10 @@ tdq = TDQ_SELF(); if ((curthread->td_flags & TDF_IDLETD) != 0) { - if (tdq->tdq_load > 0) + if (TDQ_LOAD(tdq) > 0) goto out; } else - if (tdq->tdq_load - 1 > 0) + if (TDQ_LOAD(tdq) - 1 > 0) goto out; load = 0; out: @@ -2894,10 +2903,10 @@ total = 0; CPU_FOREACH(i) - total += TDQ_CPU(i)->tdq_sysload; + total += atomic_load_int(&TDQ_CPU(i)->tdq_sysload); return (total); #else - return (TDQ_SELF()->tdq_sysload); + return (atomic_load_int(&TDQ_SELF()->tdq_sysload)); #endif } @@ -2937,18 +2946,18 @@ THREAD_NO_SLEEPING(); oldswitchcnt = -1; for (;;) { - if (tdq->tdq_load) { + if (TDQ_LOAD(tdq)) { thread_lock(td); mi_switch(SW_VOL | SWT_IDLE); } - switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; + switchcnt = TDQ_SWITCHCNT(tdq); #ifdef SMP if (always_steal || switchcnt != oldswitchcnt) { oldswitchcnt = switchcnt; if (tdq_idled(tdq) == 0) continue; } - switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; + switchcnt = TDQ_SWITCHCNT(tdq); #else oldswitchcnt = switchcnt; #endif @@ -2961,19 +2970,19 @@ */ if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) { for (i = 0; i < sched_idlespins; i++) { - if (tdq->tdq_load) + if (TDQ_LOAD(tdq)) break; cpu_spinwait(); } } /* If there was context switch during spin, restart it. */ - switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; - if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt) + switchcnt = TDQ_SWITCHCNT(tdq); + if (TDQ_LOAD(tdq) != 0 || switchcnt != oldswitchcnt) continue; /* Run main MD idle handler. */ - tdq->tdq_cpu_idle = 1; + atomic_store_int(&tdq->tdq_cpu_idle, 1); /* * Make sure that the tdq_cpu_idle update is globally visible * before cpu_idle() reads tdq_load. The order is important @@ -2985,21 +2994,21 @@ * threads often enough to make it worthwhile to do so in * order to avoid calling cpu_idle(). */ - if (tdq->tdq_load != 0) { - tdq->tdq_cpu_idle = 0; + if (TDQ_LOAD(tdq) != 0) { + atomic_store_int(&tdq->tdq_cpu_idle, 0); continue; } cpu_idle(switchcnt * 4 > sched_idlespinthresh); - tdq->tdq_cpu_idle = 0; + atomic_store_int(&tdq->tdq_cpu_idle, 0); /* * Account thread-less hardware interrupts and * other wakeup reasons equal to context switches. */ - switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt; + switchcnt = TDQ_SWITCHCNT(tdq); if (switchcnt != oldswitchcnt) continue; - tdq->tdq_switchcnt++; + TDQ_SWITCHCNT_INC(tdq); oldswitchcnt++; } }