Index: sys/kern/kern_clocksource.c
===================================================================
--- sys/kern/kern_clocksource.c
+++ sys/kern/kern_clocksource.c
@@ -67,7 +67,7 @@
 
 struct pcpu_state;
 static sbintime_t	getnextcpuevent(struct pcpu_state *state, int idle);
-static sbintime_t	getnextevent(void);
+static sbintime_t	getnextevent(struct pcpu_state *state);
 static int		handleevents(sbintime_t now, int fake);
 
 static struct mtx	et_hw_mtx;
@@ -256,12 +256,10 @@
  * Schedule binuptime of the next event on all CPUs.
  */
 static sbintime_t
-getnextevent(void)
+getnextevent(struct pcpu_state *state)
 {
-	struct pcpu_state *state;
 	sbintime_t event;
 
-	state = DPCPU_PTR(timerstate);
 	ET_HW_ASSERT_LOCKED(state);
 	event = state->nextevent;
 #ifdef SMP
@@ -367,10 +365,10 @@
 	uint64_t tmp;
 	int eq;
 
-	if (timer->et_flags & ET_FLAGS_PERCPU) {
-		state = DPCPU_PTR(timerstate);
+	state = DPCPU_PTR(timerstate);
+	if (timer->et_flags & ET_FLAGS_PERCPU)
 		next = &state->nexttick;
-	} else
+	else
 		next = &nexttick;
 	ET_HW_ASSERT_LOCKED(state);
 	if (periodic) {
@@ -390,7 +388,7 @@
 			et_start(timer, new, timerperiod);
 		}
 	} else {
-		new = getnextevent();
+		new = getnextevent(state);
 		eq = (new == *next);
 		CTR3(KTR_SPARE2, "load:    next %d.%08x eq %d",
 		    (int)(new >> 32), (u_int)(new & 0xffffffff), eq);
Index: sys/kern/sched_ule.c
===================================================================
--- sys/kern/sched_ule.c
+++ sys/kern/sched_ule.c
@@ -226,9 +226,8 @@
 static int __read_mostly sched_idlespinthresh = -1;
 
 /*
- * tdq - per processor runqs and statistics.  All fields are protected by the
- * tdq_lock.  The load and lowpri may be accessed without to avoid excess
- * locking in sched_pickcpu();
+ * tdq - per processor runqs and statistics.  A mutex synchronizes access to
+ * most fields.  Some fields are loaded or modified without the mutex.
  */
 struct tdq {
 	/* 
@@ -239,12 +238,12 @@
 	struct mtx_padalign tdq_lock;		/* run queue lock. */
 	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
 	struct thread	*tdq_curthread;		/* Current executing thread. */
-	volatile int	tdq_load;		/* Aggregate load. */
-	volatile int	tdq_cpu_idle;		/* cpu_idle() is active. */
+	int		tdq_load;		/* Aggregate load. */
 	int		tdq_sysload;		/* For loadavg, !ITHD load. */
-	volatile int	tdq_transferable;	/* Transferable thread count. */
-	volatile short	tdq_switchcnt;		/* Switches this tick. */
-	volatile short	tdq_oldswitchcnt;	/* Switches last tick. */
+	int		tdq_cpu_idle;		/* cpu_idle() is active. */
+	int		tdq_transferable;	/* Transferable thread count. */
+	short		tdq_switchcnt;		/* Switches this tick. */
+	short		tdq_oldswitchcnt;	/* Switches last tick. */
 	u_char		tdq_lowpri;		/* Lowest priority thread. */
 	u_char		tdq_owepreempt;		/* Remote preemption pending. */
 	u_char		tdq_idx;		/* Current insert index. */
@@ -257,12 +256,20 @@
 #ifdef KTR
 	char		tdq_loadname[TDQ_LOADNAME_LEN];
 #endif
-} __aligned(64);
+};
 
 /* Idle thread states and config. */
 #define	TDQ_RUNNING	1
 #define	TDQ_IDLE	2
 
+/* Lockless accessors. */
+#define	TDQ_LOAD(tdq)		atomic_load_int(&(tdq)->tdq_load)
+#define	TDQ_TRANSFERABLE(tdq)	atomic_load_int(&(tdq)->tdq_transferable)
+#define	TDQ_SWITCHCNT(tdq)	(atomic_load_short(&(tdq)->tdq_switchcnt) + \
+				 atomic_load_short(&(tdq)->tdq_oldswitchcnt))
+#define	TDQ_SWITCHCNT_INC(tdq)	(atomic_store_short(&(tdq)->tdq_switchcnt, \
+				 atomic_load_short(&(tdq)->tdq_switchcnt) + 1))
+
 #ifdef SMP
 struct cpu_group __read_mostly *cpu_top;		/* CPU topology */
 
@@ -322,7 +329,7 @@
 static __inline void tdq_runq_add(struct tdq *, struct thread *, int);
 static __inline void tdq_runq_rem(struct tdq *, struct thread *);
 static inline int sched_shouldpreempt(int, int, int);
-void tdq_print(int cpu);
+static void tdq_print(int cpu);
 static void runq_print(struct runq *rq);
 static int tdq_add(struct tdq *, struct thread *, int);
 #ifdef SMP
@@ -397,7 +404,7 @@
 /*
  * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
  */
-void
+static void __unused
 tdq_print(int cpu)
 {
 	struct tdq *tdq;
@@ -698,7 +705,7 @@
 		if (!CPU_ISSET(c, &cg->cg_mask))
 			continue;
 		tdq = TDQ_CPU(c);
-		l = tdq->tdq_load;
+		l = TDQ_LOAD(tdq);
 		if (c == s->cs_prefer) {
 			if (__predict_false(s->cs_running))
 				l--;
@@ -768,14 +775,14 @@
 		if (!CPU_ISSET(c, &cg->cg_mask))
 			continue;
 		tdq = TDQ_CPU(c);
-		l = tdq->tdq_load;
+		l = TDQ_LOAD(tdq);
 		load = l * 256;
 		total += load;
 
 		/*
 		 * Check this CPU is acceptable.
 		 */
-		if (l < s->cs_load || (tdq->tdq_transferable < s->cs_trans) ||
+		if (l < s->cs_load || TDQ_TRANSFERABLE(tdq) < s->cs_trans ||
 		    !CPU_ISSET(c, s->cs_mask))
 			continue;
 
@@ -847,7 +854,7 @@
 		if (CPU_EMPTY(&lmask))
 			break;
 		tdq = TDQ_CPU(high);
-		if (tdq->tdq_load == 1) {
+		if (TDQ_LOAD(tdq) == 1) {
 			/*
 			 * There is only one running thread.  We can't move
 			 * it from here, so tell it to pick new CPU by itself.
@@ -865,9 +872,9 @@
 		}
 		anylow = 1;
 nextlow:
-		if (tdq->tdq_transferable == 0)
+		if (TDQ_TRANSFERABLE(tdq) == 0)
 			continue;
-		low = sched_lowest(cg, &lmask, -1, tdq->tdq_load - 1, high, 1);
+		low = sched_lowest(cg, &lmask, -1, TDQ_LOAD(tdq) - 1, high, 1);
 		/* Stop if we looked well and found no less loaded CPU. */
 		if (anylow && low == -1)
 			break;
@@ -1012,15 +1019,15 @@
 		return (1);
 	CPU_FILL(&mask);
 	CPU_CLR(PCPU_GET(cpuid), &mask);
-    restart:
-	switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+restart:
+	switchcnt = TDQ_SWITCHCNT(tdq);
 	for (cg = tdq->tdq_cg, goup = 0; ; ) {
 		cpu = sched_highest(cg, &mask, steal_thresh, 1);
 		/*
 		 * We were assigned a thread but not preempted.  Returning
 		 * 0 here will cause our caller to switch to it.
 		 */
-		if (tdq->tdq_load)
+		if (TDQ_LOAD(tdq))
 			return (0);
 
 		/*
@@ -1056,8 +1063,8 @@
 		 * this situation about 20% of the time on an 8 core
 		 * 16 thread Ryzen 7, but it still helps performance.
 		 */
-		if (steal->tdq_load < steal_thresh ||
-		    steal->tdq_transferable == 0)
+		if (TDQ_LOAD(steal) < steal_thresh ||
+		    TDQ_TRANSFERABLE(steal) == 0)
 			goto restart;
 		/*
 		 * Try to lock both queues. If we are assigned a thread while
@@ -1082,9 +1089,9 @@
 		 * of date.  The latter is rare.  In either case restart
 		 * the search.
 		 */
-		if (steal->tdq_load < steal_thresh ||
-		    steal->tdq_transferable == 0 ||
-		    switchcnt != tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt) {
+		if (TDQ_LOAD(steal) < steal_thresh ||
+		    TDQ_TRANSFERABLE(steal) == 0 ||
+		    switchcnt != TDQ_SWITCHCNT(tdq)) {
 			tdq_unlock_pair(tdq, steal);
 			goto restart;
 		}
@@ -1148,7 +1155,8 @@
 	 */
 	cpu = TDQ_ID(tdq);
 	if (TD_IS_IDLETHREAD(tdq->tdq_curthread) &&
-	    tdq->tdq_cpu_idle != 0 && cpu_idle_wakeup(cpu))
+	    atomic_load_int(&tdq->tdq_cpu_idle) != 0 &&
+	    cpu_idle_wakeup(cpu))
 		return;
 
 	/*
@@ -1419,7 +1427,7 @@
 	tdq = TDQ_CPU(cpu);
 	if (THREAD_CAN_SCHED(td, self) && TDQ_SELF()->tdq_lowpri > pri &&
 	    tdq->tdq_lowpri < PRI_MIN_IDLE &&
-	    TDQ_SELF()->tdq_load <= tdq->tdq_load + 1) {
+	    TDQ_LOAD(TDQ_SELF()) <= TDQ_LOAD(tdq) + 1) {
 		SCHED_STAT_INC(pickcpu_local);
 		cpu = self;
 	}
@@ -2015,7 +2023,7 @@
 		 * If a thread was added while interrupts were disabled don't
 		 * steal one here.
 		 */
-		if (tdq->tdq_load > 0) {
+		if (TDQ_LOAD(tdq) > 0) {
 			TDQ_LOCK(tdq);
 			break;
 		}
@@ -2057,8 +2065,8 @@
 		 * At this point unconditionally exit the loop to bound
 		 * the time spent in the critcal section.
 		 */
-		if (steal->tdq_load < steal_thresh ||
-		    steal->tdq_transferable == 0)
+		if (TDQ_LOAD(steal) < steal_thresh ||
+		    TDQ_TRANSFERABLE(steal) == 0)
 			continue;
 		/*
 		 * Try to lock both queues. If we are assigned a thread while
@@ -2075,8 +2083,8 @@
 		 * The data returned by sched_highest() is stale and
                  * the chosen CPU no longer has an eligible thread.
 		 */
-		if (steal->tdq_load < steal_thresh ||
-		    steal->tdq_transferable == 0) {
+		if (TDQ_LOAD(steal) < steal_thresh ||
+		    TDQ_TRANSFERABLE(steal) == 0) {
 			TDQ_UNLOCK(steal);
 			break;
 		}
@@ -2177,9 +2185,9 @@
 	    (flags & SW_PREEMPT) != 0;
 	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_PICKCPU | TDF_SLICEEND);
 	td->td_owepreempt = 0;
-	tdq->tdq_owepreempt = 0;
+	atomic_store_char(&tdq->tdq_owepreempt, 0);
 	if (!TD_IS_IDLETHREAD(td))
-		tdq->tdq_switchcnt++;
+		TDQ_SWITCHCNT_INC(tdq);
 
 	/*
 	 * Always block the thread lock so we can drop the tdq lock early.
@@ -2539,6 +2547,7 @@
 	 */
 	tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt;
 	tdq->tdq_switchcnt = tdq->tdq_load;
+
 	/*
 	 * Advance the insert index once for each tick to ensure that all
 	 * threads get a chance to run.
@@ -2595,10 +2604,10 @@
 
 	tdq = TDQ_SELF();
 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
-		if (tdq->tdq_load > 0)
+		if (TDQ_LOAD(tdq) > 0)
 			goto out;
 	} else
-		if (tdq->tdq_load - 1 > 0)
+		if (TDQ_LOAD(tdq) - 1 > 0)
 			goto out;
 	load = 0;
 out:
@@ -2894,10 +2903,10 @@
 
 	total = 0;
 	CPU_FOREACH(i)
-		total += TDQ_CPU(i)->tdq_sysload;
+		total += atomic_load_int(&TDQ_CPU(i)->tdq_sysload);
 	return (total);
 #else
-	return (TDQ_SELF()->tdq_sysload);
+	return (atomic_load_int(&TDQ_SELF()->tdq_sysload));
 #endif
 }
 
@@ -2937,18 +2946,18 @@
 	THREAD_NO_SLEEPING();
 	oldswitchcnt = -1;
 	for (;;) {
-		if (tdq->tdq_load) {
+		if (TDQ_LOAD(tdq)) {
 			thread_lock(td);
 			mi_switch(SW_VOL | SWT_IDLE);
 		}
-		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+		switchcnt = TDQ_SWITCHCNT(tdq);
 #ifdef SMP
 		if (always_steal || switchcnt != oldswitchcnt) {
 			oldswitchcnt = switchcnt;
 			if (tdq_idled(tdq) == 0)
 				continue;
 		}
-		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+		switchcnt = TDQ_SWITCHCNT(tdq);
 #else
 		oldswitchcnt = switchcnt;
 #endif
@@ -2961,19 +2970,19 @@
 		 */
 		if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) {
 			for (i = 0; i < sched_idlespins; i++) {
-				if (tdq->tdq_load)
+				if (TDQ_LOAD(tdq))
 					break;
 				cpu_spinwait();
 			}
 		}
 
 		/* If there was context switch during spin, restart it. */
-		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
-		if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt)
+		switchcnt = TDQ_SWITCHCNT(tdq);
+		if (TDQ_LOAD(tdq) != 0 || switchcnt != oldswitchcnt)
 			continue;
 
 		/* Run main MD idle handler. */
-		tdq->tdq_cpu_idle = 1;
+		atomic_store_int(&tdq->tdq_cpu_idle, 1);
 		/*
 		 * Make sure that the tdq_cpu_idle update is globally visible
 		 * before cpu_idle() reads tdq_load.  The order is important
@@ -2985,21 +2994,21 @@
 		 * threads often enough to make it worthwhile to do so in
 		 * order to avoid calling cpu_idle().
 		 */
-		if (tdq->tdq_load != 0) {
-			tdq->tdq_cpu_idle = 0;
+		if (TDQ_LOAD(tdq) != 0) {
+			atomic_store_int(&tdq->tdq_cpu_idle, 0);
 			continue;
 		}
 		cpu_idle(switchcnt * 4 > sched_idlespinthresh);
-		tdq->tdq_cpu_idle = 0;
+		atomic_store_int(&tdq->tdq_cpu_idle, 0);
 
 		/*
 		 * Account thread-less hardware interrupts and
 		 * other wakeup reasons equal to context switches.
 		 */
-		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+		switchcnt = TDQ_SWITCHCNT(tdq);
 		if (switchcnt != oldswitchcnt)
 			continue;
-		tdq->tdq_switchcnt++;
+		TDQ_SWITCHCNT_INC(tdq);
 		oldswitchcnt++;
 	}
 }