Index: sys/amd64/include/pcpu.h =================================================================== --- sys/amd64/include/pcpu.h +++ sys/amd64/include/pcpu.h @@ -44,7 +44,11 @@ * other processors" */ #define PCPU_MD_FIELDS \ - char pc_monitorbuf[128] __aligned(128); /* cache line */ \ + struct monitorbuf { \ + int idle_state; /* Used by cpu_idle_mwait. */ \ + int stop_state; /* Used by cpustop_handler. */ \ + char padding[128 - (2 * sizeof(int))]; \ + } pc_monitorbuf __aligned(128); /* cache line */ \ struct pcpu *pc_prvspace; /* Self-reference */ \ struct pmap *pc_curpmap; \ struct amd64tss *pc_tssp; /* TSS segment active on CPU */ \ @@ -83,6 +87,9 @@ #ifdef _KERNEL +#define MONITOR_STOPSTATE_RUNNING 0 +#define MONITOR_STOPSTATE_STOPPED 1 + #if defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) /* Index: sys/i386/include/pcpu.h =================================================================== --- sys/i386/include/pcpu.h +++ sys/i386/include/pcpu.h @@ -50,7 +50,11 @@ */ #define PCPU_MD_FIELDS \ - char pc_monitorbuf[128] __aligned(128); /* cache line */ \ + struct monitorbuf { \ + int idle_state; /* Used by cpu_idle_mwait. */ \ + int stop_state; /* Used by cpustop_handler. */ \ + char padding[128 - (2 * sizeof(int))]; \ + } pc_monitorbuf __aligned(128); /* cache line */ \ struct pcpu *pc_prvspace; /* Self-reference */ \ struct pmap *pc_curpmap; \ struct segment_descriptor pc_common_tssd; \ @@ -84,6 +88,9 @@ #ifdef _KERNEL +#define MONITOR_STOPSTATE_RUNNING 0 +#define MONITOR_STOPSTATE_STOPPED 1 + #if defined(__GNUCLIKE_ASM) && defined(__GNUCLIKE___TYPEOF) /* Index: sys/kern/subr_smp.c =================================================================== --- sys/kern/subr_smp.c +++ sys/kern/subr_smp.c @@ -378,6 +378,26 @@ CPU_COPY_STORE_REL(&map, &started_cpus); #if X86 + /* + * Wake up any CPUs stopped with MWAIT. From MI code we can't tell if + * MONITOR/MWAIT is enabled, but the potentially redundant writes are + * relatively inexpensive. + */ + if (type == IPI_STOP) { + struct monitorbuf *mb; + u_int id; + + CPU_FOREACH(id) { + if (!CPU_ISSET(id, &map)) + continue; + + mb = &pcpu_find(id)->pc_monitorbuf; + atomic_store_int(&mb->stop_state, + MONITOR_STOPSTATE_RUNNING); + } + wmb(); + } + if (!nmi_is_broadcast || nmi_kdb_lock == 0) { #endif /* wait for each to clear its bit */ Index: sys/x86/include/x86_smp.h =================================================================== --- sys/x86/include/x86_smp.h +++ sys/x86/include/x86_smp.h @@ -61,6 +61,11 @@ }; extern struct cpu_info *cpu_info; +/* + * Set if MWAIT does not reliably wake when the MONITORed address is written. + */ +extern bool mwait_cpustop_broken; + #ifdef COUNT_IPIS extern u_long *ipi_invltlb_counts[MAXCPU]; extern u_long *ipi_invlrng_counts[MAXCPU]; Index: sys/x86/x86/cpu_machdep.c =================================================================== --- sys/x86/x86/cpu_machdep.c +++ sys/x86/x86/cpu_machdep.c @@ -110,6 +110,13 @@ static volatile u_int cpu_reset_proxy_active; #endif +/* + * Automatically initialized per CPU errata in cpu_idle_tun below. + */ +bool mwait_cpustop_broken = false; +SYSCTL_BOOL(_machdep, OID_AUTO, mwait_cpustop_broken, CTLFLAG_RDTUN, + &mwait_cpustop_broken, 0, + "Can not reliably wake MONITOR/MWAIT cpus without interrupts"); /* * Machine dependent boot() routine @@ -164,7 +171,7 @@ * but all Intel CPUs provide hardware coordination. */ - state = (int *)PCPU_PTR(monitorbuf); + state = &PCPU_PTR(monitorbuf)->idle_state; KASSERT(atomic_load_int(state) == STATE_SLEEPING, ("cpu_mwait_cx: wrong monitorbuf state")); atomic_store_int(state, STATE_MWAIT); @@ -358,6 +365,7 @@ cpu_reset(void) { #ifdef SMP + struct monitorbuf *mb; cpuset_t map; u_int cnt; @@ -378,6 +386,9 @@ /* Restart CPU #0. */ CPU_SETOF(0, &started_cpus); + mb = &pcpu_find(0)->pc_monitorbuf; + atomic_store_int(&mb->stop_state, + MONITOR_STOPSTATE_RUNNING); wmb(); cnt = 0; @@ -422,7 +433,7 @@ { int *state; - state = (int *)PCPU_PTR(monitorbuf); + state = &PCPU_PTR(monitorbuf)->idle_state; atomic_store_int(state, STATE_SLEEPING); /* See comments in cpu_idle_hlt(). */ @@ -441,7 +452,7 @@ { int *state; - state = (int *)PCPU_PTR(monitorbuf); + state = &PCPU_PTR(monitorbuf)->idle_state; atomic_store_int(state, STATE_SLEEPING); /* @@ -473,7 +484,7 @@ { int *state; - state = (int *)PCPU_PTR(monitorbuf); + state = &PCPU_PTR(monitorbuf)->idle_state; atomic_store_int(state, STATE_MWAIT); /* See comments in cpu_idle_hlt(). */ @@ -498,7 +509,7 @@ int *state; int i; - state = (int *)PCPU_PTR(monitorbuf); + state = &PCPU_PTR(monitorbuf)->idle_state; atomic_store_int(state, STATE_RUNNING); /* @@ -598,9 +609,11 @@ int cpu_idle_wakeup(int cpu) { + struct monitorbuf *mb; int *state; - state = (int *)pcpu_find(cpu)->pc_monitorbuf; + mb = &pcpu_find(cpu)->pc_monitorbuf; + state = &mb->idle_state; switch (atomic_load_int(state)) { case STATE_SLEEPING: return (0); @@ -714,6 +727,7 @@ /* Ryzen erratas 1057, 1109. */ cpu_idle_selector("hlt"); idle_mwait = 0; + mwait_cpustop_broken = true; } if (cpu_vendor_id == CPU_VENDOR_INTEL && cpu_id == 0x506c9) { @@ -725,6 +739,7 @@ * sleep states. */ cpu_idle_apl31_workaround = 1; + mwait_cpustop_broken = true; } TUNABLE_INT_FETCH("machdep.idle_apl31", &cpu_idle_apl31_workaround); } Index: sys/x86/x86/mp_x86.c =================================================================== --- sys/x86/x86/mp_x86.c +++ sys/x86/x86/mp_x86.c @@ -161,6 +161,10 @@ unsigned int boot_address; +static bool stop_mwait = false; +SYSCTL_BOOL(_machdep, OID_AUTO, stop_mwait, CTLFLAG_RWTUN, &stop_mwait, 0, + "Use MONITOR/MWAIT when stopping CPU, if available"); + #define MiB(v) (v ## ULL << 20) void @@ -1390,13 +1394,15 @@ } /* - * Handle an IPI_STOP by saving our current context and spinning until we - * are resumed. + * Handle an IPI_STOP by saving our current context and spinning (or mwaiting, + * if available) until we are resumed. */ void cpustop_handler(void) { + struct monitorbuf *mb; u_int cpu; + bool use_mwait; cpu = PCPU_GET(cpuid); @@ -1405,8 +1411,23 @@ /* Indicate that we are stopped */ CPU_SET_ATOMIC(cpu, &stopped_cpus); + use_mwait = (stop_mwait && (cpu_feature2 & CPUID2_MON) != 0 && + !mwait_cpustop_broken); + if (use_mwait) + mb = PCPU_PTR(monitorbuf); + /* Wait for restart */ while (!CPU_ISSET(cpu, &started_cpus)) { + if (use_mwait) { + atomic_store_int(&mb->stop_state, + MONITOR_STOPSTATE_STOPPED); + cpu_monitor(mb, 0, 0); + if (atomic_load_int(&mb->stop_state) == + MONITOR_STOPSTATE_STOPPED) + cpu_mwait(0, MWAIT_C1); + continue; + } + ia32_pause(); /*