diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c --- a/sys/amd64/amd64/initcpu.c +++ b/sys/amd64/amd64/initcpu.c @@ -255,6 +255,8 @@ { uint64_t msr; uint32_t cr4; + u_int r[4]; + int use_invlpg_pmap; cr4 = rcr4(); if ((cpu_feature & CPUID_XMM) && (cpu_feature & CPUID_FXSR)) { @@ -317,6 +319,21 @@ if ((amd_feature & AMDID_RDTSCP) != 0 || (cpu_stdext_feature2 & CPUID_STDEXT2_RDPID) != 0) wrmsr(MSR_TSC_AUX, cpu_auxmsr()); + + if (cpu_high >= 0x1a) { + cpuid_count(0x1a, 0, r); + if ((r[0] & CPUID_HYBRID_CORE_MASK) == + CPUID_HYBRID_SMALL_CORE) { + PCPU_SET(small_core, 1); + if (pmap_pcid_enabled) { + use_invlpg_pmap = 1; + TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_bug", + &use_invlpg_pmap); + PCPU_SET(pcid_invlpg_bug, use_invlpg_pmap); + pmap_pcid_invlpg_workaround = use_invlpg_pmap; + } + } + } } void diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -861,7 +861,7 @@ (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ - invlpg(smp_tlb_addr1); + pmap_invlpg(smp_tlb_pmap, smp_tlb_addr1); if (smp_tlb_pmap == PCPU_GET(curpmap) && smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 && PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { @@ -886,7 +886,7 @@ (*ipi_invlpg_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ - invlpg(smp_tlb_addr1); + pmap_invlpg(smp_tlb_pmap, smp_tlb_addr1); if (smp_tlb_pmap == PCPU_GET(curpmap) && (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3 && PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { @@ -931,10 +931,16 @@ #endif /* COUNT_IPIS */ addr = smp_tlb_addr1; - do { - invlpg(addr); - addr += PAGE_SIZE; - } while (addr < smp_tlb_addr2); + if (smp_tlb_pmap == kernel_pmap && PCPU_GET(pcid_invlpg_bug)) { + struct invpcid_descr d = { 0 }; + + invpcid(&d, INVPCID_CTXGLOB); + } else { + do { + invlpg(addr); + addr += PAGE_SIZE; + } while (addr < smp_tlb_addr2); + } if (smp_tlb_pmap == PCPU_GET(curpmap) && smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 && PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -529,6 +529,10 @@ int invpcid_works = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0, "Is the invpcid instruction available ?"); +int pmap_pcid_invlpg_workaround = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &pmap_pcid_invlpg_workaround, 0, + "Enable small core PCID/INVLPG workaround"); int __read_frequently pti = 0; SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, @@ -2797,7 +2801,7 @@ if ((newpde & PG_PS) == 0) /* Demotion: flush a specific 2MB page mapping. */ - invlpg(va); + pmap_invlpg(pmap, va); else if ((newpde & PG_G) == 0) /* * Promotion: flush every 4KB page mapping from the TLB @@ -3136,7 +3140,7 @@ vm_offset_t addr2 __unused) { if (pmap == kernel_pmap) { - invlpg(va); + pmap_invlpg(kernel_pmap, va); } else if (pmap == PCPU_GET(curpmap)) { invlpg(va); pmap_invalidate_page_cb(pmap, va); @@ -3227,8 +3231,14 @@ vm_offset_t addr; if (pmap == kernel_pmap) { - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); + if (PCPU_GET(pcid_invlpg_bug)) { + struct invpcid_descr d = { 0 }; + + invpcid(&d, INVPCID_CTXGLOB); + } else { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + } } else if (pmap == PCPU_GET(curpmap)) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); @@ -7674,7 +7684,7 @@ va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE); pmap_kenter(va, pa); - invlpg(va); + pmap_invlpg(kernel_pmap, va); return ((void *)crashdumpmap); } @@ -10377,7 +10387,7 @@ page[i]->md.pat_mode, 0); pte_store(pte, paddr | X86_PG_RW | X86_PG_V | cache_bits); - invlpg(vaddr[i]); + pmap_invlpg(kernel_pmap, vaddr[i]); } } } @@ -10426,7 +10436,13 @@ if (addr != qframe) return; pte_store(vtopte(qframe), 0); + + /* + * pmap_quick_enter_page() doesn't set PG_G, so we can use + * INVLPG there. + */ invlpg(qframe); + mtx_unlock_spin(&qframe_mtx); } diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h --- a/sys/amd64/include/pcpu.h +++ b/sys/amd64/include/pcpu.h @@ -99,7 +99,9 @@ uint32_t pc_smp_tlb_gen; \ u_int pc_smp_tlb_op; \ uint64_t pc_ucr3_load_mask; \ - char __pad[2916] /* pad to UMA_PCPU_ALLOC_SIZE */ + u_int pc_small_core; \ + u_int pc_pcid_invlpg_bug; \ + char __pad[2908] /* pad to UMA_PCPU_ALLOC_SIZE */ #define PC_DBREG_CMD_NONE 0 #define PC_DBREG_CMD_LOAD 1 diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -431,6 +431,7 @@ extern vm_paddr_t dmaplimit; extern int pmap_pcid_enabled; extern int invpcid_works; +extern int pmap_pcid_invlpg_workaround; #define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode) #define pmap_page_is_write_mapped(m) (((m)->a.flags & PGA_WRITEABLE) != 0) @@ -514,6 +515,24 @@ return (&pmap->pm_active); } +/* + * It seems that AlderLake+ small cores have some microarchitectural + * bug, which results in the INVLPG instruction failing to flush all + * global TLB entries when PCID is enabled. Work around it for now, + * by doing global invalidation on small cores instead of INVLPG. + */ +static __inline void +pmap_invlpg(pmap_t pmap, vm_offset_t va) +{ + if (pmap == kernel_pmap && PCPU_GET(pcid_invlpg_bug)) { + struct invpcid_descr d = { 0 }; + + invpcid(&d, INVPCID_CTXGLOB); + } else { + invlpg(va); + } +} + #endif /* _KERNEL */ /* Return various clipped indexes for a given VA */ diff --git a/sys/x86/include/specialreg.h b/sys/x86/include/specialreg.h --- a/sys/x86/include/specialreg.h +++ b/sys/x86/include/specialreg.h @@ -490,6 +490,11 @@ #define CPUID_STDEXT3_CORE_CAP 0x40000000 #define CPUID_STDEXT3_SSBD 0x80000000 +/* CPUID_HYBRID_ID leaf 0x1a */ +#define CPUID_HYBRID_CORE_MASK 0xff000000 +#define CPUID_HYBRID_SMALL_CORE 0x20000000 +#define CPUID_HYBRID_LARGE_CORE 0x40000000 + /* MSR IA32_ARCH_CAP(ABILITIES) bits */ #define IA32_ARCH_CAP_RDCL_NO 0x00000001 #define IA32_ARCH_CAP_IBRS_ALL 0x00000002