Index: head/sys/amd64/amd64/exception.S =================================================================== --- head/sys/amd64/amd64/exception.S +++ head/sys/amd64/amd64/exception.S @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef KDTRACE_HOOKS .bss @@ -607,8 +608,10 @@ cmpq $~0,PCPU(UCR3) je 2f movq PCPU(UCR3),%r9 + andq PCPU(UCR3_LOAD_MASK),%r9 movq %r9,%cr3 2: xorl %r9d,%r9d + movq $PMAP_UCR3_NOMASK,PCPU(UCR3_LOAD_MASK) swapgs sysretq @@ -1262,6 +1265,8 @@ movq TF_SS(%rsp),%rax movq %rax,PTI_SS(%rdx) movq PCPU(UCR3),%rax + andq PCPU(UCR3_LOAD_MASK),%rax + movq $PMAP_UCR3_NOMASK,PCPU(UCR3_LOAD_MASK) swapgs movq %rdx,%rsp movq %rax,%cr3 Index: head/sys/amd64/amd64/genassym.c =================================================================== --- head/sys/amd64/amd64/genassym.c +++ head/sys/amd64/amd64/genassym.c @@ -230,6 +230,7 @@ ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt)); ASSYM(PC_KCR3, offsetof(struct pcpu, pc_kcr3)); ASSYM(PC_UCR3, offsetof(struct pcpu, pc_ucr3)); +ASSYM(PC_UCR3_LOAD_MASK, offsetof(struct pcpu, pc_ucr3_load_mask)); ASSYM(PC_SAVED_UCR3, offsetof(struct pcpu, pc_saved_ucr3)); ASSYM(PC_PTI_STACK, offsetof(struct pcpu, pc_pti_stack)); ASSYM(PC_PTI_STACK_SZ, PC_PTI_STACK_SZ); Index: head/sys/amd64/amd64/machdep.c =================================================================== --- head/sys/amd64/amd64/machdep.c +++ head/sys/amd64/amd64/machdep.c @@ -1562,6 +1562,7 @@ PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]); PCPU_SET(fs32p, &gdt[GUFS32_SEL]); PCPU_SET(gs32p, &gdt[GUGS32_SEL]); + PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); PCPU_SET(smp_tlb_gen, 1); } Index: head/sys/amd64/amd64/mp_machdep.c =================================================================== --- head/sys/amd64/amd64/mp_machdep.c +++ head/sys/amd64/amd64/mp_machdep.c @@ -283,6 +283,7 @@ pc->pc_fs32p = &gdt[GUFS32_SEL]; pc->pc_gs32p = &gdt[GUGS32_SEL]; pc->pc_ldt = (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]; + pc->pc_ucr3_load_mask = PMAP_UCR3_NOMASK; /* See comment in pmap_bootstrap(). */ pc->pc_pcid_next = PMAP_PCID_KERN + 2; pc->pc_pcid_gen = 1; @@ -821,15 +822,14 @@ invpcid(&d, INVPCID_CTXGLOB); } else { invpcid(&d, INVPCID_CTX); - d.pcid |= PMAP_PCID_USER_PT; - invpcid(&d, INVPCID_CTX); + if (smp_tlb_pmap == PCPU_GET(curpmap)) + PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); } } static void invltlb_pcid_handler(pmap_t smp_tlb_pmap) { - uint64_t kcr3, ucr3; uint32_t pcid; #ifdef COUNT_XINVLTLB_HITS @@ -849,15 +849,11 @@ * invalidation when switching to the pmap on this * CPU. */ - if (PCPU_GET(curpmap) == smp_tlb_pmap) { + if (smp_tlb_pmap == PCPU_GET(curpmap)) { pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; - kcr3 = smp_tlb_pmap->pm_cr3 | pcid; - ucr3 = smp_tlb_pmap->pm_ucr3; - if (ucr3 != PMAP_NO_CR3) { - ucr3 |= PMAP_PCID_USER_PT | pcid; - pmap_pti_pcid_invalidate(ucr3, kcr3); - } else - load_cr3(kcr3); + load_cr3(smp_tlb_pmap->pm_cr3 | pcid); + if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) + PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); } } } @@ -888,7 +884,9 @@ #endif /* COUNT_IPIS */ invlpg(smp_tlb_addr1); - if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) { + if (smp_tlb_pmap == PCPU_GET(curpmap) && + smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 && + PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid | PMAP_PCID_USER_PT; d.pad = 0; @@ -912,7 +910,8 @@ invlpg(smp_tlb_addr1); if (smp_tlb_pmap == PCPU_GET(curpmap) && - (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) { + (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3 && + PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; @@ -960,7 +959,9 @@ invlpg(addr); addr += PAGE_SIZE; } while (addr < addr2); - if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) { + if (smp_tlb_pmap == PCPU_GET(curpmap) && + smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 && + PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid | PMAP_PCID_USER_PT; d.pad = 0; @@ -994,7 +995,8 @@ addr += PAGE_SIZE; } while (addr < addr2); if (smp_tlb_pmap == PCPU_GET(curpmap) && - (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) { + (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3 && + PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid; kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE; ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c +++ head/sys/amd64/amd64/pmap.c @@ -2520,7 +2520,16 @@ cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { - if (pmap->pm_ucr3 != PMAP_NO_CR3) { + if (pmap->pm_ucr3 != PMAP_NO_CR3 && + /* + * If we context-switched right after + * PCPU_GET(ucr3_load_mask), we could read the + * ~CR3_PCID_SAVE mask, which causes us to skip + * the code below to invalidate user pages. This + * is handled in pmap_activate_sw_pcid_pti() by + * clearing pm_gen if ucr3_load_mask is ~CR3_PCID_SAVE. + */ + PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { /* * Because pm_pcid is recalculated on a * context switch, we must disable switching. @@ -2635,7 +2644,8 @@ cpuid = PCPU_GET(cpuid); if (pmap == PCPU_GET(curpmap)) { - if (pmap->pm_ucr3 != PMAP_NO_CR3) { + if (pmap->pm_ucr3 != PMAP_NO_CR3 && + PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { critical_enter(); pcid = pmap->pm_pcids[cpuid].pm_pcid; if (invpcid_works1) { @@ -2736,7 +2746,7 @@ pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1) { struct invpcid_descr d; - uint64_t kcr3, ucr3; + uint64_t kcr3; uint32_t pcid; u_int cpuid, i; @@ -2757,20 +2767,12 @@ d.pad = 0; d.addr = 0; invpcid(&d, INVPCID_CTX); - if (pmap->pm_ucr3 != PMAP_NO_CR3) { - d.pcid |= PMAP_PCID_USER_PT; - invpcid(&d, INVPCID_CTX); - } } else { kcr3 = pmap->pm_cr3 | pcid; - ucr3 = pmap->pm_ucr3; - if (ucr3 != PMAP_NO_CR3) { - ucr3 |= pcid | PMAP_PCID_USER_PT; - pmap_pti_pcid_invalidate(ucr3, kcr3); - } else { - load_cr3(kcr3); - } + load_cr3(kcr3); } + if (pmap->pm_ucr3 != PMAP_NO_CR3) + PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); critical_exit(); } else pmap->pm_pcids[cpuid].pm_gen = 0; @@ -8816,12 +8818,23 @@ PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base; } -static void inline -pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1) +static void +pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) { - struct invpcid_descr d; + pmap_t old_pmap; uint64_t cached, cr3, kcr3, ucr3; + KASSERT((read_rflags() & PSL_I) == 0, + ("PCID needs interrupts disabled in pmap_activate_sw()")); + + /* See the comment in pmap_invalidate_page_pcid(). */ + if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) { + PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK); + old_pmap = PCPU_GET(curpmap); + MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3); + old_pmap->pm_pcids[cpuid].pm_gen = 0; + } + cached = pmap_pcid_alloc_checked(pmap, cpuid); cr3 = rcr3(); if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) @@ -8831,77 +8844,26 @@ ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | PMAP_PCID_USER_PT; - if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) { - /* - * Explicitly invalidate translations cached from the - * user page table. They are not automatically - * flushed by reload of cr3 with the kernel page table - * pointer above. - * - * Note that the if() condition is resolved statically - * by using the function argument instead of - * runtime-evaluated invpcid_works value. - */ - if (invpcid_works1) { - d.pcid = PMAP_PCID_USER_PT | - pmap->pm_pcids[cpuid].pm_pcid; - d.pad = 0; - d.addr = 0; - invpcid(&d, INVPCID_CTX); - } else { - pmap_pti_pcid_invalidate(ucr3, kcr3); - } - } + if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) + PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); if (cached) PCPU_INC(pm_save_cnt); -} -static void -pmap_activate_sw_pcid_invpcid_pti(struct thread *td, pmap_t pmap, u_int cpuid) -{ - - pmap_activate_sw_pcid_pti(pmap, cpuid, true); pmap_activate_sw_pti_post(td, pmap); } static void -pmap_activate_sw_pcid_noinvpcid_pti(struct thread *td, pmap_t pmap, - u_int cpuid) -{ - register_t rflags; - - /* - * If the INVPCID instruction is not available, - * invltlb_pcid_handler() is used to handle an invalidate_all - * IPI, which checks for curpmap == smp_tlb_pmap. The below - * sequence of operations has a window where %CR3 is loaded - * with the new pmap's PML4 address, but the curpmap value has - * not yet been updated. This causes the invltlb IPI handler, - * which is called between the updates, to execute as a NOP, - * which leaves stale TLB entries. - * - * Note that the most typical use of pmap_activate_sw(), from - * the context switch, is immune to this race, because - * interrupts are disabled (while the thread lock is owned), - * and the IPI happens after curpmap is updated. Protect - * other callers in a similar way, by disabling interrupts - * around the %cr3 register reload and curpmap assignment. - */ - rflags = intr_disable(); - pmap_activate_sw_pcid_pti(pmap, cpuid, false); - intr_restore(rflags); - pmap_activate_sw_pti_post(td, pmap); -} - -static void pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid) { uint64_t cached, cr3; + KASSERT((read_rflags() & PSL_I) == 0, + ("PCID needs interrupts disabled in pmap_activate_sw()")); + cached = pmap_pcid_alloc_checked(pmap, cpuid); cr3 = rcr3(); if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) @@ -8913,17 +8875,6 @@ } static void -pmap_activate_sw_pcid_noinvpcid_nopti(struct thread *td __unused, pmap_t pmap, - u_int cpuid) -{ - register_t rflags; - - rflags = intr_disable(); - pmap_activate_sw_pcid_nopti(td, pmap, cpuid); - intr_restore(rflags); -} - -static void pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap, u_int cpuid __unused) { @@ -8947,14 +8898,10 @@ u_int)) { - if (pmap_pcid_enabled && pti && invpcid_works) - return (pmap_activate_sw_pcid_invpcid_pti); - else if (pmap_pcid_enabled && pti && !invpcid_works) - return (pmap_activate_sw_pcid_noinvpcid_pti); - else if (pmap_pcid_enabled && !pti && invpcid_works) + if (pmap_pcid_enabled && pti) + return (pmap_activate_sw_pcid_pti); + else if (pmap_pcid_enabled && !pti) return (pmap_activate_sw_pcid_nopti); - else if (pmap_pcid_enabled && !pti && !invpcid_works) - return (pmap_activate_sw_pcid_noinvpcid_nopti); else if (!pmap_pcid_enabled && pti) return (pmap_activate_sw_nopcid_pti); else /* if (!pmap_pcid_enabled && !pti) */ @@ -8991,10 +8938,26 @@ void pmap_activate(struct thread *td) { - - critical_enter(); + /* + * invltlb_{invpcid,}_pcid_handler() is used to handle an + * invalidate_all IPI, which checks for curpmap == + * smp_tlb_pmap. The below sequence of operations has a + * window where %CR3 is loaded with the new pmap's PML4 + * address, but the curpmap value has not yet been updated. + * This causes the invltlb IPI handler, which is called + * between the updates, to execute as a NOP, which leaves + * stale TLB entries. + * + * Note that the most common use of pmap_activate_sw(), from + * a context switch, is immune to this race, because + * interrupts are disabled (while the thread lock is owned), + * so the IPI is delayed until after curpmap is updated. Protect + * other callers in a similar way, by disabling interrupts + * around the %cr3 register reload and curpmap assignment. + */ + spinlock_enter(); pmap_activate_sw(td); - critical_exit(); + spinlock_exit(); } void Index: head/sys/amd64/include/pcpu.h =================================================================== --- head/sys/amd64/include/pcpu.h +++ head/sys/amd64/include/pcpu.h @@ -99,7 +99,8 @@ uint64_t pc_smp_tlb_addr2; \ uint32_t pc_smp_tlb_gen; \ u_int pc_smp_tlb_op; \ - char __pad[2924] /* pad to UMA_PCPU_ALLOC_SIZE */ + uint64_t pc_ucr3_load_mask; \ + char __pad[2916] /* pad to UMA_PCPU_ALLOC_SIZE */ #define PC_DBREG_CMD_NONE 0 #define PC_DBREG_CMD_LOAD 1 Index: head/sys/amd64/include/pmap.h =================================================================== --- head/sys/amd64/include/pmap.h +++ head/sys/amd64/include/pmap.h @@ -241,6 +241,7 @@ #define PMAP_PCID_USER_PT 0x800 #define PMAP_NO_CR3 (~0UL) +#define PMAP_UCR3_NOMASK (~0UL) #ifndef LOCORE