Index: head/sys/amd64/amd64/pmap.c =================================================================== --- head/sys/amd64/amd64/pmap.c +++ head/sys/amd64/amd64/pmap.c @@ -146,6 +146,7 @@ #include #include +#include #include #include #include @@ -7435,17 +7436,177 @@ return (0); } +static uint64_t +pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid) +{ + uint64_t cached; + + cached = pmap_pcid_alloc(pmap, cpuid); + KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 && + pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, + ("pmap %p cpu %d pcid %#x", pmap, cpuid, + pmap->pm_pcids[cpuid].pm_pcid)); + KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || + pmap == kernel_pmap, + ("non-kernel pmap pmap %p cpu %d pcid %#x", + pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); + return (cached); +} + +static void +pmap_activate_sw_pti_post(pmap_t pmap) +{ + + if (pmap->pm_ucr3 != PMAP_NO_CR3) + PCPU_GET(tssp)->tss_rsp0 = ((vm_offset_t)PCPU_PTR(pti_stack) + + PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful; +} + +static void inline +pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1) +{ + struct invpcid_descr d; + uint64_t cached, cr3, kcr3, ucr3; + + cached = pmap_pcid_alloc_checked(pmap, cpuid); + cr3 = rcr3(); + if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) + load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid); + PCPU_SET(curpmap, pmap); + kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; + ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | + PMAP_PCID_USER_PT; + + if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) { + /* + * Explicitly invalidate translations cached from the + * user page table. They are not automatically + * flushed by reload of cr3 with the kernel page table + * pointer above. + * + * Note that the if() condition is resolved statically + * by using the function argument instead of + * runtime-evaluated invpcid_works value. + */ + if (invpcid_works1) { + d.pcid = PMAP_PCID_USER_PT | + pmap->pm_pcids[cpuid].pm_pcid; + d.pad = 0; + d.addr = 0; + invpcid(&d, INVPCID_CTX); + } else { + pmap_pti_pcid_invalidate(ucr3, kcr3); + } + } + + PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); + PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); + if (cached) + PCPU_INC(pm_save_cnt); +} + +static void +pmap_activate_sw_pcid_invpcid_pti(pmap_t pmap, u_int cpuid) +{ + + pmap_activate_sw_pcid_pti(pmap, cpuid, true); + pmap_activate_sw_pti_post(pmap); +} + +static void +pmap_activate_sw_pcid_noinvpcid_pti(pmap_t pmap, u_int cpuid) +{ + register_t rflags; + + /* + * If the INVPCID instruction is not available, + * invltlb_pcid_handler() is used to handle an invalidate_all + * IPI, which checks for curpmap == smp_tlb_pmap. The below + * sequence of operations has a window where %CR3 is loaded + * with the new pmap's PML4 address, but the curpmap value has + * not yet been updated. This causes the invltlb IPI handler, + * which is called between the updates, to execute as a NOP, + * which leaves stale TLB entries. + * + * Note that the most typical use of pmap_activate_sw(), from + * the context switch, is immune to this race, because + * interrupts are disabled (while the thread lock is owned), + * and the IPI happens after curpmap is updated. Protect + * other callers in a similar way, by disabling interrupts + * around the %cr3 register reload and curpmap assignment. + */ + rflags = intr_disable(); + pmap_activate_sw_pcid_pti(pmap, cpuid, false); + intr_restore(rflags); + pmap_activate_sw_pti_post(pmap); +} + +static void +pmap_activate_sw_pcid_nopti(pmap_t pmap, u_int cpuid) +{ + uint64_t cached, cr3; + + cached = pmap_pcid_alloc_checked(pmap, cpuid); + cr3 = rcr3(); + if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) + load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | + cached); + PCPU_SET(curpmap, pmap); + if (cached) + PCPU_INC(pm_save_cnt); +} + +static void +pmap_activate_sw_pcid_noinvpcid_nopti(pmap_t pmap, u_int cpuid) +{ + register_t rflags; + + rflags = intr_disable(); + pmap_activate_sw_pcid_nopti(pmap, cpuid); + intr_restore(rflags); +} + +static void +pmap_activate_sw_nopcid_nopti(pmap_t pmap, u_int cpuid __unused) +{ + + load_cr3(pmap->pm_cr3); + PCPU_SET(curpmap, pmap); +} + +static void +pmap_activate_sw_nopcid_pti(pmap_t pmap, u_int cpuid __unused) +{ + + pmap_activate_sw_nopcid_nopti(pmap, cpuid); + PCPU_SET(kcr3, pmap->pm_cr3); + PCPU_SET(ucr3, pmap->pm_ucr3); + pmap_activate_sw_pti_post(pmap); +} + +DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (pmap_t, u_int), static) +{ + + if (pmap_pcid_enabled && pti && invpcid_works) + return (pmap_activate_sw_pcid_invpcid_pti); + else if (pmap_pcid_enabled && pti && !invpcid_works) + return (pmap_activate_sw_pcid_noinvpcid_pti); + else if (pmap_pcid_enabled && !pti && invpcid_works) + return (pmap_activate_sw_pcid_nopti); + else if (pmap_pcid_enabled && !pti && !invpcid_works) + return (pmap_activate_sw_pcid_noinvpcid_nopti); + else if (!pmap_pcid_enabled && pti) + return (pmap_activate_sw_nopcid_pti); + else /* if (!pmap_pcid_enabled && !pti) */ + return (pmap_activate_sw_nopcid_nopti); +} + void pmap_activate_sw(struct thread *td) { pmap_t oldpmap, pmap; - struct invpcid_descr d; - uint64_t cached, cr3, kcr3, kern_pti_cached, rsp0, ucr3; - register_t rflags; u_int cpuid; - struct amd64tss *tssp; - rflags = 0; oldpmap = PCPU_GET(curpmap); pmap = vmspace_pmap(td->td_proc->p_vmspace); if (oldpmap == pmap) @@ -7456,91 +7617,7 @@ #else CPU_SET(cpuid, &pmap->pm_active); #endif - cr3 = rcr3(); - if (pmap_pcid_enabled) { - cached = pmap_pcid_alloc(pmap, cpuid); - KASSERT(pmap->pm_pcids[cpuid].pm_pcid >= 0 && - pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX, - ("pmap %p cpu %d pcid %#x", pmap, cpuid, - pmap->pm_pcids[cpuid].pm_pcid)); - KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN || - pmap == kernel_pmap, - ("non-kernel pmap thread %p pmap %p cpu %d pcid %#x", - td, pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid)); - - /* - * If the INVPCID instruction is not available, - * invltlb_pcid_handler() is used for handle - * invalidate_all IPI, which checks for curpmap == - * smp_tlb_pmap. Below operations sequence has a - * window where %CR3 is loaded with the new pmap's - * PML4 address, but curpmap value is not yet updated. - * This causes invltlb IPI handler, called between the - * updates, to execute as NOP, which leaves stale TLB - * entries. - * - * Note that the most typical use of - * pmap_activate_sw(), from the context switch, is - * immune to this race, because interrupts are - * disabled (while the thread lock is owned), and IPI - * happens after curpmap is updated. Protect other - * callers in a similar way, by disabling interrupts - * around the %cr3 register reload and curpmap - * assignment. - */ - if (!invpcid_works) - rflags = intr_disable(); - - kern_pti_cached = pti ? 0 : cached; - if (!kern_pti_cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3) { - load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid | - kern_pti_cached); - } - PCPU_SET(curpmap, pmap); - if (pti) { - kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid; - ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid | - PMAP_PCID_USER_PT; - - if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) { - /* - * Manually invalidate translations cached - * from the user page table. They are not - * flushed by reload of cr3 with the kernel - * page table pointer above. - */ - if (invpcid_works) { - d.pcid = PMAP_PCID_USER_PT | - pmap->pm_pcids[cpuid].pm_pcid; - d.pad = 0; - d.addr = 0; - invpcid(&d, INVPCID_CTX); - } else { - pmap_pti_pcid_invalidate(ucr3, kcr3); - } - } - - PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE); - PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE); - } - if (!invpcid_works) - intr_restore(rflags); - if (cached) - PCPU_INC(pm_save_cnt); - } else { - load_cr3(pmap->pm_cr3); - PCPU_SET(curpmap, pmap); - if (pti) { - PCPU_SET(kcr3, pmap->pm_cr3); - PCPU_SET(ucr3, pmap->pm_ucr3); - } - } - if (pmap->pm_ucr3 != PMAP_NO_CR3) { - rsp0 = ((vm_offset_t)PCPU_PTR(pti_stack) + - PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful; - tssp = PCPU_GET(tssp); - tssp->tss_rsp0 = rsp0; - } + pmap_activate_sw_mode(pmap, cpuid); #ifdef SMP CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active); #else