Index: sys/amd64/amd64/mp_machdep.c =================================================================== --- sys/amd64/amd64/mp_machdep.c +++ sys/amd64/amd64/mp_machdep.c @@ -635,11 +635,11 @@ } /* - * Used by pmap to request cache or TLB invalidation on local and - * remote processors. Mask provides the set of remote CPUs which are + * Used by the pmap to request cache or TLB invalidation on local and + * remote processors. Mask provides the set of remote CPUs that are * to be signalled with the invalidation IPI. As an optimization, the - * curcpu_cb callback is invoked on the calling CPU while waiting for - * remote CPUs to complete the operation. + * curcpu_cb callback is invoked on the calling CPU in a critical + * section while waiting for the remote CPUs to complete the operation. * * The callback function is called unconditionally on the caller's * underlying processor, even when this processor is not set in the @@ -649,6 +649,9 @@ * Interrupts must be enabled when calling the function with smp * started, to avoid deadlock with other IPIs that are protected with * smp_ipi_mtx spinlock at the initiator side. + * + * Function must be called with the thread pinned, and it unpins on + * completion. */ static void smp_targeted_tlb_shootdown(cpuset_t mask, pmap_t pmap, vm_offset_t addr1, @@ -662,23 +665,21 @@ * It is not necessary to signal other CPUs while booting or * when in the debugger. */ - if (kdb_active || KERNEL_PANICKED() || !smp_started) { - curcpu_cb(pmap, addr1, addr2); - return; - } + if (kdb_active || KERNEL_PANICKED() || !smp_started) + goto local_cb; - sched_pin(); + KASSERT(curthread->td_pinned > 0, ("curthread not pinned")); /* * Check for other cpus. Return if none. */ if (CPU_ISFULLSET(&mask)) { if (mp_ncpus <= 1) - goto nospinexit; + goto local_cb; } else { CPU_CLR(PCPU_GET(cpuid), &mask); if (CPU_EMPTY(&mask)) - goto nospinexit; + goto local_cb; } /* @@ -734,13 +735,22 @@ while (atomic_load_int(p_cpudone) != generation) ia32_pause(); } - critical_exit(); + + /* + * Unpin before leaving critical section. If the thread owes + * preemption, this allows scheduler to select thread on any + * CPU from its cpuset. + */ sched_unpin(); + critical_exit(); + return; -nospinexit: +local_cb: + critical_enter(); curcpu_cb(pmap, addr1, addr2); sched_unpin(); + critical_exit(); } void Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -2773,54 +2773,19 @@ static cpuset_t pmap_invalidate_cpu_mask(pmap_t pmap) { - return (pmap == kernel_pmap ? all_cpus : pmap->pm_active); } static inline void -pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va, - const bool invpcid_works1) +pmap_invalidate_preipi_pcid(pmap_t pmap) { - struct invpcid_descr d; - uint64_t kcr3, ucr3; - uint32_t pcid; u_int cpuid, i; + sched_pin(); + cpuid = PCPU_GET(cpuid); - if (pmap == PCPU_GET(curpmap)) { - if (pmap->pm_ucr3 != PMAP_NO_CR3 && - /* - * If we context-switched right after - * PCPU_GET(ucr3_load_mask), we could read the - * ~CR3_PCID_SAVE mask, which causes us to skip - * the code below to invalidate user pages. This - * is handled in pmap_activate_sw_pcid_pti() by - * clearing pm_gen if ucr3_load_mask is ~CR3_PCID_SAVE. - */ - PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { - /* - * Because pm_pcid is recalculated on a - * context switch, we must disable switching. - * Otherwise, we might use a stale value - * below. - */ - critical_enter(); - pcid = pmap->pm_pcids[cpuid].pm_pcid; - if (invpcid_works1) { - d.pcid = pcid | PMAP_PCID_USER_PT; - d.pad = 0; - d.addr = va; - invpcid(&d, INVPCID_ADDR); - } else { - kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; - ucr3 = pmap->pm_ucr3 | pcid | - PMAP_PCID_USER_PT | CR3_PCID_SAVE; - pmap_pti_pcid_invlpg(ucr3, kcr3, va); - } - critical_exit(); - } - } else - pmap->pm_pcids[cpuid].pm_gen = 0; + if (pmap != PCPU_GET(curpmap)) + cpuid = 0xffffffff; /* An impossible value */ CPU_FOREACH(i) { if (cpuid != i) @@ -2841,51 +2806,98 @@ } static void -pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va) +pmap_invalidate_preipi_nopcid(pmap_t pmap __unused) { + sched_pin(); +} - pmap_invalidate_page_pcid(pmap, va, true); +DEFINE_IFUNC(static, void, pmap_invalidate_preipi, (pmap_t)) +{ + return (pmap_pcid_enabled ? pmap_invalidate_preipi_pcid : + pmap_invalidate_preipi_nopcid); } -static void -pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va) +static inline void +pmap_invalidate_page_pcid_cb(pmap_t pmap, vm_offset_t va, + const bool invpcid_works1) { + struct invpcid_descr d; + uint64_t kcr3, ucr3; + uint32_t pcid; + u_int cpuid; + + /* + * Because pm_pcid is recalculated on a context switch, we + * must ensure there is no preemption, not just pinning. + * Otherwise, we might use a stale value below. + */ + CRITICAL_ASSERT(curthread); + + /* + * No need to do anything with user page tables invalidation + * if there is no user page table, or invalidation is deferred + * until the return to userspace. ucr3_load_mask is stable + * because we have preemption disabled. + */ + if (pmap->pm_ucr3 == PMAP_NO_CR3 || + PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) + return; + + cpuid = PCPU_GET(cpuid); + + pcid = pmap->pm_pcids[cpuid].pm_pcid; + if (invpcid_works1) { + d.pcid = pcid | PMAP_PCID_USER_PT; + d.pad = 0; + d.addr = va; + invpcid(&d, INVPCID_ADDR); + } else { + kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; + ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; + pmap_pti_pcid_invlpg(ucr3, kcr3, va); + } +} - pmap_invalidate_page_pcid(pmap, va, false); +static void +pmap_invalidate_page_pcid_invpcid_cb(pmap_t pmap, vm_offset_t va) +{ + pmap_invalidate_page_pcid_cb(pmap, va, true); } static void -pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va) +pmap_invalidate_page_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t va) { + pmap_invalidate_page_pcid_cb(pmap, va, false); } -DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t)) +static void +pmap_invalidate_page_nopcid_cb(pmap_t pmap __unused, vm_offset_t va __unused) { +} +DEFINE_IFUNC(static, void, pmap_invalidate_page_cb, (pmap_t, vm_offset_t)) +{ if (pmap_pcid_enabled) - return (invpcid_works ? pmap_invalidate_page_pcid_invpcid : - pmap_invalidate_page_pcid_noinvpcid); - return (pmap_invalidate_page_nopcid); + return (invpcid_works ? pmap_invalidate_page_pcid_invpcid_cb : + pmap_invalidate_page_pcid_noinvpcid_cb); + return (pmap_invalidate_page_nopcid_cb); } static void pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va, vm_offset_t addr2 __unused) { - if (pmap == kernel_pmap) { invlpg(va); - } else { - if (pmap == PCPU_GET(curpmap)) - invlpg(va); - pmap_invalidate_page_mode(pmap, va); + } else if (pmap == PCPU_GET(curpmap)) { + invlpg(va); + pmap_invalidate_page_cb(pmap, va); } } void pmap_invalidate_page(pmap_t pmap, vm_offset_t va) { - if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; @@ -2894,6 +2906,7 @@ KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_page: invalid type %d", pmap->pm_type)); + pmap_invalidate_preipi(pmap); smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap, pmap_invalidate_page_curcpu_cb); } @@ -2902,74 +2915,63 @@ #define PMAP_INVLPG_THRESHOLD (4 * 1024 * PAGE_SIZE) static void -pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, +pmap_invalidate_range_pcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, const bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3, ucr3; uint32_t pcid; - u_int cpuid, i; + u_int cpuid; + + CRITICAL_ASSERT(curthread); + + if (pmap != PCPU_GET(curpmap) || + pmap->pm_ucr3 == PMAP_NO_CR3 || + PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) + return; cpuid = PCPU_GET(cpuid); - if (pmap == PCPU_GET(curpmap)) { - if (pmap->pm_ucr3 != PMAP_NO_CR3 && - PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) { - critical_enter(); - pcid = pmap->pm_pcids[cpuid].pm_pcid; - if (invpcid_works1) { - d.pcid = pcid | PMAP_PCID_USER_PT; - d.pad = 0; - d.addr = sva; - for (; d.addr < eva; d.addr += PAGE_SIZE) - invpcid(&d, INVPCID_ADDR); - } else { - kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; - ucr3 = pmap->pm_ucr3 | pcid | - PMAP_PCID_USER_PT | CR3_PCID_SAVE; - pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); - } - critical_exit(); - } - } else - pmap->pm_pcids[cpuid].pm_gen = 0; - CPU_FOREACH(i) { - if (cpuid != i) - pmap->pm_pcids[i].pm_gen = 0; + pcid = pmap->pm_pcids[cpuid].pm_pcid; + if (invpcid_works1) { + d.pcid = pcid | PMAP_PCID_USER_PT; + d.pad = 0; + for (d.addr = sva; d.addr < eva; d.addr += PAGE_SIZE) + invpcid(&d, INVPCID_ADDR); + } else { + kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE; + ucr3 = pmap->pm_ucr3 | pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE; + pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva); } - /* See the comment in pmap_invalidate_page_pcid(). */ - atomic_thread_fence_seq_cst(); } static void -pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva, +pmap_invalidate_range_pcid_invpcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { - - pmap_invalidate_range_pcid(pmap, sva, eva, true); + pmap_invalidate_range_pcid_cb(pmap, sva, eva, true); } static void -pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva, +pmap_invalidate_range_pcid_noinvpcid_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { - - pmap_invalidate_range_pcid(pmap, sva, eva, false); + pmap_invalidate_range_pcid_cb(pmap, sva, eva, false); } static void -pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +pmap_invalidate_range_nopcid_cb(pmap_t pmap __unused, vm_offset_t sva __unused, + vm_offset_t eva __unused) { } -DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t, +DEFINE_IFUNC(static, void, pmap_invalidate_range_cb, (pmap_t, vm_offset_t, vm_offset_t)) { - if (pmap_pcid_enabled) - return (invpcid_works ? pmap_invalidate_range_pcid_invpcid : - pmap_invalidate_range_pcid_noinvpcid); - return (pmap_invalidate_range_nopcid); + return (invpcid_works ? pmap_invalidate_range_pcid_invpcid_cb : + pmap_invalidate_range_pcid_noinvpcid_cb); + return (pmap_invalidate_range_nopcid_cb); } static void @@ -2980,19 +2982,16 @@ if (pmap == kernel_pmap) { for (addr = sva; addr < eva; addr += PAGE_SIZE) invlpg(addr); - } else { - if (pmap == PCPU_GET(curpmap)) { - for (addr = sva; addr < eva; addr += PAGE_SIZE) - invlpg(addr); - } - pmap_invalidate_range_mode(pmap, sva, eva); + } else if (pmap == PCPU_GET(curpmap)) { + for (addr = sva; addr < eva; addr += PAGE_SIZE) + invlpg(addr); + pmap_invalidate_range_cb(pmap, sva, eva); } } void pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { - if (eva - sva >= PMAP_INVLPG_THRESHOLD) { pmap_invalidate_all(pmap); return; @@ -3006,17 +3005,18 @@ KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_range: invalid type %d", pmap->pm_type)); + pmap_invalidate_preipi(pmap); smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap, pmap_invalidate_range_curcpu_cb); } static inline void -pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1) +pmap_invalidate_all_pcid_cb(pmap_t pmap, bool invpcid_works1) { struct invpcid_descr d; uint64_t kcr3; uint32_t pcid; - u_int cpuid, i; + u_int cpuid; if (pmap == kernel_pmap) { if (invpcid_works1) { @@ -3025,79 +3025,64 @@ } else { invltlb_glob(); } - } else { + } else if (pmap == PCPU_GET(curpmap)) { + CRITICAL_ASSERT(curthread); cpuid = PCPU_GET(cpuid); - if (pmap == PCPU_GET(curpmap)) { - critical_enter(); - pcid = pmap->pm_pcids[cpuid].pm_pcid; - if (invpcid_works1) { - d.pcid = pcid; - d.pad = 0; - d.addr = 0; - invpcid(&d, INVPCID_CTX); - } else { - kcr3 = pmap->pm_cr3 | pcid; - load_cr3(kcr3); - } - if (pmap->pm_ucr3 != PMAP_NO_CR3) - PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); - critical_exit(); - } else - pmap->pm_pcids[cpuid].pm_gen = 0; - CPU_FOREACH(i) { - if (cpuid != i) - pmap->pm_pcids[i].pm_gen = 0; + + pcid = pmap->pm_pcids[cpuid].pm_pcid; + if (invpcid_works1) { + d.pcid = pcid; + d.pad = 0; + d.addr = 0; + invpcid(&d, INVPCID_CTX); + } else { + kcr3 = pmap->pm_cr3 | pcid; + load_cr3(kcr3); } + if (pmap->pm_ucr3 != PMAP_NO_CR3) + PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE); } - /* See the comment in pmap_invalidate_page_pcid(). */ - atomic_thread_fence_seq_cst(); } static void -pmap_invalidate_all_pcid_invpcid(pmap_t pmap) +pmap_invalidate_all_pcid_invpcid_cb(pmap_t pmap) { - - pmap_invalidate_all_pcid(pmap, true); + pmap_invalidate_all_pcid_cb(pmap, true); } static void -pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap) +pmap_invalidate_all_pcid_noinvpcid_cb(pmap_t pmap) { - - pmap_invalidate_all_pcid(pmap, false); + pmap_invalidate_all_pcid_cb(pmap, false); } static void -pmap_invalidate_all_nopcid(pmap_t pmap) +pmap_invalidate_all_nopcid_cb(pmap_t pmap) { - if (pmap == kernel_pmap) invltlb_glob(); else if (pmap == PCPU_GET(curpmap)) invltlb(); } -DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t)) +DEFINE_IFUNC(static, void, pmap_invalidate_all_cb, (pmap_t)) { - if (pmap_pcid_enabled) - return (invpcid_works ? pmap_invalidate_all_pcid_invpcid : - pmap_invalidate_all_pcid_noinvpcid); - return (pmap_invalidate_all_nopcid); + return (invpcid_works ? pmap_invalidate_all_pcid_invpcid_cb : + pmap_invalidate_all_pcid_noinvpcid_cb); + return (pmap_invalidate_all_nopcid_cb); } static void pmap_invalidate_all_curcpu_cb(pmap_t pmap, vm_offset_t addr1 __unused, vm_offset_t addr2 __unused) { - - pmap_invalidate_all_mode(pmap); + pmap_invalidate_all_cb(pmap); } void pmap_invalidate_all(pmap_t pmap) { - if (pmap_type_guest(pmap)) { pmap_invalidate_ept(pmap); return; @@ -3106,6 +3091,7 @@ KASSERT(pmap->pm_type == PT_X86, ("pmap_invalidate_all: invalid type %d", pmap->pm_type)); + pmap_invalidate_preipi(pmap); smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap, pmap_invalidate_all_curcpu_cb); } @@ -3114,14 +3100,13 @@ pmap_invalidate_cache_curcpu_cb(pmap_t pmap __unused, vm_offset_t va __unused, vm_offset_t addr2 __unused) { - wbinvd(); } void pmap_invalidate_cache(void) { - + sched_pin(); smp_cache_flush(pmap_invalidate_cache_curcpu_cb); }