Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -468,21 +468,45 @@ static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); static struct mtx invl_gen_mtx; -static u_long pmap_invl_gen = 0; /* Fake lock object to satisfy turnstiles interface. */ static struct lock_object invl_gen_ts = { .lo_name = "invlts", }; +static struct pmap_invl_gen pmap_invl_gen_head = { + .gen = 1, + .next = NULL, +}; +static u_long pmap_invl_gen = 1; + +#define PMAP_ASSERT_NOT_IN_DI() \ + KASSERT(pmap_not_in_di(), ("DI already started")) + +static bool pmap_not_in_di_l(void); +static bool pmap_not_in_di_u(void); +DEFINE_IFUNC(, bool, pmap_not_in_di, (void), static) +{ + + return ((cpu_feature & CPUID_CX8) == 0 ? pmap_not_in_di_l : + pmap_not_in_di_u); +} static bool -pmap_not_in_di(void) +pmap_not_in_di_l(void) { + struct pmap_invl_gen *invl_gen; - return (curthread->td_md.md_invl_gen.gen == 0); + invl_gen = &curthread->td_md.md_invl_gen; + return (invl_gen->gen == 0); } -#define PMAP_ASSERT_NOT_IN_DI() \ - KASSERT(pmap_not_in_di(), ("DI already started")) +static void +pmap_thread_init_invl_gen_l(struct thread *td) +{ + struct pmap_invl_gen *invl_gen; + + invl_gen = &td->td_md.md_invl_gen; + invl_gen->gen = 0; +} /* * Start a new Delayed Invalidation (DI) block of code, executed by @@ -493,7 +517,7 @@ * pmap active. */ static void -pmap_delayed_invl_started(void) +pmap_delayed_invl_started_l(void) { struct pmap_invl_gen *invl_gen; u_long currgen; @@ -525,7 +549,7 @@ * current thread's DI. */ static void -pmap_delayed_invl_finished(void) +pmap_delayed_invl_finished_l(void) { struct pmap_invl_gen *invl_gen, *next; struct turnstile *ts; @@ -551,6 +575,215 @@ invl_gen->gen = 0; } +static bool +pmap_not_in_di_u(void) +{ + struct pmap_invl_gen *invl_gen; + + invl_gen = &curthread->td_md.md_invl_gen; + return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0); +} + +static void +pmap_thread_init_invl_gen_u(struct thread *td) +{ + struct pmap_invl_gen *invl_gen; + + invl_gen = &td->td_md.md_invl_gen; + invl_gen->gen = 0; + invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID; +} + +static bool +pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out) +{ + uint64_t new_high, new_low, old_high, old_low; + char res; + + old_low = new_low = 0; + old_high = new_high = (uintptr_t)0; + + __asm volatile("lock;cmpxchg16b\t%1;sete\t%0" + : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) + : "b"(new_low), "c" (new_high) + : "memory", "cc"); + if (res == 0) { + if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0) + return (false); + out->gen = old_low; + out->next = (void *)old_high; + } else { + out->gen = new_low; + out->next = (void *)new_high; + } + return (true); +} + +static bool +pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val, + struct pmap_invl_gen *new_val) +{ + uint64_t new_high, new_low, old_high, old_low; + char res; + + new_low = new_val->gen; + new_high = (uintptr_t)new_val->next; + old_low = old_val->gen; + old_high = (uintptr_t)old_val->next; + + __asm volatile("lock;cmpxchg16b\t%1;sete\t%0" + : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high) + : "b"(new_low), "c" (new_high) + : "memory", "cc"); + return (res); +} + +#ifdef PV_STATS +static long invl_start_restart; +SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD, + &invl_start_restart, 0, + ""); +static long invl_finish_restart; +SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD, + &invl_finish_restart, 0, + ""); +static int invl_max_qlen; +SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD, + &invl_max_qlen, 0, + ""); +#endif + +static struct lock_delay_config __read_frequently di_delay; +LOCK_DELAY_SYSINIT_DEFAULT(di_delay); + +static void +pmap_delayed_invl_started_u(void) +{ + struct pmap_invl_gen *invl_gen, *p, prev, new_prev; + struct lock_delay_arg lda; + uintptr_t prevl; +#ifdef PV_STAT + int i, ii; +#endif + + invl_gen = &curthread->td_md.md_invl_gen; + PMAP_ASSERT_NOT_IN_DI(); + lock_delay_arg_init(&lda, &di_delay); + +again: + PV_STAT(i = 0); + for (p = &pmap_invl_gen_head;; p = prev.next) { + PV_STAT(i++); + prevl = atomic_load_ptr(&p->next); + if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { + PV_STAT(atomic_add_long(&invl_start_restart, 1)); + lock_delay(&lda); + goto again; + } + if (prevl == 0) + break; + prev.next = (void *)prevl; + } +#ifdef PV_STAT + if ((ii = invl_max_qlen) < i) + atomic_cmpset_int(&invl_max_qlen, ii, i); +#endif + + if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) { + PV_STAT(atomic_add_long(&invl_start_restart, 1)); + lock_delay(&lda); + goto again; + } + + new_prev.gen = prev.gen; + new_prev.next = invl_gen; + invl_gen->gen = prev.gen + 1; + + /* Formal fence between store to invl->gen and updating *p. */ + atomic_thread_fence_rel(); + + /* + * ABA for *p is not possible there, since p->gen can only + * increase. So if the *p thread finished its di, then + * started a new one and got inserted into the list at the + * same place, its gen will appear greater than the previously + * read gen. + */ + if (!pmap_di_store_invl(p, &prev, &new_prev)) { + PV_STAT(atomic_add_long(&invl_start_restart, 1)); + lock_delay(&lda); + goto again; + } + + /* + * There we clear PMAP_INVL_GEN_NEXT_INVALID in + * invl_gen->next, allowing other threads to iterate past us. + * pmap_di_store_invl() provides fence between the generation + * write and the update of next. + */ + invl_gen->next = NULL; + return; +} + +static void +pmap_delayed_invl_finished_u(void) +{ + struct pmap_invl_gen *invl_gen, *p, prev, new_prev; + struct lock_delay_arg lda; + uintptr_t prevl; + u_long mygen; + + invl_gen = &curthread->td_md.md_invl_gen; + KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0")); + KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0, + ("missed invl_start: INVALID")); + lock_delay_arg_init(&lda, &di_delay); + + atomic_set_ptr((uintptr_t *)&invl_gen->next, + PMAP_INVL_GEN_NEXT_INVALID); + + /* + * Load invl_gen->gen after setting invl_gen->next + * PMAP_INVL_GEN_NEXT_INVALID. This prevents larger + * generations to propagate to our invl_gen->gen. Lock prefix + * in atomic_set_ptr() works as seq_cst fence. + */ + mygen = atomic_load_long(&invl_gen->gen); + +again: + for (p = &pmap_invl_gen_head; p != NULL; p = prev.next) { + prevl = atomic_load_ptr(&p->next); + if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) { + PV_STAT(atomic_add_long(&invl_start_restart, 1)); + lock_delay(&lda); + goto again; + } + prev.next = (void *)prevl; + if (prev.next == invl_gen) + break; + } + KASSERT(p != NULL, ("not found myself on the di list")); + if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen) { + PV_STAT(atomic_add_long(&invl_finish_restart, 1)); + lock_delay(&lda); + goto again; + } + KASSERT(prev.gen < mygen, + ("invalid di gen sequence %lu %lu", prev.gen, mygen)); + new_prev.gen = mygen; + new_prev.next = (void *)((uintptr_t)invl_gen->next & + ~PMAP_INVL_GEN_NEXT_INVALID); + + /* Formal fence between load of prev and storing update to it. */ + atomic_thread_fence_rel(); + + if (!pmap_di_store_invl(p, &prev, &new_prev)) { + PV_STAT(atomic_add_long(&invl_finish_restart, 1)); + lock_delay(&lda); + goto again; + } +} + #ifdef PV_STATS static long invl_wait; SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, @@ -579,7 +812,7 @@ * processor. */ static void -pmap_delayed_invl_wait(vm_page_t m) +pmap_delayed_invl_wait_l(vm_page_t m) { struct turnstile *ts; u_long *m_gen; @@ -603,6 +836,54 @@ } } +static void +pmap_delayed_invl_wait_u(vm_page_t m) +{ + u_long *m_gen; +#ifdef PV_STATS + bool accounted = false; +#endif + + m_gen = pmap_delayed_invl_genp(m); + while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) { +#ifdef PV_STATS + if (!accounted) { + atomic_add_long(&invl_wait, 1); + accounted = true; + } +#endif + kern_yield(PRI_USER); + } +} + +DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *), static) +{ + + return ((cpu_feature & CPUID_CX8) == 0 ? pmap_thread_init_invl_gen_l : + pmap_thread_init_invl_gen_u); +} + +DEFINE_IFUNC(static, void, pmap_delayed_invl_started, (void), static) +{ + + return ((cpu_feature & CPUID_CX8) == 0 ? pmap_delayed_invl_started_l : + pmap_delayed_invl_started_u); +} + +DEFINE_IFUNC(static, void, pmap_delayed_invl_finished, (void), static) +{ + + return ((cpu_feature & CPUID_CX8) == 0 ? pmap_delayed_invl_finished_l : + pmap_delayed_invl_finished_u); +} + +DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t), static) +{ + + return ((cpu_feature & CPUID_CX8) == 0 ? pmap_delayed_invl_wait_l : + pmap_delayed_invl_wait_u); +} + /* * Mark the page m's PV list as participating in the current thread's * DI block. Any threads concurrently using m's PV list to remove or @@ -2854,6 +3135,7 @@ pmap_pinit0(pmap_t pmap) { struct proc *p; + struct thread *td; int i; PMAP_LOCK_INIT(pmap); @@ -2872,12 +3154,14 @@ pmap->pm_pcids[i].pm_gen = 1; } pmap_activate_boot(pmap); + td = curthread; if (pti) { - p = curproc; + p = td->td_proc; PROC_LOCK(p); p->p_md.md_flags |= P_MD_KPTI; PROC_UNLOCK(p); } + pmap_thread_init_invl_gen(td); if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", Index: sys/amd64/amd64/trap.c =================================================================== --- sys/amd64/amd64/trap.c +++ sys/amd64/amd64/trap.c @@ -1183,7 +1183,7 @@ KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", syscallname(td->td_proc, td->td_sa.code))); - KASSERT(td->td_md.md_invl_gen.gen == 0, + KASSERT(pmap_not_in_di(), ("System call %s returning with leaked invl_gen %lu", syscallname(td->td_proc, td->td_sa.code), td->td_md.md_invl_gen.gen)); Index: sys/amd64/amd64/vm_machdep.c =================================================================== --- sys/amd64/amd64/vm_machdep.c +++ sys/amd64/amd64/vm_machdep.c @@ -228,7 +228,7 @@ /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I; - td2->td_md.md_invl_gen.gen = 0; + pmap_thread_init_invl_gen(td2); /* As an i386, do not copy io permission bitmap. */ pcb2->pcb_tssp = NULL; @@ -544,6 +544,7 @@ /* Setup to release spin count in fork_exit(). */ td->td_md.md_spinlock_count = 1; td->td_md.md_saved_flags = PSL_KERNEL | PSL_I; + pmap_thread_init_invl_gen(td); } /* Index: sys/amd64/conf/X =================================================================== --- sys/amd64/conf/X +++ sys/amd64/conf/X @@ -86,3 +86,4 @@ device uart device random +options PV_STATS Index: sys/amd64/include/pmap.h =================================================================== --- sys/amd64/include/pmap.h +++ sys/amd64/include/pmap.h @@ -441,6 +441,7 @@ void *pmap_mapdev(vm_paddr_t, vm_size_t); void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int); void *pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size); +bool pmap_not_in_di(void); boolean_t pmap_page_is_mapped(vm_page_t m); void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma); void pmap_pinit_pml4(vm_page_t); @@ -465,6 +466,7 @@ int pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); int pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, int flags); +void pmap_thread_init_invl_gen(struct thread *td); int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap); #endif /* _KERNEL */ Index: sys/amd64/include/proc.h =================================================================== --- sys/amd64/include/proc.h +++ sys/amd64/include/proc.h @@ -50,10 +50,14 @@ int ldt_refcnt; }; +#define PMAP_INVL_GEN_NEXT_INVALID 0x1ULL struct pmap_invl_gen { u_long gen; /* (k) */ - LIST_ENTRY(pmap_invl_gen) link; /* (pp) */ -}; + union { + LIST_ENTRY(pmap_invl_gen) link; /* (pp) */ + struct pmap_invl_gen *next; + }; +} __aligned(16); /* * Machine-dependent part of the proc structure for AMD64. Index: sys/kern/kern_thread.c =================================================================== --- sys/kern/kern_thread.c +++ sys/kern/kern_thread.c @@ -84,7 +84,7 @@ "struct thread KBI td_pflags"); _Static_assert(offsetof(struct thread, td_frame) == 0x478, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x530, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x548, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb0, "struct proc KBI p_flag");