Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -116,6 +116,7 @@ #include #include #include +#include #include #include #include @@ -124,7 +125,6 @@ #include #include #include -#include #include #include #include @@ -400,7 +400,6 @@ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx __exclusive_cache_line pv_chunks_mutex; static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; -static u_long pv_invl_gen[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; @@ -465,25 +464,90 @@ CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", "Count of saved TLB context on switch"); -static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = - LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); -static struct mtx invl_gen_mtx; -static u_long pmap_invl_gen = 0; -/* Fake lock object to satisfy turnstiles interface. */ -static struct lock_object invl_gen_ts = { - .lo_name = "invlts", -}; +#ifdef PV_STATS +static long invl_busy_slots; +SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_busy_slots, CTLFLAG_RD, + &invl_busy_slots, 0, + "Number of times DI invalidation blocked pmap_remove/pmap_protect"); +static long invl_wait; +SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, + &invl_wait, 0, + "Number of times DI invalidation blocked pmap_remove_all/write"); +static long invl_gen_wrapped; +SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_gen_wrapped, CTLFLAG_RD, + &invl_gen_wrapped, 0, + "Number of times DI generation wrapped"); +static long invl_busy_next_gen; +SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_busy_next_gen, CTLFLAG_RD, + &invl_busy_next_gen, 0, + "Number of times DI blocked waiting for next generation"); +#endif + +static u_int pmap_invl_gen = 1; +static u_int pmap_finished_invl_gen = 0; +static u_int pmap_invl_slots[MAXCPU * 2]; +_Static_assert((nitems(pmap_invl_slots) & (nitems(pmap_invl_slots) - 1)) == 0, + "XXX"); static bool pmap_not_in_di(void) { - return (curthread->td_md.md_invl_gen.gen == 0); + return (curthread->td_md.md_invl_gen == 0); } #define PMAP_ASSERT_NOT_IN_DI() \ KASSERT(pmap_not_in_di(), ("DI already started")) +static u_int * +pmap_delayed_invl_slot(u_int gen) +{ + + return (&pmap_invl_slots[gen & (nitems(pmap_invl_slots) - 1)]); +} + +static u_int +pmap_delayed_invl_next_gen(void) +{ + u_int gen; + + gen = atomic_load_int(&pmap_invl_gen); + for (;;) { + if (gen == UINT_MAX) { + /* + * Gen wrap around. Wait for all di's from + * previous cycle to finish, then start new + * wrap-around cycle and reset + * pmap_finished_invl_gen. + */ + if (atomic_cmpset_int(&pmap_invl_gen, gen, 0) == 1) { + while (atomic_load_int(&pmap_finished_invl_gen) + != gen) + kern_yield(PRI_USER); + atomic_store_int(&pmap_finished_invl_gen, 0); +#ifdef PV_STATS + atomic_add_long(&invl_gen_wrapped, 1); +#endif + } + gen = 1; + atomic_store_int(&pmap_invl_gen, gen); + return (gen); + } else if (gen == 0) { + /* + * Gen wrap around. Other thread waits for + * all di's from previous cycle to finish. + */ + kern_yield(PRI_USER); +#ifdef PV_STATS + atomic_add_long(&invl_busy_next_gen, 1); +#endif + } else if (atomic_fcmpset_int(&pmap_invl_gen, &gen, + gen + 1) == 1) { + return (gen + 1); + } + } +} + /* * Start a new Delayed Invalidation (DI) block of code, executed by * the current thread. Within a DI block, the current thread may @@ -495,19 +559,25 @@ static void pmap_delayed_invl_started(void) { - struct pmap_invl_gen *invl_gen; - u_long currgen; + u_int gen, i; - invl_gen = &curthread->td_md.md_invl_gen; PMAP_ASSERT_NOT_IN_DI(); - mtx_lock(&invl_gen_mtx); - if (LIST_EMPTY(&pmap_invl_gen_tracker)) - currgen = pmap_invl_gen; - else - currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; - invl_gen->gen = currgen + 1; - LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); - mtx_unlock(&invl_gen_mtx); + for (i = 0;;) { + gen = pmap_delayed_invl_next_gen(); + if (atomic_cmpset_int(pmap_delayed_invl_slot(gen), 0, + gen) == 0) { + if (++i == nitems(pmap_invl_slots)) { +#ifdef PV_STATS + atomic_add_long(&invl_busy_slots, 1); +#endif + kern_yield(PRI_USER); + i = 0; + } + continue; + } + curthread->td_md.md_invl_gen = gen; + break; + } } /* @@ -527,41 +597,24 @@ static void pmap_delayed_invl_finished(void) { - struct pmap_invl_gen *invl_gen, *next; - struct turnstile *ts; - - invl_gen = &curthread->td_md.md_invl_gen; - KASSERT(invl_gen->gen != 0, ("missed invl_started")); - mtx_lock(&invl_gen_mtx); - next = LIST_NEXT(invl_gen, link); - if (next == NULL) { - turnstile_chain_lock(&invl_gen_ts); - ts = turnstile_lookup(&invl_gen_ts); - pmap_invl_gen = invl_gen->gen; - if (ts != NULL) { - turnstile_broadcast(ts, TS_SHARED_QUEUE); - turnstile_unpend(ts); - } - turnstile_chain_unlock(&invl_gen_ts); - } else { - next->gen = invl_gen->gen; - } - LIST_REMOVE(invl_gen, link); - mtx_unlock(&invl_gen_mtx); - invl_gen->gen = 0; -} + struct thread *td; + u_int fin_gen, invl_gen, *slot; -#ifdef PV_STATS -static long invl_wait; -SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, - "Number of times DI invalidation blocked pmap_remove_all/write"); -#endif + td = curthread; + invl_gen = td->td_md.md_invl_gen; + td->td_md.md_invl_gen = 0; + KASSERT(invl_gen != 0, ("missed invl_started")); + slot = pmap_delayed_invl_slot(invl_gen); + KASSERT(*slot != 0, ("cleared slot")); + *slot = 0; -static u_long * -pmap_delayed_invl_genp(vm_page_t m) -{ - - return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); + fin_gen = atomic_load_int(&pmap_finished_invl_gen); + for (;;) { + if (fin_gen >= invl_gen || + atomic_fcmpset_int(&pmap_finished_invl_gen, &fin_gen, + invl_gen) == 1) + break; + } } /* @@ -581,25 +634,19 @@ static void pmap_delayed_invl_wait(vm_page_t m) { - struct turnstile *ts; - u_long *m_gen; -#ifdef PV_STATS - bool accounted = false; -#endif + u_int m_gen, slot; - m_gen = pmap_delayed_invl_genp(m); - while (*m_gen > pmap_invl_gen) { + m_gen = m->md.di_gen; + if (m_gen == 0) + return; + for (;;) { + slot = *pmap_delayed_invl_slot(m_gen); + if (slot == 0 || slot > m_gen) + break; #ifdef PV_STATS - if (!accounted) { - atomic_add_long(&invl_wait, 1); - accounted = true; - } + atomic_add_long(&invl_wait, 1); #endif - ts = turnstile_trywait(&invl_gen_ts); - if (*m_gen > pmap_invl_gen) - turnstile_wait(ts, NULL, TS_SHARED_QUEUE); - else - turnstile_cancel(ts); + kern_yield(PRI_USER); } } @@ -617,15 +664,13 @@ static void pmap_delayed_invl_page(vm_page_t m) { - u_long gen, *m_gen; + u_int gen; rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); - gen = curthread->td_md.md_invl_gen.gen; + gen = curthread->td_md.md_invl_gen; if (gen == 0) return; - m_gen = pmap_delayed_invl_genp(m); - if (*m_gen < gen) - *m_gen = gen; + m->md.di_gen = gen; } /* @@ -1180,11 +1225,6 @@ kernel_pmap->pm_stats.resident_count = res; kernel_pmap->pm_flags = pmap_flags; - /* - * Initialize the TLB invalidations generation number lock. - */ - mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); - /* * Reserve some special page table entries/VA space for temporary * mapping of pages. Index: sys/amd64/amd64/trap.c =================================================================== --- sys/amd64/amd64/trap.c +++ sys/amd64/amd64/trap.c @@ -1183,10 +1183,10 @@ KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td), ("System call %s returning with mangled pcb_save", syscallname(td->td_proc, td->td_sa.code))); - KASSERT(td->td_md.md_invl_gen.gen == 0, + KASSERT(td->td_md.md_invl_gen == 0, ("System call %s returning with leaked invl_gen %lu", syscallname(td->td_proc, td->td_sa.code), - td->td_md.md_invl_gen.gen)); + td->td_md.md_invl_gen)); syscallret(td, error); Index: sys/amd64/amd64/vm_machdep.c =================================================================== --- sys/amd64/amd64/vm_machdep.c +++ sys/amd64/amd64/vm_machdep.c @@ -228,7 +228,7 @@ /* Setup to release spin count in fork_exit(). */ td2->td_md.md_spinlock_count = 1; td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I; - td2->td_md.md_invl_gen.gen = 0; + td2->td_md.md_invl_gen = 0; /* As an i386, do not copy io permission bitmap. */ pcb2->pcb_tssp = NULL; Index: sys/amd64/include/pmap.h =================================================================== --- sys/amd64/include/pmap.h +++ sys/amd64/include/pmap.h @@ -312,6 +312,7 @@ TAILQ_HEAD(, pv_entry) pv_list; /* (p) */ int pv_gen; /* (p) */ int pat_mode; + int di_gen; }; enum pmap_type { Index: sys/amd64/include/proc.h =================================================================== --- sys/amd64/include/proc.h +++ sys/amd64/include/proc.h @@ -42,7 +42,6 @@ * List of locks * c - proc lock * k - only accessed by curthread - * pp - pmap.c:invl_gen_mtx */ struct proc_ldt { @@ -50,11 +49,6 @@ int ldt_refcnt; }; -struct pmap_invl_gen { - u_long gen; /* (k) */ - LIST_ENTRY(pmap_invl_gen) link; /* (pp) */ -}; - /* * Machine-dependent part of the proc structure for AMD64. */ @@ -62,7 +56,8 @@ int md_spinlock_count; /* (k) */ register_t md_saved_flags; /* (k) */ register_t md_spurflt_addr; /* (k) Spurious page fault address. */ - struct pmap_invl_gen md_invl_gen; + u_long md_invl_gen; + void *md_pad0[2]; register_t md_efirt_tmp; /* (k) */ int md_efirt_dis_pf; /* (k) */ };