Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -114,6 +114,7 @@ #include #include #include +#include #include #include #include @@ -349,6 +350,7 @@ vm_paddr_t dmaplimit; vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; pt_entry_t pg_nx; +static epoch_t pmap_epoch; static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters"); @@ -386,13 +388,10 @@ /* * Data for the pv entry allocation mechanism. - * Updates to pv_invl_gen are protected by the pv_list_locks[] - * elements, but reads are not. */ static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); static struct mtx __exclusive_cache_line pv_chunks_mutex; static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; -static u_long pv_invl_gen[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; @@ -438,14 +437,13 @@ CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU", "Count of saved TLB context on switch"); -static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker = - LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker); -static struct mtx invl_gen_mtx; -static u_long pmap_invl_gen = 0; -/* Fake lock object to satisfy turnstiles interface. */ -static struct lock_object invl_gen_ts = { - .lo_name = "invlts", -}; +static void +pmap_epoch_init(void *arg __unused) +{ + + pmap_epoch = epoch_alloc(EPOCH_PREEMPT|EPOCH_LOCKED); +} +SYSINIT(epoch, SI_SUB_TASKQ + 1, SI_ORDER_ANY, pmap_epoch_init, NULL); static bool pmap_not_in_di(void) @@ -466,62 +464,24 @@ * pmap active. */ static void -pmap_delayed_invl_started(void) +pmap_delayed_invl_started(epoch_tracker_t et) { - struct pmap_invl_gen *invl_gen; - u_long currgen; - invl_gen = &curthread->td_md.md_invl_gen; - PMAP_ASSERT_NOT_IN_DI(); - mtx_lock(&invl_gen_mtx); - if (LIST_EMPTY(&pmap_invl_gen_tracker)) - currgen = pmap_invl_gen; - else - currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen; - invl_gen->gen = currgen + 1; - LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link); - mtx_unlock(&invl_gen_mtx); + epoch_enter_preempt(pmap_epoch, et); + curthread->td_md.md_invl_gen.gen = 1; } /* - * Finish the DI block, previously started by the current thread. All - * required TLB flushes for the pages marked by - * pmap_delayed_invl_page() must be finished before this function is - * called. - * - * This function works by bumping the global DI generation number to - * the generation number of the current thread's DI, unless there is a - * pending DI that started earlier. In the latter case, bumping the - * global DI generation number would incorrectly signal that the - * earlier DI had finished. Instead, this function bumps the earlier - * DI's generation number to match the generation number of the - * current thread's DI. + * This function works by checking that there are either no callers + * within a DI block or if there are that a grace period elapses for + * any callers in an epoch section when it is initially called. */ static void -pmap_delayed_invl_finished(void) -{ - struct pmap_invl_gen *invl_gen, *next; - struct turnstile *ts; - - invl_gen = &curthread->td_md.md_invl_gen; - KASSERT(invl_gen->gen != 0, ("missed invl_started")); - mtx_lock(&invl_gen_mtx); - next = LIST_NEXT(invl_gen, link); - if (next == NULL) { - turnstile_chain_lock(&invl_gen_ts); - ts = turnstile_lookup(&invl_gen_ts); - pmap_invl_gen = invl_gen->gen; - if (ts != NULL) { - turnstile_broadcast(ts, TS_SHARED_QUEUE); - turnstile_unpend(ts); - } - turnstile_chain_unlock(&invl_gen_ts); - } else { - next->gen = invl_gen->gen; - } - LIST_REMOVE(invl_gen, link); - mtx_unlock(&invl_gen_mtx); - invl_gen->gen = 0; +pmap_delayed_invl_finished(epoch_tracker_t et) +{ + + curthread->td_md.md_invl_gen.gen = 0; + epoch_exit_preempt(pmap_epoch, et); } #ifdef PV_STATS @@ -530,13 +490,6 @@ "Number of times DI invalidation blocked pmap_remove_all/write"); #endif -static u_long * -pmap_delayed_invl_genp(vm_page_t m) -{ - - return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); -} - /* * Ensure that all currently executing DI blocks, that need to flush * TLB for the given page m, actually flushed the TLB at the time the @@ -554,51 +507,8 @@ static void pmap_delayed_invl_wait(vm_page_t m) { - struct turnstile *ts; - u_long *m_gen; -#ifdef PV_STATS - bool accounted = false; -#endif - m_gen = pmap_delayed_invl_genp(m); - while (*m_gen > pmap_invl_gen) { -#ifdef PV_STATS - if (!accounted) { - atomic_add_long(&invl_wait, 1); - accounted = true; - } -#endif - ts = turnstile_trywait(&invl_gen_ts); - if (*m_gen > pmap_invl_gen) - turnstile_wait(ts, NULL, TS_SHARED_QUEUE); - else - turnstile_cancel(ts); - } -} - -/* - * Mark the page m's PV list as participating in the current thread's - * DI block. Any threads concurrently using m's PV list to remove or - * restrict all mappings to m will wait for the current thread's DI - * block to complete before proceeding. - * - * The function works by setting the DI generation number for m's PV - * list to at least the DI generation number of the current thread. - * This forces a caller of pmap_delayed_invl_wait() to block until - * current thread calls pmap_delayed_invl_finished(). - */ -static void -pmap_delayed_invl_page(vm_page_t m) -{ - u_long gen, *m_gen; - - rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); - gen = curthread->td_md.md_invl_gen.gen; - if (gen == 0) - return; - m_gen = pmap_delayed_invl_genp(m); - if (*m_gen < gen) - *m_gen = gen; + epoch_wait_preempt(pmap_epoch); } /* @@ -1130,11 +1040,6 @@ TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_flags = pmap_flags; - /* - * Initialize the TLB invalidations generation number lock. - */ - mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF); - /* * Reserve some special page table entries/VA space for temporary * mapping of pages. @@ -3160,7 +3065,8 @@ #endif static void -reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) +reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di, + epoch_tracker_t et) { if (pmap == NULL) @@ -3169,7 +3075,7 @@ if (pmap != locked_pmap) PMAP_UNLOCK(pmap); if (start_di) - pmap_delayed_invl_finished(); + pmap_delayed_invl_finished(et); } /* @@ -3200,6 +3106,7 @@ uint64_t inuse; int bit, field, freed; bool start_di; + struct epoch_tracker et; static int active_reclaims = 0; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); @@ -3245,20 +3152,20 @@ */ if (pmap != next_pmap) { reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, - start_di); + start_di, &et); pmap = next_pmap; /* Avoid deadlock and lock recursion. */ if (pmap > locked_pmap) { RELEASE_PV_LIST_LOCK(lockp); PMAP_LOCK(pmap); if (start_di) - pmap_delayed_invl_started(); + pmap_delayed_invl_started(&et); mtx_lock(&pv_chunks_mutex); continue; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { if (start_di) - pmap_delayed_invl_started(); + pmap_delayed_invl_started(&et); mtx_lock(&pv_chunks_mutex); continue; } else { @@ -3271,7 +3178,7 @@ goto next_chunk; } } else if (start_di) - pmap_delayed_invl_started(); + pmap_delayed_invl_started(&et); PG_G = pmap_global_bit(pmap); PG_A = pmap_accessed_bit(pmap); PG_M = pmap_modified_bit(pmap); @@ -3313,7 +3220,6 @@ PGA_WRITEABLE); } } - pmap_delayed_invl_page(m); pc->pc_map[field] |= 1UL << bit; pmap_unuse_pt(pmap, va, *pde, &free); freed++; @@ -3368,7 +3274,7 @@ TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); active_reclaims--; mtx_unlock(&pv_chunks_mutex); - reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); + reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di, &et); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); SLIST_REMOVE_HEAD(&free, plinks.s.ss); @@ -4040,7 +3946,6 @@ if (TAILQ_EMPTY(&m->md.pv_list) && TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); - pmap_delayed_invl_page(m); } } if (pmap == kernel_pmap) { @@ -4092,7 +3997,6 @@ if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } - pmap_delayed_invl_page(m); } return (pmap_unuse_pt(pmap, va, ptepde, free)); } @@ -4175,6 +4079,7 @@ pd_entry_t ptpaddr, *pde; pt_entry_t PG_G, PG_V; struct spglist free; + struct epoch_tracker et; int anyvalid; PG_G = pmap_global_bit(pmap); @@ -4189,7 +4094,7 @@ anyvalid = 0; SLIST_INIT(&free); - pmap_delayed_invl_started(); + pmap_delayed_invl_started(&et); PMAP_LOCK(pmap); /* @@ -4285,7 +4190,7 @@ if (anyvalid) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); - pmap_delayed_invl_finished(); + pmap_delayed_invl_finished(&et); vm_page_free_pages_toq(&free, true); } @@ -4724,9 +4629,6 @@ * or lose information. That is, this routine must actually * insert this page into the given map NOW. * - * When destroying both a page table and PV entry, this function - * performs the TLB invalidation before releasing the PV list - * lock, so we do not need pmap_delayed_invl_page() calls here. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, @@ -5032,6 +4934,7 @@ pd_entry_t oldpde, *pde; pt_entry_t PG_G, PG_RW, PG_V; vm_page_t mt, pdpg; + struct epoch_tracker et; PG_G = pmap_global_bit(pmap); PG_RW = pmap_rw_bit(pmap); @@ -5071,11 +4974,11 @@ if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, va, oldpde); } else { - pmap_delayed_invl_started(); + pmap_delayed_invl_started(&et); if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, lockp)) pmap_invalidate_all(pmap); - pmap_delayed_invl_finished(); + pmap_delayed_invl_finished(&et); } vm_page_free_pages_toq(&free, true); if (va >= VM_MAXUSER_ADDRESS) { @@ -6589,6 +6492,7 @@ vm_offset_t va, va_next; vm_page_t m; boolean_t anychanged; + struct epoch_tracker et; if (advice != MADV_DONTNEED && advice != MADV_FREE) return; @@ -6608,7 +6512,7 @@ PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); anychanged = FALSE; - pmap_delayed_invl_started(); + pmap_delayed_invl_started(&et); PMAP_LOCK(pmap); for (; sva < eva; sva = va_next) { pml4e = pmap_pml4e(pmap, sva); @@ -6705,7 +6609,7 @@ if (anychanged) pmap_invalidate_all(pmap); PMAP_UNLOCK(pmap); - pmap_delayed_invl_finished(); + pmap_delayed_invl_finished(&et); } /*