Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -121,6 +121,7 @@ #include #include #include +#include #include #include #include @@ -171,6 +172,13 @@ #define PMAP_MEMDOM 1 #endif +#define PC_FREE0 0xfffffffffffffffful +#define PC_FREE1 0xfffffffffffffffful +#define PC_FREE2 0x000000fffffffffful + +_Static_assert(sizeof(struct pv_chunk) <= PAGE_SIZE, "_NPCM too large"); +_Static_assert(NBBY * sizeof(uint64_t) * _NPCM >= _NPCPV, "_NPCM too large"); + static __inline boolean_t pmap_type_guest(pmap_t pmap) { @@ -316,6 +324,9 @@ #define PMAP_INLINE #endif +static TAILQ_HEAD(, pmap) all_pmaps; +static struct mtx all_pmaps_lock; + #ifdef PV_STATS #define PV_STAT(x) do { x ; } while (0) #else @@ -331,51 +342,46 @@ }) #define pa_to_pmdp(pa) (&pv_table[pa_index(pa)]) #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) -#define PHYS_TO_PV_LIST_LOCK(pa) ({ \ - struct rwlock *_lock; \ - if (__predict_false((pa) > pmap_last_pa)) \ - _lock = &pv_dummy_large.pv_lock; \ - else \ - _lock = &(pa_to_pmdp(pa)->pv_lock); \ - _lock; \ -}) #else #define pa_index(pa) ((pa) >> PDRSHIFT) #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) #define NPV_LIST_LOCKS MAXCPU -#define PHYS_TO_PV_LIST_LOCK(pa) \ - (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #endif +#define PHYS_TO_PV_LIST_LOCK(pa) PHYS_TO_VM_PAGE(pa) + #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ - struct rwlock **_lockp = (lockp); \ - struct rwlock *_new_lock; \ + vm_page_t _m; \ \ - _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ - if (_new_lock != *_lockp) { \ + _m = PHYS_TO_VM_PAGE(pa); \ + if (_m == NULL) \ + _m = &pv_fake_page; \ + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, _m); \ +} while (0) + +#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) do { \ + PVLL **_lockp = (lockp); \ + \ + if (m != *_lockp) { \ if (*_lockp != NULL) \ - rw_wunlock(*_lockp); \ - *_lockp = _new_lock; \ - rw_wlock(*_lockp); \ + pmap_pv_list_unlock(*_lockp); \ + *_lockp = m; \ + pmap_pv_list_lock(m); \ } \ } while (0) -#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ - CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) - #define RELEASE_PV_LIST_LOCK(lockp) do { \ - struct rwlock **_lockp = (lockp); \ + PVLL **_lockp = (lockp); \ \ if (*_lockp != NULL) { \ - rw_wunlock(*_lockp); \ + pmap_pv_list_unlock(*_lockp); \ *_lockp = NULL; \ } \ } while (0) -#define VM_PAGE_TO_PV_LIST_LOCK(m) \ - PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) +#define VM_PAGE_TO_PV_LIST_LOCK(m) (m) struct pmap kernel_pmap_store; @@ -444,46 +450,161 @@ * Data for the pv entry allocation mechanism. * Updates to pv_invl_gen are protected by the pv list lock but reads are not. */ -#ifdef NUMA -static __inline int -pc_to_domain(struct pv_chunk *pc) -{ - return (_vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); -} -#else -static __inline int -pc_to_domain(struct pv_chunk *pc __unused) -{ - - return (0); -} -#endif - -struct pv_chunks_list { - struct mtx pvc_lock; - TAILQ_HEAD(pch, pv_chunk) pvc_list; - int active_reclaims; -} __aligned(CACHE_LINE_SIZE); - -struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; +typedef struct vm_page PVLL; #ifdef NUMA struct pmap_large_md_page { - struct rwlock pv_lock; - struct md_page pv_page; + struct lock_object lo; + uintptr_t pad; + struct md_page pv_page; u_long pv_invl_gen; }; +/* + * We only depend on the size being a power of two, so the assert + * is overzealous. However, should the struct be resized to a + * different power of two, the code below needs to be revisited. + */ +_Static_assert(sizeof(struct pmap_large_md_page) == 64, "pmap_large_md_page"); + __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; #define pv_dummy pv_dummy_large.pv_page __read_mostly static struct pmap_large_md_page *pv_table; __read_mostly vm_paddr_t pmap_last_pa; +static struct lock_object * +pv_list_lock_object(vm_paddr_t pa) +{ + if (__predict_false(pa) > pmap_last_pa) + return (&pv_dummy_large.lo); + return (&pa_to_pmdp(pa)->lo); +} #else -static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; +static struct lock_object __exclusive_cache_line pv_lo[NPV_LIST_LOCKS]; static u_long pv_invl_gen[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; +static struct lock_object * +pv_list_lock_object(vm_paddr_t pa) +{ + return (&pv_lo[pa_index(pa) % NPV_LIST_LOCKS]); +} #endif +__read_mostly static struct vm_page pv_fake_page; + +static void +pmap_pv_list_lock(vm_page_t m) +{ + obm_lock(&m->md.pv_lock, pv_list_lock_object(VM_PAGE_TO_PHYS(m))); +} + +static void +pmap_pv_list_unlock(vm_page_t m) +{ + obm_unlock(&m->md.pv_lock, pv_list_lock_object(VM_PAGE_TO_PHYS(m))); +} + +/* + * Helper for pmap_pv_list_lock_pde(). The pte_locked argument + * indicates whether the PV llst for m is already locked. + */ +static void +pmap_pv_list_lock_pde1(vm_page_t m, bool pte_locked) +{ + vm_page_t mt, sm; + struct lock_object *lo; + int i; + + if (pte_locked) + obm_assert_locked(&m->md.pv_lock); + + sm = m - atop(VM_PAGE_TO_PHYS(m) & (PG_FRAME & PDRMASK)); + lo = pv_list_lock_object(VM_PAGE_TO_PHYS(m)); + + if (pte_locked) { + /* + * Fast attempt. If we either own or can get the pv + * list lock of the first page in the superpage, all + * other owners must release their locks without + * waiting for us. + */ + if (m == sm || obm_trylock(&sm->md.pv_lock)) { + for (i = 1, mt = sm + 1; i < NPTEPG; i++, mt++) { + if (m != mt) + obm_lock(&mt->md.pv_lock, lo); + } + return; + } + + obm_unlock(&m->md.pv_lock, lo); + } + + for (i = 0, mt = sm; i < NPTEPG; i++, mt++) { + obm_lock(&mt->md.pv_lock, lo); + } +} + +/* + * Locks all pv lists for ordinary pages constituting the superpage + * that contains the passed page. + */ +static void +pmap_pv_list_lock_pde(vm_paddr_t pa, PVLL **lockp) +{ + vm_page_t m; + + m = PHYS_TO_VM_PAGE(pa); + KASSERT(m != NULL, + ("pmap_pv_list_lock_pde: unmanaged phys addr %#lx", pa)); + + if (*lockp == NULL) { + pmap_pv_list_lock_pde1(m, false); + return; + } + if ((VM_PAGE_TO_PHYS(*lockp) & PG_PS_FRAME) != (pa & PG_PS_FRAME)) { + pmap_pv_list_unlock(*lockp); + *lockp = NULL; + pmap_pv_list_lock_pde1(m, false); + return; + } + pmap_pv_list_lock_pde1(*lockp, true); +} + +/* + * Unlock all pv lists for ordinary pages constituting the superpage + * at the physical address pa. + * + * If *lockp points to one of the ordinary pages from the superpage we + * are demoting or promoting, then we keep this page's pv list locked + * after pmap_pv_list_unlock_pde(). Otherwise, we just unlock + * whatever was locked, and unlock the whole run of pages constituting + * the superpage in pmap_pv_list_unlock_pde(). + */ +static void +pmap_pv_list_unlock_pde(vm_paddr_t pa, PVLL **lockp) +{ + vm_page_t m, mt, sm; + struct lock_object *lo; + int i; + bool pte_locked; + + m = *lockp; + pte_locked = m != NULL; + if (!pte_locked) { + m = PHYS_TO_VM_PAGE(pa); + if (m == NULL) + m = &pv_fake_page; + } + + sm = m - atop(VM_PAGE_TO_PHYS(m) & (PG_FRAME & PDRMASK)); + lo = pv_list_lock_object(VM_PAGE_TO_PHYS(m)); + obm_assert_locked(&m->md.pv_lock); + obm_assert_locked(&sm->md.pv_lock); + + for (i = 0, mt = sm; i < NPTEPG; i++, mt++) { + if (!pte_locked || mt != m) + obm_unlock(&mt->md.pv_lock, lo); + } +} /* * All those kernel PT submaps that BSD is so fond of @@ -1169,7 +1290,7 @@ { u_long gen, *m_gen; - rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); + obm_assert_locked(&m->md.pv_lock); gen = curthread->td_md.md_invl_gen.gen; if (gen == 0) return; @@ -1202,37 +1323,35 @@ static void free_pv_chunk(struct pv_chunk *pc); static void free_pv_chunk_batch(struct pv_chunklist *batch); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); -static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); +static pv_entry_t get_pv_entry(pmap_t pmap, PVLL **lockp); static int popcnt_pc_map_pq(uint64_t *map); -static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); -static void reserve_pv_entries(pmap_t pmap, int needed, - struct rwlock **lockp); -static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, - struct rwlock **lockp); +static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, PVLL **lockp, + bool avoid_locked_pmap); +static void reserve_pv_entries(pmap_t pmap, int needed, PVLL **lockp); +static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, - u_int flags, struct rwlock **lockp); + u_int flags, PVLL **lockp); #if VM_NRESERVLEVEL > 0 -static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, - struct rwlock **lockp); +static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa); #endif static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, - vm_offset_t va); + vm_offset_t va); static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte); static int pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot, int mode, int flags); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, - vm_offset_t va, struct rwlock **lockp); + vm_offset_t va, PVLL **lockp); static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va); static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, - vm_prot_t prot, struct rwlock **lockp); + vm_prot_t prot, PVLL **lockp); static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, - u_int flags, vm_page_t m, struct rwlock **lockp); + u_int flags, vm_page_t m, PVLL **lockp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, - vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); + vm_page_t m, vm_prot_t prot, vm_page_t mpte, PVLL **lockp); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, @@ -1240,13 +1359,13 @@ static void pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva); static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, - pd_entry_t pde); + pd_entry_t pde); static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode); static vm_page_t pmap_large_map_getptp_unlocked(void); static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); #if VM_NRESERVLEVEL > 0 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, - struct rwlock **lockp); + PVLL **lockp); #endif static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); @@ -1257,27 +1376,26 @@ static pd_entry_t *pmap_pti_pde(vm_offset_t va); static void pmap_pti_wire_pte(void *pte); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, - struct spglist *free, struct rwlock **lockp); + struct spglist *free, PVLL **lockp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, - pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); + pd_entry_t ptepde, struct spglist *free, PVLL **lockp); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free); static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, - pd_entry_t *pde, struct spglist *free, - struct rwlock **lockp); + pd_entry_t *pde, struct spglist *free, PVLL **lockp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, - vm_page_t m, struct rwlock **lockp); + vm_page_t m, PVLL **lockp); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, - struct rwlock **lockp, vm_offset_t va); + PVLL **lockp, vm_offset_t va); static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, - struct rwlock **lockp); + PVLL **lockp); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, - struct rwlock **lockp); + PVLL **lockp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); @@ -1823,6 +1941,9 @@ cr4 |= CR4_SMAP; load_cr4(cr4); + TAILQ_INIT(&all_pmaps); + mtx_init(&all_pmaps_lock, "allpms", NULL, MTX_DEF); + /* * Initialize the kernel pmap (which is statically allocated). * Count bootstrap data as being resident in case any of this data is @@ -2141,6 +2262,7 @@ TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; + obm_init(&m->md.pv_lock); } static int pmap_allow_2m_x_ept; @@ -2197,13 +2319,6 @@ long start, end, highest, pv_npg; int domain, i, j, pages; - /* - * We strongly depend on the size being a power of two, so the assert - * is overzealous. However, should the struct be resized to a - * different power of two, the code below needs to be revisited. - */ - CTASSERT((sizeof(*pvd) == 64)); - /* * Calculate the size of the array. */ @@ -2238,12 +2353,13 @@ vm_page_t m = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); if (m == NULL) - panic("vm_page_alloc_domain failed for %lx\n", (vm_offset_t)pvd + j); + panic("vm_page_alloc_domain failed for %lx\n", + (vm_offset_t)pvd + j); pmap_qenter((vm_offset_t)pvd + j, &m, 1); } for (j = 0; j < s / sizeof(*pvd); j++) { - rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); + obm_init_lo(&pvd->lo, "pmap pv list"); TAILQ_INIT(&pvd->pv_page.pv_list); pvd->pv_page.pv_gen = 0; pvd->pv_page.pat_mode = 0; @@ -2252,8 +2368,18 @@ } } pvd = &pv_dummy_large; - rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); + obm_init_lo(&pvd->lo, "pmap pv list dummy"); TAILQ_INIT(&pvd->pv_page.pv_list); + + /* + * Initialize pv_fake page, which is used to make pv_list + * locking work for physical addresses not covered by + * vm_page_array[]. In particular, it is needed by + * pv_list_lock_object() and CHANGE_PV_LIST_LOCK_TO_PHYS(). + */ + pmap_page_init(&pv_fake_page); + pv_fake_page.phys_addr = pmap_last_pa + PAGE_SIZE; + pvd->pv_page.pv_gen = 0; pvd->pv_page.pat_mode = 0; pvd->pv_invl_gen = 0; @@ -2269,7 +2395,7 @@ * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) - rw_init(&pv_list_locks[i], "pmap pv list"); + obm_init_lo(&pv_lo[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. @@ -2285,6 +2411,10 @@ for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); + + /* See explanation above for NUMA case. */ + pmap_page_init(&pv_fake_page); + pv_fake_page.phys_addr = vm_phys_segs[vm_phys_nsegs - 1].end + PAGE_SIZE; } #endif @@ -2300,6 +2430,10 @@ vm_page_t m, mpte; int error, i, ret, skz63; + /* Compiler cannot evaluate this at compile time. */ + MPASS(__bitcount64(PC_FREE0) + __bitcount64(PC_FREE1) + + __bitcount64(PC_FREE2) == _NPCPV); + /* L1TF, reserve page @0 unconditionally */ vm_page_blacklist_add(0, bootverbose); @@ -2384,13 +2518,6 @@ pagesizes[1] = NBPDR; } - /* - * Initialize pv chunk lists. - */ - for (i = 0; i < PMAP_MEMDOM; i++) { - mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF); - TAILQ_INIT(&pv_chunks[i].pvc_list); - } pmap_init_pv_table(); pmap_initialized = 1; @@ -3906,6 +4033,22 @@ } } +void +pmap_lock_init(pmap_t pmap) +{ + mtx_init(&pmap->pm_mtx, "pmap", NULL, MTX_DEF | MTX_DUPOK); + + /* + * Add the pmap to the global list of pmaps, which is used + * during pv chunk reclamation. The pmap is never removed + * from the list, relying on type-stability of the vmspace + * zone. + */ + mtx_lock(&all_pmaps_lock); + TAILQ_INSERT_TAIL(&all_pmaps, pmap, pm_allpmaps); + mtx_unlock(&all_pmaps_lock); +} + void pmap_pinit0(pmap_t pmap) { @@ -4103,8 +4246,7 @@ } static pml4_entry_t * -pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, - bool addref) +pmap_allocpte_getpml4(pmap_t pmap, PVLL **lockp, vm_offset_t va, bool addref) { vm_pindex_t pml5index; pml5_entry_t *pml5; @@ -4140,8 +4282,7 @@ } static pdp_entry_t * -pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, - bool addref) +pmap_allocpte_getpdp(pmap_t pmap, PVLL **lockp, vm_offset_t va, bool addref) { vm_page_t pdppg; pml4_entry_t *pml4; @@ -4220,8 +4361,8 @@ * since it is statically allocated by pmap_pinit() and not by _pmap_allocpte(). */ static vm_page_t -_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, - vm_offset_t va __unused) +_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, PVLL **lockp, + vm_offset_t va) { vm_pindex_t pml5index, pml4index; pml5_entry_t *pml5, *pml5u; @@ -4358,7 +4499,7 @@ static pd_entry_t * pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, - struct rwlock **lockp) + PVLL **lockp) { pdp_entry_t *pdpe, PG_V; pd_entry_t *pde; @@ -4397,7 +4538,7 @@ } static vm_page_t -pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) +pmap_allocpte(pmap_t pmap, vm_offset_t va, PVLL **lockp) { vm_pindex_t ptepindex; pd_entry_t *pd, PG_V; @@ -4650,10 +4791,6 @@ * page management routines. ***************************************************/ -CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); -CTASSERT(_NPCM == 3); -CTASSERT(_NPCPV == 168); - static __inline struct pv_chunk * pv_to_chunk(pv_entry_t pv) { @@ -4663,10 +4800,6 @@ #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) -#define PC_FREE0 0xfffffffffffffffful -#define PC_FREE1 0xfffffffffffffffful -#define PC_FREE2 0x000000fffffffffful - static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; #ifdef PV_STATS @@ -4694,129 +4827,32 @@ "Current number of spare pv entries"); #endif -static void -reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) -{ - - if (pmap == NULL) - return; - pmap_invalidate_all(pmap); - if (pmap != locked_pmap) - PMAP_UNLOCK(pmap); - if (start_di) - pmap_delayed_invl_finish(); -} - -/* - * We are in a serious low memory condition. Resort to - * drastic measures to free some pages so we can allocate - * another pv entry chunk. - * - * Returns NULL if PV entries were reclaimed from the specified pmap. - * - * We do not, however, unmap 2mpages because subsequent accesses will - * allocate per-page pv entries until repromotion occurs, thereby - * exacerbating the shortage of free pv entries. - */ -static vm_page_t -reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) +static bool +reclaim_pv_chunk_handle_pmap(pmap_t pmap, pmap_t locked_pmap, + bool avoid_locked_pmap, PVLL **lockp, struct spglist *free) { - struct pv_chunks_list *pvc; - struct pv_chunk *pc, *pc_marker, *pc_marker_end; - struct pv_chunk_header pc_marker_b, pc_marker_end_b; + struct pv_chunk *pc, *pcn; + pv_entry_t pv; + vm_offset_t va; + vm_page_t m, m_pc; struct md_page *pvh; pd_entry_t *pde; - pmap_t next_pmap, pmap; pt_entry_t *pte, tpte; pt_entry_t PG_G, PG_A, PG_M, PG_RW; - pv_entry_t pv; - vm_offset_t va; - vm_page_t m, m_pc; - struct spglist free; uint64_t inuse; int bit, field, freed; - bool start_di, restart; - - PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); - KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); - pmap = NULL; - m_pc = NULL; - PG_G = PG_A = PG_M = PG_RW = 0; - SLIST_INIT(&free); - bzero(&pc_marker_b, sizeof(pc_marker_b)); - bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); - pc_marker = (struct pv_chunk *)&pc_marker_b; - pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; + bool ret; - /* - * A delayed invalidation block should already be active if - * pmap_advise() or pmap_remove() called this function by way - * of pmap_demote_pde_locked(). - */ - start_di = pmap_not_in_di(); - - pvc = &pv_chunks[domain]; - mtx_lock(&pvc->pvc_lock); - pvc->active_reclaims++; - TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); - TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); - while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && - SLIST_EMPTY(&free)) { - next_pmap = pc->pc_pmap; - if (next_pmap == NULL) { - /* - * The next chunk is a marker. However, it is - * not our marker, so active_reclaims must be - * > 1. Consequently, the next_chunk code - * will not rotate the pv_chunks list. - */ - goto next_chunk; - } - mtx_unlock(&pvc->pvc_lock); + PMAP_LOCK_ASSERT(pmap, MA_OWNED); - /* - * A pv_chunk can only be removed from the pc_lru list - * when both pc_chunks_mutex is owned and the - * corresponding pmap is locked. - */ - if (pmap != next_pmap) { - restart = false; - reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, - start_di); - pmap = next_pmap; - /* Avoid deadlock and lock recursion. */ - if (pmap > locked_pmap) { - RELEASE_PV_LIST_LOCK(lockp); - PMAP_LOCK(pmap); - if (start_di) - pmap_delayed_invl_start(); - mtx_lock(&pvc->pvc_lock); - restart = true; - } else if (pmap != locked_pmap) { - if (PMAP_TRYLOCK(pmap)) { - if (start_di) - pmap_delayed_invl_start(); - mtx_lock(&pvc->pvc_lock); - restart = true; - } else { - pmap = NULL; /* pmap is not locked */ - mtx_lock(&pvc->pvc_lock); - pc = TAILQ_NEXT(pc_marker, pc_lru); - if (pc == NULL || - pc->pc_pmap != next_pmap) - continue; - goto next_chunk; - } - } else if (start_di) - pmap_delayed_invl_start(); - PG_G = pmap_global_bit(pmap); - PG_A = pmap_accessed_bit(pmap); - PG_M = pmap_modified_bit(pmap); - PG_RW = pmap_rw_bit(pmap); - if (restart) - continue; - } + ret = false; + PG_G = pmap_global_bit(pmap); + PG_A = pmap_accessed_bit(pmap); + PG_M = pmap_modified_bit(pmap); + PG_RW = pmap_rw_bit(pmap); + TAILQ_FOREACH_REVERSE_SAFE(pc, &pmap->pm_pvchunk, pvchunks, + pc_list, pcn) { /* * Destroy every non-wired, 4 KB page mapping in the chunk. */ @@ -4854,84 +4890,156 @@ } pmap_delayed_invl_page(m); pc->pc_map[field] |= 1UL << bit; - pmap_unuse_pt(pmap, va, *pde, &free); + pmap_unuse_pt(pmap, va, *pde, free); freed++; } } - if (freed == 0) { - mtx_lock(&pvc->pvc_lock); - goto next_chunk; - } + if (freed == 0) + continue; + /* Every freed mapping is for a 4 KB page. */ pmap_resident_count_dec(pmap, freed); PV_STAT(atomic_add_long(&pv_entry_frees, freed)); PV_STAT(atomic_add_int(&pv_entry_spare, freed)); PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && pc->pc_map[2] == PC_FREE2) { - PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); - PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); - PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); - /* Entire chunk is free; return it. */ - m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); - dump_drop_page(m_pc->phys_addr); - mtx_lock(&pvc->pvc_lock); - TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); - break; + if (!avoid_locked_pmap || locked_pmap != pmap) { + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + PV_STAT(atomic_subtract_int(&pv_entry_spare, + _NPCPV)); + PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); + PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); + /* Entire chunk is free; return it. */ + m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS( + (vm_offset_t)pc)); + dump_drop_page(m_pc->phys_addr); + m_pc->ref_count = 0; + SLIST_INSERT_HEAD(free, m_pc, plinks.s.ss); + break; + } + } else { + /* + * Re-insert at head because allocator bails + * out if it finds fully populated chunk. + */ + TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); + TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); } - TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); - mtx_lock(&pvc->pvc_lock); /* One freed pv entry in locked_pmap is sufficient. */ - if (pmap == locked_pmap) + if (pmap == locked_pmap) { + ret = true; break; -next_chunk: - TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); - TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); - if (pvc->active_reclaims == 1 && pmap != NULL) { - /* - * Rotate the pv chunks list so that we do not - * scan the same pv chunks that could not be - * freed (because they contained a wired - * and/or superpage mapping) on every - * invocation of reclaim_pv_chunk(). - */ - while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) { - MPASS(pc->pc_pmap != NULL); - TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); - TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); - } } } - TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); - TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); - pvc->active_reclaims--; - mtx_unlock(&pvc->pvc_lock); - reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); - if (m_pc == NULL && !SLIST_EMPTY(&free)) { - m_pc = SLIST_FIRST(&free); - SLIST_REMOVE_HEAD(&free, plinks.s.ss); - /* Recycle a freed page table page. */ - m_pc->ref_count = 1; - } - vm_page_free_pages_toq(&free, true); - return (m_pc); + return (ret); } +/* + * We are in a serious low memory condition. Resort to + * drastic measures to free some pages so we can allocate + * another pv entry chunk. + * + * Returns NULL if PV entries were reclaimed from the specified pmap, + * otherwise, returns a free page to be used for a PV chunk. + * + * If avoid_locked_pmap is true, chunks are not freed from the + * locked_pmap (but pv entries are). + * + * We do not, however, unmap 2mpages because subsequent accesses will + * allocate per-page pv entries until repromotion occurs, thereby + * exacerbating the shortage of free pv entries. + */ static vm_page_t -reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) +reclaim_pv_chunk(pmap_t locked_pmap, PVLL **lockp, bool avoid_locked_pmap) { vm_page_t m; - int i, domain; + pmap_t next_pmap, pmap; + struct spglist free; + bool res, start_di; - domain = PCPU_GET(domain); - for (i = 0; i < vm_ndomains; i++) { - m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); - if (m != NULL) + PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); + KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); + pmap = NULL; + m = NULL; + res = false; + SLIST_INIT(&free); + + /* + * A delayed invalidation block should already be active if + * pmap_advise() or pmap_remove() called this function by way + * of pmap_demote_pde_locked(). + */ + start_di = pmap_not_in_di(); + + for (;;) { + /* + * A parallel reclaim_pv_chunk() could move our cursor + * to the end of the list, which causes earlier + * termination of the loop. Since all callers are + * prepared to the reclaim_pv_chunk() failure, it only + * means that callers retry with the page allocator + * before trying to reclaim one more time. + */ + mtx_lock(&all_pmaps_lock); + next_pmap = pmap == NULL ? TAILQ_FIRST(&all_pmaps) : + TAILQ_NEXT(pmap, pm_allpmaps); + mtx_unlock(&all_pmaps_lock); + if (next_pmap == NULL) break; - domain = (domain + 1) % vm_ndomains; - } + pmap = next_pmap; + + /* + * This lockless check is fine, we would either + * process a pmap without any pv chunks or skip some + * potentially consumable pmap. But it is still + * useful to cheaply skip freed pmaps which are kept + * on the list due to type stability. + */ + if (pmap->pm_stats.resident_count == 0) + continue; + + /* Avoid deadlock and lock recursion. */ + if (pmap > locked_pmap) { + RELEASE_PV_LIST_LOCK(lockp); + PMAP_LOCK(pmap); + if (start_di) + pmap_delayed_invl_start(); + } else if (pmap != locked_pmap) { + if (PMAP_TRYLOCK(pmap)) { + if (start_di) + pmap_delayed_invl_start(); + } else { + /* The pmap is not locked, skip it. */ + continue; + } + } else if (start_di) + pmap_delayed_invl_start(); + if (pmap->pm_stats.resident_count != 0) { + res = reclaim_pv_chunk_handle_pmap(pmap, locked_pmap, + avoid_locked_pmap, lockp, &free); + } + pmap_invalidate_all(pmap); + if (pmap != locked_pmap) + PMAP_UNLOCK(pmap); + if (start_di) + pmap_delayed_invl_finish(); + if (res || !SLIST_EMPTY(&free)) { + mtx_lock(&all_pmaps_lock); + TAILQ_REMOVE(&all_pmaps, pmap, pm_allpmaps); + TAILQ_INSERT_TAIL(&all_pmaps, pmap, pm_allpmaps); + mtx_unlock(&all_pmaps_lock); + break; + } + } + if (!res && !SLIST_EMPTY(&free)) { + m = SLIST_FIRST(&free); + SLIST_REMOVE_HEAD(&free, plinks.s.ss); + /* Recycle a freed page table page. */ + m->ref_count = 1; + } + vm_page_free_pages_toq(&free, true); return (m); } @@ -4967,7 +5075,7 @@ } static void -free_pv_chunk_dequeued(struct pv_chunk *pc) +free_pv_chunk(struct pv_chunk *pc) { vm_page_t m; @@ -4981,40 +5089,13 @@ vm_page_free(m); } -static void -free_pv_chunk(struct pv_chunk *pc) -{ - struct pv_chunks_list *pvc; - - pvc = &pv_chunks[pc_to_domain(pc)]; - mtx_lock(&pvc->pvc_lock); - TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); - mtx_unlock(&pvc->pvc_lock); - free_pv_chunk_dequeued(pc); -} - static void free_pv_chunk_batch(struct pv_chunklist *batch) { - struct pv_chunks_list *pvc; struct pv_chunk *pc, *npc; - int i; - - for (i = 0; i < vm_ndomains; i++) { - if (TAILQ_EMPTY(&batch[i])) - continue; - pvc = &pv_chunks[i]; - mtx_lock(&pvc->pvc_lock); - TAILQ_FOREACH(pc, &batch[i], pc_list) { - TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); - } - mtx_unlock(&pvc->pvc_lock); - } - for (i = 0; i < vm_ndomains; i++) { - TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { - free_pv_chunk_dequeued(pc); - } + TAILQ_FOREACH_SAFE(pc, batch, pc_list, npc) { + free_pv_chunk(pc); } } @@ -5027,9 +5108,8 @@ * The given PV list lock may be released. */ static pv_entry_t -get_pv_entry(pmap_t pmap, struct rwlock **lockp) +get_pv_entry(pmap_t pmap, PVLL **lockp) { - struct pv_chunks_list *pvc; int bit, field; pv_entry_t pv; struct pv_chunk *pc; @@ -5069,7 +5149,7 @@ PV_STAT(pc_chunk_tryfail++); return (NULL); } - m = reclaim_pv_chunk(pmap, lockp); + m = reclaim_pv_chunk(pmap, lockp, false); if (m == NULL) goto retry; } @@ -5081,10 +5161,6 @@ pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; - pvc = &pv_chunks[_vm_phys_domain(m->phys_addr)]; - mtx_lock(&pvc->pvc_lock); - TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); - mtx_unlock(&pvc->pvc_lock); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); @@ -5128,26 +5204,16 @@ * The given PV list lock may be released. */ static void -reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) +reserve_pv_entries(pmap_t pmap, int needed, PVLL **lockp) { - struct pv_chunks_list *pvc; - struct pch new_tail[PMAP_MEMDOM]; struct pv_chunk *pc; vm_page_t m; - int avail, free, i; + int avail, free; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); - /* - * Newly allocated PV chunks must be stored in a private list until - * the required number of PV chunks have been allocated. Otherwise, - * reclaim_pv_chunk() could recycle one of these chunks. In - * contrast, these chunks must be added to the pmap upon allocation. - */ - for (i = 0; i < PMAP_MEMDOM; i++) - TAILQ_INIT(&new_tail[i]); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { @@ -5168,7 +5234,7 @@ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED); if (m == NULL) { - m = reclaim_pv_chunk(pmap, lockp); + m = reclaim_pv_chunk(pmap, lockp, true); if (m == NULL) goto retry; reclaimed = true; @@ -5182,7 +5248,6 @@ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_TAIL(&new_tail[pc_to_domain(pc)], pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); /* @@ -5193,14 +5258,6 @@ if (reclaimed) goto retry; } - for (i = 0; i < vm_ndomains; i++) { - if (TAILQ_EMPTY(&new_tail[i])) - continue; - pvc = &pv_chunks[i]; - mtx_lock(&pvc->pvc_lock); - TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); - mtx_unlock(&pvc->pvc_lock); - } } /* @@ -5230,8 +5287,7 @@ * entries for each of the 4KB page mappings. */ static void -pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, - struct rwlock **lockp) +pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; struct pv_chunk *pc; @@ -5243,7 +5299,6 @@ PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 2mpage aligned")); - CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first @@ -5299,8 +5354,7 @@ * for the 2MB page mapping. */ static void -pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, - struct rwlock **lockp) +pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa) { struct md_page *pvh; pv_entry_t pv; @@ -5309,7 +5363,6 @@ KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 2mpage aligned")); - CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* * Transfer the first page's pv entry for this mapping to the 2mpage's @@ -5356,7 +5409,7 @@ */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, - struct rwlock **lockp) + PVLL **lockp) { pv_entry_t pv; @@ -5379,7 +5432,7 @@ */ static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, - struct rwlock **lockp) + PVLL **lockp) { struct md_page *pvh; pv_entry_t pv; @@ -5420,13 +5473,13 @@ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { - struct rwlock *lock; + PVLL *lock; boolean_t rv; lock = NULL; rv = pmap_demote_pde_locked(pmap, pde, va, &lock); if (lock != NULL) - rw_wunlock(lock); + pmap_pv_list_unlock(lock); return (rv); } @@ -5459,7 +5512,7 @@ static void pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, - pd_entry_t oldpde, struct rwlock **lockp) + pd_entry_t oldpde, PVLL **lockp) { struct spglist free; vm_offset_t sva; @@ -5476,7 +5529,7 @@ static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, - struct rwlock **lockp) + PVLL **lockp) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; @@ -5610,8 +5663,11 @@ /* * Demote the PV entry. */ - if ((oldpde & PG_MANAGED) != 0) - pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); + if ((oldpde & PG_MANAGED) != 0) { + pmap_pv_list_lock_pde(oldpde & PG_PS_FRAME, lockp); + pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME); + pmap_pv_list_unlock_pde(oldpde & PG_PS_FRAME, lockp); + } atomic_add_long(&pmap_pde_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", @@ -5664,7 +5720,7 @@ */ static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, - struct spglist *free, struct rwlock **lockp) + struct spglist *free, PVLL **lockp) { struct md_page *pvh; pd_entry_t oldpde; @@ -5693,6 +5749,7 @@ eva = sva + NBPDR; for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME); va < eva; va += PAGE_SIZE, m++) { + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) vm_page_dirty(m); if (oldpde & PG_A) @@ -5725,7 +5782,7 @@ */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, - pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) + pd_entry_t ptepde, struct spglist *free, PVLL **lockp) { struct md_page *pvh; pt_entry_t oldpte, PG_A, PG_M, PG_RW; @@ -5766,7 +5823,7 @@ pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free) { - struct rwlock *lock; + PVLL *lock; pt_entry_t *pte, PG_V; PG_V = pmap_valid_bit(pmap); @@ -5779,7 +5836,7 @@ lock = NULL; pmap_remove_pte(pmap, pte, va, *pde, free, &lock); if (lock != NULL) - rw_wunlock(lock); + pmap_pv_list_unlock(lock); pmap_invalidate_page(pmap, va); } @@ -5788,7 +5845,7 @@ */ static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, - pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) + pd_entry_t *pde, struct spglist *free, PVLL **lockp) { pt_entry_t PG_G, *pte; vm_offset_t va; @@ -5830,7 +5887,7 @@ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { - struct rwlock *lock; + PVLL *lock; vm_offset_t va_next; pml5_entry_t *pml5e; pml4_entry_t *pml4e; @@ -5955,7 +6012,7 @@ anyvalid = 1; } if (lock != NULL) - rw_wunlock(lock); + pmap_pv_list_unlock(lock); out: if (anyvalid) pmap_invalidate_all(pmap); @@ -5983,7 +6040,7 @@ struct md_page *pvh; pv_entry_t pv; pmap_t pmap; - struct rwlock *lock; + PVLL *lock; pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; pd_entry_t *pde; vm_offset_t va; @@ -5997,16 +6054,16 @@ pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: - rw_wlock(lock); + pmap_pv_list_lock(lock); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(lock); if (pvh_gen != pvh->pv_gen) { - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_UNLOCK(pmap); goto retry; } @@ -6015,17 +6072,19 @@ pde = pmap_pde(pmap, va); (void)pmap_demote_pde_locked(pmap, pde, va, &lock); PMAP_UNLOCK(pmap); + if (lock != m) + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_UNLOCK(pmap); goto retry; } @@ -6057,7 +6116,7 @@ PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); - rw_wunlock(lock); + pmap_pv_list_unlock(lock); pmap_delayed_invl_wait(m); vm_page_free_pages_toq(&free, true); } @@ -6273,8 +6332,7 @@ * identical characteristics. */ static void -pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, - struct rwlock **lockp) +pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, PVLL **lockp) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; @@ -6376,8 +6434,11 @@ /* * Promote the pv entries. */ - if ((newpde & PG_MANAGED) != 0) - pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); + if ((newpde & PG_MANAGED) != 0) { + pmap_pv_list_lock_pde(newpde & PG_PS_FRAME, lockp); + pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME); + pmap_pv_list_unlock_pde(newpde & PG_PS_FRAME, lockp); + } /* * Propagate the PAT index to its proper position. @@ -6418,7 +6479,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { - struct rwlock *lock; + PVLL *lock; pd_entry_t *pde; pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; pt_entry_t newpte, origpte; @@ -6624,7 +6685,7 @@ pv = get_pv_entry(pmap, &lock); pv->pv_va = va; } - CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((newpte & PG_RW) != 0) @@ -6677,7 +6738,7 @@ rv = KERN_SUCCESS; out: if (lock != NULL) - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_UNLOCK(pmap); return (rv); } @@ -6691,7 +6752,7 @@ */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, - struct rwlock **lockp) + PVLL **lockp) { pd_entry_t newpde; pt_entry_t PG_V; @@ -6742,7 +6803,7 @@ */ static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, - vm_page_t m, struct rwlock **lockp) + vm_page_t m, PVLL **lockp) { struct spglist free; pd_entry_t oldpde, *pde; @@ -6890,7 +6951,7 @@ pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { - struct rwlock *lock; + PVLL *lock; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; @@ -6915,7 +6976,7 @@ m = TAILQ_NEXT(m, listq); } if (lock != NULL) - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_UNLOCK(pmap); } @@ -6931,19 +6992,19 @@ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { - struct rwlock *lock; + PVLL *lock; lock = NULL; PMAP_LOCK(pmap); (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); if (lock != NULL) - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, - vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) + vm_prot_t prot, vm_page_t mpte, PVLL **lockp) { pt_entry_t newpte, *pte, PG_V; @@ -7238,7 +7299,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { - struct rwlock *lock; + PVLL *lock; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde, srcptepaddr; @@ -7375,7 +7436,7 @@ } out: if (lock != NULL) - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } @@ -7490,7 +7551,7 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; - struct rwlock *lock; + PVLL *lock; pv_entry_t pv; int loops = 0; boolean_t rv; @@ -7499,7 +7560,7 @@ ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); - rw_rlock(lock); + pmap_pv_list_lock(lock); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; @@ -7521,7 +7582,7 @@ break; } } - rw_runlock(lock); + pmap_pv_list_unlock(lock); return (rv); } @@ -7534,7 +7595,7 @@ int pmap_page_wired_mappings(vm_page_t m) { - struct rwlock *lock; + PVLL *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; @@ -7544,16 +7605,16 @@ if ((m->oflags & VPO_UNMANAGED) != 0) return (0); lock = VM_PAGE_TO_PV_LIST_LOCK(m); - rw_rlock(lock); + pmap_pv_list_lock(lock); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; - rw_runlock(lock); + pmap_pv_list_unlock(lock); PMAP_LOCK(pmap); - rw_rlock(lock); + pmap_pv_list_lock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; @@ -7571,9 +7632,9 @@ if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; - rw_runlock(lock); + pmap_pv_list_unlock(lock); PMAP_LOCK(pmap); - rw_rlock(lock); + pmap_pv_list_lock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); @@ -7586,7 +7647,7 @@ PMAP_UNLOCK(pmap); } } - rw_runlock(lock); + pmap_pv_list_unlock(lock); return (count); } @@ -7597,17 +7658,17 @@ boolean_t pmap_page_is_mapped(vm_page_t m) { - struct rwlock *lock; + PVLL *lock; boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); lock = VM_PAGE_TO_PV_LIST_LOCK(m); - rw_rlock(lock); + pmap_pv_list_lock(lock); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); - rw_runlock(lock); + pmap_pv_list_unlock(lock); return (rv); } @@ -7643,15 +7704,14 @@ pt_entry_t *pte, tpte; pt_entry_t PG_M, PG_RW, PG_V; struct spglist free; - struct pv_chunklist free_chunks[PMAP_MEMDOM]; + struct pv_chunklist free_chunks; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; struct pv_chunk *pc, *npc; - struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; - int allfree, field, freed, i, idx; + int allfree, field, freed, idx; boolean_t superpage; vm_paddr_t pa; @@ -7674,13 +7734,11 @@ } #endif - lock = NULL; PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); - for (i = 0; i < PMAP_MEMDOM; i++) - TAILQ_INIT(&free_chunks[i]); + TAILQ_INIT(&free_chunks); SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { @@ -7762,12 +7820,11 @@ vm_page_dirty(m); } - CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); + pmap_pv_list_lock(m); /* Mark free */ pc->pc_map[field] |= bitmask; if (superpage) { - pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); pvh = pa_to_pvh(tpte & PG_PS_FRAME); TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; @@ -7777,6 +7834,7 @@ TAILQ_EMPTY(&mt->md.pv_list)) vm_page_aflag_clear(mt, PGA_WRITEABLE); } + pmap_pv_list_unlock(m); mpte = pmap_remove_pt_page(pmap, pv->pv_va); if (mpte != NULL) { KASSERT(mpte->valid == VM_PAGE_BITS_ALL, @@ -7787,8 +7845,9 @@ mpte->ref_count = 0; pmap_add_delayed_free_list(mpte, &free, FALSE); } + pmap_resident_count_dec(pmap, NBPDR / + PAGE_SIZE); } else { - pmap_resident_count_dec(pmap, 1); TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((m->a.flags & PGA_WRITEABLE) != 0 && @@ -7798,6 +7857,8 @@ if (TAILQ_EMPTY(&pvh->pv_list)) vm_page_aflag_clear(m, PGA_WRITEABLE); } + pmap_pv_list_unlock(m); + pmap_resident_count_dec(pmap, 1); } pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); freed++; @@ -7808,11 +7869,9 @@ PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list); + TAILQ_INSERT_TAIL(&free_chunks, pc, pc_list); } } - if (lock != NULL) - rw_wunlock(lock); pmap_invalidate_all(pmap); pmap_pkru_deassign_all(pmap); free_pv_chunk_batch((struct pv_chunklist *)&free_chunks); @@ -7823,7 +7882,7 @@ static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { - struct rwlock *lock; + PVLL *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask; @@ -7834,15 +7893,15 @@ rv = FALSE; lock = VM_PAGE_TO_PV_LIST_LOCK(m); - rw_rlock(lock); + pmap_pv_list_lock(lock); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; - rw_runlock(lock); + pmap_pv_list_unlock(lock); PMAP_LOCK(pmap); - rw_rlock(lock); + pmap_pv_list_lock(lock); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; @@ -7872,9 +7931,9 @@ if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; - rw_runlock(lock); + pmap_pv_list_unlock(lock); PMAP_LOCK(pmap); - rw_rlock(lock); + pmap_pv_list_lock(lock); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); @@ -7900,7 +7959,7 @@ } } out: - rw_runlock(lock); + pmap_pv_list_unlock(lock); return (rv); } @@ -7973,7 +8032,7 @@ { struct md_page *pvh; pmap_t pmap; - struct rwlock *lock; + PVLL *lock; pv_entry_t next_pv, pv; pd_entry_t *pde; pt_entry_t oldpte, *pte, PG_M, PG_RW; @@ -7991,17 +8050,18 @@ pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: - rw_wlock(lock); + pmap_pv_list_lock(lock); +retry_pv_loop_locked: + pvh_gen = pvh->pv_gen; TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { - pvh_gen = pvh->pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); - rw_wunlock(lock); + pmap_pv_list_unlock(lock); goto retry_pv_loop; } } @@ -8010,23 +8070,24 @@ pde = pmap_pde(pmap, va); if ((*pde & PG_RW) != 0) (void)pmap_demote_pde_locked(pmap, pde, va, &lock); - KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), - ("inconsistent pv lock %p %p for page %p", - lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); PMAP_UNLOCK(pmap); + if (lock != m || pvh_gen != pvh->pv_gen) { + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); + goto retry_pv_loop_locked; + } } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); - rw_wunlock(lock); + pmap_pv_list_unlock(lock); goto retry_pv_loop; } } @@ -8049,7 +8110,7 @@ } PMAP_UNLOCK(pmap); } - rw_wunlock(lock); + pmap_pv_list_unlock(lock); vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_wait(m); } @@ -8107,7 +8168,7 @@ struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; - struct rwlock *lock; + PVLL *lock; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW; vm_offset_t va; @@ -8123,21 +8184,21 @@ pa = VM_PAGE_TO_PHYS(m); lock = PHYS_TO_PV_LIST_LOCK(pa); pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); - rw_wlock(lock); + pmap_pv_list_lock(lock); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) goto small_mappings; pv = pvf; do { + pvh_gen = pvh->pv_gen; if (pvf == NULL) pvf = pv; pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { - pvh_gen = pvh->pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; @@ -8202,7 +8263,6 @@ pmap_invalidate_page(pmap, va); } else demoted = TRUE; - if (demoted) { /* * The superpage mapping was removed @@ -8214,9 +8274,10 @@ pv = NULL; } cleared++; - KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), - ("inconsistent pv lock %p %p for page %p", - lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + if (lock != m || pvh_gen != pvh->pv_gen) { + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); + goto retry; + } } else not_cleared++; } @@ -8241,9 +8302,9 @@ if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; @@ -8294,7 +8355,7 @@ } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: - rw_wunlock(lock); + pmap_pv_list_unlock(lock); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } @@ -8307,7 +8368,7 @@ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { - struct rwlock *lock; + PVLL *lock; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t oldpde, *pde; @@ -8364,7 +8425,7 @@ lock = NULL; if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { if (lock != NULL) - rw_wunlock(lock); + pmap_pv_list_unlock(lock); /* * The large page mapping was destroyed. @@ -8396,7 +8457,7 @@ anychanged = true; } if (lock != NULL) - rw_wunlock(lock); + pmap_pv_list_unlock(lock); } if (va_next > eva) va_next = eva; @@ -8453,7 +8514,7 @@ pv_entry_t next_pv, pv; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_M, PG_RW; - struct rwlock *lock; + PVLL *lock; vm_offset_t va; int md_gen, pvh_gen; @@ -8466,15 +8527,15 @@ pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); lock = VM_PAGE_TO_PV_LIST_LOCK(m); - rw_wlock(lock); + pmap_pv_list_lock(lock); restart: + pvh_gen = pvh->pv_gen; TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { - pvh_gen = pvh->pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(lock); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; @@ -8500,15 +8561,19 @@ pmap_invalidate_page(pmap, va); } PMAP_UNLOCK(pmap); + if (lock != m || pvh_gen != pvh->pv_gen) { + CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); + goto restart; + } } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(lock); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; @@ -8526,7 +8591,7 @@ } PMAP_UNLOCK(pmap); } - rw_wunlock(lock); + pmap_pv_list_unlock(lock); } /* @@ -9438,7 +9503,7 @@ pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) { int rv; - struct rwlock *lock; + PVLL *lock; #if VM_NRESERVLEVEL > 0 vm_page_t m, mpte; #endif @@ -9525,7 +9590,7 @@ rv = 0; /* success */ done: if (lock != NULL) - rw_wunlock(lock); + pmap_pv_list_unlock(lock); PMAP_UNLOCK(pmap); return (rv); } Index: sys/amd64/include/pmap.h =================================================================== --- sys/amd64/include/pmap.h +++ sys/amd64/include/pmap.h @@ -268,6 +268,7 @@ #include #include #include +#include #include #include @@ -341,7 +342,9 @@ struct md_page { TAILQ_HEAD(, pv_entry) pv_list; /* (p) */ int pv_gen; /* (p) */ - int pat_mode; + obm_lock_t pv_lock; + uint8_t pat_mode; + uint8_t pad0[2]; }; enum pmap_type { @@ -355,6 +358,8 @@ uint32_t pm_gen; }; +TAILQ_HEAD(pvchunks, pv_chunk); + /* * The kernel virtual address (KVA) of the level 4 page table page is always * within the direct map (DMAP) region. @@ -365,7 +370,7 @@ pml4_entry_t *pm_pmltopu; /* KVA of user top page table */ uint64_t pm_cr3; uint64_t pm_ucr3; - TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ + struct pvchunks pm_pvchunk; /* list of mappings in pmap */ cpuset_t pm_active; /* active on cpus */ enum pmap_type pm_type; /* regular or nested tables */ struct pmap_statistics pm_stats; /* pmap statistics */ @@ -374,6 +379,7 @@ int pm_flags; struct pmap_pcids pm_pcids[MAXCPU]; struct rangeset pm_pkru; + TAILQ_ENTRY(pmap) pm_allpmaps; }; /* flags */ @@ -392,8 +398,7 @@ #define PMAP_LOCK_ASSERT(pmap, type) \ mtx_assert(&(pmap)->pm_mtx, (type)) #define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx) -#define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, "pmap", \ - NULL, MTX_DEF | MTX_DUPOK) +#define PMAP_LOCK_INIT(pmap) pmap_lock_init(pmap) #define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx) #define PMAP_MTX(pmap) (&(pmap)->pm_mtx) #define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx) @@ -421,8 +426,7 @@ #define PV_CHUNK_HEADER \ pmap_t pc_pmap; \ TAILQ_ENTRY(pv_chunk) pc_list; \ - uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */ \ - TAILQ_ENTRY(pv_chunk) pc_lru; + uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */ struct pv_chunk_header { PV_CHUNK_HEADER @@ -467,6 +471,7 @@ int pmap_large_map(vm_paddr_t, vm_size_t, void **, vm_memattr_t); void pmap_large_map_wb(void *sva, vm_size_t len); void pmap_large_unmap(void *sva, vm_size_t len); +void pmap_lock_init(pmap_t pmap); void *pmap_mapbios(vm_paddr_t, vm_size_t); void *pmap_mapdev(vm_paddr_t, vm_size_t); void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int); Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -3746,6 +3746,7 @@ kern/kern_mtxpool.c standard kern/kern_mutex.c standard kern/kern_ntptime.c standard +kern/kern_obm.c standard kern/kern_osd.c standard kern/kern_physio.c standard kern/kern_pmc.c standard Index: sys/kern/kern_obm.c =================================================================== --- /dev/null +++ sys/kern/kern_obm.c @@ -0,0 +1,122 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef OBM_DEBUG +static SYSCTL_NODE(_debug, OID_AUTO, obm, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + ""); +static u_long obm_slow_lock; +SYSCTL_LONG(_debug_obm, OID_AUTO, slow_lock, CTLFLAG_RD, + &obm_slow_lock, 0, + ""); +static u_long obm_slow_unlock; +SYSCTL_LONG(_debug_obm, OID_AUTO, slow_unlock, CTLFLAG_RD, + &obm_slow_unlock, 0, + ""); +#endif + +void +obm_init_lo(struct lock_object *lo, const char *name) +{ + bzero(lo, sizeof(*lo)); + lo->lo_name = name; +} + +void +obm_init(obm_lock_t *obm) +{ + obm->lk = OBM_UNLOCKED; +} + +void +obm_lock_slow(obm_lock_t *obm, struct lock_object *lo, uint8_t v) +{ + struct turnstile *ts; + struct lock_delay_arg lda; + +#ifdef OBM_DEBUG + atomic_add_long(&obm_slow_lock, 1); +#endif + lock_delay_arg_init(&lda, &locks_delay); + for (;;) { + if (v == OBM_UNLOCKED) { + if (atomic_fcmpset_acq_char(&obm->lk, &v, + OBM_LOCKED) != 0) + break; + continue; + } + + ts = turnstile_trywait(lo); + v = atomic_load_8(&obm->lk); +retry_ts: + if (v == OBM_UNLOCKED) { + turnstile_cancel(ts); + continue; + } + if ((v & OBM_CONTESTED) == 0 && + atomic_fcmpset_8(&obm->lk, &v, v | OBM_CONTESTED) == 0) { + goto retry_ts; + } + turnstile_wait(ts, NULL, TS_SHARED_QUEUE); + v = atomic_load_8(&obm->lk); + } + TD_LOCKS_INC(curthread); +} +void +obm_unlock_slow(obm_lock_t *obm, struct lock_object *lo) +{ + struct turnstile *ts; + +#ifdef OBM_DEBUG + atomic_add_long(&obm_slow_unlock, 1); +#endif + turnstile_chain_lock(lo); + atomic_store_rel_8(&obm->lk, OBM_UNLOCKED); + ts = turnstile_lookup(lo); + if (ts != NULL) { + turnstile_broadcast(ts, TS_SHARED_QUEUE); + turnstile_unpend(ts); + } + turnstile_chain_unlock(lo); + TD_LOCKS_DEC(curthread); +} Index: sys/sys/_obm.h =================================================================== --- /dev/null +++ sys/sys/_obm.h @@ -0,0 +1,47 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS__OBM_H +#define _SYS__OBM_H + +/* One-Byte Lock */ + +#define OBM_UNLOCKED 0x00 +#define OBM_LOCKED 0x02 +#define OBM_CONTESTED 0x01 + +typedef struct obm_lock_tag { + uint8_t lk; +} obm_lock_t; + +#endif Index: sys/sys/obm.h =================================================================== --- /dev/null +++ sys/sys/obm.h @@ -0,0 +1,103 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_OBM_H +#define _SYS_OBM_H + +/* One-Byte Lock */ + +#ifdef _KERNEL + +#include +#include +#include +#include +#include + +void obm_init_lo(struct lock_object *lo, const char *name); +void obm_init(obm_lock_t *obm); +void obm_lock_slow(obm_lock_t *obm, struct lock_object *lo, uint8_t v); +void obm_unlock_slow(obm_lock_t *obm, struct lock_object *lo); + +__used static void +obm_assert_locked(obm_lock_t *obm) +{ +#ifdef INVARIANTS + uint8_t v; + + v = atomic_load_8(&obm->lk); + MPASS(v == OBM_LOCKED || v == (OBM_LOCKED | OBM_CONTESTED)); +#endif +} + +static inline bool +obm_trylock(obm_lock_t *obm) +{ + if (atomic_cmpset_acq_8(&obm->lk, OBM_UNLOCKED, OBM_LOCKED) != 0) { + TD_LOCKS_INC(curthread); + return (true); + } + return (false); +} + +static inline void +obm_lock(obm_lock_t *obm, struct lock_object *lo) +{ + uint8_t v; + + v = OBM_UNLOCKED; + if (__predict_true(atomic_fcmpset_acq_8(&obm->lk, &v, OBM_LOCKED))) { + TD_LOCKS_INC(curthread); + } else { + MPASS(v == OBM_LOCKED || v == (OBM_LOCKED | OBM_CONTESTED) || + v == OBM_UNLOCKED); + obm_lock_slow(obm, lo, v); + } +} + +static inline void +obm_unlock(obm_lock_t *obm, struct lock_object *lo) +{ + uint8_t v; + + v = OBM_LOCKED; + if (atomic_fcmpset_rel_8(&obm->lk, &v, OBM_UNLOCKED)) { + TD_LOCKS_DEC(curthread); + } else { + MPASS(v == OBM_LOCKED || v == (OBM_LOCKED | OBM_CONTESTED)); + obm_unlock_slow(obm, lo); + } +} + +#endif +#endif