Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -121,6 +121,7 @@ #include #include #include +#include #include #include #include @@ -331,50 +332,18 @@ #ifdef NUMA #define pa_to_pmdp(pa) (&pv_table[pa_index(pa)]) #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) -#define PHYS_TO_PV_LIST_LOCK(pa) ({ \ - struct rwlock *_lock; \ - if (__predict_false((pa) > pmap_last_pa)) \ - _lock = &pv_dummy_large.pv_lock; \ - else \ - _lock = &(pa_to_pmdp(pa)->pv_lock); \ - _lock; \ -}) #else #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) - #define NPV_LIST_LOCKS MAXCPU - -#define PHYS_TO_PV_LIST_LOCK(pa) \ - (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) #endif -#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ - struct rwlock **_lockp = (lockp); \ - struct rwlock *_new_lock; \ - \ - _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ - if (_new_lock != *_lockp) { \ - if (*_lockp != NULL) \ - rw_wunlock(*_lockp); \ - *_lockp = _new_lock; \ - rw_wlock(*_lockp); \ - } \ -} while (0) - -#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \ - CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m)) - -#define RELEASE_PV_LIST_LOCK(lockp) do { \ - struct rwlock **_lockp = (lockp); \ - \ - if (*_lockp != NULL) { \ - rw_wunlock(*_lockp); \ - *_lockp = NULL; \ - } \ -} while (0) - -#define VM_PAGE_TO_PV_LIST_LOCK(m) \ - PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m)) +#define PV_PMAP(pv) ((pv)->pv_pmap) + +/* get_pv_entry flags */ +#define PMAP_GPV_NOWAIT 0x0001 +#define PMAP_GPV_RESERV 0x0002 + +static uma_zone_t pmap_pv_zone; struct pmap kernel_pmap_store; @@ -427,46 +396,126 @@ * Data for the pv entry allocation mechanism. * Updates to pv_invl_gen are protected by the pv list lock but reads are not. */ -#ifdef NUMA -static __inline int -pc_to_domain(struct pv_chunk *pc) -{ - - return (_vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); -} -#else -static __inline int -pc_to_domain(struct pv_chunk *pc __unused) -{ - - return (0); -} -#endif - -struct pv_chunks_list { - struct mtx pvc_lock; - TAILQ_HEAD(pch, pv_chunk) pvc_list; - int active_reclaims; -} __aligned(CACHE_LINE_SIZE); - -struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; #ifdef NUMA struct pmap_large_md_page { - struct rwlock pv_lock; - struct md_page pv_page; + struct lock_object lo; + uintptr_t pad; + struct md_page pv_page; u_long pv_invl_gen; }; +/* + * We strongly depend on the size being a power of two, so the assert + * is overzealous. However, should the struct be resized to a + * different power of two, the code below needs to be revisited. + */ +_Static_assert(sizeof(struct pmap_large_md_page) == 64, "pmap_large_md_page"); + __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large; #define pv_dummy pv_dummy_large.pv_page __read_mostly static struct pmap_large_md_page *pv_table; __read_mostly vm_paddr_t pmap_last_pa; +static struct lock_object * +pv_list_lock_object(vm_paddr_t pa) +{ + if (__predict_false((pa) > pmap_last_pa)) \ + return (&pv_dummy_large.lo); + return (&(pa_to_pmdp(pa)->lo)); +} #else -static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; +static struct lock_object __exclusive_cache_line pv_lo[NPV_LIST_LOCKS]; static u_long pv_invl_gen[NPV_LIST_LOCKS]; static struct md_page *pv_table; static struct md_page pv_dummy; +static struct lock_object * +pv_list_lock_object(vm_paddr_t pa) +{ + return (&pv_lo[pa_index(pa) % NPV_LIST_LOCKS]); +} #endif +__read_mostly static struct vm_page pv_fake_page; + +static void +pmap_pv_list_lock(vm_page_t m) +{ + obm_lock(&m->md.pv_lock, pv_list_lock_object(VM_PAGE_TO_PHYS(m))); +} + +static void +pmap_pv_list_unlock(vm_page_t m) +{ + obm_unlock(&m->md.pv_lock, pv_list_lock_object(VM_PAGE_TO_PHYS(m))); +} + +/* + * Locks all pv lists for 4k pages constituing the superpage that + * contains the passed page. The page's pv list must be locked. + * Returns false if trylock failed and the page's pv list was unlocked + * in the process, which typically means that the caller must restart. + */ +static bool +pmap_pv_list_lock_pde(vm_page_t m) +{ + vm_page_t mt, sm; + struct lock_object *lo; + int i; + + obm_assert_locked(&m->md.pv_lock); + + sm = m - ((VM_PAGE_TO_PHYS(m) - (VM_PAGE_TO_PHYS(m) & + PG_PS_FRAME)) >> PAGE_SHIFT); + lo = pv_list_lock_object(VM_PAGE_TO_PHYS(m)); + + /* + * Fast attempt. If we either own or can get the pv list lock + * of the first page in the superpage, all other owners must + * release their locks without waiting for us. + */ + if (m == sm || obm_trylock(&sm->md.pv_lock)) { + for (i = 1, mt = sm + 1; i < NPDEPG; i++, mt++) { + if (m != mt) + obm_lock(&mt->md.pv_lock, lo); + } + } + return (true); + + obm_unlock(&m->md.pv_lock, lo); + for (i = 0, mt = sm; i < NPDEPG; i++, mt++) { + obm_lock(&mt->md.pv_lock, lo); + } + return (false); +} + +static void +change_pv_list_lock_to_vm_page(vm_page_t *mp, vm_page_t m) +{ + if (*mp != m) { + if (*mp != NULL) + pmap_pv_list_unlock(*mp); + *mp = m; + pmap_pv_list_lock(m); + } +} + +static void +change_pv_list_lock_to_phys(vm_page_t *mp, vm_paddr_t pa) +{ + vm_page_t m; + + m = PHYS_TO_VM_PAGE(pa); + if (m == NULL) + m = &pv_fake_page; + change_pv_list_lock_to_vm_page(mp, m); +} + +static void +release_pv_list_lock(vm_page_t *mp) +{ + if (*mp != NULL) { + pmap_pv_list_unlock(*mp); + *mp = NULL; + } +} /* * All those kernel PT submaps that BSD is so fond of @@ -1152,7 +1201,7 @@ { u_long gen, *m_gen; - rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); +//XXXKIB rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED); gen = curthread->td_md.md_invl_gen.gen; if (gen == 0) return; @@ -1169,8 +1218,7 @@ /* * Internal flags for pmap_enter()'s helper functions. */ -#define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */ -#define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */ +#define PMAP_ENTER_NOREPLACE 0x1000000 /* Don't replace mappings. */ /* * Internal flags for pmap_mapdev_internal() and @@ -1182,21 +1230,16 @@ TAILQ_HEAD(pv_chunklist, pv_chunk); -static void free_pv_chunk(struct pv_chunk *pc); -static void free_pv_chunk_batch(struct pv_chunklist *batch); static void free_pv_entry(pmap_t pmap, pv_entry_t pv); -static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp); -static int popcnt_pc_map_pq(uint64_t *map); -static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp); -static void reserve_pv_entries(pmap_t pmap, int needed, - struct rwlock **lockp); +static pv_entry_t get_pv_entry(pmap_t pmap, int flag); +static bool reserve_pv_entries(pmap_t pmap, int needed, bool locked); static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, - struct rwlock **lockp); + vm_page_t *pv_lock_mp); static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, - u_int flags, struct rwlock **lockp); + u_int flags, vm_page_t *pv_lock_mp); #if VM_NRESERVLEVEL > 0 static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, - struct rwlock **lockp); + vm_page_t *pv_lock_mp); #endif static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va); static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, @@ -1207,15 +1250,15 @@ vm_prot_t prot, int mode, int flags); static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va); static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, - vm_offset_t va, struct rwlock **lockp); + vm_offset_t va, vm_page_t *pv_lock_mp); static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va); static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, - vm_prot_t prot, struct rwlock **lockp); + vm_prot_t prot, vm_page_t *vm_lock_mp); static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, - u_int flags, vm_page_t m, struct rwlock **lockp); + u_int flags, vm_page_t m, vm_page_t *pv_lock_mp); static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, - vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp); + vm_page_t m, vm_prot_t prot, vm_page_t mpte, vm_page_t *pv_lock_mp); static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte); static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted); static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, @@ -1229,7 +1272,7 @@ static vm_paddr_t pmap_large_map_kextract(vm_offset_t va); #if VM_NRESERVLEVEL > 0 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, - struct rwlock **lockp); + vm_page_t *pv_lock_mp); #endif static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot); @@ -1240,27 +1283,26 @@ static pd_entry_t *pmap_pti_pde(vm_offset_t va); static void pmap_pti_wire_pte(void *pte); static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, - struct spglist *free, struct rwlock **lockp); + struct spglist *free, vm_page_t *pv_lock_mp); static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva, - pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp); + pd_entry_t ptepde, struct spglist *free, vm_page_t *pv_lock_mp); static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va); static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free); static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, - pd_entry_t *pde, struct spglist *free, - struct rwlock **lockp); + pd_entry_t *pde, struct spglist *free, vm_page_t *pv_lock_mp); static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, - vm_page_t m, struct rwlock **lockp); + vm_page_t m, vm_page_t *pv_lock_mp); static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde); static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, - struct rwlock **lockp); + vm_page_t *pv_lock_mp); static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, - struct rwlock **lockp); + vm_page_t *pv_lock_mp); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, - struct rwlock **lockp); + vm_page_t *pv_lock_mp); static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free); @@ -1729,7 +1771,6 @@ kernel_pmap->pm_cr3 = KPML4phys; kernel_pmap->pm_ucr3 = PMAP_NO_CR3; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ - TAILQ_INIT(&kernel_pmap->pm_pvchunk); kernel_pmap->pm_stats.resident_count = res; kernel_pmap->pm_flags = pmap_flags; @@ -1895,6 +1936,7 @@ TAILQ_INIT(&m->md.pv_list); m->md.pat_mode = PAT_WRITE_BACK; + obm_init(&m->md.pv_lock); } static int pmap_allow_2m_x_ept; @@ -1951,13 +1993,6 @@ long start, end, highest, pv_npg; int domain, i, j, pages; - /* - * We strongly depend on the size being a power of two, so the assert - * is overzealous. However, should the struct be resized to a - * different power of two, the code below needs to be revisited. - */ - CTASSERT((sizeof(*pvd) == 64)); - /* * Calculate the size of the array. */ @@ -1992,12 +2027,13 @@ vm_page_t m = vm_page_alloc_domain(NULL, 0, domain, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); if (m == NULL) - panic("vm_page_alloc_domain failed for %lx\n", (vm_offset_t)pvd + j); + panic("vm_page_alloc_domain failed for %lx\n", + (vm_offset_t)pvd + j); pmap_qenter((vm_offset_t)pvd + j, &m, 1); } for (j = 0; j < s / sizeof(*pvd); j++) { - rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); + obm_init_lo(&pvd->lo, "pmap pv list"); TAILQ_INIT(&pvd->pv_page.pv_list); pvd->pv_page.pv_gen = 0; pvd->pv_page.pat_mode = 0; @@ -2006,8 +2042,10 @@ } } pvd = &pv_dummy_large; - rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW); + obm_init_lo(&pvd->lo, "pmap pv list dummy"); TAILQ_INIT(&pvd->pv_page.pv_list); + pmap_page_init(&pv_fake_page); + pv_fake_page.phys_addr = pmap_last_pa + PAGE_SIZE; pvd->pv_page.pv_gen = 0; pvd->pv_page.pat_mode = 0; pvd->pv_invl_gen = 0; @@ -2023,7 +2061,7 @@ * Initialize the pool of pv list locks. */ for (i = 0; i < NPV_LIST_LOCKS; i++) - rw_init(&pv_list_locks[i], "pmap pv list"); + obm_init_lo(&pv_lo[i], "pmap pv list"); /* * Calculate the size of the pv head table for superpages. @@ -2039,6 +2077,8 @@ for (i = 0; i < pv_npg; i++) TAILQ_INIT(&pv_table[i].pv_list); TAILQ_INIT(&pv_dummy.pv_list); + pmap_page_init(&pv_fake_page); + pv_fake_page.phys_addr = vm_phys_segs[vm_phys_nsegs - 1].end + PAGE_SIZE; } #endif @@ -2138,14 +2178,9 @@ pagesizes[1] = NBPDR; } - /* - * Initialize pv chunk lists. - */ - for (i = 0; i < PMAP_MEMDOM; i++) { - mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF); - TAILQ_INIT(&pv_chunks[i].pvc_list); - } pmap_init_pv_table(); + pmap_pv_zone = uma_zcreate("pmap pv", sizeof(struct pv_entry), + NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct pv_entry), 0); pmap_initialized = 1; for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { @@ -3637,9 +3672,9 @@ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); - TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = pmap_flags; + TAILQ_INIT(&pmap->pm_pv_spares); CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1; pmap->pm_pcids[i].pm_gen = 1; @@ -3755,10 +3790,10 @@ pmap->pm_root.rt_root = 0; CPU_ZERO(&pmap->pm_active); - TAILQ_INIT(&pmap->pm_pvchunk); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); pmap->pm_flags = flags; pmap->pm_eptgen = 0; + TAILQ_INIT(&pmap->pm_pv_spares); return (1); } @@ -3782,7 +3817,7 @@ * race conditions. */ static vm_page_t -_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) +_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, vm_page_t *pv_lock_mp) { vm_page_t m, pdppg, pdpg; pt_entry_t PG_A, PG_M, PG_RW, PG_V; @@ -3799,8 +3834,8 @@ */ if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { - if (lockp != NULL) { - RELEASE_PV_LIST_LOCK(lockp); + if (pv_lock_mp != NULL) { + release_pv_list_lock(pv_lock_mp); PMAP_UNLOCK(pmap); PMAP_ASSERT_NOT_IN_DI(); vm_wait(NULL); @@ -3859,7 +3894,7 @@ if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pdp, recurse */ if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, - lockp) == NULL) { + pv_lock_mp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); @@ -3891,7 +3926,7 @@ if ((*pml4 & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, - lockp) == NULL) { + pv_lock_mp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); @@ -3904,7 +3939,7 @@ if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ if (_pmap_allocpte(pmap, NUPDE + pdpindex, - lockp) == NULL) { + pv_lock_mp) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); @@ -3929,7 +3964,7 @@ static pd_entry_t * pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, - struct rwlock **lockp) + vm_page_t *pv_lock_mp) { pdp_entry_t *pdpe, PG_V; pd_entry_t *pde; @@ -3951,12 +3986,11 @@ } else if (va < VM_MAXUSER_ADDRESS) { /* Allocate a pd page. */ pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT; - pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); + pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, pv_lock_mp); if (pdpg == NULL) { - if (lockp != NULL) + if (pv_lock_mp != NULL) goto retry; - else - return (NULL); + return (NULL); } pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(va)]; @@ -3968,7 +4002,7 @@ } static vm_page_t -pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp) +pmap_allocpte(pmap_t pmap, vm_offset_t va, vm_page_t *pv_lock_mp) { vm_pindex_t ptepindex; pd_entry_t *pd, PG_V; @@ -3991,7 +4025,7 @@ * normal 4K page. */ if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) { - if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) { + if (!pmap_demote_pde_locked(pmap, pd, va, pv_lock_mp)) { /* * Invalidation of the 2MB page mapping may have caused * the deallocation of the underlying PD page. @@ -4012,8 +4046,8 @@ * Here if the pte page isn't mapped, or if it has been * deallocated. */ - m = _pmap_allocpte(pmap, ptepindex, lockp); - if (m == NULL && lockp != NULL) + m = _pmap_allocpte(pmap, ptepindex, pv_lock_mp); + if (m == NULL && pv_lock_mp != NULL) goto retry; } return (m); @@ -4214,25 +4248,6 @@ * page management routines. ***************************************************/ -CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE); -CTASSERT(_NPCM == 3); -CTASSERT(_NPCPV == 168); - -static __inline struct pv_chunk * -pv_to_chunk(pv_entry_t pv) -{ - - return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK)); -} - -#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap) - -#define PC_FREE0 0xfffffffffffffffful -#define PC_FREE1 0xfffffffffffffffful -#define PC_FREE2 0x000000fffffffffful - -static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 }; - #ifdef PV_STATS static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail; @@ -4258,512 +4273,94 @@ "Current number of spare pv entries"); #endif -static void -reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di) -{ - - if (pmap == NULL) - return; - pmap_invalidate_all(pmap); - if (pmap != locked_pmap) - PMAP_UNLOCK(pmap); - if (start_di) - pmap_delayed_invl_finish(); -} - /* - * We are in a serious low memory condition. Resort to - * drastic measures to free some pages so we can allocate - * another pv entry chunk. - * - * Returns NULL if PV entries were reclaimed from the specified pmap. - * - * We do not, however, unmap 2mpages because subsequent accesses will - * allocate per-page pv entries until repromotion occurs, thereby - * exacerbating the shortage of free pv entries. + * Returns a new PV entry, allocating a new PV chunk from the system + * when needed. */ -static vm_page_t -reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) +static pv_entry_t +get_pv_entry(pmap_t pmap, int flag) { - struct pv_chunks_list *pvc; - struct pv_chunk *pc, *pc_marker, *pc_marker_end; - struct pv_chunk_header pc_marker_b, pc_marker_end_b; - struct md_page *pvh; - pd_entry_t *pde; - pmap_t next_pmap, pmap; - pt_entry_t *pte, tpte; - pt_entry_t PG_G, PG_A, PG_M, PG_RW; pv_entry_t pv; - vm_offset_t va; - vm_page_t m, m_pc; - struct spglist free; - uint64_t inuse; - int bit, field, freed; - bool start_di, restart; - - PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); - KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); - pmap = NULL; - m_pc = NULL; - PG_G = PG_A = PG_M = PG_RW = 0; - SLIST_INIT(&free); - bzero(&pc_marker_b, sizeof(pc_marker_b)); - bzero(&pc_marker_end_b, sizeof(pc_marker_end_b)); - pc_marker = (struct pv_chunk *)&pc_marker_b; - pc_marker_end = (struct pv_chunk *)&pc_marker_end_b; - - /* - * A delayed invalidation block should already be active if - * pmap_advise() or pmap_remove() called this function by way - * of pmap_demote_pde_locked(). - */ - start_di = pmap_not_in_di(); - - pvc = &pv_chunks[domain]; - mtx_lock(&pvc->pvc_lock); - pvc->active_reclaims++; - TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); - TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); - while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && - SLIST_EMPTY(&free)) { - next_pmap = pc->pc_pmap; - if (next_pmap == NULL) { - /* - * The next chunk is a marker. However, it is - * not our marker, so active_reclaims must be - * > 1. Consequently, the next_chunk code - * will not rotate the pv_chunks list. - */ - goto next_chunk; - } - mtx_unlock(&pvc->pvc_lock); - - /* - * A pv_chunk can only be removed from the pc_lru list - * when both pc_chunks_mutex is owned and the - * corresponding pmap is locked. - */ - if (pmap != next_pmap) { - restart = false; - reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, - start_di); - pmap = next_pmap; - /* Avoid deadlock and lock recursion. */ - if (pmap > locked_pmap) { - RELEASE_PV_LIST_LOCK(lockp); - PMAP_LOCK(pmap); - if (start_di) - pmap_delayed_invl_start(); - mtx_lock(&pvc->pvc_lock); - restart = true; - } else if (pmap != locked_pmap) { - if (PMAP_TRYLOCK(pmap)) { - if (start_di) - pmap_delayed_invl_start(); - mtx_lock(&pvc->pvc_lock); - restart = true; - } else { - pmap = NULL; /* pmap is not locked */ - mtx_lock(&pvc->pvc_lock); - pc = TAILQ_NEXT(pc_marker, pc_lru); - if (pc == NULL || - pc->pc_pmap != next_pmap) - continue; - goto next_chunk; - } - } else if (start_di) - pmap_delayed_invl_start(); - PG_G = pmap_global_bit(pmap); - PG_A = pmap_accessed_bit(pmap); - PG_M = pmap_modified_bit(pmap); - PG_RW = pmap_rw_bit(pmap); - if (restart) - continue; - } - - /* - * Destroy every non-wired, 4 KB page mapping in the chunk. - */ - freed = 0; - for (field = 0; field < _NPCM; field++) { - for (inuse = ~pc->pc_map[field] & pc_freemask[field]; - inuse != 0; inuse &= ~(1UL << bit)) { - bit = bsfq(inuse); - pv = &pc->pc_pventry[field * 64 + bit]; - va = pv->pv_va; - pde = pmap_pde(pmap, va); - if ((*pde & PG_PS) != 0) - continue; - pte = pmap_pde_to_pte(pde, va); - if ((*pte & PG_W) != 0) - continue; - tpte = pte_load_clear(pte); - if ((tpte & PG_G) != 0) - pmap_invalidate_page(pmap, va); - m = PHYS_TO_VM_PAGE(tpte & PG_FRAME); - if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) - vm_page_dirty(m); - if ((tpte & PG_A) != 0) - vm_page_aflag_set(m, PGA_REFERENCED); - CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); - m->md.pv_gen++; - if (TAILQ_EMPTY(&m->md.pv_list) && - (m->flags & PG_FICTITIOUS) == 0) { - pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); - if (TAILQ_EMPTY(&pvh->pv_list)) { - vm_page_aflag_clear(m, - PGA_WRITEABLE); - } - } - pmap_delayed_invl_page(m); - pc->pc_map[field] |= 1UL << bit; - pmap_unuse_pt(pmap, va, *pde, &free); - freed++; - } - } - if (freed == 0) { - mtx_lock(&pvc->pvc_lock); - goto next_chunk; - } - /* Every freed mapping is for a 4 KB page. */ - pmap_resident_count_dec(pmap, freed); - PV_STAT(atomic_add_long(&pv_entry_frees, freed)); - PV_STAT(atomic_add_int(&pv_entry_spare, freed)); - PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 && - pc->pc_map[2] == PC_FREE2) { - PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); - PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); - PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); - /* Entire chunk is free; return it. */ - m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); - dump_drop_page(m_pc->phys_addr); - mtx_lock(&pvc->pvc_lock); - TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); - break; - } - TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); - mtx_lock(&pvc->pvc_lock); - /* One freed pv entry in locked_pmap is sufficient. */ - if (pmap == locked_pmap) - break; -next_chunk: - TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); - TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); - if (pvc->active_reclaims == 1 && pmap != NULL) { - /* - * Rotate the pv chunks list so that we do not - * scan the same pv chunks that could not be - * freed (because they contained a wired - * and/or superpage mapping) on every - * invocation of reclaim_pv_chunk(). - */ - while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) { - MPASS(pc->pc_pmap != NULL); - TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); - TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); - } - } - } - TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); - TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); - pvc->active_reclaims--; - mtx_unlock(&pvc->pvc_lock); - reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); - if (m_pc == NULL && !SLIST_EMPTY(&free)) { - m_pc = SLIST_FIRST(&free); - SLIST_REMOVE_HEAD(&free, plinks.s.ss); - /* Recycle a freed page table page. */ - m_pc->ref_count = 1; - } - vm_page_free_pages_toq(&free, true); - return (m_pc); -} - -static vm_page_t -reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) -{ - vm_page_t m; - int i, domain; - - domain = PCPU_GET(domain); - for (i = 0; i < vm_ndomains; i++) { - m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); - if (m != NULL) - break; - domain = (domain + 1) % vm_ndomains; - } - - return (m); -} - -/* - * free the pv_entry back to the free list - */ -static void -free_pv_entry(pmap_t pmap, pv_entry_t pv) -{ - struct pv_chunk *pc; - int idx, field, bit; - - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - PV_STAT(atomic_add_long(&pv_entry_frees, 1)); - PV_STAT(atomic_add_int(&pv_entry_spare, 1)); - PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); - pc = pv_to_chunk(pv); - idx = pv - &pc->pc_pventry[0]; - field = idx / 64; - bit = idx % 64; - pc->pc_map[field] |= 1ul << bit; - if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 || - pc->pc_map[2] != PC_FREE2) { - /* 98% of the time, pc is already at the head of the list. */ - if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) { - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); - } - return; - } - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - free_pv_chunk(pc); -} - -static void -free_pv_chunk_dequeued(struct pv_chunk *pc) -{ - vm_page_t m; - - PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV)); - PV_STAT(atomic_subtract_int(&pc_chunk_count, 1)); - PV_STAT(atomic_add_int(&pc_chunk_frees, 1)); - /* entire chunk is free, return it */ - m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); - dump_drop_page(m->phys_addr); - vm_page_unwire_noq(m); - vm_page_free(m); -} - -static void -free_pv_chunk(struct pv_chunk *pc) -{ - struct pv_chunks_list *pvc; - - pvc = &pv_chunks[pc_to_domain(pc)]; - mtx_lock(&pvc->pvc_lock); - TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); - mtx_unlock(&pvc->pvc_lock); - free_pv_chunk_dequeued(pc); -} + int mflag; -static void -free_pv_chunk_batch(struct pv_chunklist *batch) -{ - struct pv_chunks_list *pvc; - struct pv_chunk *pc, *npc; - int i; + MPASS((flag & ~(PMAP_GPV_NOWAIT | PMAP_GPV_RESERV)) == 0); - for (i = 0; i < vm_ndomains; i++) { - if (TAILQ_EMPTY(&batch[i])) - continue; - pvc = &pv_chunks[i]; - mtx_lock(&pvc->pvc_lock); - TAILQ_FOREACH(pc, &batch[i], pc_list) { - TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); - } - mtx_unlock(&pvc->pvc_lock); + if ((flag & PMAP_GPV_RESERV) != 0) { + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + MPASS(!TAILQ_EMPTY(&pmap->pm_pv_spares)); + MPASS(curthread->td_md.md_pmap_reserv > 0); + pv = TAILQ_FIRST(&pmap->pm_pv_spares); + TAILQ_REMOVE(&pmap->pm_pv_spares, pv, pv_next); + curthread->td_md.md_pmap_reserv -= 1; + return (pv); } - for (i = 0; i < vm_ndomains; i++) { - TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { - free_pv_chunk_dequeued(pc); - } + if ((flag & PMAP_GPV_NOWAIT) != 0) { + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + mflag = M_NOWAIT; + } else { + mflag = M_WAITOK; } + pv = uma_zalloc(pmap_pv_zone, mflag); + pv->pv_pmap = pmap; + if (pv != NULL) + PV_STAT(atomic_add_long(&pv_entry_count, 1)); + return (pv); } /* - * Returns a new PV entry, allocating a new PV chunk from the system when - * needed. If this PV chunk allocation fails and a PV list lock pointer was - * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is - * returned. - * - * The given PV list lock may be released. + * Ensure that the number of spare PV entries in the specified pmap meets or + * exceeds the given count, "needed". */ -static pv_entry_t -get_pv_entry(pmap_t pmap, struct rwlock **lockp) +static bool +reserve_pv_entries(pmap_t pmap, int needed, bool locked) { - struct pv_chunks_list *pvc; - int bit, field; + TAILQ_HEAD(, pv_entry) pvs; pv_entry_t pv; - struct pv_chunk *pc; - vm_page_t m; + int i, j; - PMAP_LOCK_ASSERT(pmap, MA_OWNED); - PV_STAT(atomic_add_long(&pv_entry_allocs, 1)); -retry: - pc = TAILQ_FIRST(&pmap->pm_pvchunk); - if (pc != NULL) { - for (field = 0; field < _NPCM; field++) { - if (pc->pc_map[field]) { - bit = bsfq(pc->pc_map[field]); - break; - } - } - if (field < _NPCM) { - pv = &pc->pc_pventry[field * 64 + bit]; - pc->pc_map[field] &= ~(1ul << bit); - /* If this was the last item, move it to tail */ - if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && - pc->pc_map[2] == 0) { - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, - pc_list); + MPASS(pmap == PCPU_GET(curpmap)); + TAILQ_INIT(&pvs); + for (i = 0; i < needed; i++) { + pv = uma_zalloc(pmap_pv_zone, locked ? M_NOWAIT : M_WAITOK); + if (pv == NULL) { + for (j = 0; j < i; j++) { + pv = TAILQ_FIRST(&pvs); + TAILQ_REMOVE(&pvs, pv, pv_next); + uma_zfree(pmap_pv_zone, pv); } - PV_STAT(atomic_add_long(&pv_entry_count, 1)); - PV_STAT(atomic_subtract_int(&pv_entry_spare, 1)); - return (pv); - } - } - /* No free items, allocate another chunk */ - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED); - if (m == NULL) { - if (lockp == NULL) { - PV_STAT(pc_chunk_tryfail++); - return (NULL); + return (false); } - m = reclaim_pv_chunk(pmap, lockp); - if (m == NULL) - goto retry; + pv->pv_pmap = pmap; + TAILQ_INSERT_TAIL(&pvs, pv, pv_next); } - PV_STAT(atomic_add_int(&pc_chunk_count, 1)); - PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); - dump_add_page(m->phys_addr); - pc = (void *)PHYS_TO_DMAP(m->phys_addr); - pc->pc_pmap = pmap; - pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ - pc->pc_map[1] = PC_FREE1; - pc->pc_map[2] = PC_FREE2; - pvc = &pv_chunks[_vm_phys_domain(m->phys_addr)]; - mtx_lock(&pvc->pvc_lock); - TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); - mtx_unlock(&pvc->pvc_lock); - pv = &pc->pc_pventry[0]; - TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); - PV_STAT(atomic_add_long(&pv_entry_count, 1)); - PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1)); - return (pv); + if (!locked) + PMAP_LOCK(pmap); + TAILQ_CONCAT(&pmap->pm_pv_spares, &pvs, pv_next); + if (!locked) + PMAP_UNLOCK(pmap); + curthread->td_md.md_pmap_reserv += needed; + return (true); } -/* - * Returns the number of one bits within the given PV chunk map. - * - * The erratas for Intel processors state that "POPCNT Instruction May - * Take Longer to Execute Than Expected". It is believed that the - * issue is the spurious dependency on the destination register. - * Provide a hint to the register rename logic that the destination - * value is overwritten, by clearing it, as suggested in the - * optimization manual. It should be cheap for unaffected processors - * as well. - * - * Reference numbers for erratas are - * 4th Gen Core: HSD146 - * 5th Gen Core: BDM85 - * 6th Gen Core: SKL029 - */ -static int -popcnt_pc_map_pq(uint64_t *map) +static void +free_pv_entry(pmap_t pmap, struct pv_entry *pv) { - u_long result, tmp; - - __asm __volatile("xorl %k0,%k0;popcntq %2,%0;" - "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;" - "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0" - : "=&r" (result), "=&r" (tmp) - : "m" (map[0]), "m" (map[1]), "m" (map[2])); - return (result); + uma_zfree(pmap_pv_zone, pv); + PV_STAT(atomic_add_long(&pv_entry_count, -1)); } -/* - * Ensure that the number of spare PV entries in the specified pmap meets or - * exceeds the given count, "needed". - * - * The given PV list lock may be released. - */ static void -reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) +unreserve_pv_entries(pmap_t pmap) { - struct pv_chunks_list *pvc; - struct pch new_tail[PMAP_MEMDOM]; - struct pv_chunk *pc; - vm_page_t m; - int avail, free, i; - bool reclaimed; + struct pv_entry *pv; + MPASS(pmap == PCPU_GET(curpmap)); PMAP_LOCK_ASSERT(pmap, MA_OWNED); - KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL")); - - /* - * Newly allocated PV chunks must be stored in a private list until - * the required number of PV chunks have been allocated. Otherwise, - * reclaim_pv_chunk() could recycle one of these chunks. In - * contrast, these chunks must be added to the pmap upon allocation. - */ - for (i = 0; i < PMAP_MEMDOM; i++) - TAILQ_INIT(&new_tail[i]); -retry: - avail = 0; - TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { -#ifndef __POPCNT__ - if ((cpu_feature2 & CPUID2_POPCNT) == 0) - bit_count((bitstr_t *)pc->pc_map, 0, - sizeof(pc->pc_map) * NBBY, &free); - else -#endif - free = popcnt_pc_map_pq(pc->pc_map); - if (free == 0) - break; - avail += free; - if (avail >= needed) - break; - } - for (reclaimed = false; avail < needed; avail += _NPCPV) { - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED); - if (m == NULL) { - m = reclaim_pv_chunk(pmap, lockp); - if (m == NULL) - goto retry; - reclaimed = true; - } - PV_STAT(atomic_add_int(&pc_chunk_count, 1)); - PV_STAT(atomic_add_int(&pc_chunk_allocs, 1)); - dump_add_page(m->phys_addr); - pc = (void *)PHYS_TO_DMAP(m->phys_addr); - pc->pc_pmap = pmap; - pc->pc_map[0] = PC_FREE0; - pc->pc_map[1] = PC_FREE1; - pc->pc_map[2] = PC_FREE2; - TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_TAIL(&new_tail[pc_to_domain(pc)], pc, pc_lru); - PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); - - /* - * The reclaim might have freed a chunk from the current pmap. - * If that chunk contained available entries, we need to - * re-count the number of available entries. - */ - if (reclaimed) - goto retry; - } - for (i = 0; i < vm_ndomains; i++) { - if (TAILQ_EMPTY(&new_tail[i])) - continue; - pvc = &pv_chunks[i]; - mtx_lock(&pvc->pvc_lock); - TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); - mtx_unlock(&pvc->pvc_lock); + while ((curthread->td_md.md_pmap_reserv)-- > 0) { + pv = TAILQ_FIRST(&pmap->pm_pv_spares); + TAILQ_REMOVE(&pmap->pm_pv_spares, pv, pv_next); + free_pv_entry(pmap, pv); } } @@ -4795,65 +4392,47 @@ */ static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, - struct rwlock **lockp) + vm_page_t *pv_lock_mp) { struct md_page *pvh; - struct pv_chunk *pc; pv_entry_t pv; vm_offset_t va_last; vm_page_t m; - int bit, field; PMAP_LOCK_ASSERT(pmap, MA_OWNED); KASSERT((pa & PDRMASK) == 0, ("pmap_pv_demote_pde: pa is not 2mpage aligned")); - CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); + + pvh = pa_to_pvh(pa); + va = trunc_2mpage(va); + va_last = va + NBPDR; + m = PHYS_TO_VM_PAGE(pa); + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_pv_demote_pde: page %p is not managed", m)); + change_pv_list_lock_to_phys(pv_lock_mp, pa); /* * Transfer the 2mpage's pv entry for this mapping to the first * page's pv list. Once this transfer begins, the pv list lock * must not be released until the last pv entry is reinstantiated. */ - pvh = pa_to_pvh(pa); - va = trunc_2mpage(va); pv = pmap_pvh_remove(pvh, pmap, va); KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); - m = PHYS_TO_VM_PAGE(pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; - /* Instantiate the remaining NPTEPG - 1 pv entries. */ - PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1)); - va_last = va + NBPDR - PAGE_SIZE; - for (;;) { - pc = TAILQ_FIRST(&pmap->pm_pvchunk); - KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 || - pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare")); - for (field = 0; field < _NPCM; field++) { - while (pc->pc_map[field]) { - bit = bsfq(pc->pc_map[field]); - pc->pc_map[field] &= ~(1ul << bit); - pv = &pc->pc_pventry[field * 64 + bit]; - va += PAGE_SIZE; - pv->pv_va = va; - m++; - KASSERT((m->oflags & VPO_UNMANAGED) == 0, - ("pmap_pv_demote_pde: page %p is not managed", m)); - TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); - m->md.pv_gen++; - if (va == va_last) - goto out; - } - } - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); - } -out: - if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) { - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list); + va += PAGE_SIZE; + m++; + + for (; va < va_last; m++, va += PAGE_SIZE) { + KASSERT((m->oflags & VPO_UNMANAGED) == 0, + ("pmap_pv_demote_pde: page %p is not managed", m)); + pv = get_pv_entry(pmap, PMAP_GPV_RESERV); + KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found")); + pv->pv_va = va; + TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; } PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1)); - PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1)); } #if VM_NRESERVLEVEL > 0 @@ -4864,7 +4443,7 @@ */ static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa, - struct rwlock **lockp) + vm_page_t *pv_lock_mp) { struct md_page *pvh; pv_entry_t pv; @@ -4873,28 +4452,33 @@ KASSERT((pa & PDRMASK) == 0, ("pmap_pv_promote_pde: pa is not 2mpage aligned")); - CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); /* - * Transfer the first page's pv entry for this mapping to the 2mpage's - * pv list. Aside from avoiding the cost of a call to get_pv_entry(), - * a transfer avoids the possibility that get_pv_entry() calls - * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the - * mappings that is being promoted. + * Transfer the first page's pv entry for this mapping to the + * 2mpage's pv list. This avoids potentially failing call to + * get_pv_entry(). */ m = PHYS_TO_VM_PAGE(pa); + change_pv_list_lock_to_vm_page(pv_lock_mp, m); va = trunc_2mpage(va); pv = pmap_pvh_remove(&m->md, pmap, va); KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found")); pvh = pa_to_pvh(pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; - /* Free the remaining NPTEPG - 1 pv entries. */ + /* Free or stash the remaining NPTEPG - 1 pv entries. */ va_last = va + NBPDR - PAGE_SIZE; do { m++; va += PAGE_SIZE; - pmap_pvh_free(&m->md, pmap, va); + pv = pmap_pvh_remove(&m->md, pmap, va); + MPASS(pv != NULL); + if (va >= VM_MAXUSER_ADDRESS) { + MPASS(pmap == kernel_pmap); + TAILQ_INSERT_TAIL(&pmap->pm_pv_spares, pv, pv_next); + } else { + free_pv_entry(pmap, pv); + } } while (va < va_last); } #endif /* VM_NRESERVLEVEL > 0 */ @@ -4920,15 +4504,14 @@ */ static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m, - struct rwlock **lockp) + vm_page_t *pv_lock_mp) { pv_entry_t pv; PMAP_LOCK_ASSERT(pmap, MA_OWNED); - /* Pass NULL instead of the lock pointer to disable reclamation. */ - if ((pv = get_pv_entry(pmap, NULL)) != NULL) { + if ((pv = get_pv_entry(pmap, PMAP_GPV_NOWAIT)) != NULL) { pv->pv_va = va; - CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + change_pv_list_lock_to_vm_page(pv_lock_mp, m); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; return (TRUE); @@ -4937,27 +4520,25 @@ } /* - * Create the PV entry for a 2MB page mapping. Always returns true unless the - * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns - * false if the PV entry cannot be allocated without resorting to reclamation. + * Create the PV entry for a 2MB page mapping. Returns false if the + * PV entry cannot be allocated. */ static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags, - struct rwlock **lockp) + vm_page_t *pv_lock_mp) { struct md_page *pvh; pv_entry_t pv; vm_paddr_t pa; PMAP_LOCK_ASSERT(pmap, MA_OWNED); - /* Pass NULL instead of the lock pointer to disable reclamation. */ - if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ? - NULL : lockp)) == NULL) + pv = get_pv_entry(pmap, PMAP_GPV_NOWAIT); + if (pv == NULL) return (false); pv->pv_va = va; pa = pde & PG_PS_FRAME; - CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa); pvh = pa_to_pvh(pa); + change_pv_list_lock_to_phys(pv_lock_mp, pa); TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next); pvh->pv_gen++; return (true); @@ -4984,13 +4565,12 @@ static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va) { - struct rwlock *lock; + vm_page_t pv_lock_m; boolean_t rv; - lock = NULL; - rv = pmap_demote_pde_locked(pmap, pde, va, &lock); - if (lock != NULL) - rw_wunlock(lock); + pv_lock_m = NULL; + rv = pmap_demote_pde_locked(pmap, pde, va, &pv_lock_m); + release_pv_list_lock(&pv_lock_m); return (rv); } @@ -5023,14 +4603,14 @@ static void pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, - pd_entry_t oldpde, struct rwlock **lockp) + pd_entry_t oldpde, vm_page_t *pv_lock_mp) { struct spglist free; vm_offset_t sva; SLIST_INIT(&free); sva = trunc_2mpage(va); - pmap_remove_pde(pmap, pde, sva, &free, lockp); + pmap_remove_pde(pmap, pde, sva, &free, pv_lock_mp); if ((oldpde & pmap_global_bit(pmap)) == 0) pmap_invalidate_pde_page(pmap, sva, oldpde); vm_page_free_pages_toq(&free, true); @@ -5040,7 +4620,7 @@ static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, - struct rwlock **lockp) + vm_page_t *pv_lock_mp) { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; @@ -5071,7 +4651,7 @@ if ((oldpde & PG_A) == 0) { KASSERT((oldpde & PG_W) == 0, ("pmap_demote_pde: a wired mapping is missing PG_A")); - pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); + pmap_demote_pde_abort(pmap, va, pde, oldpde, pv_lock_mp); return (FALSE); } @@ -5109,7 +4689,8 @@ * invalidate the 2MB page mapping and return "failure". */ if (mpte == NULL) { - pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp); + pmap_demote_pde_abort(pmap, va, pde, oldpde, + pv_lock_mp); return (FALSE); } @@ -5144,14 +4725,32 @@ /* * The spare PV entries must be reserved prior to demoting the - * mapping, that is, prior to changing the PDE. Otherwise, the state - * of the PDE and the PV lists will be inconsistent, which can result - * in reclaim_pv_chunk() attempting to remove a PV entry from the - * wrong PV list and pmap_pv_demote_pde() failing to find the expected - * PV entry for the 2MB page mapping that is being demoted. + * mapping, that is, prior to changing the PDE. Otherwise, + * the state of the PDE and the PV lists will be inconsistent, + * and current pmap_pv_demote_pde() cannot handle failure to + * get PV entry. */ - if ((oldpde & PG_MANAGED) != 0) - reserve_pv_entries(pmap, NPTEPG - 1, lockp); + if ((oldpde & PG_MANAGED) != 0) { + if (in_kernel) { +#ifdef INVARIANTS + struct pv_entry *pv1; + int i; + + for (i = 1, pv1 = TAILQ_FIRST(&pmap->pm_pv_spares); + i < NPTEPG; i++, pv1 = TAILQ_NEXT(pv1, pv_next)) { + MPASS(pv1 != NULL); + } +#endif + curthread->td_md.md_pmap_reserv += NPTEPG - 1; + } else if (!reserve_pv_entries(pmap, NPTEPG - 1, true)) { + pmap_demote_pde_abort(pmap, va, pde, oldpde, + pv_lock_mp); + vm_page_free(mpte); + vm_wire_sub(1); + pmap_resident_count_dec(pmap, 1); + return (FALSE); + } + } /* * Demote the mapping. This pmap is locked. The old PDE has @@ -5175,7 +4774,7 @@ * Demote the PV entry. */ if ((oldpde & PG_MANAGED) != 0) - pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp); + pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, pv_lock_mp); atomic_add_long(&pmap_pde_demotions, 1); CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p", @@ -5228,7 +4827,7 @@ */ static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva, - struct spglist *free, struct rwlock **lockp) + struct spglist *free, vm_page_t *pv_lock_mp) { struct md_page *pvh; pd_entry_t oldpde; @@ -5251,7 +4850,7 @@ pmap_invalidate_pde_page(kernel_pmap, sva, oldpde); pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); if (oldpde & PG_MANAGED) { - CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME); + change_pv_list_lock_to_phys(pv_lock_mp, oldpde & PG_PS_FRAME); pvh = pa_to_pvh(oldpde & PG_PS_FRAME); pmap_pvh_free(pvh, pmap, sva); eva = sva + NBPDR; @@ -5289,7 +4888,7 @@ */ static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, - pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp) + pd_entry_t ptepde, struct spglist *free, vm_page_t *pv_lock_mp) { struct md_page *pvh; pt_entry_t oldpte, PG_A, PG_M, PG_RW; @@ -5310,7 +4909,7 @@ vm_page_dirty(m); if (oldpte & PG_A) vm_page_aflag_set(m, PGA_REFERENCED); - CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m); + change_pv_list_lock_to_vm_page(pv_lock_mp, m); pmap_pvh_free(&m->md, pmap, va); if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { @@ -5330,7 +4929,7 @@ pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, struct spglist *free) { - struct rwlock *lock; + vm_page_t pv_lock_m; pt_entry_t *pte, PG_V; PG_V = pmap_valid_bit(pmap); @@ -5340,10 +4939,9 @@ pte = pmap_pde_to_pte(pde, va); if ((*pte & PG_V) == 0) return; - lock = NULL; - pmap_remove_pte(pmap, pte, va, *pde, free, &lock); - if (lock != NULL) - rw_wunlock(lock); + pv_lock_m = NULL; + pmap_remove_pte(pmap, pte, va, *pde, free, &pv_lock_m); + release_pv_list_lock(&pv_lock_m); pmap_invalidate_page(pmap, va); } @@ -5352,7 +4950,7 @@ */ static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, - pd_entry_t *pde, struct spglist *free, struct rwlock **lockp) + pd_entry_t *pde, struct spglist *free, vm_page_t *pv_lock_mp) { pt_entry_t PG_G, *pte; vm_offset_t va; @@ -5375,7 +4973,7 @@ anyvalid = true; else if (va == eva) va = sva; - if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) { + if (pmap_remove_pte(pmap, pte, sva, *pde, free, pv_lock_mp)) { sva += PAGE_SIZE; break; } @@ -5394,7 +4992,7 @@ void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) { - struct rwlock *lock; + vm_page_t pv_lock_m; vm_offset_t va_next; pml4_entry_t *pml4e; pdp_entry_t *pdpe; @@ -5432,7 +5030,7 @@ } } - lock = NULL; + pv_lock_m = NULL; for (; sva < eva; sva = va_next) { if (pmap->pm_stats.resident_count == 0) @@ -5485,10 +5083,11 @@ */ if ((ptpaddr & PG_G) == 0) anyvalid = 1; - pmap_remove_pde(pmap, pde, sva, &free, &lock); + pmap_remove_pde(pmap, pde, sva, &free, + &pv_lock_m); continue; } else if (!pmap_demote_pde_locked(pmap, pde, sva, - &lock)) { + &pv_lock_m)) { /* The large page mapping was destroyed. */ continue; } else @@ -5503,11 +5102,11 @@ if (va_next > eva) va_next = eva; - if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock)) + if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, + &pv_lock_m)) anyvalid = 1; } - if (lock != NULL) - rw_wunlock(lock); + release_pv_list_lock(&pv_lock_m); out: if (anyvalid) pmap_invalidate_all(pmap); @@ -5535,7 +5134,7 @@ struct md_page *pvh; pv_entry_t pv; pmap_t pmap; - struct rwlock *lock; + vm_page_t pv_lock_m; pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW; pd_entry_t *pde; vm_offset_t va; @@ -5545,27 +5144,27 @@ KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_remove_all: page %p is not managed", m)); SLIST_INIT(&free); - lock = VM_PAGE_TO_PV_LIST_LOCK(m); + pv_lock_m = m; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry: - rw_wlock(lock); + pmap_pv_list_lock(m); while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(m); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(m); if (pvh_gen != pvh->pv_gen) { - rw_wunlock(lock); + pmap_pv_list_unlock(m); PMAP_UNLOCK(pmap); goto retry; } } va = pv->pv_va; pde = pmap_pde(pmap, va); - (void)pmap_demote_pde_locked(pmap, pde, va, &lock); + (void)pmap_demote_pde_locked(pmap, pde, va, &pv_lock_m); PMAP_UNLOCK(pmap); } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { @@ -5573,11 +5172,11 @@ if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(m); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(m); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { - rw_wunlock(lock); + pmap_pv_list_unlock(m); PMAP_UNLOCK(pmap); goto retry; } @@ -5609,7 +5208,7 @@ PMAP_UNLOCK(pmap); } vm_page_aflag_clear(m, PGA_WRITEABLE); - rw_wunlock(lock); + pmap_pv_list_unlock(m); pmap_delayed_invl_wait(m); vm_page_free_pages_toq(&free, true); } @@ -5826,7 +5425,7 @@ */ static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, - struct rwlock **lockp) + vm_page_t *pv_lock_mp) { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; @@ -5928,8 +5527,10 @@ /* * Promote the pv entries. */ - if ((newpde & PG_MANAGED) != 0) - pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp); + if ((newpde & PG_MANAGED) != 0) { + pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, + pv_lock_mp); + } /* * Propagate the PAT index to its proper position. @@ -5960,17 +5561,18 @@ * * NB: This is the only routine which MAY NOT lazy-evaluate * or lose information. That is, this routine must actually - * insert this page into the given map NOW. + * insert this page into the given map NOW, unless PMAP_ENTER_NOPVSLEEP + * flag is specified. * * When destroying both a page table and PV entry, this function - * performs the TLB invalidation before releasing the PV list - * lock, so we do not need pmap_delayed_invl_page() calls here. + * releases PV list lock before performing the TLB invalidation, + * so we need pmap_delayed_invl_page() calls here. */ int pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, u_int flags, int8_t psind) { - struct rwlock *lock; + vm_page_t pv_lock_m; pd_entry_t *pde; pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V; pt_entry_t newpte, origpte; @@ -6027,13 +5629,14 @@ } else newpte |= PG_MANAGED; - lock = NULL; + pv_lock_m = NULL; PMAP_LOCK(pmap); if (psind == 1) { /* Assert the required virtual and physical alignment. */ KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned")); KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind")); - rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock); + rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, + &pv_lock_m); goto out; } mpte = NULL; @@ -6045,7 +5648,7 @@ retry: pde = pmap_pde(pmap, va); if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 || - pmap_demote_pde_locked(pmap, pde, va, &lock))) { + pmap_demote_pde_locked(pmap, pde, va, &pv_lock_m))) { pte = pmap_pde_to_pte(pde, va); if (va < VM_MAXUSER_ADDRESS && mpte == NULL) { mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME); @@ -6058,7 +5661,7 @@ */ nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), - nosleep ? NULL : &lock); + nosleep ? NULL : &pv_lock_m); if (mpte == NULL && nosleep) { rv = KERN_RESOURCE_SHORTAGE; goto out; @@ -6140,17 +5743,18 @@ pmap_invalidate_page(pmap, va); vm_page_aflag_set(om, PGA_REFERENCED); } - CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa); + change_pv_list_lock_to_phys(&pv_lock_m, opa); pv = pmap_pvh_remove(&om->md, pmap, va); KASSERT(pv != NULL, ("pmap_enter: no PV entry for %#lx", va)); - if ((newpte & PG_MANAGED) == 0) - free_pv_entry(pmap, pv); if ((om->a.flags & PGA_WRITEABLE) != 0 && TAILQ_EMPTY(&om->md.pv_list) && ((om->flags & PG_FICTITIOUS) != 0 || TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list))) vm_page_aflag_clear(om, PGA_WRITEABLE); + release_pv_list_lock(&pv_lock_m); + if ((newpte & PG_MANAGED) == 0) + free_pv_entry(pmap, pv); } else { /* * Since this mapping is unmanaged, assume that PG_A @@ -6173,14 +5777,20 @@ */ if ((newpte & PG_MANAGED) != 0) { if (pv == NULL) { - pv = get_pv_entry(pmap, &lock); + pv = get_pv_entry(pmap, PMAP_GPV_NOWAIT); + if (pv == NULL) { + MPASS((flags & PMAP_ENTER_NOPVSLEEP) != 0); + rv = KERN_RESOURCE_SHORTAGE; + goto out; + } pv->pv_va = va; } - CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa); + change_pv_list_lock_to_phys(&pv_lock_m, pa); TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next); m->md.pv_gen++; if ((newpte & PG_RW) != 0) vm_page_aflag_set(m, PGA_WRITEABLE); + release_pv_list_lock(&pv_lock_m); } /* @@ -6223,13 +5833,12 @@ pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) - pmap_promote_pde(pmap, pde, va, &lock); + pmap_promote_pde(pmap, pde, va, &pv_lock_m); #endif rv = KERN_SUCCESS; out: - if (lock != NULL) - rw_wunlock(lock); + release_pv_list_lock(&pv_lock_m); PMAP_UNLOCK(pmap); return (rv); } @@ -6243,7 +5852,7 @@ */ static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, - struct rwlock **lockp) + vm_page_t *pv_lock_mp) { pd_entry_t newpde; pt_entry_t PG_V; @@ -6259,8 +5868,7 @@ if (va < VM_MAXUSER_ADDRESS) newpde |= PG_U; return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP | - PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) == - KERN_SUCCESS); + PMAP_ENTER_NOREPLACE, NULL, pv_lock_mp) == KERN_SUCCESS); } /* @@ -6287,14 +5895,13 @@ * otherwise. Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and * a mapping already exists at the specified virtual address. Returns * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table - * page allocation failed. Returns KERN_RESOURCE_SHORTAGE if - * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed. + * page allocation failed, or if PV entry allocation failed. * * The parameter "m" is only used when creating a managed, writeable mapping. */ static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags, - vm_page_t m, struct rwlock **lockp) + vm_page_t m, vm_page_t *pv_lock_mp) { struct spglist free; pd_entry_t oldpde, *pde; @@ -6317,7 +5924,7 @@ return (KERN_FAILURE); } if ((pde = pmap_alloc_pde(pmap, va, &pdpg, (flags & - PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) { + PMAP_ENTER_NOSLEEP) != 0 ? NULL : pv_lock_mp)) == NULL) { CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); @@ -6362,13 +5969,14 @@ * However, if the PDE resulted from a promotion, then * a reserved PT page could be freed. */ - (void)pmap_remove_pde(pmap, pde, va, &free, lockp); + (void)pmap_remove_pde(pmap, pde, va, &free, + pv_lock_mp); if ((oldpde & PG_G) == 0) pmap_invalidate_pde_page(pmap, va, oldpde); } else { pmap_delayed_invl_start(); if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free, - lockp)) + pv_lock_mp)) pmap_invalidate_all(pmap); pmap_delayed_invl_finish(); } @@ -6394,7 +6002,7 @@ /* * Abort this mapping if its PV entry could not be created. */ - if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) { + if (!pmap_pv_insert_pde(pmap, va, newpde, flags, pv_lock_mp)) { if (pdpg != NULL) pmap_abort_ptp(pmap, va, pdpg); CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx" @@ -6442,7 +6050,7 @@ pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_page_t m_start, vm_prot_t prot) { - struct rwlock *lock; + vm_page_t pv_lock_m; vm_offset_t va; vm_page_t m, mpte; vm_pindex_t diff, psize; @@ -6452,22 +6060,21 @@ psize = atop(end - start); mpte = NULL; m = m_start; - lock = NULL; + pv_lock_m = NULL; PMAP_LOCK(pmap); while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) { va = start + ptoa(diff); if ((va & PDRMASK) == 0 && va + NBPDR <= end && m->psind == 1 && pmap_ps_enabled(pmap) && pmap_allow_2m_x_page(pmap, (prot & VM_PROT_EXECUTE) != 0) && - pmap_enter_2mpage(pmap, va, m, prot, &lock)) + pmap_enter_2mpage(pmap, va, m, prot, &pv_lock_m)) m = &m[NBPDR / PAGE_SIZE - 1]; else mpte = pmap_enter_quick_locked(pmap, va, m, prot, - mpte, &lock); + mpte, &pv_lock_m); m = TAILQ_NEXT(m, listq); } - if (lock != NULL) - rw_wunlock(lock); + release_pv_list_lock(&pv_lock_m); PMAP_UNLOCK(pmap); } @@ -6483,19 +6090,18 @@ void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot) { - struct rwlock *lock; + vm_page_t pv_lock_m; - lock = NULL; + pv_lock_m = NULL; PMAP_LOCK(pmap); - (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock); - if (lock != NULL) - rw_wunlock(lock); + (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &pv_lock_m); + release_pv_list_lock(&pv_lock_m); PMAP_UNLOCK(pmap); } static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, - vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp) + vm_prot_t prot, vm_page_t mpte, vm_page_t *pv_lock_mp) { pt_entry_t newpte, *pte, PG_V; @@ -6562,7 +6168,7 @@ * Enter on the PV list if part of our managed memory. */ if ((m->oflags & VPO_UNMANAGED) == 0 && - !pmap_try_insert_pv_entry(pmap, va, m, lockp)) { + !pmap_try_insert_pv_entry(pmap, va, m, pv_lock_mp)) { if (mpte != NULL) pmap_abort_ptp(pmap, va, mpte); return (NULL); @@ -6789,7 +6395,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len, vm_offset_t src_addr) { - struct rwlock *lock; + vm_page_t pv_lock_m; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde, srcptepaddr; @@ -6815,7 +6421,7 @@ return; end_addr = src_addr + len; - lock = NULL; + pv_lock_m = NULL; if (dst_pmap < src_pmap) { PMAP_LOCK(dst_pmap); PMAP_LOCK(src_pmap); @@ -6865,7 +6471,7 @@ break; if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 || pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr, - PMAP_ENTER_NORECLAIM, &lock))) { + 0, &pv_lock_m))) { *pde = srcptepaddr & ~PG_W; pmap_resident_count_inc(dst_pmap, NBPDR / PAGE_SIZE); @@ -6908,7 +6514,7 @@ dst_pte = &dst_pte[pmap_pte_index(addr)]; if (*dst_pte == 0 && pmap_try_insert_pv_entry(dst_pmap, addr, - PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) { + PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &pv_lock_m)) { /* * Clear the wired, modified, and accessed * (referenced) bits during the copy. @@ -6925,8 +6531,7 @@ } } out: - if (lock != NULL) - rw_wunlock(lock); + release_pv_list_lock(&pv_lock_m); PMAP_UNLOCK(src_pmap); PMAP_UNLOCK(dst_pmap); } @@ -7041,16 +6646,15 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m) { struct md_page *pvh; - struct rwlock *lock; pv_entry_t pv; - int loops = 0; + int loops; boolean_t rv; KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("pmap_page_exists_quick: page %p is not managed", m)); rv = FALSE; - lock = VM_PAGE_TO_PV_LIST_LOCK(m); - rw_rlock(lock); + loops = 0; + pmap_pv_list_lock(m); TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { if (PV_PMAP(pv) == pmap) { rv = TRUE; @@ -7072,7 +6676,7 @@ break; } } - rw_runlock(lock); + pmap_pv_list_unlock(m); return (rv); } @@ -7085,7 +6689,6 @@ int pmap_page_wired_mappings(vm_page_t m) { - struct rwlock *lock; struct md_page *pvh; pmap_t pmap; pt_entry_t *pte; @@ -7094,17 +6697,16 @@ if ((m->oflags & VPO_UNMANAGED) != 0) return (0); - lock = VM_PAGE_TO_PV_LIST_LOCK(m); - rw_rlock(lock); + pmap_pv_list_lock(m); restart: count = 0; TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; - rw_runlock(lock); + pmap_pv_list_unlock(m); PMAP_LOCK(pmap); - rw_rlock(lock); + pmap_pv_list_lock(m); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; @@ -7122,9 +6724,9 @@ if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; - rw_runlock(lock); + pmap_pv_list_unlock(m); PMAP_LOCK(pmap); - rw_rlock(lock); + pmap_pv_list_lock(m); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); @@ -7137,7 +6739,7 @@ PMAP_UNLOCK(pmap); } } - rw_runlock(lock); + pmap_pv_list_unlock(m); return (count); } @@ -7148,20 +6750,115 @@ boolean_t pmap_page_is_mapped(vm_page_t m) { - struct rwlock *lock; boolean_t rv; if ((m->oflags & VPO_UNMANAGED) != 0) return (FALSE); - lock = VM_PAGE_TO_PV_LIST_LOCK(m); - rw_rlock(lock); + pmap_pv_list_lock(m); rv = !TAILQ_EMPTY(&m->md.pv_list) || ((m->flags & PG_FICTITIOUS) == 0 && !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list)); - rw_runlock(lock); + pmap_pv_list_unlock(m); return (rv); } +static void +pmap_remove_pages_superpage(pmap_t pmap, vm_offset_t va, pd_entry_t *pdep, + pd_entry_t pde, struct spglist *free, pd_entry_t pg_rwm) +{ + vm_page_t m, mpte, mt; + struct pv_entry *pv; + struct md_page *pvh; + vm_paddr_t pa; + + pa = pde & PG_PS_FRAME; + m = PHYS_TO_VM_PAGE(pa); + KASSERT(m->phys_addr == pa, + ("vm_page_t %p phys_addr mismatch %016jx %016jx", + m, (uintmax_t)m->phys_addr, (uintmax_t)pde)); + KASSERT((m->flags & PG_FICTITIOUS) != 0 || + m < &vm_page_array[vm_page_array_size], + ("pmap_remove_pages_superpage: bad pte %#jx", (uintmax_t)pde)); + pte_clear(pdep); + if ((pde & pg_rwm) == pg_rwm) { + for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) + vm_page_dirty(mt); + } + pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); + pvh = pa_to_pvh(pde & PG_PS_FRAME); + pmap_pv_list_lock(m); + TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) { + if (PV_PMAP(pv) == pmap && pv->pv_va == va) + break; + } + MPASS(pv != NULL); + TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); + pvh->pv_gen++; + if (TAILQ_EMPTY(&pvh->pv_list)) { + for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) + if ((mt->a.flags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&mt->md.pv_list)) + vm_page_aflag_clear(mt, PGA_WRITEABLE); + } + pmap_pv_list_unlock(m); + mpte = pmap_remove_pt_page(pmap, va); + if (mpte != NULL) { + KASSERT(mpte->valid == VM_PAGE_BITS_ALL, + ("pmap_remove_pages_superpage: pte page not promoted")); + pmap_resident_count_dec(pmap, 1); + KASSERT(mpte->ref_count == NPTEPG, + ("pmap_remove_pages_superpage: " + "pte page reference count error")); + mpte->ref_count = 0; + pmap_add_delayed_free_list(mpte, free, FALSE); + } + free_pv_entry(pmap, pv); + PV_STAT(atomic_add_long(&pv_entry_frees, NBPDR / PAGE_SIZE)); + PV_STAT(atomic_subtract_long(&pv_entry_count, NBPDR / PAGE_SIZE)); +} + +static void +pmap_remove_pages_page(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep, + pt_entry_t pte, struct spglist *free, pt_entry_t pg_rwm) +{ + vm_page_t m; + struct pv_entry *pv; + struct md_page *pvh; + vm_paddr_t pa; + + pa = pte & PG_FRAME; + m = PHYS_TO_VM_PAGE(pa); + KASSERT(m->phys_addr == pa, + ("vm_page_t %p phys_addr mismatch %016jx %016jx", + m, (uintmax_t)m->phys_addr, (uintmax_t)pte)); + KASSERT((m->flags & PG_FICTITIOUS) != 0 || + m < &vm_page_array[vm_page_array_size], + ("pmap_remove_pages_page: bad pte %#jx", (uintmax_t)pte)); + pte_clear(ptep); + /* Update the vm_page_t clean/reference bits. */ + if ((pte & pg_rwm) == pg_rwm) + vm_page_dirty(m); + pmap_resident_count_dec(pmap, 1); + pmap_pv_list_lock(m); + TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { + if (PV_PMAP(pv) == pmap && pv->pv_va == va) + break; + } + MPASS(pv != NULL); + TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); + m->md.pv_gen++; + if ((m->a.flags & PGA_WRITEABLE) != 0 && + TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) { + pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); + if (TAILQ_EMPTY(&pvh->pv_list)) + vm_page_aflag_clear(m, PGA_WRITEABLE); + } + pmap_pv_list_unlock(m); + free_pv_entry(pmap, pv); + PV_STAT(atomic_add_long(&pv_entry_frees, 1)); + PV_STAT(atomic_subtract_long(&pv_entry_count, 1)); +} + /* * Destroy all managed, non-wired mappings in the given user-space * pmap. This pmap cannot be active on any processor besides the @@ -7170,13 +6867,11 @@ * This function cannot be applied to the kernel pmap. Moreover, it * is not intended for general use. It is only to be used during * process termination. Consequently, it can be implemented in ways - * that make it faster than pmap_remove(). First, it can more quickly - * destroy mappings by iterating over the pmap's collection of PV - * entries, rather than searching the page table. Second, it doesn't - * have to test and clear the page table entries atomically, because - * no processor is currently accessing the user address space. In - * particular, a page table entry's dirty bit won't change state once - * this function starts. + * that make it faster than pmap_remove(). It doesn't have to test + * and clear the page table entries atomically, because no processor + * is currently accessing the user address space. In particular, a + * page table entry's dirty bit won't change state once this function + * starts. * * Although this function destroys all of the pmap's managed, * non-wired mappings, it can delay and batch the invalidation of TLB @@ -7190,21 +6885,13 @@ void pmap_remove_pages(pmap_t pmap) { - pd_entry_t ptepde; - pt_entry_t *pte, tpte; - pt_entry_t PG_M, PG_RW, PG_V; struct spglist free; - struct pv_chunklist free_chunks[PMAP_MEMDOM]; - vm_page_t m, mpte, mt; - pv_entry_t pv; - struct md_page *pvh; - struct pv_chunk *pc, *npc; - struct rwlock *lock; - int64_t bit; - uint64_t inuse, bitmask; - int allfree, field, freed, i, idx; - boolean_t superpage; - vm_paddr_t pa; + vm_offset_t va; + pml4_entry_t pml4e; + pdp_entry_t *pdpp, pdpe; + pd_entry_t *pdp, pde; + pt_entry_t *ptp, pte, pg_rwm, PG_V; + int i4, ipdp, ipd, ipe; /* * Assert that the given pmap is only active on the current @@ -7225,148 +6912,73 @@ } #endif - lock = NULL; - PG_M = pmap_modified_bit(pmap); + pg_rwm = pmap_rw_bit(pmap) | pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); - PG_RW = pmap_rw_bit(pmap); - for (i = 0; i < PMAP_MEMDOM; i++) - TAILQ_INIT(&free_chunks[i]); SLIST_INIT(&free); PMAP_LOCK(pmap); - TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { - allfree = 1; - freed = 0; - for (field = 0; field < _NPCM; field++) { - inuse = ~pc->pc_map[field] & pc_freemask[field]; - while (inuse != 0) { - bit = bsfq(inuse); - bitmask = 1UL << bit; - idx = field * 64 + bit; - pv = &pc->pc_pventry[idx]; - inuse &= ~bitmask; - - pte = pmap_pdpe(pmap, pv->pv_va); - ptepde = *pte; - pte = pmap_pdpe_to_pde(pte, pv->pv_va); - tpte = *pte; - if ((tpte & (PG_PS | PG_V)) == PG_V) { - superpage = FALSE; - ptepde = tpte; - pte = (pt_entry_t *)PHYS_TO_DMAP(tpte & - PG_FRAME); - pte = &pte[pmap_pte_index(pv->pv_va)]; - tpte = *pte; - } else { - /* - * Keep track whether 'tpte' is a - * superpage explicitly instead of - * relying on PG_PS being set. - * - * This is because PG_PS is numerically - * identical to PG_PTE_PAT and thus a - * regular page could be mistaken for - * a superpage. - */ - superpage = TRUE; - } - if ((tpte & PG_V) == 0) { - panic("bad pte va %lx pte %lx", - pv->pv_va, tpte); - } + for (i4 = 0; i4 < NUPML4E; i4++) { + pml4e = pmap->pm_pml4[i4]; + if ((pml4e & PG_V) == 0) { + MPASS(pml4e == 0); + continue; + } -/* - * We cannot remove wired pages from a process' mapping at this time - */ - if (tpte & PG_W) { - allfree = 0; + pdpp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & PG_FRAME); + for (ipdp = 0; ipdp < NPDPEPG; ipdp++) { + pdpe = pdpp[ipdp]; + if ((pdpe & PG_V) == 0) { + MPASS(pdpe == 0); + continue; + } + + pdp = (pd_entry_t *)PHYS_TO_DMAP(pdpe & PG_FRAME); + for (ipd = 0; ipd < NPDEPG; ipd++) { + pde = pdp[ipd]; + if ((pde & PG_V) == 0) { + MPASS(pde == 0); continue; } + if ((pde & PG_PS) != 0) { + /* + * We cannot remove wired pages from + * a process' mapping at this time. + * Also skip unmanaged ptes. + */ + if ((pde & (PG_W | PG_MANAGED)) != + PG_MANAGED) + continue; - if (superpage) - pa = tpte & PG_PS_FRAME; - else - pa = tpte & PG_FRAME; - - m = PHYS_TO_VM_PAGE(pa); - KASSERT(m->phys_addr == pa, - ("vm_page_t %p phys_addr mismatch %016jx %016jx", - m, (uintmax_t)m->phys_addr, - (uintmax_t)tpte)); - - KASSERT((m->flags & PG_FICTITIOUS) != 0 || - m < &vm_page_array[vm_page_array_size], - ("pmap_remove_pages: bad tpte %#jx", - (uintmax_t)tpte)); - - pte_clear(pte); - - /* - * Update the vm_page_t clean/reference bits. - */ - if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) { - if (superpage) { - for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) - vm_page_dirty(mt); - } else - vm_page_dirty(m); + va = UVADDR(i4, ipdp, ipd, 0); + pmap_remove_pages_superpage(pmap, va, + &pdp[ipd], pde, &free, pg_rwm); + pmap_unuse_pt(pmap, va, pdpe, &free); + continue; } - CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m); - - /* Mark free */ - pc->pc_map[field] |= bitmask; - if (superpage) { - pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE); - pvh = pa_to_pvh(tpte & PG_PS_FRAME); - TAILQ_REMOVE(&pvh->pv_list, pv, pv_next); - pvh->pv_gen++; - if (TAILQ_EMPTY(&pvh->pv_list)) { - for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++) - if ((mt->a.flags & PGA_WRITEABLE) != 0 && - TAILQ_EMPTY(&mt->md.pv_list)) - vm_page_aflag_clear(mt, PGA_WRITEABLE); - } - mpte = pmap_remove_pt_page(pmap, pv->pv_va); - if (mpte != NULL) { - KASSERT(mpte->valid == VM_PAGE_BITS_ALL, - ("pmap_remove_pages: pte page not promoted")); - pmap_resident_count_dec(pmap, 1); - KASSERT(mpte->ref_count == NPTEPG, - ("pmap_remove_pages: pte page reference count error")); - mpte->ref_count = 0; - pmap_add_delayed_free_list(mpte, &free, FALSE); - } - } else { - pmap_resident_count_dec(pmap, 1); - TAILQ_REMOVE(&m->md.pv_list, pv, pv_next); - m->md.pv_gen++; - if ((m->a.flags & PGA_WRITEABLE) != 0 && - TAILQ_EMPTY(&m->md.pv_list) && - (m->flags & PG_FICTITIOUS) == 0) { - pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m)); - if (TAILQ_EMPTY(&pvh->pv_list)) - vm_page_aflag_clear(m, PGA_WRITEABLE); + ptp = (pt_entry_t *)PHYS_TO_DMAP(pde & + PG_FRAME); + for (ipe = 0; ipe < NPTEPG; ipe++) { + pte = ptp[ipe]; + if ((pte & PG_V) == 0) { + MPASS(pte == 0); + continue; } + if ((pte & (PG_W | PG_MANAGED)) != + PG_MANAGED) + continue; + va = UVADDR(i4, ipdp, ipd, ipe); + pmap_remove_pages_page(pmap, va, + &ptp[ipe], pte, &free, pg_rwm); + pmap_unuse_pt(pmap, va, pde, &free); } - pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free); - freed++; } } - PV_STAT(atomic_add_long(&pv_entry_frees, freed)); - PV_STAT(atomic_add_int(&pv_entry_spare, freed)); - PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); - if (allfree) { - TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list); - } } - if (lock != NULL) - rw_wunlock(lock); + pmap_invalidate_all(pmap); pmap_pkru_deassign_all(pmap); - free_pv_chunk_batch((struct pv_chunklist *)&free_chunks); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } @@ -7374,7 +6986,6 @@ static boolean_t pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified) { - struct rwlock *lock; pv_entry_t pv; struct md_page *pvh; pt_entry_t *pte, mask; @@ -7384,16 +6995,15 @@ boolean_t rv; rv = FALSE; - lock = VM_PAGE_TO_PV_LIST_LOCK(m); - rw_rlock(lock); + pmap_pv_list_lock(m); restart: TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; - rw_runlock(lock); + pmap_pv_list_unlock(m); PMAP_LOCK(pmap); - rw_rlock(lock); + pmap_pv_list_lock(m); if (md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; @@ -7423,9 +7033,9 @@ if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; - rw_runlock(lock); + pmap_pv_list_unlock(m); PMAP_LOCK(pmap); - rw_rlock(lock); + pmap_pv_list_lock(m); if (md_gen != m->md.pv_gen || pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); @@ -7451,7 +7061,7 @@ } } out: - rw_runlock(lock); + pmap_pv_list_unlock(m); return (rv); } @@ -7524,7 +7134,7 @@ { struct md_page *pvh; pmap_t pmap; - struct rwlock *lock; + vm_page_t pv_lock_m; pv_entry_t next_pv, pv; pd_entry_t *pde; pt_entry_t oldpte, *pte, PG_M, PG_RW; @@ -7538,32 +7148,33 @@ if (!pmap_page_is_write_mapped(m)) return; - lock = VM_PAGE_TO_PV_LIST_LOCK(m); + pv_lock_m = NULL; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); retry_pv_loop: - rw_wlock(lock); + change_pv_list_lock_to_vm_page(&pv_lock_m, m); TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(m); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(m); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); - rw_wunlock(lock); + pmap_pv_list_unlock(m); goto retry_pv_loop; } } PG_RW = pmap_rw_bit(pmap); va = pv->pv_va; pde = pmap_pde(pmap, va); - if ((*pde & PG_RW) != 0) - (void)pmap_demote_pde_locked(pmap, pde, va, &lock); - KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), - ("inconsistent pv lock %p %p for page %p", - lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + if ((*pde & PG_RW) != 0) { + (void)pmap_demote_pde_locked(pmap, pde, va, + &pv_lock_m); + } + KASSERT(pv_lock_m == m, + ("inconsistent pv lock %p for page %p", pv_lock_m, m)); PMAP_UNLOCK(pmap); } TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) { @@ -7571,13 +7182,13 @@ if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(m); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(m); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); - rw_wunlock(lock); + pmap_pv_list_lock(m); goto retry_pv_loop; } } @@ -7600,7 +7211,7 @@ } PMAP_UNLOCK(pmap); } - rw_wunlock(lock); + pmap_pv_list_unlock(m); vm_page_aflag_clear(m, PGA_WRITEABLE); pmap_delayed_invl_wait(m); } @@ -7658,7 +7269,7 @@ struct md_page *pvh; pv_entry_t pv, pvf; pmap_t pmap; - struct rwlock *lock; + vm_page_t pv_lock_m; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_A, PG_M, PG_RW; vm_offset_t va; @@ -7672,9 +7283,9 @@ SLIST_INIT(&free); cleared = 0; pa = VM_PAGE_TO_PHYS(m); - lock = PHYS_TO_PV_LIST_LOCK(pa); + pv_lock_m = m; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa); - rw_wlock(lock); + pmap_pv_list_lock(m); retry: not_cleared = 0; if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL) @@ -7686,9 +7297,9 @@ pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(m); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(m); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto retry; @@ -7735,7 +7346,7 @@ pmap_invalidate_page(pmap, pv->pv_va); demoted = FALSE; } else if (pmap_demote_pde_locked(pmap, pde, - pv->pv_va, &lock)) { + pv->pv_va, &pv_lock_m)) { /* * Remove the mapping to a single page * so that a subsequent access may @@ -7749,7 +7360,7 @@ PG_PS_FRAME); pte = pmap_pde_to_pte(pde, va); pmap_remove_pte(pmap, pte, va, *pde, - NULL, &lock); + NULL, &pv_lock_m); pmap_invalidate_page(pmap, va); } else demoted = TRUE; @@ -7765,9 +7376,9 @@ pv = NULL; } cleared++; - KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), - ("inconsistent pv lock %p %p for page %p", - lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + KASSERT(pv_lock_m == m, + ("inconsistent pv lock %p for page %p", + pv_lock_m, m)); } else not_cleared++; } @@ -7792,9 +7403,9 @@ if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; md_gen = m->md.pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(m); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(m); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto retry; @@ -7823,15 +7434,15 @@ * hard work for unwired pages only. */ pmap_remove_pte(pmap, pte, pv->pv_va, - *pde, &free, &lock); + *pde, &free, &pv_lock_m); pmap_invalidate_page(pmap, pv->pv_va); cleared++; if (pvf == pv) pvf = NULL; pv = NULL; - KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m), - ("inconsistent pv lock %p %p for page %p", - lock, VM_PAGE_TO_PV_LIST_LOCK(m), m)); + KASSERT(pv_lock_m == m, + ("inconsistent pv lock %p for page %p", + pv_lock_m, m)); } else not_cleared++; } @@ -7845,7 +7456,7 @@ } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared + not_cleared < PMAP_TS_REFERENCED_MAX); out: - rw_wunlock(lock); + pmap_pv_list_unlock(m); vm_page_free_pages_toq(&free, true); return (cleared + not_cleared); } @@ -7858,7 +7469,7 @@ void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice) { - struct rwlock *lock; + vm_page_t pv_lock_m; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t oldpde, *pde; @@ -7912,10 +7523,10 @@ else if ((oldpde & PG_PS) != 0) { if ((oldpde & PG_MANAGED) == 0) continue; - lock = NULL; - if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) { - if (lock != NULL) - rw_wunlock(lock); + pv_lock_m = NULL; + if (!pmap_demote_pde_locked(pmap, pde, sva, + &pv_lock_m)) { + release_pv_list_lock(&pv_lock_m); /* * The large page mapping was destroyed. @@ -7943,11 +7554,10 @@ KASSERT((*pte & PG_V) != 0, ("pmap_advise: invalid PTE")); pmap_remove_pte(pmap, pte, va, *pde, NULL, - &lock); + &pv_lock_m); anychanged = true; } - if (lock != NULL) - rw_wunlock(lock); + release_pv_list_lock(&pv_lock_m); } if (va_next > eva) va_next = eva; @@ -8004,7 +7614,7 @@ pv_entry_t next_pv, pv; pd_entry_t oldpde, *pde; pt_entry_t *pte, PG_M, PG_RW; - struct rwlock *lock; + vm_page_t pv_lock_m; vm_offset_t va; int md_gen, pvh_gen; @@ -8016,16 +7626,16 @@ return; pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(VM_PAGE_TO_PHYS(m)); - lock = VM_PAGE_TO_PV_LIST_LOCK(m); - rw_wlock(lock); + pv_lock_m = m; + pmap_pv_list_lock(m); restart: TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) { pmap = PV_PMAP(pv); if (!PMAP_TRYLOCK(pmap)) { pvh_gen = pvh->pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(m); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(m); if (pvh_gen != pvh->pv_gen) { PMAP_UNLOCK(pmap); goto restart; @@ -8038,7 +7648,7 @@ oldpde = *pde; /* If oldpde has PG_RW set, then it also has PG_M set. */ if ((oldpde & PG_RW) != 0 && - pmap_demote_pde_locked(pmap, pde, va, &lock) && + pmap_demote_pde_locked(pmap, pde, va, &pv_lock_m) && (oldpde & PG_W) == 0) { /* * Write protect the mapping to a single page so that @@ -8057,9 +7667,9 @@ if (!PMAP_TRYLOCK(pmap)) { md_gen = m->md.pv_gen; pvh_gen = pvh->pv_gen; - rw_wunlock(lock); + pmap_pv_list_unlock(m); PMAP_LOCK(pmap); - rw_wlock(lock); + pmap_pv_list_lock(m); if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) { PMAP_UNLOCK(pmap); goto restart; @@ -8077,7 +7687,7 @@ } PMAP_UNLOCK(pmap); } - rw_wunlock(lock); + pmap_pv_list_unlock(m); } /* @@ -9026,7 +8636,7 @@ pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) { int rv; - struct rwlock *lock; + vm_page_t pv_lock_m; #if VM_NRESERVLEVEL > 0 vm_page_t m, mpte; #endif @@ -9045,7 +8655,7 @@ PG_RW = pmap_rw_bit(pmap); rv = -1; - lock = NULL; + pv_lock_m = NULL; PMAP_LOCK(pmap); pde = pmap_pde(pmap, va); @@ -9097,7 +8707,7 @@ pmap_ps_enabled(pmap) && (m->flags & PG_FICTITIOUS) == 0 && vm_reserv_level_iffullpop(m) == 0) { - pmap_promote_pde(pmap, pde, va, &lock); + pmap_promote_pde(pmap, pde, va, &pv_lock_m); #ifdef INVARIANTS atomic_add_long(&ad_emulation_superpage_promotions, 1); #endif @@ -9112,8 +8722,7 @@ #endif rv = 0; /* success */ done: - if (lock != NULL) - rw_wunlock(lock); + release_pv_list_lock(&pv_lock_m); PMAP_UNLOCK(pmap); return (rv); } Index: sys/amd64/amd64/vm_machdep.c =================================================================== --- sys/amd64/amd64/vm_machdep.c +++ sys/amd64/amd64/vm_machdep.c @@ -364,6 +364,7 @@ bzero(xhdr, sizeof(*xhdr)); xhdr->xstate_bv = xsave_mask; } + td->td_md.md_pmap_reserv = 0; } void Index: sys/amd64/include/pmap.h =================================================================== --- sys/amd64/include/pmap.h +++ sys/amd64/include/pmap.h @@ -303,7 +303,6 @@ * Pmap stuff */ struct pv_entry; -struct pv_chunk; /* * Locks @@ -312,7 +311,9 @@ struct md_page { TAILQ_HEAD(, pv_entry) pv_list; /* (p) */ int pv_gen; /* (p) */ - int pat_mode; + uint8_t pv_lock; + uint8_t pat_mode; + uint8_t pad0[2]; }; enum pmap_type { @@ -336,7 +337,6 @@ pml4_entry_t *pm_pml4u; /* KVA of user l4 page table */ uint64_t pm_cr3; uint64_t pm_ucr3; - TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ cpuset_t pm_active; /* active on cpus */ enum pmap_type pm_type; /* regular or nested tables */ struct pmap_statistics pm_stats; /* pmap statistics */ @@ -345,6 +345,7 @@ int pm_flags; struct pmap_pcids pm_pcids[MAXCPU]; struct rangeset pm_pkru; + TAILQ_HEAD(, pv_entry) pm_pv_spares; }; /* flags */ @@ -379,31 +380,11 @@ * mappings of that page. An entry is a pv_entry_t, the list is pv_list. */ typedef struct pv_entry { + pmap_t pv_pmap; vm_offset_t pv_va; /* virtual address for mapping */ TAILQ_ENTRY(pv_entry) pv_next; } *pv_entry_t; -/* - * pv_entries are allocated in chunks per-process. This avoids the - * need to track per-pmap assignments. - */ -#define _NPCM 3 -#define _NPCPV 168 -#define PV_CHUNK_HEADER \ - pmap_t pc_pmap; \ - TAILQ_ENTRY(pv_chunk) pc_list; \ - uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */ \ - TAILQ_ENTRY(pv_chunk) pc_lru; - -struct pv_chunk_header { - PV_CHUNK_HEADER -}; - -struct pv_chunk { - PV_CHUNK_HEADER - struct pv_entry pc_pventry[_NPCPV]; -}; - #ifdef _KERNEL extern caddr_t CADDR1; Index: sys/amd64/include/proc.h =================================================================== --- sys/amd64/include/proc.h +++ sys/amd64/include/proc.h @@ -75,6 +75,7 @@ int md_efirt_dis_pf; /* (k) */ struct pcb md_pcb; vm_offset_t md_stack_base; + int md_pmap_reserv; }; struct mdproc { Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -3740,6 +3740,7 @@ kern/kern_mtxpool.c standard kern/kern_mutex.c standard kern/kern_ntptime.c standard +kern/kern_obm.c standard kern/kern_osd.c standard kern/kern_physio.c standard kern/kern_pmc.c standard Index: sys/kern/kern_obm.c =================================================================== --- /dev/null +++ sys/kern/kern_obm.c @@ -0,0 +1,132 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +static SYSCTL_NODE(_debug, OID_AUTO, obm, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + ""); +static u_long obm_slow_lock; +SYSCTL_LONG(_debug_obm, OID_AUTO, slow_lock, CTLFLAG_RD, + &obm_slow_lock, 0, + ""); +static u_long obm_slow_unlock; +SYSCTL_LONG(_debug_obm, OID_AUTO, slow_unlock, CTLFLAG_RD, + &obm_slow_unlock, 0, + ""); + +void +obm_init_lo(struct lock_object *lo, const char *name) +{ + bzero(lo, sizeof(*lo)); + lo->lo_name = name; +} + +void +obm_init(uint8_t *byte) +{ + *byte = OBM_UNLOCKED; +} + +bool +obm_trylock(uint8_t *byte) +{ + return (atomic_cmpset_acq_char(byte, OBM_UNLOCKED, OBM_LOCKED) != 0); +} + +void +obm_lock(uint8_t *byte, struct lock_object *lo) +{ + struct turnstile *ts; + struct lock_delay_arg lda; + uint8_t v; + + v = OBM_UNLOCKED; + if (atomic_fcmpset_acq_char(byte, &v, OBM_LOCKED)) + return; + + atomic_add_long(&obm_slow_lock, 1); + lock_delay_arg_init(&lda, &locks_delay); + for (;;) { + lock_delay(&lda); +reload: + v = atomic_load_char(byte); + if (v == OBM_UNLOCKED) { + if (atomic_fcmpset_acq_char(byte, &v, OBM_LOCKED)) + break; + continue; + } + + ts = turnstile_trywait(lo); + v = atomic_load_char(byte); +cancel_turnstile: + if (v == OBM_UNLOCKED) { + turnstile_cancel(ts); + continue; + } + if ((v & OBM_CONTESTED) == 0 && + atomic_fcmpset_char(byte, &v, v | OBM_CONTESTED) == 0) + goto cancel_turnstile; + turnstile_wait(ts, NULL, TS_SHARED_QUEUE); + goto reload; + } +} + +void +obm_unlock(uint8_t *byte, struct lock_object *lo) +{ + struct turnstile *ts; + uint8_t v; + + v = OBM_LOCKED; + if (atomic_fcmpset_rel_char(byte, &v, OBM_UNLOCKED)) + return; + MPASS(v == OBM_LOCKED || v == (OBM_LOCKED | OBM_CONTESTED)); + atomic_add_long(&obm_slow_unlock, 1); + turnstile_chain_lock(lo); + atomic_store_rel_char(byte, OBM_UNLOCKED); + ts = turnstile_lookup(lo); + if (ts != NULL) { + turnstile_broadcast(ts, TS_SHARED_QUEUE); + turnstile_unpend(ts); + } + turnstile_chain_unlock(lo); +} + Index: sys/sys/obm.h =================================================================== --- /dev/null +++ sys/sys/obm.h @@ -0,0 +1,66 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_OBM_H +#define _SYS_OBM_H + +/* One-Byte Lock */ + +#ifdef _KERNEL + +#include +#include +#include + +#define OBM_UNLOCKED 0x00 +#define OBM_LOCKED 0x02 +#define OBM_CONTESTED 0x01 + +void obm_init_lo(struct lock_object *lo, const char *name); +void obm_init(uint8_t *byte); +void obm_lock(uint8_t *byte, struct lock_object *lo); +void obm_unlock(uint8_t *byte, struct lock_object *lo); +bool obm_trylock(uint8_t *byte); + +static void +obm_assert_locked(uint8_t *byte) +{ +#ifdef INVARIANTS + uint8_t v; + + v = atomic_load_char(byte); + MPASS(v == OBM_LOCKED || v == (OBM_LOCKED | OBM_CONTESTED)); +#endif +} +#endif +#endif Index: sys/vm/pmap.h =================================================================== --- sys/vm/pmap.h +++ sys/vm/pmap.h @@ -106,6 +106,7 @@ */ #define PMAP_ENTER_NOSLEEP 0x00000100 #define PMAP_ENTER_WIRED 0x00000200 +#define PMAP_ENTER_NOPVSLEEP 0x00000400 #define PMAP_ENTER_RESERVED 0xFF000000 /* Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -1005,6 +1005,28 @@ vm_page_valid(fs->m); } +static void +vm_fault_waitpfault(struct faultstate *fs, struct domainset *dset) +{ + + if (dset == NULL) { + dset = fs->object->domain.dr_policy; + if (dset == NULL) + dset = curthread->td_domain.dr_policy; + } + if (vm_pfault_oom_attempts < 0 || fs->oom < vm_pfault_oom_attempts) { + fs->oom++; + vm_waitpfault(dset, vm_pfault_oom_wait * hz); + } else { + if (bootverbose) + printf( + "proc %d (%s) failed to alloc page on fault, starting OOM\n", + curproc->p_pid, curproc->p_comm); + vm_pageout_oom(VM_OOM_MEM_PF); + fs->oom = 0; + } +} + /* * Allocate a page directly or via the object populate method. */ @@ -1069,18 +1091,7 @@ } if (fs->m == NULL) { unlock_and_deallocate(fs); - if (vm_pfault_oom_attempts < 0 || - fs->oom < vm_pfault_oom_attempts) { - fs->oom++; - vm_waitpfault(dset, vm_pfault_oom_wait * hz); - } else { - if (bootverbose) - printf( - "proc %d (%s) failed to alloc page on fault, starting OOM\n", - curproc->p_pid, curproc->p_comm); - vm_pageout_oom(VM_OOM_MEM_PF); - fs->oom = 0; - } + vm_fault_waitpfault(fs, dset); return (KERN_RESOURCE_SHORTAGE); } fs->oom = 0; @@ -1264,6 +1275,10 @@ rv = vm_fault_soft_fast(&fs); if (rv == KERN_SUCCESS) return (rv); + if (rv == KERN_RESOURCE_SHORTAGE) { + VM_OBJECT_RUNLOCK(fs.first_object); + goto RetryFault; + } if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) { VM_OBJECT_RUNLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.first_object); @@ -1493,8 +1508,19 @@ * back on the active queue until later so that the pageout daemon * won't find it (yet). */ - pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, - fs.fault_type | (fs.wired ? PMAP_ENTER_WIRED : 0), 0); + rv = pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, + fs.fault_type | (fs.wired ? PMAP_ENTER_WIRED : 0) | + PMAP_ENTER_NOPVSLEEP, 0); + MPASS(rv == KERN_SUCCESS || rv == KERN_RESOURCE_SHORTAGE); + KASSERT((fs.m->oflags & VPO_UNMANAGED) == 0 || rv == KERN_SUCCESS, + ("pmap_enter failure rv %d page %p fs %p", rv, fs.m, &fs)); + if (rv == KERN_RESOURCE_SHORTAGE) { + fault_page_release(&fs.m); + fault_deallocate(&fs); + vm_fault_waitpfault(&fs, NULL); + goto RetryFault; + } + if (faultcount != 1 && (fs.fault_flags & VM_FAULT_WIRE) == 0 && fs.wired == 0) vm_fault_prefault(&fs, vaddr, @@ -1800,12 +1826,14 @@ vm_map_entry_t dst_entry, vm_map_entry_t src_entry, vm_ooffset_t *fork_charge) { + struct faultstate fs; vm_object_t backing_object, dst_object, object, src_object; vm_pindex_t dst_pindex, pindex, src_pindex; vm_prot_t access, prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; + int rv; boolean_t upgrade; #ifdef lint @@ -1836,6 +1864,8 @@ dst_object->domain = src_object->domain; dst_object->charge = dst_entry->end - dst_entry->start; } + fs.oom = 0; + fs.object = dst_object; VM_OBJECT_WLOCK(dst_object); KASSERT(upgrade || dst_entry->object.vm_object == NULL, @@ -1964,8 +1994,16 @@ * backing pages. */ if (vm_page_all_valid(dst_m)) { - pmap_enter(dst_map->pmap, vaddr, dst_m, prot, - access | (upgrade ? PMAP_ENTER_WIRED : 0), 0); + for (;;) { + rv = pmap_enter(dst_map->pmap, vaddr, dst_m, + prot, access | (upgrade ? PMAP_ENTER_WIRED : + 0), 0); + MPASS(rv == KERN_SUCCESS || + rv == KERN_RESOURCE_SHORTAGE); + if (rv == KERN_SUCCESS) + break; + vm_fault_waitpfault(&fs, NULL); + } } /* Index: sys/vm/vm_kern.c =================================================================== --- sys/vm/vm_kern.c +++ sys/vm/vm_kern.c @@ -185,7 +185,7 @@ vm_object_t object = kernel_object; vm_offset_t addr, i, offset; vm_page_t m; - int pflags, tries; + int pflags, rv, tries; vm_prot_t prot; size = round_page(size); @@ -224,8 +224,9 @@ if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); vm_page_valid(m); - pmap_enter(kernel_pmap, addr + i, m, prot, - prot | PMAP_ENTER_WIRED, 0); + rv = pmap_enter(kernel_pmap, addr + i, m, prot, + prot | PMAP_ENTER_WIRED | PMAP_ENTER_NOPVSLEEP, 0); + MPASS(rv == KERN_SUCCESS); } VM_OBJECT_WUNLOCK(object); return (addr); @@ -277,7 +278,7 @@ vm_offset_t addr, offset, tmp; vm_page_t end_m, m; u_long npages; - int pflags, tries; + int pflags, rv, tries; size = round_page(size); vmem = vm_dom[domain].vmd_kernel_arena; @@ -316,8 +317,9 @@ if ((flags & M_ZERO) && (m->flags & PG_ZERO) == 0) pmap_zero_page(m); vm_page_valid(m); - pmap_enter(kernel_pmap, tmp, m, VM_PROT_RW, - VM_PROT_RW | PMAP_ENTER_WIRED, 0); + rv = pmap_enter(kernel_pmap, tmp, m, VM_PROT_RW, + VM_PROT_RW | PMAP_ENTER_WIRED | PMAP_ENTER_NOPVSLEEP, 0); + MPASS(rv == KERN_SUCCESS); tmp += PAGE_SIZE; } VM_OBJECT_WUNLOCK(object); @@ -459,7 +461,7 @@ vm_offset_t offset, i; vm_page_t m, mpred; vm_prot_t prot; - int pflags; + int pflags, rv; KASSERT(object == kernel_object, ("kmem_back_domain: only supports kernel object.")); @@ -499,8 +501,9 @@ KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("kmem_malloc: page %p is managed", m)); vm_page_valid(m); - pmap_enter(kernel_pmap, addr + i, m, prot, - prot | PMAP_ENTER_WIRED, 0); + rv = pmap_enter(kernel_pmap, addr + i, m, prot, + prot | PMAP_ENTER_WIRED | PMAP_ENTER_NOPVSLEEP, 0); + MPASS(rv == KERN_SUCCESS); #if VM_NRESERVLEVEL > 0 if (__predict_false((prot & VM_PROT_EXECUTE) != 0)) m->oflags |= VPO_KMEM_EXEC; Index: sys/vm/vm_page.c =================================================================== --- sys/vm/vm_page.c +++ sys/vm/vm_page.c @@ -5463,7 +5463,7 @@ else m = (vm_page_t)addr; db_printf( - "page %p obj %p pidx 0x%jx phys 0x%jx q %d ref %u\n" + "page %p obj %p pidx 0x%jx phys 0x%jx q %d ref 0x%x\n" " af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n", m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr, m->a.queue, m->ref_count, m->a.flags, m->oflags,