Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -165,6 +165,12 @@ #include #include +#ifdef NUMA +#define PMAP_MEMDOM MAXMEMDOM +#else +#define PMAP_MEMDOM 1 +#endif + static __inline boolean_t pmap_type_guest(pmap_t pmap) { @@ -420,8 +426,30 @@ * Data for the pv entry allocation mechanism. * Updates to pv_invl_gen are protected by the pv list lock but reads are not. */ -static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); -static struct mtx __exclusive_cache_line pv_chunks_mutex; +#ifdef NUMA +static __inline int +pc_to_domain(struct pv_chunk *pc) +{ + + return (_vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc))); +} +#else +static __inline int +pc_to_domain(struct pv_chunk *pc __unused) +{ + + return (0); +} +#endif + +struct pv_chunks_list { + struct mtx pvc_lock; + TAILQ_HEAD(pch, pv_chunk) pvc_list; + int active_reclaims; +} __aligned(CACHE_LINE_SIZE); + +struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM]; + #if VM_NRESERVLEVEL > 0 struct pmap_large_md_page { struct rwlock pv_lock; @@ -2035,10 +2063,12 @@ } /* - * Initialize the pv chunk list mutex. + * Initialize pv chunk lists. */ - mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); - + for (i = 0; i < PMAP_MEMDOM; i++) { + mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF); + TAILQ_INIT(&pv_chunks[i].pvc_list); + } pmap_init_pv_table(); pmap_initialized = 1; @@ -4141,8 +4171,9 @@ * exacerbating the shortage of free pv entries. */ static vm_page_t -reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) +reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain) { + struct pv_chunks_list *pvc; struct pv_chunk *pc, *pc_marker, *pc_marker_end; struct pv_chunk_header pc_marker_b, pc_marker_end_b; struct md_page *pvh; @@ -4157,7 +4188,6 @@ uint64_t inuse; int bit, field, freed; bool start_di; - static int active_reclaims = 0; PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED); KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL")); @@ -4177,10 +4207,11 @@ */ start_di = pmap_not_in_di(); - mtx_lock(&pv_chunks_mutex); - active_reclaims++; - TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru); - TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru); + pvc = &pv_chunks[domain]; + mtx_lock(&pvc->pvc_lock); + pvc->active_reclaims++; + TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru); + TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru); while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end && SLIST_EMPTY(&free)) { next_pmap = pc->pc_pmap; @@ -4193,7 +4224,7 @@ */ goto next_chunk; } - mtx_unlock(&pv_chunks_mutex); + mtx_unlock(&pvc->pvc_lock); /* * A pv_chunk can only be removed from the pc_lru list @@ -4210,17 +4241,17 @@ PMAP_LOCK(pmap); if (start_di) pmap_delayed_invl_start(); - mtx_lock(&pv_chunks_mutex); + mtx_lock(&pvc->pvc_lock); continue; } else if (pmap != locked_pmap) { if (PMAP_TRYLOCK(pmap)) { if (start_di) pmap_delayed_invl_start(); - mtx_lock(&pv_chunks_mutex); + mtx_lock(&pvc->pvc_lock); continue; } else { pmap = NULL; /* pmap is not locked */ - mtx_lock(&pv_chunks_mutex); + mtx_lock(&pvc->pvc_lock); pc = TAILQ_NEXT(pc_marker, pc_lru); if (pc == NULL || pc->pc_pmap != next_pmap) @@ -4277,7 +4308,7 @@ } } if (freed == 0) { - mtx_lock(&pv_chunks_mutex); + mtx_lock(&pvc->pvc_lock); goto next_chunk; } /* Every freed mapping is for a 4 KB page. */ @@ -4294,19 +4325,19 @@ /* Entire chunk is free; return it. */ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc)); dump_drop_page(m_pc->phys_addr); - mtx_lock(&pv_chunks_mutex); - TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + mtx_lock(&pvc->pvc_lock); + TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); break; } TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); - mtx_lock(&pv_chunks_mutex); + mtx_lock(&pvc->pvc_lock); /* One freed pv entry in locked_pmap is sufficient. */ if (pmap == locked_pmap) break; next_chunk: - TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); - TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru); - if (active_reclaims == 1 && pmap != NULL) { + TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); + TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru); + if (pvc->active_reclaims == 1 && pmap != NULL) { /* * Rotate the pv chunks list so that we do not * scan the same pv chunks that could not be @@ -4314,17 +4345,17 @@ * and/or superpage mapping) on every * invocation of reclaim_pv_chunk(). */ - while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) { + while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) { MPASS(pc->pc_pmap != NULL); - TAILQ_REMOVE(&pv_chunks, pc, pc_lru); - TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); + TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); + TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); } } } - TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru); - TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru); - active_reclaims--; - mtx_unlock(&pv_chunks_mutex); + TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru); + TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru); + pvc->active_reclaims--; + mtx_unlock(&pvc->pvc_lock); reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di); if (m_pc == NULL && !SLIST_EMPTY(&free)) { m_pc = SLIST_FIRST(&free); @@ -4336,6 +4367,23 @@ return (m_pc); } +static vm_page_t +reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp) +{ + vm_page_t m; + int i, domain; + + domain = PCPU_GET(domain); + for (i = 0; i < vm_ndomains; i++) { + m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain); + if (m != NULL) + break; + domain = (domain + 1) % vm_ndomains; + } + + return (m); +} + /* * free the pv_entry back to the free list */ @@ -4385,29 +4433,37 @@ static void free_pv_chunk(struct pv_chunk *pc) { + struct pv_chunks_list *pvc; - mtx_lock(&pv_chunks_mutex); - TAILQ_REMOVE(&pv_chunks, pc, pc_lru); - mtx_unlock(&pv_chunks_mutex); + pvc = &pv_chunks[pc_to_domain(pc)]; + mtx_lock(&pvc->pvc_lock); + TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); + mtx_unlock(&pvc->pvc_lock); free_pv_chunk_dequeued(pc); } static void free_pv_chunk_batch(struct pv_chunklist *batch) { + struct pv_chunks_list *pvc; struct pv_chunk *pc, *npc; + int i; - if (TAILQ_EMPTY(batch)) - return; - - mtx_lock(&pv_chunks_mutex); - TAILQ_FOREACH(pc, batch, pc_list) { - TAILQ_REMOVE(&pv_chunks, pc, pc_lru); + for (i = 0; i < vm_ndomains; i++) { + if (TAILQ_EMPTY(&batch[i])) + continue; + pvc = &pv_chunks[i]; + mtx_lock(&pvc->pvc_lock); + TAILQ_FOREACH(pc, &batch[i], pc_list) { + TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru); + } + mtx_unlock(&pvc->pvc_lock); } - mtx_unlock(&pv_chunks_mutex); - TAILQ_FOREACH_SAFE(pc, batch, pc_list, npc) { - free_pv_chunk_dequeued(pc); + for (i = 0; i < vm_ndomains; i++) { + TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) { + free_pv_chunk_dequeued(pc); + } } } @@ -4422,6 +4478,7 @@ static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp) { + struct pv_chunks_list *pvc; int bit, field; pv_entry_t pv; struct pv_chunk *pc; @@ -4473,9 +4530,10 @@ pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; - mtx_lock(&pv_chunks_mutex); - TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru); - mtx_unlock(&pv_chunks_mutex); + pvc = &pv_chunks[_vm_phys_domain(m->phys_addr)]; + mtx_lock(&pvc->pvc_lock); + TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru); + mtx_unlock(&pvc->pvc_lock); pv = &pc->pc_pventry[0]; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); PV_STAT(atomic_add_long(&pv_entry_count, 1)); @@ -4521,10 +4579,11 @@ static void reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp) { - struct pch new_tail; + struct pv_chunks_list *pvc; + struct pch new_tail[PMAP_MEMDOM]; struct pv_chunk *pc; vm_page_t m; - int avail, free; + int avail, free, i; bool reclaimed; PMAP_LOCK_ASSERT(pmap, MA_OWNED); @@ -4536,7 +4595,8 @@ * reclaim_pv_chunk() could recycle one of these chunks. In * contrast, these chunks must be added to the pmap upon allocation. */ - TAILQ_INIT(&new_tail); + for (i = 0; i < PMAP_MEMDOM; i++) + TAILQ_INIT(&new_tail[i]); retry: avail = 0; TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) { @@ -4571,7 +4631,7 @@ pc->pc_map[1] = PC_FREE1; pc->pc_map[2] = PC_FREE2; TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru); + TAILQ_INSERT_TAIL(&new_tail[pc_to_domain(pc)], pc, pc_lru); PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV)); /* @@ -4582,10 +4642,13 @@ if (reclaimed) goto retry; } - if (!TAILQ_EMPTY(&new_tail)) { - mtx_lock(&pv_chunks_mutex); - TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru); - mtx_unlock(&pv_chunks_mutex); + for (i = 0; i < vm_ndomains; i++) { + if (TAILQ_EMPTY(&new_tail[i])) + continue; + pvc = &pv_chunks[i]; + mtx_lock(&pvc->pvc_lock); + TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru); + mtx_unlock(&pvc->pvc_lock); } } @@ -7011,7 +7074,7 @@ pt_entry_t *pte, tpte; pt_entry_t PG_M, PG_RW, PG_V; struct spglist free; - struct pv_chunklist free_chunks; + struct pv_chunklist free_chunks[PMAP_MEMDOM]; vm_page_t m, mpte, mt; pv_entry_t pv; struct md_page *pvh; @@ -7019,7 +7082,7 @@ struct rwlock *lock; int64_t bit; uint64_t inuse, bitmask; - int allfree, field, freed, idx; + int allfree, field, freed, i, idx; boolean_t superpage; vm_paddr_t pa; @@ -7047,7 +7110,8 @@ PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); - TAILQ_INIT(&free_chunks); + for (i = 0; i < PMAP_MEMDOM; i++) + TAILQ_INIT(&free_chunks[i]); SLIST_INIT(&free); PMAP_LOCK(pmap); TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) { @@ -7175,14 +7239,14 @@ PV_STAT(atomic_subtract_long(&pv_entry_count, freed)); if (allfree) { TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list); - TAILQ_INSERT_TAIL(&free_chunks, pc, pc_list); + TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list); } } if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); pmap_pkru_deassign_all(pmap); - free_pv_chunk_batch(&free_chunks); + free_pv_chunk_batch((struct pv_chunklist *)&free_chunks); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); }