Page MenuHomeFreeBSD

D24217.id83480.diff
No OneTemporary

D24217.id83480.diff

diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -121,6 +121,7 @@
#include <sys/malloc.h>
#include <sys/mman.h>
#include <sys/mutex.h>
+#include <sys/obm.h>
#include <sys/proc.h>
#include <sys/rangeset.h>
#include <sys/rwlock.h>
@@ -174,6 +175,13 @@
#define PMAP_MEMDOM 1
#endif
+#define PC_FREE0 0xfffffffffffffffful
+#define PC_FREE1 0xfffffffffffffffful
+#define PC_FREE2 0x000000fffffffffful
+
+_Static_assert(sizeof(struct pv_chunk) <= PAGE_SIZE, "_NPCM too large");
+_Static_assert(NBBY * sizeof(uint64_t) * _NPCM >= _NPCPV, "_NPCM too large");
+
static __inline boolean_t
pmap_type_guest(pmap_t pmap)
{
@@ -319,6 +327,9 @@
#define PMAP_INLINE
#endif
+static TAILQ_HEAD(, pmap) all_pmaps;
+static struct mtx all_pmaps_lock;
+
#ifdef PV_STATS
#define PV_STAT(x) do { x ; } while (0)
#else
@@ -334,51 +345,46 @@
})
#define pa_to_pmdp(pa) (&pv_table[pa_index(pa)])
#define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page))
-#define PHYS_TO_PV_LIST_LOCK(pa) ({ \
- struct rwlock *_lock; \
- if (__predict_false((pa) > pmap_last_pa)) \
- _lock = &pv_dummy_large.pv_lock; \
- else \
- _lock = &(pa_to_pmdp(pa)->pv_lock); \
- _lock; \
-})
#else
#define pa_index(pa) ((pa) >> PDRSHIFT)
#define pa_to_pvh(pa) (&pv_table[pa_index(pa)])
#define NPV_LIST_LOCKS MAXCPU
-#define PHYS_TO_PV_LIST_LOCK(pa) \
- (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
#endif
+#define PHYS_TO_PV_LIST_LOCK(pa) PHYS_TO_VM_PAGE(pa)
+
#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \
- struct rwlock **_lockp = (lockp); \
- struct rwlock *_new_lock; \
+ vm_page_t _m; \
+ \
+ _m = PHYS_TO_VM_PAGE(pa); \
+ if (_m == NULL) \
+ _m = &pv_fake_page; \
+ CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, _m); \
+} while (0)
+
+#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) do { \
+ PVLL **_lockp = (lockp); \
\
- _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \
- if (_new_lock != *_lockp) { \
+ if (m != *_lockp) { \
if (*_lockp != NULL) \
- rw_wunlock(*_lockp); \
- *_lockp = _new_lock; \
- rw_wlock(*_lockp); \
+ pmap_pv_list_unlock(*_lockp); \
+ *_lockp = m; \
+ pmap_pv_list_lock(m); \
} \
} while (0)
-#define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
- CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
-
#define RELEASE_PV_LIST_LOCK(lockp) do { \
- struct rwlock **_lockp = (lockp); \
+ PVLL **_lockp = (lockp); \
\
if (*_lockp != NULL) { \
- rw_wunlock(*_lockp); \
+ pmap_pv_list_unlock(*_lockp); \
*_lockp = NULL; \
} \
} while (0)
-#define VM_PAGE_TO_PV_LIST_LOCK(m) \
- PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
+#define VM_PAGE_TO_PV_LIST_LOCK(m) (m)
struct pmap kernel_pmap_store;
@@ -447,46 +453,161 @@
* Data for the pv entry allocation mechanism.
* Updates to pv_invl_gen are protected by the pv list lock but reads are not.
*/
-#ifdef NUMA
-static __inline int
-pc_to_domain(struct pv_chunk *pc)
-{
-
- return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
-}
-#else
-static __inline int
-pc_to_domain(struct pv_chunk *pc __unused)
-{
- return (0);
-}
-#endif
-
-struct pv_chunks_list {
- struct mtx pvc_lock;
- TAILQ_HEAD(pch, pv_chunk) pvc_list;
- int active_reclaims;
-} __aligned(CACHE_LINE_SIZE);
-
-struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
+typedef struct vm_page PVLL;
#ifdef NUMA
struct pmap_large_md_page {
- struct rwlock pv_lock;
- struct md_page pv_page;
+ struct lock_object lo;
+ uintptr_t pad;
+ struct md_page pv_page;
u_long pv_invl_gen;
};
+/*
+ * We only depend on the size being a power of two, so the assert
+ * is overzealous. However, should the struct be resized to a
+ * different power of two, the code below needs to be revisited.
+ */
+_Static_assert(sizeof(struct pmap_large_md_page) == 64, "pmap_large_md_page");
+
__exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
#define pv_dummy pv_dummy_large.pv_page
__read_mostly static struct pmap_large_md_page *pv_table;
__read_mostly vm_paddr_t pmap_last_pa;
+static struct lock_object *
+pv_list_lock_object(vm_paddr_t pa)
+{
+ if (__predict_false(pa) > pmap_last_pa)
+ return (&pv_dummy_large.lo);
+ return (&pa_to_pmdp(pa)->lo);
+}
#else
-static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
+static struct lock_object __exclusive_cache_line pv_lo[NPV_LIST_LOCKS];
static u_long pv_invl_gen[NPV_LIST_LOCKS];
static struct md_page *pv_table;
static struct md_page pv_dummy;
+static struct lock_object *
+pv_list_lock_object(vm_paddr_t pa)
+{
+ return (&pv_lo[pa_index(pa) % NPV_LIST_LOCKS]);
+}
#endif
+__read_mostly static struct vm_page pv_fake_page;
+
+#define pmap_pv_list_lock(m) do { \
+ vm_page_t _pvl_m = (m); \
+ obm_lock(&_pvl_m->md.pv_lock, \
+ pv_list_lock_object(VM_PAGE_TO_PHYS(_pvl_m))); \
+} while (0)
+
+#define pmap_pv_list_unlock(m) do { \
+ vm_page_t _pvl_m = (m); \
+ obm_unlock(&_pvl_m->md.pv_lock, \
+ pv_list_lock_object(VM_PAGE_TO_PHYS(_pvl_m))); \
+} while (0)
+
+/*
+ * Helper for pmap_pv_list_lock_pde(). The pte_locked argument
+ * indicates whether the PV llst for m is already locked.
+ */
+static void
+pmap_pv_list_lock_pde1(vm_page_t m, bool pte_locked)
+{
+ vm_page_t mt, sm;
+ struct lock_object *lo;
+ int i;
+
+ if (pte_locked)
+ obm_assert_locked(&m->md.pv_lock);
+
+ sm = m - atop(VM_PAGE_TO_PHYS(m) & (PG_FRAME & PDRMASK));
+ lo = pv_list_lock_object(VM_PAGE_TO_PHYS(m));
+
+ if (pte_locked) {
+ /*
+ * Fast attempt. If we either own or can get the pv
+ * list lock of the first page in the superpage, all
+ * other owners must release their locks without
+ * waiting for us.
+ */
+ if (m == sm || obm_trylock(&sm->md.pv_lock, lo)) {
+ for (i = 1, mt = sm + 1; i < NPTEPG; i++, mt++) {
+ if (m != mt)
+ obm_lock(&mt->md.pv_lock, lo);
+ }
+ return;
+ }
+
+ obm_unlock(&m->md.pv_lock, lo);
+ }
+
+ for (i = 0, mt = sm; i < NPTEPG; i++, mt++) {
+ obm_lock(&mt->md.pv_lock, lo);
+ }
+}
+
+/*
+ * Locks all pv lists for ordinary pages constituting the superpage
+ * that contains the passed page.
+ */
+static void
+pmap_pv_list_lock_pde(vm_paddr_t pa, PVLL **lockp)
+{
+ vm_page_t m;
+
+ m = PHYS_TO_VM_PAGE(pa);
+ KASSERT(m != NULL,
+ ("pmap_pv_list_lock_pde: unmanaged phys addr %#lx", pa));
+
+ if (*lockp == NULL) {
+ pmap_pv_list_lock_pde1(m, false);
+ return;
+ }
+ if ((VM_PAGE_TO_PHYS(*lockp) & PG_PS_FRAME) != (pa & PG_PS_FRAME)) {
+ pmap_pv_list_unlock(*lockp);
+ *lockp = NULL;
+ pmap_pv_list_lock_pde1(m, false);
+ return;
+ }
+ pmap_pv_list_lock_pde1(*lockp, true);
+}
+
+/*
+ * Unlock all pv lists for ordinary pages constituting the superpage
+ * at the physical address pa.
+ *
+ * If *lockp points to one of the ordinary pages from the superpage we
+ * are demoting or promoting, then we keep this page's pv list locked
+ * after pmap_pv_list_unlock_pde(). Otherwise, we just unlock
+ * whatever was locked, and unlock the whole run of pages constituting
+ * the superpage in pmap_pv_list_unlock_pde().
+ */
+static void
+pmap_pv_list_unlock_pde(vm_paddr_t pa, PVLL **lockp)
+{
+ vm_page_t m, mt, sm;
+ struct lock_object *lo;
+ int i;
+ bool pte_locked;
+
+ m = *lockp;
+ pte_locked = m != NULL;
+ if (!pte_locked) {
+ m = PHYS_TO_VM_PAGE(pa);
+ if (m == NULL)
+ m = &pv_fake_page;
+ }
+
+ sm = m - atop(VM_PAGE_TO_PHYS(m) & (PG_FRAME & PDRMASK));
+ lo = pv_list_lock_object(VM_PAGE_TO_PHYS(m));
+ obm_assert_locked(&m->md.pv_lock);
+ obm_assert_locked(&sm->md.pv_lock);
+
+ for (i = 0, mt = sm; i < NPTEPG; i++, mt++) {
+ if (!pte_locked || mt != m)
+ obm_unlock(&mt->md.pv_lock, lo);
+ }
+}
/*
* All those kernel PT submaps that BSD is so fond of
@@ -1172,7 +1293,7 @@
{
u_long gen, *m_gen;
- rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
+ obm_assert_locked(&m->md.pv_lock);
gen = curthread->td_md.md_invl_gen.gen;
if (gen == 0)
return;
@@ -1205,37 +1326,35 @@
static void free_pv_chunk(struct pv_chunk *pc);
static void free_pv_chunk_batch(struct pv_chunklist *batch);
static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
-static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
+static pv_entry_t get_pv_entry(pmap_t pmap, PVLL **lockp);
static int popcnt_pc_map_pq(uint64_t *map);
-static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
-static void reserve_pv_entries(pmap_t pmap, int needed,
- struct rwlock **lockp);
-static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
- struct rwlock **lockp);
+static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, PVLL **lockp,
+ bool avoid_locked_pmap);
+static void reserve_pv_entries(pmap_t pmap, int needed, PVLL **lockp);
+static void pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
static bool pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
- u_int flags, struct rwlock **lockp);
+ u_int flags, PVLL **lockp);
#if VM_NRESERVLEVEL > 0
-static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
- struct rwlock **lockp);
+static void pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
#endif
static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
- vm_offset_t va);
+ vm_offset_t va);
static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
vm_prot_t prot, int mode, int flags);
static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
- vm_offset_t va, struct rwlock **lockp);
+ vm_offset_t va, PVLL **lockp);
static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
vm_offset_t va);
static bool pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
- vm_prot_t prot, struct rwlock **lockp);
+ vm_prot_t prot, PVLL **lockp);
static int pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
- u_int flags, vm_page_t m, struct rwlock **lockp);
+ u_int flags, vm_page_t m, PVLL **lockp);
static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
- vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
+ vm_page_t m, vm_prot_t prot, vm_page_t mpte, PVLL **lockp);
static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted);
static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,
@@ -1243,13 +1362,13 @@
static void pmap_invalidate_cache_range_all(vm_offset_t sva,
vm_offset_t eva);
static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
- pd_entry_t pde);
+ pd_entry_t pde);
static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
static vm_page_t pmap_large_map_getptp_unlocked(void);
static vm_paddr_t pmap_large_map_kextract(vm_offset_t va);
#if VM_NRESERVLEVEL > 0
static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
- struct rwlock **lockp);
+ PVLL **lockp);
#endif
static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
vm_prot_t prot);
@@ -1260,29 +1379,28 @@
static pd_entry_t *pmap_pti_pde(vm_offset_t va);
static void pmap_pti_wire_pte(void *pte);
static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
- struct spglist *free, struct rwlock **lockp);
+ struct spglist *free, PVLL **lockp);
static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
- pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
+ pd_entry_t ptepde, struct spglist *free, PVLL **lockp);
static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
struct spglist *free);
static bool pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
- pd_entry_t *pde, struct spglist *free,
- struct rwlock **lockp);
+ pd_entry_t *pde, struct spglist *free, PVLL **lockp);
static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
- vm_page_t m, struct rwlock **lockp);
+ vm_page_t m, PVLL **lockp);
static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
pd_entry_t newpde);
static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp,
- struct rwlock **lockp);
+ PVLL **lockp);
static vm_page_t pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex,
- struct rwlock **lockp, vm_offset_t va);
+ PVLL **lockp, vm_offset_t va);
static vm_page_t pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex,
- struct rwlock **lockp, vm_offset_t va);
+ PVLL **lockp, vm_offset_t va);
static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
- struct rwlock **lockp);
+ PVLL **lockp);
static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
struct spglist *free);
@@ -1830,6 +1948,9 @@
cr4 |= CR4_SMAP;
load_cr4(cr4);
+ TAILQ_INIT(&all_pmaps);
+ mtx_init(&all_pmaps_lock, "allpms", NULL, MTX_DEF);
+
/*
* Initialize the kernel pmap (which is statically allocated).
* Count bootstrap data as being resident in case any of this data is
@@ -2148,6 +2269,7 @@
TAILQ_INIT(&m->md.pv_list);
m->md.pat_mode = PAT_WRITE_BACK;
+ obm_init(&m->md.pv_lock);
}
static int pmap_allow_2m_x_ept;
@@ -2204,13 +2326,6 @@
long start, end, highest, pv_npg;
int domain, i, j, pages;
- /*
- * We strongly depend on the size being a power of two, so the assert
- * is overzealous. However, should the struct be resized to a
- * different power of two, the code below needs to be revisited.
- */
- CTASSERT((sizeof(*pvd) == 64));
-
/*
* Calculate the size of the array.
*/
@@ -2245,12 +2360,13 @@
vm_page_t m = vm_page_alloc_domain(NULL, 0,
domain, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ);
if (m == NULL)
- panic("vm_page_alloc_domain failed for %lx\n", (vm_offset_t)pvd + j);
+ panic("vm_page_alloc_domain failed for %lx\n",
+ (vm_offset_t)pvd + j);
pmap_qenter((vm_offset_t)pvd + j, &m, 1);
}
for (j = 0; j < s / sizeof(*pvd); j++) {
- rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
+ obm_init_lo(&pvd->lo, "pmap pv list");
TAILQ_INIT(&pvd->pv_page.pv_list);
pvd->pv_page.pv_gen = 0;
pvd->pv_page.pat_mode = 0;
@@ -2259,8 +2375,18 @@
}
}
pvd = &pv_dummy_large;
- rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
+ obm_init_lo(&pvd->lo, "pmap pv list dummy");
TAILQ_INIT(&pvd->pv_page.pv_list);
+
+ /*
+ * Initialize pv_fake page, which is used to make pv_list
+ * locking work for physical addresses not covered by
+ * vm_page_array[]. In particular, it is needed by
+ * pv_list_lock_object() and CHANGE_PV_LIST_LOCK_TO_PHYS().
+ */
+ pmap_page_init(&pv_fake_page);
+ pv_fake_page.phys_addr = pmap_last_pa + PAGE_SIZE;
+
pvd->pv_page.pv_gen = 0;
pvd->pv_page.pat_mode = 0;
pvd->pv_invl_gen = 0;
@@ -2276,7 +2402,7 @@
* Initialize the pool of pv list locks.
*/
for (i = 0; i < NPV_LIST_LOCKS; i++)
- rw_init(&pv_list_locks[i], "pmap pv list");
+ obm_init_lo(&pv_lo[i], "pmap pv list");
/*
* Calculate the size of the pv head table for superpages.
@@ -2292,6 +2418,10 @@
for (i = 0; i < pv_npg; i++)
TAILQ_INIT(&pv_table[i].pv_list);
TAILQ_INIT(&pv_dummy.pv_list);
+
+ /* See explanation above for NUMA case. */
+ pmap_page_init(&pv_fake_page);
+ pv_fake_page.phys_addr = vm_phys_segs[vm_phys_nsegs - 1].end + PAGE_SIZE;
}
#endif
@@ -2307,6 +2437,10 @@
vm_page_t m, mpte;
int error, i, ret, skz63;
+ /* Compiler cannot evaluate this at compile time. */
+ MPASS(__bitcount64(PC_FREE0) + __bitcount64(PC_FREE1) +
+ __bitcount64(PC_FREE2) == _NPCPV);
+
/* L1TF, reserve page @0 unconditionally */
vm_page_blacklist_add(0, bootverbose);
@@ -2396,13 +2530,6 @@
}
}
- /*
- * Initialize pv chunk lists.
- */
- for (i = 0; i < PMAP_MEMDOM; i++) {
- mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL, MTX_DEF);
- TAILQ_INIT(&pv_chunks[i].pvc_list);
- }
pmap_init_pv_table();
pmap_initialized = 1;
@@ -4061,6 +4188,22 @@
}
}
+void
+pmap_lock_init(pmap_t pmap)
+{
+ mtx_init(&pmap->pm_mtx, "pmap", NULL, MTX_DEF | MTX_DUPOK);
+
+ /*
+ * Add the pmap to the global list of pmaps, which is used
+ * during pv chunk reclamation. The pmap is never removed
+ * from the list, relying on type-stability of the vmspace
+ * zone.
+ */
+ mtx_lock(&all_pmaps_lock);
+ TAILQ_INSERT_TAIL(&all_pmaps, pmap, pm_allpmaps);
+ mtx_unlock(&all_pmaps_lock);
+}
+
void
pmap_pinit0(pmap_t pmap)
{
@@ -4279,8 +4422,7 @@
}
static pml4_entry_t *
-pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va,
- bool addref)
+pmap_allocpte_getpml4(pmap_t pmap, PVLL **lockp, vm_offset_t va, bool addref)
{
vm_pindex_t pml5index;
pml5_entry_t *pml5;
@@ -4316,8 +4458,7 @@
}
static pdp_entry_t *
-pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va,
- bool addref)
+pmap_allocpte_getpdp(pmap_t pmap, PVLL **lockp, vm_offset_t va, bool addref)
{
vm_page_t pdppg;
pml4_entry_t *pml4;
@@ -4390,7 +4531,7 @@
* since it is statically allocated by pmap_pinit() and not by pmap_allocpte().
*/
static vm_page_t
-pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp,
+pmap_allocpte_nosleep(pmap_t pmap, vm_pindex_t ptepindex, PVLL **lockp,
vm_offset_t va)
{
vm_pindex_t pml5index, pml4index;
@@ -4524,7 +4665,7 @@
* which prevents the page from being freed under us.
*/
static vm_page_t
-pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp,
+pmap_allocpte_alloc(pmap_t pmap, vm_pindex_t ptepindex, PVLL **lockp,
vm_offset_t va)
{
vm_page_t m;
@@ -4541,8 +4682,7 @@
}
static pd_entry_t *
-pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp,
- struct rwlock **lockp)
+pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, PVLL **lockp)
{
pdp_entry_t *pdpe, PG_V;
pd_entry_t *pde;
@@ -4581,7 +4721,7 @@
}
static vm_page_t
-pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
+pmap_allocpte(pmap_t pmap, vm_offset_t va, PVLL **lockp)
{
vm_pindex_t ptepindex;
pd_entry_t *pd, PG_V;
@@ -4832,10 +4972,6 @@
* page management routines.
***************************************************/
-CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
-CTASSERT(_NPCM == 3);
-CTASSERT(_NPCPV == 168);
-
static __inline struct pv_chunk *
pv_to_chunk(pv_entry_t pv)
{
@@ -4845,10 +4981,6 @@
#define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
-#define PC_FREE0 0xfffffffffffffffful
-#define PC_FREE1 0xfffffffffffffffful
-#define PC_FREE2 0x000000fffffffffful
-
static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
#ifdef PV_STATS
@@ -4876,129 +5008,32 @@
"Current number of spare pv entries");
#endif
-static void
-reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di)
-{
-
- if (pmap == NULL)
- return;
- pmap_invalidate_all(pmap);
- if (pmap != locked_pmap)
- PMAP_UNLOCK(pmap);
- if (start_di)
- pmap_delayed_invl_finish();
-}
-
-/*
- * We are in a serious low memory condition. Resort to
- * drastic measures to free some pages so we can allocate
- * another pv entry chunk.
- *
- * Returns NULL if PV entries were reclaimed from the specified pmap.
- *
- * We do not, however, unmap 2mpages because subsequent accesses will
- * allocate per-page pv entries until repromotion occurs, thereby
- * exacerbating the shortage of free pv entries.
- */
-static vm_page_t
-reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
+static bool
+reclaim_pv_chunk_handle_pmap(pmap_t pmap, pmap_t locked_pmap,
+ bool avoid_locked_pmap, PVLL **lockp, struct spglist *free)
{
- struct pv_chunks_list *pvc;
- struct pv_chunk *pc, *pc_marker, *pc_marker_end;
- struct pv_chunk_header pc_marker_b, pc_marker_end_b;
+ struct pv_chunk *pc, *pcn;
+ pv_entry_t pv;
+ vm_offset_t va;
+ vm_page_t m, m_pc;
struct md_page *pvh;
pd_entry_t *pde;
- pmap_t next_pmap, pmap;
pt_entry_t *pte, tpte;
pt_entry_t PG_G, PG_A, PG_M, PG_RW;
- pv_entry_t pv;
- vm_offset_t va;
- vm_page_t m, m_pc;
- struct spglist free;
uint64_t inuse;
int bit, field, freed;
- bool start_di, restart;
-
- PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
- KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
- pmap = NULL;
- m_pc = NULL;
- PG_G = PG_A = PG_M = PG_RW = 0;
- SLIST_INIT(&free);
- bzero(&pc_marker_b, sizeof(pc_marker_b));
- bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
- pc_marker = (struct pv_chunk *)&pc_marker_b;
- pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
-
- /*
- * A delayed invalidation block should already be active if
- * pmap_advise() or pmap_remove() called this function by way
- * of pmap_demote_pde_locked().
- */
- start_di = pmap_not_in_di();
+ bool ret;
- pvc = &pv_chunks[domain];
- mtx_lock(&pvc->pvc_lock);
- pvc->active_reclaims++;
- TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
- TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
- while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
- SLIST_EMPTY(&free)) {
- next_pmap = pc->pc_pmap;
- if (next_pmap == NULL) {
- /*
- * The next chunk is a marker. However, it is
- * not our marker, so active_reclaims must be
- * > 1. Consequently, the next_chunk code
- * will not rotate the pv_chunks list.
- */
- goto next_chunk;
- }
- mtx_unlock(&pvc->pvc_lock);
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
- /*
- * A pv_chunk can only be removed from the pc_lru list
- * when both pc_chunks_mutex is owned and the
- * corresponding pmap is locked.
- */
- if (pmap != next_pmap) {
- restart = false;
- reclaim_pv_chunk_leave_pmap(pmap, locked_pmap,
- start_di);
- pmap = next_pmap;
- /* Avoid deadlock and lock recursion. */
- if (pmap > locked_pmap) {
- RELEASE_PV_LIST_LOCK(lockp);
- PMAP_LOCK(pmap);
- if (start_di)
- pmap_delayed_invl_start();
- mtx_lock(&pvc->pvc_lock);
- restart = true;
- } else if (pmap != locked_pmap) {
- if (PMAP_TRYLOCK(pmap)) {
- if (start_di)
- pmap_delayed_invl_start();
- mtx_lock(&pvc->pvc_lock);
- restart = true;
- } else {
- pmap = NULL; /* pmap is not locked */
- mtx_lock(&pvc->pvc_lock);
- pc = TAILQ_NEXT(pc_marker, pc_lru);
- if (pc == NULL ||
- pc->pc_pmap != next_pmap)
- continue;
- goto next_chunk;
- }
- } else if (start_di)
- pmap_delayed_invl_start();
- PG_G = pmap_global_bit(pmap);
- PG_A = pmap_accessed_bit(pmap);
- PG_M = pmap_modified_bit(pmap);
- PG_RW = pmap_rw_bit(pmap);
- if (restart)
- continue;
- }
+ ret = false;
+ PG_G = pmap_global_bit(pmap);
+ PG_A = pmap_accessed_bit(pmap);
+ PG_M = pmap_modified_bit(pmap);
+ PG_RW = pmap_rw_bit(pmap);
+ TAILQ_FOREACH_REVERSE_SAFE(pc, &pmap->pm_pvchunk, pvchunks,
+ pc_list, pcn) {
/*
* Destroy every non-wired, 4 KB page mapping in the chunk.
*/
@@ -5036,84 +5071,156 @@
}
pmap_delayed_invl_page(m);
pc->pc_map[field] |= 1UL << bit;
- pmap_unuse_pt(pmap, va, *pde, &free);
+ pmap_unuse_pt(pmap, va, *pde, free);
freed++;
}
}
- if (freed == 0) {
- mtx_lock(&pvc->pvc_lock);
- goto next_chunk;
- }
+ if (freed == 0)
+ continue;
+
/* Every freed mapping is for a 4 KB page. */
pmap_resident_count_dec(pmap, freed);
PV_STAT(atomic_add_long(&pv_entry_frees, freed));
PV_STAT(atomic_add_int(&pv_entry_spare, freed));
PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
- TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
pc->pc_map[2] == PC_FREE2) {
- PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
- PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
- PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
- /* Entire chunk is free; return it. */
- m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
- dump_drop_page(m_pc->phys_addr);
- mtx_lock(&pvc->pvc_lock);
- TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
- break;
+ if (!avoid_locked_pmap || locked_pmap != pmap) {
+ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+ PV_STAT(atomic_subtract_int(&pv_entry_spare,
+ _NPCPV));
+ PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
+ PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
+ /* Entire chunk is free; return it. */
+ m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS(
+ (vm_offset_t)pc));
+ dump_drop_page(m_pc->phys_addr);
+ m_pc->ref_count = 0;
+ SLIST_INSERT_HEAD(free, m_pc, plinks.s.ss);
+ break;
+ }
+ } else {
+ /*
+ * Re-insert at head because allocator bails
+ * out if it finds fully populated chunk.
+ */
+ TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
+ TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
}
- TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
- mtx_lock(&pvc->pvc_lock);
/* One freed pv entry in locked_pmap is sufficient. */
- if (pmap == locked_pmap)
+ if (pmap == locked_pmap) {
+ ret = true;
break;
-next_chunk:
- TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
- TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
- if (pvc->active_reclaims == 1 && pmap != NULL) {
- /*
- * Rotate the pv chunks list so that we do not
- * scan the same pv chunks that could not be
- * freed (because they contained a wired
- * and/or superpage mapping) on every
- * invocation of reclaim_pv_chunk().
- */
- while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker) {
- MPASS(pc->pc_pmap != NULL);
- TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
- TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
- }
}
}
- TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
- TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
- pvc->active_reclaims--;
- mtx_unlock(&pvc->pvc_lock);
- reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di);
- if (m_pc == NULL && !SLIST_EMPTY(&free)) {
- m_pc = SLIST_FIRST(&free);
- SLIST_REMOVE_HEAD(&free, plinks.s.ss);
- /* Recycle a freed page table page. */
- m_pc->ref_count = 1;
- }
- vm_page_free_pages_toq(&free, true);
- return (m_pc);
+ return (ret);
}
+/*
+ * We are in a serious low memory condition. Resort to
+ * drastic measures to free some pages so we can allocate
+ * another pv entry chunk.
+ *
+ * Returns NULL if PV entries were reclaimed from the specified pmap,
+ * otherwise, returns a free page to be used for a PV chunk.
+ *
+ * If avoid_locked_pmap is true, chunks are not freed from the
+ * locked_pmap (but pv entries are).
+ *
+ * We do not, however, unmap 2mpages because subsequent accesses will
+ * allocate per-page pv entries until repromotion occurs, thereby
+ * exacerbating the shortage of free pv entries.
+ */
static vm_page_t
-reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
+reclaim_pv_chunk(pmap_t locked_pmap, PVLL **lockp, bool avoid_locked_pmap)
{
vm_page_t m;
- int i, domain;
+ pmap_t next_pmap, pmap;
+ struct spglist free;
+ bool res, start_di;
- domain = PCPU_GET(domain);
- for (i = 0; i < vm_ndomains; i++) {
- m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
- if (m != NULL)
+ PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
+ KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
+ pmap = NULL;
+ m = NULL;
+ res = false;
+ SLIST_INIT(&free);
+
+ /*
+ * A delayed invalidation block should already be active if
+ * pmap_advise() or pmap_remove() called this function by way
+ * of pmap_demote_pde_locked().
+ */
+ start_di = pmap_not_in_di();
+
+ for (;;) {
+ /*
+ * A parallel reclaim_pv_chunk() could move our cursor
+ * to the end of the list, which causes earlier
+ * termination of the loop. Since all callers are
+ * prepared to the reclaim_pv_chunk() failure, it only
+ * means that callers retry with the page allocator
+ * before trying to reclaim one more time.
+ */
+ mtx_lock(&all_pmaps_lock);
+ next_pmap = pmap == NULL ? TAILQ_FIRST(&all_pmaps) :
+ TAILQ_NEXT(pmap, pm_allpmaps);
+ mtx_unlock(&all_pmaps_lock);
+ if (next_pmap == NULL)
break;
- domain = (domain + 1) % vm_ndomains;
- }
+ pmap = next_pmap;
+
+ /*
+ * This lockless check is fine, we would either
+ * process a pmap without any pv chunks or skip some
+ * potentially consumable pmap. But it is still
+ * useful to cheaply skip freed pmaps which are kept
+ * on the list due to type stability.
+ */
+ if (pmap->pm_stats.resident_count == 0)
+ continue;
+
+ /* Avoid deadlock and lock recursion. */
+ if (pmap > locked_pmap) {
+ RELEASE_PV_LIST_LOCK(lockp);
+ PMAP_LOCK(pmap);
+ if (start_di)
+ pmap_delayed_invl_start();
+ } else if (pmap != locked_pmap) {
+ if (PMAP_TRYLOCK(pmap)) {
+ if (start_di)
+ pmap_delayed_invl_start();
+ } else {
+ /* The pmap is not locked, skip it. */
+ continue;
+ }
+ } else if (start_di)
+ pmap_delayed_invl_start();
+ if (pmap->pm_stats.resident_count != 0) {
+ res = reclaim_pv_chunk_handle_pmap(pmap, locked_pmap,
+ avoid_locked_pmap, lockp, &free);
+ }
+ pmap_invalidate_all(pmap);
+ if (pmap != locked_pmap)
+ PMAP_UNLOCK(pmap);
+ if (start_di)
+ pmap_delayed_invl_finish();
+ if (res || !SLIST_EMPTY(&free)) {
+ mtx_lock(&all_pmaps_lock);
+ TAILQ_REMOVE(&all_pmaps, pmap, pm_allpmaps);
+ TAILQ_INSERT_TAIL(&all_pmaps, pmap, pm_allpmaps);
+ mtx_unlock(&all_pmaps_lock);
+ break;
+ }
+ }
+ if (!res && !SLIST_EMPTY(&free)) {
+ m = SLIST_FIRST(&free);
+ SLIST_REMOVE_HEAD(&free, plinks.s.ss);
+ /* Recycle a freed page table page. */
+ m->ref_count = 1;
+ }
+ vm_page_free_pages_toq(&free, true);
return (m);
}
@@ -5149,7 +5256,7 @@
}
static void
-free_pv_chunk_dequeued(struct pv_chunk *pc)
+free_pv_chunk(struct pv_chunk *pc)
{
vm_page_t m;
@@ -5163,40 +5270,13 @@
vm_page_free(m);
}
-static void
-free_pv_chunk(struct pv_chunk *pc)
-{
- struct pv_chunks_list *pvc;
-
- pvc = &pv_chunks[pc_to_domain(pc)];
- mtx_lock(&pvc->pvc_lock);
- TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
- mtx_unlock(&pvc->pvc_lock);
- free_pv_chunk_dequeued(pc);
-}
-
static void
free_pv_chunk_batch(struct pv_chunklist *batch)
{
- struct pv_chunks_list *pvc;
struct pv_chunk *pc, *npc;
- int i;
-
- for (i = 0; i < vm_ndomains; i++) {
- if (TAILQ_EMPTY(&batch[i]))
- continue;
- pvc = &pv_chunks[i];
- mtx_lock(&pvc->pvc_lock);
- TAILQ_FOREACH(pc, &batch[i], pc_list) {
- TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
- }
- mtx_unlock(&pvc->pvc_lock);
- }
- for (i = 0; i < vm_ndomains; i++) {
- TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
- free_pv_chunk_dequeued(pc);
- }
+ TAILQ_FOREACH_SAFE(pc, batch, pc_list, npc) {
+ free_pv_chunk(pc);
}
}
@@ -5209,9 +5289,8 @@
* The given PV list lock may be released.
*/
static pv_entry_t
-get_pv_entry(pmap_t pmap, struct rwlock **lockp)
+get_pv_entry(pmap_t pmap, PVLL **lockp)
{
- struct pv_chunks_list *pvc;
int bit, field;
pv_entry_t pv;
struct pv_chunk *pc;
@@ -5251,7 +5330,7 @@
PV_STAT(pc_chunk_tryfail++);
return (NULL);
}
- m = reclaim_pv_chunk(pmap, lockp);
+ m = reclaim_pv_chunk(pmap, lockp, false);
if (m == NULL)
goto retry;
}
@@ -5263,10 +5342,6 @@
pc->pc_map[0] = PC_FREE0 & ~1ul; /* preallocated bit 0 */
pc->pc_map[1] = PC_FREE1;
pc->pc_map[2] = PC_FREE2;
- pvc = &pv_chunks[vm_page_domain(m)];
- mtx_lock(&pvc->pvc_lock);
- TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
- mtx_unlock(&pvc->pvc_lock);
pv = &pc->pc_pventry[0];
TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
PV_STAT(atomic_add_long(&pv_entry_count, 1));
@@ -5310,26 +5385,16 @@
* The given PV list lock may be released.
*/
static void
-reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
+reserve_pv_entries(pmap_t pmap, int needed, PVLL **lockp)
{
- struct pv_chunks_list *pvc;
- struct pch new_tail[PMAP_MEMDOM];
struct pv_chunk *pc;
vm_page_t m;
- int avail, free, i;
+ int avail, free;
bool reclaimed;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
- /*
- * Newly allocated PV chunks must be stored in a private list until
- * the required number of PV chunks have been allocated. Otherwise,
- * reclaim_pv_chunk() could recycle one of these chunks. In
- * contrast, these chunks must be added to the pmap upon allocation.
- */
- for (i = 0; i < PMAP_MEMDOM; i++)
- TAILQ_INIT(&new_tail[i]);
retry:
avail = 0;
TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
@@ -5350,7 +5415,7 @@
m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
VM_ALLOC_WIRED);
if (m == NULL) {
- m = reclaim_pv_chunk(pmap, lockp);
+ m = reclaim_pv_chunk(pmap, lockp, true);
if (m == NULL)
goto retry;
reclaimed = true;
@@ -5364,8 +5429,6 @@
pc->pc_map[1] = PC_FREE1;
pc->pc_map[2] = PC_FREE2;
TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
- TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
- PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
/*
* The reclaim might have freed a chunk from the current pmap.
@@ -5375,14 +5438,6 @@
if (reclaimed)
goto retry;
}
- for (i = 0; i < vm_ndomains; i++) {
- if (TAILQ_EMPTY(&new_tail[i]))
- continue;
- pvc = &pv_chunks[i];
- mtx_lock(&pvc->pvc_lock);
- TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
- mtx_unlock(&pvc->pvc_lock);
- }
}
/*
@@ -5412,8 +5467,7 @@
* entries for each of the 4KB page mappings.
*/
static void
-pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
- struct rwlock **lockp)
+pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
{
struct md_page *pvh;
struct pv_chunk *pc;
@@ -5425,7 +5479,6 @@
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
KASSERT((pa & PDRMASK) == 0,
("pmap_pv_demote_pde: pa is not 2mpage aligned"));
- CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
/*
* Transfer the 2mpage's pv entry for this mapping to the first
@@ -5481,8 +5534,7 @@
* for the 2MB page mapping.
*/
static void
-pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
- struct rwlock **lockp)
+pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
{
struct md_page *pvh;
pv_entry_t pv;
@@ -5491,7 +5543,6 @@
KASSERT((pa & PDRMASK) == 0,
("pmap_pv_promote_pde: pa is not 2mpage aligned"));
- CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
/*
* Transfer the first page's pv entry for this mapping to the 2mpage's
@@ -5538,7 +5589,7 @@
*/
static boolean_t
pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
- struct rwlock **lockp)
+ PVLL **lockp)
{
pv_entry_t pv;
@@ -5561,7 +5612,7 @@
*/
static bool
pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags,
- struct rwlock **lockp)
+ PVLL **lockp)
{
struct md_page *pvh;
pv_entry_t pv;
@@ -5602,13 +5653,13 @@
static boolean_t
pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
{
- struct rwlock *lock;
+ PVLL *lock;
boolean_t rv;
lock = NULL;
rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
if (lock != NULL)
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
return (rv);
}
@@ -5641,7 +5692,7 @@
static void
pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
- pd_entry_t oldpde, struct rwlock **lockp)
+ pd_entry_t oldpde, PVLL **lockp)
{
struct spglist free;
vm_offset_t sva;
@@ -5658,7 +5709,7 @@
static boolean_t
pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
- struct rwlock **lockp)
+ PVLL **lockp)
{
pd_entry_t newpde, oldpde;
pt_entry_t *firstpte, newpte;
@@ -5792,8 +5843,11 @@
/*
* Demote the PV entry.
*/
- if ((oldpde & PG_MANAGED) != 0)
- pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
+ if ((oldpde & PG_MANAGED) != 0) {
+ pmap_pv_list_lock_pde(oldpde & PG_PS_FRAME, lockp);
+ pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
+ pmap_pv_list_unlock_pde(oldpde & PG_PS_FRAME, lockp);
+ }
atomic_add_long(&pmap_pde_demotions, 1);
CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p",
@@ -5846,7 +5900,7 @@
*/
static int
pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
- struct spglist *free, struct rwlock **lockp)
+ struct spglist *free, PVLL **lockp)
{
struct md_page *pvh;
pd_entry_t oldpde;
@@ -5875,6 +5929,7 @@
eva = sva + NBPDR;
for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
va < eva; va += PAGE_SIZE, m++) {
+ CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
vm_page_dirty(m);
if (oldpde & PG_A)
@@ -5907,7 +5962,7 @@
*/
static int
pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
- pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
+ pd_entry_t ptepde, struct spglist *free, PVLL **lockp)
{
struct md_page *pvh;
pt_entry_t oldpte, PG_A, PG_M, PG_RW;
@@ -5948,7 +6003,7 @@
pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
struct spglist *free)
{
- struct rwlock *lock;
+ PVLL *lock;
pt_entry_t *pte, PG_V;
PG_V = pmap_valid_bit(pmap);
@@ -5961,7 +6016,7 @@
lock = NULL;
pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
if (lock != NULL)
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
pmap_invalidate_page(pmap, va);
}
@@ -5970,7 +6025,7 @@
*/
static bool
pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
- pd_entry_t *pde, struct spglist *free, struct rwlock **lockp)
+ pd_entry_t *pde, struct spglist *free, PVLL **lockp)
{
pt_entry_t PG_G, *pte;
vm_offset_t va;
@@ -6012,7 +6067,7 @@
void
pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
{
- struct rwlock *lock;
+ PVLL *lock;
vm_page_t mt;
vm_offset_t va_next;
pml5_entry_t *pml5e;
@@ -6150,7 +6205,7 @@
anyvalid = 1;
}
if (lock != NULL)
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
out:
if (anyvalid)
pmap_invalidate_all(pmap);
@@ -6178,7 +6233,7 @@
struct md_page *pvh;
pv_entry_t pv;
pmap_t pmap;
- struct rwlock *lock;
+ PVLL *lock;
pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
pd_entry_t *pde;
vm_offset_t va;
@@ -6192,16 +6247,16 @@
pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
pa_to_pvh(VM_PAGE_TO_PHYS(m));
retry:
- rw_wlock(lock);
+ pmap_pv_list_lock(lock);
while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
pvh_gen = pvh->pv_gen;
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_LOCK(pmap);
- rw_wlock(lock);
+ pmap_pv_list_lock(lock);
if (pvh_gen != pvh->pv_gen) {
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_UNLOCK(pmap);
goto retry;
}
@@ -6210,17 +6265,19 @@
pde = pmap_pde(pmap, va);
(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
PMAP_UNLOCK(pmap);
+ if (lock != m)
+ CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
}
while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
pvh_gen = pvh->pv_gen;
md_gen = m->md.pv_gen;
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_LOCK(pmap);
- rw_wlock(lock);
+ pmap_pv_list_lock(lock);
if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_UNLOCK(pmap);
goto retry;
}
@@ -6252,7 +6309,7 @@
PMAP_UNLOCK(pmap);
}
vm_page_aflag_clear(m, PGA_WRITEABLE);
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
pmap_delayed_invl_wait(m);
vm_page_free_pages_toq(&free, true);
}
@@ -6487,8 +6544,7 @@
* identical characteristics.
*/
static void
-pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
- struct rwlock **lockp)
+pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va, PVLL **lockp)
{
pd_entry_t newpde;
pt_entry_t *firstpte, oldpte, pa, *pte;
@@ -6590,8 +6646,11 @@
/*
* Promote the pv entries.
*/
- if ((newpde & PG_MANAGED) != 0)
- pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
+ if ((newpde & PG_MANAGED) != 0) {
+ pmap_pv_list_lock_pde(newpde & PG_PS_FRAME, lockp);
+ pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
+ pmap_pv_list_unlock_pde(newpde & PG_PS_FRAME, lockp);
+ }
/*
* Propagate the PAT index to its proper position.
@@ -6726,7 +6785,7 @@
pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
u_int flags, int8_t psind)
{
- struct rwlock *lock;
+ PVLL *lock;
pd_entry_t *pde;
pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
pt_entry_t newpte, origpte;
@@ -6939,7 +6998,7 @@
pv = get_pv_entry(pmap, &lock);
pv->pv_va = va;
}
- CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
+ CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
m->md.pv_gen++;
if ((newpte & PG_RW) != 0)
@@ -6992,7 +7051,7 @@
rv = KERN_SUCCESS;
out:
if (lock != NULL)
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_UNLOCK(pmap);
return (rv);
}
@@ -7006,7 +7065,7 @@
*/
static bool
pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
- struct rwlock **lockp)
+ PVLL **lockp)
{
pd_entry_t newpde;
pt_entry_t PG_V;
@@ -7057,7 +7116,7 @@
*/
static int
pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
- vm_page_t m, struct rwlock **lockp)
+ vm_page_t m, PVLL **lockp)
{
struct spglist free;
pd_entry_t oldpde, *pde;
@@ -7205,7 +7264,7 @@
pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
vm_page_t m_start, vm_prot_t prot)
{
- struct rwlock *lock;
+ PVLL *lock;
vm_offset_t va;
vm_page_t m, mpte;
vm_pindex_t diff, psize;
@@ -7230,7 +7289,7 @@
m = TAILQ_NEXT(m, listq);
}
if (lock != NULL)
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_UNLOCK(pmap);
}
@@ -7246,19 +7305,19 @@
void
pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
{
- struct rwlock *lock;
+ PVLL *lock;
lock = NULL;
PMAP_LOCK(pmap);
(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
if (lock != NULL)
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_UNLOCK(pmap);
}
static vm_page_t
pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
- vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
+ vm_prot_t prot, vm_page_t mpte, PVLL **lockp)
{
pt_entry_t newpte, *pte, PG_V;
@@ -7566,7 +7625,7 @@
pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
vm_offset_t src_addr)
{
- struct rwlock *lock;
+ PVLL *lock;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t *pde, srcptepaddr;
@@ -7730,7 +7789,7 @@
}
out:
if (lock != NULL)
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_UNLOCK(src_pmap);
PMAP_UNLOCK(dst_pmap);
}
@@ -7845,7 +7904,7 @@
pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
{
struct md_page *pvh;
- struct rwlock *lock;
+ PVLL *lock;
pv_entry_t pv;
int loops = 0;
boolean_t rv;
@@ -7854,7 +7913,7 @@
("pmap_page_exists_quick: page %p is not managed", m));
rv = FALSE;
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
- rw_rlock(lock);
+ pmap_pv_list_lock(lock);
TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
if (PV_PMAP(pv) == pmap) {
rv = TRUE;
@@ -7876,7 +7935,7 @@
break;
}
}
- rw_runlock(lock);
+ pmap_pv_list_unlock(lock);
return (rv);
}
@@ -7889,7 +7948,7 @@
int
pmap_page_wired_mappings(vm_page_t m)
{
- struct rwlock *lock;
+ PVLL *lock;
struct md_page *pvh;
pmap_t pmap;
pt_entry_t *pte;
@@ -7899,16 +7958,16 @@
if ((m->oflags & VPO_UNMANAGED) != 0)
return (0);
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
- rw_rlock(lock);
+ pmap_pv_list_lock(lock);
restart:
count = 0;
TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
md_gen = m->md.pv_gen;
- rw_runlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_LOCK(pmap);
- rw_rlock(lock);
+ pmap_pv_list_lock(lock);
if (md_gen != m->md.pv_gen) {
PMAP_UNLOCK(pmap);
goto restart;
@@ -7926,9 +7985,9 @@
if (!PMAP_TRYLOCK(pmap)) {
md_gen = m->md.pv_gen;
pvh_gen = pvh->pv_gen;
- rw_runlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_LOCK(pmap);
- rw_rlock(lock);
+ pmap_pv_list_lock(lock);
if (md_gen != m->md.pv_gen ||
pvh_gen != pvh->pv_gen) {
PMAP_UNLOCK(pmap);
@@ -7941,7 +8000,7 @@
PMAP_UNLOCK(pmap);
}
}
- rw_runlock(lock);
+ pmap_pv_list_unlock(lock);
return (count);
}
@@ -7952,17 +8011,17 @@
boolean_t
pmap_page_is_mapped(vm_page_t m)
{
- struct rwlock *lock;
+ PVLL *lock;
boolean_t rv;
if ((m->oflags & VPO_UNMANAGED) != 0)
return (FALSE);
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
- rw_rlock(lock);
+ pmap_pv_list_lock(lock);
rv = !TAILQ_EMPTY(&m->md.pv_list) ||
((m->flags & PG_FICTITIOUS) == 0 &&
!TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
- rw_runlock(lock);
+ pmap_pv_list_unlock(lock);
return (rv);
}
@@ -7998,15 +8057,14 @@
pt_entry_t *pte, tpte;
pt_entry_t PG_M, PG_RW, PG_V;
struct spglist free;
- struct pv_chunklist free_chunks[PMAP_MEMDOM];
+ struct pv_chunklist free_chunks;
vm_page_t m, mpte, mt;
pv_entry_t pv;
struct md_page *pvh;
struct pv_chunk *pc, *npc;
- struct rwlock *lock;
int64_t bit;
uint64_t inuse, bitmask;
- int allfree, field, freed, i, idx;
+ int allfree, field, freed, idx;
boolean_t superpage;
vm_paddr_t pa;
@@ -8029,13 +8087,11 @@
}
#endif
- lock = NULL;
PG_M = pmap_modified_bit(pmap);
PG_V = pmap_valid_bit(pmap);
PG_RW = pmap_rw_bit(pmap);
- for (i = 0; i < PMAP_MEMDOM; i++)
- TAILQ_INIT(&free_chunks[i]);
+ TAILQ_INIT(&free_chunks);
SLIST_INIT(&free);
PMAP_LOCK(pmap);
TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
@@ -8117,12 +8173,11 @@
vm_page_dirty(m);
}
- CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
+ pmap_pv_list_lock(m);
/* Mark free */
pc->pc_map[field] |= bitmask;
if (superpage) {
- pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
pvh = pa_to_pvh(tpte & PG_PS_FRAME);
TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
pvh->pv_gen++;
@@ -8132,6 +8187,7 @@
TAILQ_EMPTY(&mt->md.pv_list))
vm_page_aflag_clear(mt, PGA_WRITEABLE);
}
+ pmap_pv_list_unlock(m);
mpte = pmap_remove_pt_page(pmap, pv->pv_va);
if (mpte != NULL) {
KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
@@ -8142,8 +8198,9 @@
mpte->ref_count = 0;
pmap_add_delayed_free_list(mpte, &free, FALSE);
}
+ pmap_resident_count_dec(pmap, NBPDR /
+ PAGE_SIZE);
} else {
- pmap_resident_count_dec(pmap, 1);
TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
m->md.pv_gen++;
if ((m->a.flags & PGA_WRITEABLE) != 0 &&
@@ -8153,6 +8210,8 @@
if (TAILQ_EMPTY(&pvh->pv_list))
vm_page_aflag_clear(m, PGA_WRITEABLE);
}
+ pmap_pv_list_unlock(m);
+ pmap_resident_count_dec(pmap, 1);
}
pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
freed++;
@@ -8163,11 +8222,9 @@
PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
if (allfree) {
TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
- TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc, pc_list);
+ TAILQ_INSERT_TAIL(&free_chunks, pc, pc_list);
}
}
- if (lock != NULL)
- rw_wunlock(lock);
pmap_invalidate_all(pmap);
pmap_pkru_deassign_all(pmap);
free_pv_chunk_batch((struct pv_chunklist *)&free_chunks);
@@ -8178,7 +8235,7 @@
static boolean_t
pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
{
- struct rwlock *lock;
+ PVLL *lock;
pv_entry_t pv;
struct md_page *pvh;
pt_entry_t *pte, mask;
@@ -8189,15 +8246,15 @@
rv = FALSE;
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
- rw_rlock(lock);
+ pmap_pv_list_lock(lock);
restart:
TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
md_gen = m->md.pv_gen;
- rw_runlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_LOCK(pmap);
- rw_rlock(lock);
+ pmap_pv_list_lock(lock);
if (md_gen != m->md.pv_gen) {
PMAP_UNLOCK(pmap);
goto restart;
@@ -8227,9 +8284,9 @@
if (!PMAP_TRYLOCK(pmap)) {
md_gen = m->md.pv_gen;
pvh_gen = pvh->pv_gen;
- rw_runlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_LOCK(pmap);
- rw_rlock(lock);
+ pmap_pv_list_lock(lock);
if (md_gen != m->md.pv_gen ||
pvh_gen != pvh->pv_gen) {
PMAP_UNLOCK(pmap);
@@ -8255,7 +8312,7 @@
}
}
out:
- rw_runlock(lock);
+ pmap_pv_list_unlock(lock);
return (rv);
}
@@ -8328,7 +8385,7 @@
{
struct md_page *pvh;
pmap_t pmap;
- struct rwlock *lock;
+ PVLL *lock;
pv_entry_t next_pv, pv;
pd_entry_t *pde;
pt_entry_t oldpte, *pte, PG_M, PG_RW;
@@ -8346,17 +8403,18 @@
pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
pa_to_pvh(VM_PAGE_TO_PHYS(m));
retry_pv_loop:
- rw_wlock(lock);
+ pmap_pv_list_lock(lock);
+retry_pv_loop_locked:
+ pvh_gen = pvh->pv_gen;
TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
- pvh_gen = pvh->pv_gen;
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_LOCK(pmap);
- rw_wlock(lock);
+ pmap_pv_list_lock(lock);
if (pvh_gen != pvh->pv_gen) {
PMAP_UNLOCK(pmap);
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
goto retry_pv_loop;
}
}
@@ -8365,23 +8423,24 @@
pde = pmap_pde(pmap, va);
if ((*pde & PG_RW) != 0)
(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
- KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
- ("inconsistent pv lock %p %p for page %p",
- lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
PMAP_UNLOCK(pmap);
+ if (lock != m || pvh_gen != pvh->pv_gen) {
+ CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
+ goto retry_pv_loop_locked;
+ }
}
TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
pvh_gen = pvh->pv_gen;
md_gen = m->md.pv_gen;
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_LOCK(pmap);
- rw_wlock(lock);
+ pmap_pv_list_lock(lock);
if (pvh_gen != pvh->pv_gen ||
md_gen != m->md.pv_gen) {
PMAP_UNLOCK(pmap);
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
goto retry_pv_loop;
}
}
@@ -8404,7 +8463,7 @@
}
PMAP_UNLOCK(pmap);
}
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
vm_page_aflag_clear(m, PGA_WRITEABLE);
pmap_delayed_invl_wait(m);
}
@@ -8462,7 +8521,7 @@
struct md_page *pvh;
pv_entry_t pv, pvf;
pmap_t pmap;
- struct rwlock *lock;
+ PVLL *lock;
pd_entry_t oldpde, *pde;
pt_entry_t *pte, PG_A, PG_M, PG_RW;
vm_offset_t va;
@@ -8478,21 +8537,21 @@
pa = VM_PAGE_TO_PHYS(m);
lock = PHYS_TO_PV_LIST_LOCK(pa);
pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
- rw_wlock(lock);
+ pmap_pv_list_lock(lock);
retry:
not_cleared = 0;
if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
goto small_mappings;
pv = pvf;
do {
+ pvh_gen = pvh->pv_gen;
if (pvf == NULL)
pvf = pv;
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
- pvh_gen = pvh->pv_gen;
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_LOCK(pmap);
- rw_wlock(lock);
+ pmap_pv_list_lock(lock);
if (pvh_gen != pvh->pv_gen) {
PMAP_UNLOCK(pmap);
goto retry;
@@ -8557,7 +8616,6 @@
pmap_invalidate_page(pmap, va);
} else
demoted = TRUE;
-
if (demoted) {
/*
* The superpage mapping was removed
@@ -8569,9 +8627,10 @@
pv = NULL;
}
cleared++;
- KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
- ("inconsistent pv lock %p %p for page %p",
- lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
+ if (lock != m || pvh_gen != pvh->pv_gen) {
+ CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
+ goto retry;
+ }
} else
not_cleared++;
}
@@ -8596,9 +8655,9 @@
if (!PMAP_TRYLOCK(pmap)) {
pvh_gen = pvh->pv_gen;
md_gen = m->md.pv_gen;
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_LOCK(pmap);
- rw_wlock(lock);
+ pmap_pv_list_lock(lock);
if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
PMAP_UNLOCK(pmap);
goto retry;
@@ -8649,7 +8708,7 @@
} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
not_cleared < PMAP_TS_REFERENCED_MAX);
out:
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
vm_page_free_pages_toq(&free, true);
return (cleared + not_cleared);
}
@@ -8662,7 +8721,7 @@
void
pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
{
- struct rwlock *lock;
+ PVLL *lock;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t oldpde, *pde;
@@ -8727,7 +8786,7 @@
lock = NULL;
if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
if (lock != NULL)
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
/*
* The large page mapping was destroyed.
@@ -8759,7 +8818,7 @@
anychanged = true;
}
if (lock != NULL)
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
}
if (va_next > eva)
va_next = eva;
@@ -8816,7 +8875,7 @@
pv_entry_t next_pv, pv;
pd_entry_t oldpde, *pde;
pt_entry_t *pte, PG_M, PG_RW;
- struct rwlock *lock;
+ PVLL *lock;
vm_offset_t va;
int md_gen, pvh_gen;
@@ -8829,15 +8888,15 @@
pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
pa_to_pvh(VM_PAGE_TO_PHYS(m));
lock = VM_PAGE_TO_PV_LIST_LOCK(m);
- rw_wlock(lock);
+ pmap_pv_list_lock(lock);
restart:
+ pvh_gen = pvh->pv_gen;
TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
- pvh_gen = pvh->pv_gen;
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_LOCK(pmap);
- rw_wlock(lock);
+ pmap_pv_list_lock(lock);
if (pvh_gen != pvh->pv_gen) {
PMAP_UNLOCK(pmap);
goto restart;
@@ -8863,15 +8922,19 @@
pmap_invalidate_page(pmap, va);
}
PMAP_UNLOCK(pmap);
+ if (lock != m || pvh_gen != pvh->pv_gen) {
+ CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
+ goto restart;
+ }
}
TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
pmap = PV_PMAP(pv);
if (!PMAP_TRYLOCK(pmap)) {
md_gen = m->md.pv_gen;
pvh_gen = pvh->pv_gen;
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_LOCK(pmap);
- rw_wlock(lock);
+ pmap_pv_list_lock(lock);
if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
PMAP_UNLOCK(pmap);
goto restart;
@@ -8889,7 +8952,7 @@
}
PMAP_UNLOCK(pmap);
}
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
}
/*
@@ -9814,7 +9877,7 @@
pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
{
int rv;
- struct rwlock *lock;
+ PVLL *lock;
#if VM_NRESERVLEVEL > 0
vm_page_t m, mpte;
#endif
@@ -9901,7 +9964,7 @@
rv = 0; /* success */
done:
if (lock != NULL)
- rw_wunlock(lock);
+ pmap_pv_list_unlock(lock);
PMAP_UNLOCK(pmap);
return (rv);
}
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -268,6 +268,7 @@
#include <sys/_cpuset.h>
#include <sys/_lock.h>
#include <sys/_mutex.h>
+#include <sys/_obm.h>
#include <sys/_pctrie.h>
#include <sys/_rangeset.h>
#include <sys/_smr.h>
@@ -342,7 +343,9 @@
struct md_page {
TAILQ_HEAD(, pv_entry) pv_list; /* (p) */
int pv_gen; /* (p) */
- int pat_mode;
+ obm_lock_t pv_lock;
+ uint8_t pat_mode;
+ uint8_t pad0[2];
};
enum pmap_type {
@@ -356,6 +359,8 @@
uint32_t pm_gen;
};
+TAILQ_HEAD(pvchunks, pv_chunk);
+
/*
* The kernel virtual address (KVA) of the level 4 page table page is always
* within the direct map (DMAP) region.
@@ -366,7 +371,7 @@
pml4_entry_t *pm_pmltopu; /* KVA of user top page table */
uint64_t pm_cr3;
uint64_t pm_ucr3;
- TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */
+ struct pvchunks pm_pvchunk; /* list of mappings in pmap */
cpuset_t pm_active; /* active on cpus */
enum pmap_type pm_type; /* regular or nested tables */
struct pmap_statistics pm_stats; /* pmap statistics */
@@ -376,6 +381,7 @@
int pm_flags;
struct pmap_pcids pm_pcids[MAXCPU];
struct rangeset pm_pkru;
+ TAILQ_ENTRY(pmap) pm_allpmaps;
};
/* flags */
@@ -394,8 +400,7 @@
#define PMAP_LOCK_ASSERT(pmap, type) \
mtx_assert(&(pmap)->pm_mtx, (type))
#define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx)
-#define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, "pmap", \
- NULL, MTX_DEF | MTX_DUPOK)
+#define PMAP_LOCK_INIT(pmap) pmap_lock_init(pmap)
#define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx)
#define PMAP_MTX(pmap) (&(pmap)->pm_mtx)
#define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx)
@@ -423,8 +428,7 @@
#define PV_CHUNK_HEADER \
pmap_t pc_pmap; \
TAILQ_ENTRY(pv_chunk) pc_list; \
- uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */ \
- TAILQ_ENTRY(pv_chunk) pc_lru;
+ uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */
struct pv_chunk_header {
PV_CHUNK_HEADER
@@ -469,6 +473,7 @@
int pmap_large_map(vm_paddr_t, vm_size_t, void **, vm_memattr_t);
void pmap_large_map_wb(void *sva, vm_size_t len);
void pmap_large_unmap(void *sva, vm_size_t len);
+void pmap_lock_init(pmap_t pmap);
void *pmap_mapbios(vm_paddr_t, vm_size_t);
void *pmap_mapdev(vm_paddr_t, vm_size_t);
void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int);
diff --git a/sys/conf/files b/sys/conf/files
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3833,6 +3833,7 @@
kern/kern_mtxpool.c standard
kern/kern_mutex.c standard
kern/kern_ntptime.c standard
+kern/kern_obm.c standard
kern/kern_osd.c standard
kern/kern_physio.c standard
kern/kern_pmc.c standard
diff --git a/sys/kern/kern_obm.c b/sys/kern/kern_obm.c
new file mode 100644
--- /dev/null
+++ b/sys/kern/kern_obm.c
@@ -0,0 +1,161 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/obm.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/turnstile.h>
+#include <machine/atomic.h>
+
+void
+obm_init_lo(struct lock_object *lo, const char *name)
+{
+ bzero(lo, sizeof(*lo));
+ lo->lo_name = name;
+}
+
+void
+obm_init(obm_lock_t *obm)
+{
+ obm->lk = OBM_UNLOCKED;
+}
+
+static void __noinline
+obm_lock_hard(obm_lock_t *obm, struct lock_object *lo, uint8_t v LOCK_FILE_LINE_ARG_DEF)
+{
+ struct turnstile *ts;
+#ifdef LOCK_PROFILING
+ int contested = 0;
+ uint64_t waittime = 0;
+#endif
+
+#ifdef LOCK_PROFILING
+ lock_profile_obtain_lock_failed(lo, &contested, &waittime);
+#endif
+
+ for (;;) {
+ if (v == OBM_UNLOCKED) {
+ if (atomic_fcmpset_acq_char(&obm->lk, &v,
+ OBM_LOCKED) != 0)
+ break;
+ continue;
+ }
+
+ ts = turnstile_trywait(lo);
+ v = atomic_load_8(&obm->lk);
+retry_ts:
+ if (v == OBM_UNLOCKED) {
+ turnstile_cancel(ts);
+ continue;
+ }
+ if ((v & OBM_CONTESTED) == 0 &&
+ atomic_fcmpset_8(&obm->lk, &v, v | OBM_CONTESTED) == 0) {
+ goto retry_ts;
+ }
+ turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
+ v = atomic_load_8(&obm->lk);
+ }
+
+#ifdef LOCK_PROFILING
+ lock_profile_obtain_lock_success(lo, contested, waittime, file, line);
+#endif
+ TD_LOCKS_INC(curthread);
+}
+
+static void __noinline
+obm_unlock_hard(obm_lock_t *obm, struct lock_object *lo)
+{
+ struct turnstile *ts;
+
+ lock_profile_release_lock(lo);
+ turnstile_chain_lock(lo);
+ atomic_store_rel_8(&obm->lk, OBM_UNLOCKED);
+ ts = turnstile_lookup(lo);
+ if (ts != NULL) {
+ turnstile_broadcast(ts, TS_SHARED_QUEUE);
+ turnstile_unpend(ts);
+ }
+ turnstile_chain_unlock(lo);
+ TD_LOCKS_DEC(curthread);
+}
+
+bool
+_obm_trylock(obm_lock_t *obm, struct lock_object *lo LOCK_FILE_LINE_ARG_DEF)
+{
+ if (atomic_cmpset_acq_8(&obm->lk, OBM_UNLOCKED, OBM_LOCKED) != 0) {
+#ifdef LOCK_PROFILING
+ lock_profile_obtain_lock_success(lo, 0, 0, file, line);
+#endif
+ TD_LOCKS_INC(curthread);
+ return (true);
+ }
+ return (false);
+}
+
+void
+_obm_lock(obm_lock_t *obm, struct lock_object *lo LOCK_FILE_LINE_ARG_DEF)
+{
+ uint8_t v;
+
+ v = OBM_UNLOCKED;
+ if (__predict_true(atomic_fcmpset_acq_8(&obm->lk, &v, OBM_LOCKED))) {
+#ifdef LOCK_PROFILING
+ lock_profile_obtain_lock_success(lo, 0, 0, file, line);
+#endif
+ TD_LOCKS_INC(curthread);
+ } else {
+ MPASS(v == OBM_LOCKED || v == (OBM_LOCKED | OBM_CONTESTED) ||
+ v == OBM_UNLOCKED);
+ obm_lock_hard(obm, lo, v LOCK_FILE_LINE_ARG);
+ }
+}
+
+void
+_obm_unlock(obm_lock_t *obm, struct lock_object *lo LOCK_FILE_LINE_ARG_DEF)
+{
+ uint8_t v;
+
+ v = OBM_LOCKED;
+ if (atomic_fcmpset_rel_8(&obm->lk, &v, OBM_UNLOCKED)) {
+ lock_profile_release_lock(lo);
+ TD_LOCKS_DEC(curthread);
+ } else {
+ MPASS(v == OBM_LOCKED || v == (OBM_LOCKED | OBM_CONTESTED));
+ obm_unlock_hard(obm, lo);
+ }
+}
diff --git a/sys/sys/_obm.h b/sys/sys/_obm.h
new file mode 100644
--- /dev/null
+++ b/sys/sys/_obm.h
@@ -0,0 +1,47 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS__OBM_H
+#define _SYS__OBM_H
+
+/* One-Byte Lock */
+
+#define OBM_UNLOCKED 0x00
+#define OBM_LOCKED 0x02
+#define OBM_CONTESTED 0x01
+
+typedef struct obm_lock_tag {
+ uint8_t lk;
+} obm_lock_t;
+
+#endif
diff --git a/sys/sys/obm.h b/sys/sys/obm.h
new file mode 100644
--- /dev/null
+++ b/sys/sys/obm.h
@@ -0,0 +1,88 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2020 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_OBM_H
+#define _SYS_OBM_H
+
+/* One-Byte Lock */
+
+#ifdef _KERNEL
+
+#include <sys/systm.h>
+#include <sys/_lock.h>
+#include <sys/_obm.h>
+#include <sys/proc.h>
+#include <machine/atomic.h>
+
+#ifndef LOCK_DEBUG
+#error "LOCK_DEBUG not defined, include <sys/lock.h> before <sys/obm.h>"
+#endif
+
+void obm_init_lo(struct lock_object *lo, const char *name);
+void obm_init(obm_lock_t *obm);
+bool _obm_trylock(obm_lock_t *obm, struct lock_object *lo
+ LOCK_FILE_LINE_ARG_DEF);
+void _obm_lock(obm_lock_t *obm, struct lock_object *lo
+ LOCK_FILE_LINE_ARG_DEF);
+void _obm_unlock(obm_lock_t *obm, struct lock_object *lo
+ LOCK_FILE_LINE_ARG_DEF);
+
+__used static void
+obm_assert_locked(obm_lock_t *obm)
+{
+#ifdef INVARIANTS
+ uint8_t v;
+
+ v = atomic_load_8(&obm->lk);
+ MPASS(v == OBM_LOCKED || v == (OBM_LOCKED | OBM_CONTESTED));
+#endif
+}
+
+#if (LOCK_DEBUG > 0)
+#define obm_trylock(obm, lo) \
+ _obm_trylock(obm, lo, __FILE__, __LINE__)
+#define obm_lock(obm, lo) \
+ _obm_lock(obm, lo, __FILE__, __LINE__)
+#define obm_unlock(obm, lo) \
+ _obm_unlock(obm, lo, __FILE__, __LINE__)
+#else
+#define obm_trylock(obm, lo) \
+ _obm_trylock(obm, lo)
+#define obm_lock(obm, lo) \
+ _obm_lock(obm, lo)
+#define obm_unlock(obm, lo) \
+ _obm_unlock(obm, lo)
+#endif
+
+#endif
+#endif

File Metadata

Mime Type
text/plain
Expires
Tue, Jan 27, 1:55 PM (3 h, 55 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28056751
Default Alt Text
D24217.id83480.diff (66 KB)

Event Timeline