Changeset View
Standalone View
sys/amd64/amd64/pmap.c
- This file is larger than 256 KB, so syntax highlighting is disabled by default.
Show First 20 Lines • Show All 310 Lines • ▼ Show 20 Lines | |||||
#endif | #endif | ||||
#ifdef PV_STATS | #ifdef PV_STATS | ||||
#define PV_STAT(x) do { x ; } while (0) | #define PV_STAT(x) do { x ; } while (0) | ||||
#else | #else | ||||
#define PV_STAT(x) do { } while (0) | #define PV_STAT(x) do { } while (0) | ||||
#endif | #endif | ||||
#if VM_NRESERVLEVEL > 0 | |||||
#define pa_index(pa) ((pa) >> PDRSHIFT) | #define pa_index(pa) ((pa) >> PDRSHIFT) | ||||
#define pa_to_pmdp(pa) (&pv_table[pa_index(pa)]) | |||||
kib: Can you add INVARIANTS versions of the macros which assert that we do not access beyond the… | |||||
#define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page)) | |||||
#define PHYS_TO_PV_LIST_LOCK(pa) \ | |||||
(&(pa_to_pmdp(pa)->pv_lock)) | |||||
#else | |||||
#define pa_index(pa) ((pa) >> PDRSHIFT) | |||||
#define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) | #define pa_to_pvh(pa) (&pv_table[pa_index(pa)]) | ||||
#define NPV_LIST_LOCKS MAXCPU | #define NPV_LIST_LOCKS MAXCPU | ||||
Not Done Inline ActionsI assume this define is dead. jeff: I assume this define is dead. | |||||
Done Inline ActionsNo, but it's user should probably be modified to use MAXCPU or similar: static u_long pv_invl_gen[NPV_LIST_LOCKS]; mjg: No, but it's user should probably be modified to use MAXCPU or similar:
static u_long… | |||||
#define PHYS_TO_PV_LIST_LOCK(pa) \ | #define PHYS_TO_PV_LIST_LOCK(pa) \ | ||||
(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) | (&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS]) | ||||
#endif | |||||
#define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ | #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) do { \ | ||||
struct rwlock **_lockp = (lockp); \ | struct rwlock **_lockp = (lockp); \ | ||||
struct rwlock *_new_lock; \ | struct rwlock *_new_lock; \ | ||||
\ | \ | ||||
_new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ | _new_lock = PHYS_TO_PV_LIST_LOCK(pa); \ | ||||
if (_new_lock != *_lockp) { \ | if (_new_lock != *_lockp) { \ | ||||
if (*_lockp != NULL) \ | if (*_lockp != NULL) \ | ||||
▲ Show 20 Lines • Show All 61 Lines • ▼ Show 20 Lines | static struct pmap_preinit_mapping { | ||||
vm_offset_t va; | vm_offset_t va; | ||||
vm_size_t sz; | vm_size_t sz; | ||||
int mode; | int mode; | ||||
} pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; | } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT]; | ||||
static int pmap_initialized; | static int pmap_initialized; | ||||
/* | /* | ||||
* Data for the pv entry allocation mechanism. | * Data for the pv entry allocation mechanism. | ||||
* Updates to pv_invl_gen are protected by the pv_list_locks[] | * Updates to pv_invl_gen are protected by the pv list lock but reads are not. | ||||
* elements, but reads are not. | |||||
*/ | */ | ||||
static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); | static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); | ||||
static struct mtx __exclusive_cache_line pv_chunks_mutex; | static struct mtx __exclusive_cache_line pv_chunks_mutex; | ||||
#if VM_NRESERVLEVEL > 0 | |||||
struct pmap_large_md_page { | |||||
struct rwlock pv_lock; | |||||
struct md_page pv_page; | |||||
u_long pv_invl_gen; | |||||
}; | |||||
static struct pmap_large_md_page *pv_table; | |||||
#else | |||||
static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; | static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; | ||||
static u_long pv_invl_gen[NPV_LIST_LOCKS]; | static u_long pv_invl_gen[NPV_LIST_LOCKS]; | ||||
static struct md_page *pv_table; | static struct md_page *pv_table; | ||||
#endif | |||||
static struct md_page pv_dummy; | static struct md_page pv_dummy; | ||||
/* | /* | ||||
* All those kernel PT submaps that BSD is so fond of | * All those kernel PT submaps that BSD is so fond of | ||||
*/ | */ | ||||
pt_entry_t *CMAP1 = NULL; | pt_entry_t *CMAP1 = NULL; | ||||
caddr_t CADDR1 = 0; | caddr_t CADDR1 = 0; | ||||
static vm_offset_t qframe = 0; | static vm_offset_t qframe = 0; | ||||
▲ Show 20 Lines • Show All 494 Lines • ▼ Show 20 Lines | |||||
static long invl_wait; | static long invl_wait; | ||||
SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, | SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0, | ||||
"Number of times DI invalidation blocked pmap_remove_all/write"); | "Number of times DI invalidation blocked pmap_remove_all/write"); | ||||
static long invl_wait_slow; | static long invl_wait_slow; | ||||
SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, &invl_wait_slow, 0, | SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, &invl_wait_slow, 0, | ||||
"Number of slow invalidation waits for lockless DI"); | "Number of slow invalidation waits for lockless DI"); | ||||
#endif | #endif | ||||
#if VM_NRESERVLEVEL > 0 | |||||
static u_long * | static u_long * | ||||
pmap_delayed_invl_genp(vm_page_t m) | pmap_delayed_invl_genp(vm_page_t m) | ||||
{ | { | ||||
return (&pa_to_pmdp(VM_PAGE_TO_PHYS(m))->pv_invl_gen); | |||||
} | |||||
#else | |||||
static u_long * | |||||
pmap_delayed_invl_genp(vm_page_t m) | |||||
{ | |||||
return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); | return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]); | ||||
} | } | ||||
#endif | |||||
static void | static void | ||||
pmap_delayed_invl_callout_func(void *arg __unused) | pmap_delayed_invl_callout_func(void *arg __unused) | ||||
{ | { | ||||
if (atomic_load_int(&pmap_invl_waiters) == 0) | if (atomic_load_int(&pmap_invl_waiters) == 0) | ||||
return; | return; | ||||
pmap_delayed_invl_finish_unblock(0); | pmap_delayed_invl_finish_unblock(0); | ||||
▲ Show 20 Lines • Show All 863 Lines • ▼ Show 20 Lines | |||||
void | void | ||||
pmap_page_init(vm_page_t m) | pmap_page_init(vm_page_t m) | ||||
{ | { | ||||
TAILQ_INIT(&m->md.pv_list); | TAILQ_INIT(&m->md.pv_list); | ||||
m->md.pat_mode = PAT_WRITE_BACK; | m->md.pat_mode = PAT_WRITE_BACK; | ||||
} | } | ||||
#if VM_NRESERVLEVEL > 0 | |||||
Not Done Inline ActionsDo we need variants for reservation enabled kernels a not? It seems like the same code should work for both. jeff: Do we need variants for reservation enabled kernels a not? It seems like the same code should… | |||||
Done Inline ActionsIt does, but the old one uses less memory and there is no difficulty keeping both. mjg: It does, but the old one uses less memory and there is no difficulty keeping both. | |||||
static void | |||||
pmap_init_pv_table(void) | |||||
{ | |||||
struct pmap_large_md_page *pvd; | |||||
vm_size_t s; | |||||
int start, end, highest; | |||||
int domain, i, j, pages, pv_npg; | |||||
Not Done Inline ActionsI believe j should be long as well. BTW, why not u_long ? kib: I believe j should be long as well. BTW, why not u_long ? | |||||
Done Inline Actionsint j gives coverage of 64TB for one segment, I think that's good enough for a long time and is a little nicer next to i. signed long allows simpler loop with "highest = -1" mjg: int j gives coverage of 64TB for one segment, I think that's good enough for a long time and is… | |||||
/* | /* | ||||
* Calculate the size of the array. | |||||
*/ | |||||
pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); | |||||
kibUnsubmitted Not Done Inline ActionsAre you sure that int is enough there for all practical means ? It seems that overflow happens at PB, I do not see why not use (u_)longs there. kib: Are you sure that int is enough there for all practical means ? It seems that overflow happens… | |||||
s = (vm_size_t)(pv_npg * sizeof(struct pmap_large_md_page)); | |||||
kibUnsubmitted Not Done Inline ActionsI think it is better to cat pv_npg instead of casting the result, i.e. remove the () around multiplication. kib: I think it is better to cat pv_npg instead of casting the result, i.e. remove the () around… | |||||
s = round_page(s); | |||||
pv_table = (struct pmap_large_md_page *)kva_alloc(s); | |||||
if (pv_table == NULL) | |||||
panic("%s: kva_alloc failed\n", __func__); | |||||
Not Done Inline ActionsI think you should fix this before this version goes in. There isn't any need for the double indirect. Just use kva_alloc() to allocate the memory. Then you can populate it yourself. jeff: I think you should fix this before this version goes in. There isn't any need for the double… | |||||
Done Inline ActionsI noted this may be an easy option, but that turned out to be too naive. WIth the size of 48 bytes per object (as in: not a power of 2) we land in situation where space for some superpages will end up overlapping with the previous domain and therefore will induce interconnect traffic to use. With the pointer array the waste is at 40 bytes (56 - 16 which are already used). Removing the array drops 8 bytes per entry, but does not help here. Thus options which I see are as follows:
That said, I think the patch should go in with the current method unless someone is going to take care of the reservation stuff in the foreseeable future (in which case I'm happy to drop this patch). mjg: I noted this may be an easy option, but that turned out to be too naive. WIth the size of 48… | |||||
Not Done Inline ActionsRight now you're going to be touching remote memory (n-1)/n times anyway to follow your indirect table. You will only write to local memory but you're still touching extra cache lines just to improve locality for a fraction of memory. For the pagetable pages I just accept that the unaligned area will be imperfect. If you allocate page at a time you will have something like 4k/64 bytes per-super * 2mb super = 128MB of memory backed by locks that are in the wrong domain. That seems like a small cost to pay vs a double indirection. Because the architecture has actually very strongly aligned physical memory, we actually could improve on this situation if we made tables that covered all of physical memory. For vm_page I believe the resulting alignment would work out to gigabyte boundaries. However this would waste the space needed to represent a lot of inaccessible memory. jeff: Right now you're going to be touching remote memory (n-1)/n times anyway to follow your… | |||||
Not Done Inline ActionsThe argument for strict affinity of the pv list heads and locks also presumes that accesses are likely to be local in the first place, which is not true in general when creating or destroying mappings of shared VM objects. For example, if the system libc.so .text is backed by a superpage we are going to get many non-local accesses anyway. markj: The argument for strict affinity of the pv list heads and locks also presumes that accesses are… | |||||
/* | |||||
* Iterate physical segments to allocate space for respective pages. | |||||
*/ | |||||
highest = -1; | |||||
s = 0; | |||||
for (i = 0; i < vm_phys_nsegs; i++) { | |||||
start = vm_phys_segs[i].start / NBPDR; | |||||
end = vm_phys_segs[i].end / NBPDR; | |||||
domain = vm_phys_segs[i].domain; | |||||
if (highest >= end) | |||||
continue; | |||||
if (start < highest) { | |||||
start = highest + 1; | |||||
pvd = &pv_table[start]; | |||||
} else { | |||||
/* | |||||
* The lowest address may land somewhere in the middle | |||||
* of our page. Simplify the code by pretending it is | |||||
* at the beginning. | |||||
*/ | |||||
pvd = pa_to_pmdp(vm_phys_segs[i].start); | |||||
pvd = (struct pmap_large_md_page *)trunc_page(pvd); | |||||
start = pvd - pv_table; | |||||
} | |||||
pages = end - start + 1; | |||||
s = round_page(pages * sizeof(*pvd)); | |||||
highest = start + (s / sizeof(*pvd)) - 1; | |||||
for (j = 0; j < s; j += PAGE_SIZE) { | |||||
vm_page_t m = vm_page_alloc_domain(NULL, 0, | |||||
Not Done Inline ActionsI don't understand why kmem_back_domain did not work here. Maybe kib can comment about why pmap_qenter() works when pmap_enter() does not? It seems that it should. jeff: I don't understand why kmem_back_domain did not work here.
Maybe kib can comment about why… | |||||
Done Inline Actionsphabricator hid one of my previous updates: the panic comes from vm_reserv_alloc_page: rv = vm_reserv_from_object(object, pindex, mpred, &msucc); if (rv != NULL) { KASSERT(object != kernel_object || rv->domain == domain, ("vm_reserv_alloc_page: domain mismatch")); It happens on very first alloc from another domain. Looks like a bug in this code as it is clearly not prepared to get a different domain. Worked around by allocating without vm obj. mjg: phabricator hid one of my previous updates:
the panic comes from vm_reserv_alloc_page:
```… | |||||
domain, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); | |||||
if (m == NULL) | |||||
panic("vm_page_alloc_domain failed for %lx\n", (vm_offset_t)pvd + j); | |||||
pmap_qenter((vm_offset_t)pvd + j, &m, 1); | |||||
} | |||||
for (j = 0; j < s / sizeof(*pvd); j++) { | |||||
rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW); | |||||
TAILQ_INIT(&pvd->pv_page.pv_list); | |||||
pvd->pv_page.pv_gen = 0; | |||||
pvd->pv_page.pat_mode = 0; | |||||
pvd->pv_invl_gen = 0; | |||||
pvd++; | |||||
} | |||||
} | |||||
TAILQ_INIT(&pv_dummy.pv_list); | |||||
} | |||||
#else | |||||
static void | |||||
pmap_init_pv_table(void) | |||||
{ | |||||
vm_size_t s; | |||||
int i, pv_npg; | |||||
Not Done Inline Actionsi should be long a well. kib: i should be long a well. | |||||
/* | |||||
* Initialize the pool of pv list locks. | |||||
*/ | |||||
for (i = 0; i < NPV_LIST_LOCKS; i++) | |||||
rw_init(&pv_list_locks[i], "pmap pv list"); | |||||
/* | |||||
* Calculate the size of the pv head table for superpages. | |||||
*/ | |||||
pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); | |||||
/* | |||||
* Allocate memory for the pv head table for superpages. | |||||
*/ | |||||
s = (vm_size_t)(pv_npg * sizeof(struct md_page)); | |||||
s = round_page(s); | |||||
pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); | |||||
for (i = 0; i < pv_npg; i++) | |||||
TAILQ_INIT(&pv_table[i].pv_list); | |||||
TAILQ_INIT(&pv_dummy.pv_list); | |||||
} | |||||
#endif | |||||
/* | |||||
* Initialize the pmap module. | * Initialize the pmap module. | ||||
* Called by vm_init, to initialize any structures that the pmap | * Called by vm_init, to initialize any structures that the pmap | ||||
* system needs to map virtual memory. | * system needs to map virtual memory. | ||||
*/ | */ | ||||
void | void | ||||
pmap_init(void) | pmap_init(void) | ||||
{ | { | ||||
struct pmap_preinit_mapping *ppim; | struct pmap_preinit_mapping *ppim; | ||||
vm_page_t m, mpte; | vm_page_t m, mpte; | ||||
vm_size_t s; | int error, i, ret, skz63; | ||||
int error, i, pv_npg, ret, skz63; | |||||
/* L1TF, reserve page @0 unconditionally */ | /* L1TF, reserve page @0 unconditionally */ | ||||
vm_page_blacklist_add(0, bootverbose); | vm_page_blacklist_add(0, bootverbose); | ||||
/* Detect bare-metal Skylake Server and Skylake-X. */ | /* Detect bare-metal Skylake Server and Skylake-X. */ | ||||
if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && | if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL && | ||||
CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { | CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) { | ||||
/* | /* | ||||
▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines | if (pg_ps_enabled) { | ||||
pagesizes[1] = NBPDR; | pagesizes[1] = NBPDR; | ||||
} | } | ||||
/* | /* | ||||
* Initialize the pv chunk list mutex. | * Initialize the pv chunk list mutex. | ||||
*/ | */ | ||||
mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); | mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF); | ||||
/* | pmap_init_pv_table(); | ||||
Not Done Inline ActionsYou only need to do this if pg_ps is enabled ? kib: You only need to do this if pg_ps is enabled ? | |||||
Done Inline ActionsBoth variants take these locks so I don't think it's avoidable. mjg: Both variants take these locks so I don't think it's avoidable. | |||||
* Initialize the pool of pv list locks. | |||||
*/ | |||||
for (i = 0; i < NPV_LIST_LOCKS; i++) | |||||
rw_init(&pv_list_locks[i], "pmap pv list"); | |||||
/* | |||||
* Calculate the size of the pv head table for superpages. | |||||
*/ | |||||
pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR); | |||||
/* | |||||
* Allocate memory for the pv head table for superpages. | |||||
*/ | |||||
s = (vm_size_t)(pv_npg * sizeof(struct md_page)); | |||||
s = round_page(s); | |||||
pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO); | |||||
for (i = 0; i < pv_npg; i++) | |||||
TAILQ_INIT(&pv_table[i].pv_list); | |||||
TAILQ_INIT(&pv_dummy.pv_list); | |||||
pmap_initialized = 1; | pmap_initialized = 1; | ||||
for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { | for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) { | ||||
ppim = pmap_preinit_mapping + i; | ppim = pmap_preinit_mapping + i; | ||||
if (ppim->va == 0) | if (ppim->va == 0) | ||||
continue; | continue; | ||||
/* Make the direct map consistent */ | /* Make the direct map consistent */ | ||||
if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { | if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) { | ||||
▲ Show 20 Lines • Show All 8,414 Lines • Show Last 20 Lines |
Can you add INVARIANTS versions of the macros which assert that we do not access beyond the array end ? You would need to save the array size somewhere.