Changeset View
Standalone View
sys/amd64/amd64/pmap.c
- This file is larger than 256 KB, so syntax highlighting is disabled by default.
Show First 20 Lines • Show All 118 Lines • ▼ Show 20 Lines | |||||
#include <sys/ktr.h> | #include <sys/ktr.h> | ||||
#include <sys/lock.h> | #include <sys/lock.h> | ||||
#include <sys/malloc.h> | #include <sys/malloc.h> | ||||
#include <sys/mman.h> | #include <sys/mman.h> | ||||
#include <sys/mutex.h> | #include <sys/mutex.h> | ||||
#include <sys/proc.h> | #include <sys/proc.h> | ||||
#include <sys/rangeset.h> | #include <sys/rangeset.h> | ||||
#include <sys/rwlock.h> | #include <sys/rwlock.h> | ||||
#include <sys/sbuf.h> | |||||
#include <sys/sx.h> | #include <sys/sx.h> | ||||
#include <sys/turnstile.h> | #include <sys/turnstile.h> | ||||
#include <sys/vmem.h> | #include <sys/vmem.h> | ||||
#include <sys/vmmeter.h> | #include <sys/vmmeter.h> | ||||
#include <sys/sched.h> | #include <sys/sched.h> | ||||
#include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
#include <sys/smp.h> | #include <sys/smp.h> | ||||
#ifdef DDB | #ifdef DDB | ||||
▲ Show 20 Lines • Show All 1,976 Lines • ▼ Show 20 Lines | case PT_EPT: | ||||
break; | break; | ||||
default: | default: | ||||
panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); | panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); | ||||
} | } | ||||
return (mask); | return (mask); | ||||
} | } | ||||
static int | |||||
pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) | |||||
{ | |||||
int pat_flag, pat_idx; | |||||
pat_idx = 0; | |||||
switch (pmap->pm_type) { | |||||
case PT_X86: | |||||
case PT_RVI: | |||||
/* The PAT bit is different for PTE's and PDE's. */ | |||||
pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; | |||||
if ((pte & pat_flag) != 0) | |||||
pat_idx |= 0x4; | |||||
if ((pte & PG_NC_PCD) != 0) | |||||
pat_idx |= 0x2; | |||||
if ((pte & PG_NC_PWT) != 0) | |||||
pat_idx |= 0x1; | |||||
break; | |||||
case PT_EPT: | |||||
if ((pte & EPT_PG_IGNORE_PAT) != 0) | |||||
panic("EPT PTE %#lx has no PAT memory type", pte); | |||||
pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; | |||||
break; | |||||
} | |||||
/* See pmap_init_pat(). */ | |||||
if (pat_idx == 4) | |||||
pat_idx = 0; | |||||
if (pat_idx == 7) | |||||
pat_idx = 3; | |||||
return (pat_idx); | |||||
} | |||||
bool | bool | ||||
pmap_ps_enabled(pmap_t pmap) | pmap_ps_enabled(pmap_t pmap) | ||||
{ | { | ||||
return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); | return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); | ||||
} | } | ||||
static void | static void | ||||
▲ Show 20 Lines • Show All 7,853 Lines • ▼ Show 20 Lines | if (error == 0) | ||||
pmap_pkru_update_range(pmap, sva, eva, 0); | pmap_pkru_update_range(pmap, sva, eva, 0); | ||||
PMAP_UNLOCK(pmap); | PMAP_UNLOCK(pmap); | ||||
if (error != ENOMEM) | if (error != ENOMEM) | ||||
break; | break; | ||||
vm_wait(NULL); | vm_wait(NULL); | ||||
} | } | ||||
return (error); | return (error); | ||||
} | } | ||||
/* | |||||
* Track a range of the kernel's virtual address space that is contiguous | |||||
* in various mapping attributes. | |||||
*/ | |||||
struct pmap_kernel_map_range { | |||||
vm_offset_t sva; | |||||
pt_entry_t attrs; | |||||
int ptes; | |||||
int pdes; | |||||
int pdpes; | |||||
}; | |||||
static void | |||||
sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, | |||||
vm_offset_t eva) | |||||
{ | |||||
const char *mode; | |||||
int i, pat_idx; | |||||
if (eva <= range->sva) | |||||
return; | |||||
pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); | |||||
for (i = 0; i < PAT_INDEX_SIZE; i++) | |||||
if (pat_index[i] == pat_idx) | |||||
break; | |||||
switch (i) { | |||||
case PAT_WRITE_BACK: | |||||
mode = "WB"; | |||||
break; | |||||
case PAT_WRITE_THROUGH: | |||||
mode = "WT"; | |||||
break; | |||||
case PAT_UNCACHEABLE: | |||||
mode = "UC"; | |||||
break; | |||||
case PAT_WRITE_PROTECTED: | |||||
mode = "WP"; | |||||
break; | |||||
case PAT_WRITE_COMBINING: | |||||
mode = "WC"; | |||||
break; | |||||
default: | |||||
printf("%s: unknown PAT mode %#x for range %#016lx-%#016lx\n", | |||||
kib: Can we print the raw pat bits in hex instead of panicing ? I feel it is too evil to panic in… | |||||
Done Inline ActionsSure. markj: Sure. | |||||
__func__, i, range->sva, eva); | |||||
mode = "??"; | |||||
break; | |||||
} | |||||
sbuf_printf(sb, "%#016lx-%#016lx r%c%c%c%c %s %d %d %d\n", | |||||
range->sva, eva, | |||||
(range->attrs & X86_PG_RW) != 0 ? 'w' : '-', | |||||
(range->attrs & pg_nx) != 0 ? '-' : 'x', | |||||
(range->attrs & X86_PG_U) != 0 ? 'u' : 's', | |||||
(range->attrs & X86_PG_G) != 0 ? 'g' : '-', | |||||
mode, range->pdpes, range->pdes, range->ptes); | |||||
/* Reset to sentinel value. */ | |||||
range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); | |||||
} | |||||
/* | |||||
* Determine whether the attributes specified by a page table entry match those | |||||
* being tracked by the current range. This is not quite as simple as a direct | |||||
* flag comparison since some PAT modes have multiple representations. | |||||
*/ | |||||
static bool | |||||
sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) | |||||
{ | |||||
pt_entry_t diff, mask; | |||||
mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; | |||||
diff = (range->attrs ^ attrs) & mask; | |||||
if (diff == 0) | |||||
return (true); | |||||
if ((diff & ~X86_PG_PDE_PAT) == 0 && | |||||
pmap_pat_index(kernel_pmap, range->attrs, true) == | |||||
pmap_pat_index(kernel_pmap, attrs, true)) | |||||
return (true); | |||||
return (false); | |||||
} | |||||
static void | |||||
sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, | |||||
pt_entry_t attrs) | |||||
{ | |||||
memset(range, 0, sizeof(*range)); | |||||
range->sva = va; | |||||
range->attrs = attrs; | |||||
} | |||||
/* | |||||
* Given a leaf PTE, derive the mapping's attributes. If they do not match | |||||
* those of the current run, dump the address range and its attributes, and | |||||
Not Done Inline ActionsWhat is the return value ? kib: What is the return value ? | |||||
Done Inline ActionsFalse if the PTE is invalid or a leaf node, true otherwise. I will add a comment. markj: False if the PTE is invalid or a leaf node, true otherwise. I will add a comment. | |||||
* begin a new run. | |||||
*/ | |||||
static void | |||||
sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, | |||||
vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, | |||||
pt_entry_t pte) | |||||
{ | |||||
pt_entry_t attrs; | |||||
attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); | |||||
attrs |= pdpe & pg_nx; | |||||
attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); | |||||
if ((pdpe & PG_PS) != 0) { | |||||
attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); | |||||
} else if (pde != 0) { | |||||
attrs |= pde & pg_nx; | |||||
attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); | |||||
} | |||||
if ((pde & PG_PS) != 0) { | |||||
attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); | |||||
} else if (pte != 0) { | |||||
attrs |= pte & pg_nx; | |||||
attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); | |||||
attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); | |||||
/* Canonicalize by always using the PDE PAT bit. */ | |||||
if ((attrs & X86_PG_PTE_PAT) != 0) | |||||
attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; | |||||
} | |||||
if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { | |||||
sysctl_kmaps_dump(sb, range, va); | |||||
sysctl_kmaps_reinit(range, va, attrs); | |||||
} | |||||
} | |||||
static int | |||||
sysctl_kmaps(SYSCTL_HANDLER_ARGS) | |||||
{ | |||||
struct pmap_kernel_map_range range; | |||||
struct sbuf sbuf, *sb; | |||||
pml4_entry_t pml4e; | |||||
pdp_entry_t *pdp, pdpe; | |||||
pd_entry_t *pd, pde; | |||||
pt_entry_t *pt, pte; | |||||
vm_offset_t sva; | |||||
vm_paddr_t pa; | |||||
int error, i, j, k, l; | |||||
error = sysctl_wire_old_buffer(req, 0); | |||||
if (error != 0) | |||||
return (error); | |||||
sb = &sbuf; | |||||
sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); | |||||
/* Sentinel value. */ | |||||
range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); | |||||
PMAP_LOCK(kernel_pmap); | |||||
for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { | |||||
switch (i) { | |||||
case PML4PML4I: | |||||
sbuf_printf(sb, "\nRecursive map:\n"); | |||||
break; | |||||
case DMPML4I: | |||||
sbuf_printf(sb, "\nDirect map:\n"); | |||||
break; | |||||
case KPML4BASE: | |||||
sbuf_printf(sb, "\nKernel map:\n"); | |||||
break; | |||||
case LMSPML4I: | |||||
sbuf_printf(sb, "\nLarge map:\n"); | |||||
break; | |||||
} | |||||
/* Convert to canonical form. */ | |||||
if (sva == 1ul << 47) | |||||
sva |= -1ul << 48; | |||||
pml4e = kernel_pmap->pm_pml4[i]; | |||||
if ((pml4e & X86_PG_V) == 0) { | |||||
sysctl_kmaps_dump(sb, &range, sva); | |||||
sva += NBPML4; | |||||
continue; | |||||
} | |||||
pa = pml4e & PG_FRAME; | |||||
pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); | |||||
for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { | |||||
pdpe = pdp[j]; | |||||
if ((pdpe & X86_PG_V) == 0) { | |||||
sysctl_kmaps_dump(sb, &range, sva); | |||||
sva += NBPDP; | |||||
continue; | |||||
} | |||||
if ((pdpe & PG_PS) != 0) { | |||||
sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, | |||||
0, 0); | |||||
range.pdpes++; | |||||
sva += NBPDP; | |||||
continue; | |||||
} | |||||
pa = pdpe & PG_FRAME; | |||||
pd = (pd_entry_t *)PHYS_TO_DMAP(pa); | |||||
Not Done Inline ActionsWhat if you do not lock the kernel_pmap ? You might need to be somewhat more careful with page table walk, but otherwise I do not see much harm. kib: What if you do not lock the kernel_pmap ? You might need to be somewhat more careful with page… | |||||
Done Inline ActionsHmm, will this work with the large map? pmap_large_unmap() appears to free PTPs. Otherwise I think this is doable. markj: Hmm, will this work with the large map? pmap_large_unmap() appears to free PTPs. Otherwise I… | |||||
Not Done Inline ActionsSo sometimes you would dump not quite reasonable data for the large map. I think this will occur very rarely. After thinking about it some more, you would only need to add one more check to the code, namely, verify that the physical address is below max memory. Or even better, verify that the physical address from any paging structure belongs to some segment. kib: So sometimes you would dump not quite reasonable data for the large map. I think this will… | |||||
Done Inline ActionsThis doesn't quite work, since some kernel page table pages are not included in the phys segs, and it is difficult to determine whether such a verification is needed or not for a given PTE. markj: This doesn't quite work, since some kernel page table pages are not included in the phys segs… | |||||
Not Done Inline ActionsThen perhaps directly dig into smap or efi map. I think it is completely fine to access the paging structures unlocked, except we do not want randomly read device registers. kib: Then perhaps directly dig into smap or efi map. I think it is completely fine to access the… | |||||
Done Inline ActionsWe also do not want to read blacklisted pages. I agree that unlocked accesses are fine, except in the large map. One simpler possibility is to hold the pmap lock only when the PML4 index is in the large map range. In general I would expect the large map to consist mostly of 1GB pages, and the pmap lock is not required when performing writeback. markj: We also do not want to read blacklisted pages.
I agree that unlocked accesses are fine, except… | |||||
for (k = pmap_pde_index(sva); k < NPDEPG; k++) { | |||||
pde = pd[k]; | |||||
if ((pde & X86_PG_V) == 0) { | |||||
sysctl_kmaps_dump(sb, &range, sva); | |||||
sva += NBPDR; | |||||
continue; | |||||
} | |||||
if ((pde & PG_PS) != 0) { | |||||
sysctl_kmaps_check(sb, &range, sva, | |||||
pml4e, pdpe, pde, 0); | |||||
range.pdes++; | |||||
sva += NBPDR; | |||||
continue; | |||||
} | |||||
pa = pde & PG_FRAME; | |||||
pt = (pt_entry_t *)PHYS_TO_DMAP(pa); | |||||
for (l = pmap_pte_index(sva); l < NPTEPG; l++, | |||||
sva += PAGE_SIZE) { | |||||
pte = pt[l]; | |||||
if ((pte & X86_PG_V) == 0) { | |||||
sysctl_kmaps_dump(sb, &range, | |||||
sva); | |||||
continue; | |||||
} | |||||
sysctl_kmaps_check(sb, &range, sva, | |||||
pml4e, pdpe, pde, pte); | |||||
range.ptes++; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
PMAP_UNLOCK(kernel_pmap); | |||||
error = sbuf_finish(sb); | |||||
sbuf_delete(sb); | |||||
return (error); | |||||
} | |||||
SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, | |||||
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, | |||||
NULL, 0, sysctl_kmaps, "A", | |||||
"Dump kernel address layout"); | |||||
#ifdef DDB | #ifdef DDB | ||||
DB_SHOW_COMMAND(pte, pmap_print_pte) | DB_SHOW_COMMAND(pte, pmap_print_pte) | ||||
{ | { | ||||
pmap_t pmap; | pmap_t pmap; | ||||
pml4_entry_t *pml4; | pml4_entry_t *pml4; | ||||
pdp_entry_t *pdp; | pdp_entry_t *pdp; | ||||
pd_entry_t *pde; | pd_entry_t *pde; | ||||
▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines |
Can we print the raw pat bits in hex instead of panicing ? I feel it is too evil to panic in informational code.