Changeset View
Standalone View
sys/amd64/amd64/pmap.c
- This file is larger than 256 KB, so syntax highlighting is disabled by default.
Show First 20 Lines • Show All 118 Lines • ▼ Show 20 Lines | |||||
#include <sys/ktr.h> | #include <sys/ktr.h> | ||||
#include <sys/lock.h> | #include <sys/lock.h> | ||||
#include <sys/malloc.h> | #include <sys/malloc.h> | ||||
#include <sys/mman.h> | #include <sys/mman.h> | ||||
#include <sys/mutex.h> | #include <sys/mutex.h> | ||||
#include <sys/proc.h> | #include <sys/proc.h> | ||||
#include <sys/rangeset.h> | #include <sys/rangeset.h> | ||||
#include <sys/rwlock.h> | #include <sys/rwlock.h> | ||||
#include <sys/sbuf.h> | |||||
#include <sys/sx.h> | #include <sys/sx.h> | ||||
#include <sys/turnstile.h> | #include <sys/turnstile.h> | ||||
#include <sys/vmem.h> | #include <sys/vmem.h> | ||||
#include <sys/vmmeter.h> | #include <sys/vmmeter.h> | ||||
#include <sys/sched.h> | #include <sys/sched.h> | ||||
#include <sys/sysctl.h> | #include <sys/sysctl.h> | ||||
#include <sys/smp.h> | #include <sys/smp.h> | ||||
#ifdef DDB | #ifdef DDB | ||||
▲ Show 20 Lines • Show All 1,957 Lines • ▼ Show 20 Lines | case PT_EPT: | ||||
break; | break; | ||||
default: | default: | ||||
panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); | panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type); | ||||
} | } | ||||
return (mask); | return (mask); | ||||
} | } | ||||
static int | |||||
pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) | |||||
{ | |||||
int pat_flag, pat_idx; | |||||
pat_idx = 0; | |||||
switch (pmap->pm_type) { | |||||
case PT_X86: | |||||
case PT_RVI: | |||||
/* The PAT bit is different for PTE's and PDE's. */ | |||||
pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; | |||||
if ((pte & pat_flag) != 0) | |||||
pat_idx |= 0x4; | |||||
if ((pte & PG_NC_PCD) != 0) | |||||
pat_idx |= 0x2; | |||||
if ((pte & PG_NC_PWT) != 0) | |||||
pat_idx |= 0x1; | |||||
break; | |||||
case PT_EPT: | |||||
if ((pte & EPT_PG_IGNORE_PAT) != 0) | |||||
panic("EPT PTE %#lx has no PAT memory type", pte); | |||||
pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; | |||||
break; | |||||
} | |||||
/* See pmap_init_pat(). */ | |||||
if (pat_idx == 4) | |||||
pat_idx = 0; | |||||
if (pat_idx == 7) | |||||
pat_idx = 3; | |||||
return (pat_idx); | |||||
} | |||||
bool | bool | ||||
pmap_ps_enabled(pmap_t pmap) | pmap_ps_enabled(pmap_t pmap) | ||||
{ | { | ||||
return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); | return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0); | ||||
} | } | ||||
static void | static void | ||||
▲ Show 20 Lines • Show All 7,853 Lines • ▼ Show 20 Lines | if (error == 0) | ||||
pmap_pkru_update_range(pmap, sva, eva, 0); | pmap_pkru_update_range(pmap, sva, eva, 0); | ||||
PMAP_UNLOCK(pmap); | PMAP_UNLOCK(pmap); | ||||
if (error != ENOMEM) | if (error != ENOMEM) | ||||
break; | break; | ||||
vm_wait(NULL); | vm_wait(NULL); | ||||
} | } | ||||
return (error); | return (error); | ||||
} | } | ||||
/* | |||||
* Track a range of the kernel's virtual address space that is contiguous | |||||
* in various mapping attributes. | |||||
*/ | |||||
struct pmap_kernel_map_range { | |||||
vm_offset_t sva; | |||||
pt_entry_t attrs; | |||||
int ptes; | |||||
int pdes; | |||||
int pdpes; | |||||
}; | |||||
static void | |||||
sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, | |||||
vm_offset_t eva) | |||||
{ | |||||
const char *mode; | |||||
int i, pat_idx; | |||||
if (eva <= range->sva) | |||||
return; | |||||
pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); | |||||
for (i = 0; i < PAT_INDEX_SIZE; i++) | |||||
if (pat_index[i] == pat_idx) | |||||
break; | |||||
switch (i) { | |||||
case PAT_WRITE_BACK: | |||||
mode = "WB"; | |||||
break; | |||||
case PAT_WRITE_THROUGH: | |||||
mode = "WT"; | |||||
break; | |||||
case PAT_UNCACHEABLE: | |||||
mode = "UC"; | |||||
break; | |||||
case PAT_WRITE_PROTECTED: | |||||
mode = "WP"; | |||||
break; | |||||
case PAT_WRITE_COMBINING: | |||||
mode = "WC"; | |||||
break; | |||||
default: | |||||
panic("sysctl_kmaps_dump: unknown PAT mode %d", i); | |||||
kib: Can we print the raw pat bits in hex instead of panicing ? I feel it is too evil to panic in… | |||||
markjAuthorUnsubmitted Done Inline ActionsSure. markj: Sure. | |||||
} | |||||
sbuf_printf(sb, "%#016lx-%#016lx r%c%c%c%c %s %d %d %d\n", | |||||
range->sva, eva, | |||||
(range->attrs & X86_PG_RW) != 0 ? 'w' : '-', | |||||
(range->attrs & pg_nx) != 0 ? '-' : 'x', | |||||
(range->attrs & X86_PG_U) != 0 ? 'u' : 's', | |||||
(range->attrs & X86_PG_G) != 0 ? 'g' : '-', | |||||
mode, range->pdpes, range->pdes, range->ptes); | |||||
/* Reset to sentinel value. */ | |||||
range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); | |||||
} | |||||
/* | |||||
* Determine whether the attributes specified by a page table entry match those | |||||
* being tracked by the current range. This is not quite as simple as a direct | |||||
* flag comparison since some PAT modes have multiple representations. | |||||
*/ | |||||
static bool | |||||
sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs, | |||||
pt_entry_t mask) | |||||
{ | |||||
pt_entry_t diff; | |||||
diff = (range->attrs ^ attrs) & mask; | |||||
if (diff == 0) | |||||
return (true); | |||||
if ((diff & ~X86_PG_PDE_PAT) == 0 && | |||||
pmap_pat_index(kernel_pmap, range->attrs, true) == | |||||
pmap_pat_index(kernel_pmap, attrs, true)) | |||||
return (true); | |||||
return (false); | |||||
} | |||||
static void | |||||
sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, | |||||
pt_entry_t attrs) | |||||
{ | |||||
memset(range, 0, sizeof(*range)); | |||||
range->sva = va; | |||||
range->attrs = attrs; | |||||
} | |||||
/* | |||||
* Called for every kernel page table entry. Maintain a range of kernel virtual | |||||
* addresses that is contiguous with respect to access permissions and cache | |||||
* mode. | |||||
*/ | |||||
static bool | |||||
kibUnsubmitted Not Done Inline ActionsWhat is the return value ? kib: What is the return value ? | |||||
markjAuthorUnsubmitted Done Inline ActionsFalse if the PTE is invalid or a leaf node, true otherwise. I will add a comment. markj: False if the PTE is invalid or a leaf node, true otherwise. I will add a comment. | |||||
sysctl_kmaps_update(struct sbuf *sb, struct pmap_kernel_map_range *range, | |||||
vm_offset_t va, pml4_entry_t *pml4, pdp_entry_t *pdp, pd_entry_t *pd, | |||||
pt_entry_t *pt) | |||||
{ | |||||
pt_entry_t attrs, mask; | |||||
if ((*pml4 & X86_PG_V) == 0) { | |||||
sysctl_kmaps_dump(sb, range, va); | |||||
return (false); | |||||
} | |||||
attrs = *pml4 & (X86_PG_RW | X86_PG_U | pg_nx); | |||||
mask = X86_PG_RW | X86_PG_U | pg_nx; | |||||
if (pdp == NULL) | |||||
return (true); | |||||
if ((*pdp & X86_PG_V) == 0) { | |||||
sysctl_kmaps_dump(sb, range, va); | |||||
return (false); | |||||
} | |||||
attrs |= *pdp & pg_nx; | |||||
attrs &= pg_nx | (*pdp & (X86_PG_RW | X86_PG_U)); | |||||
if ((*pdp & X86_PG_PS) != 0) { | |||||
attrs |= *pdp & (X86_PG_G | X86_PG_PDE_CACHE); | |||||
mask |= X86_PG_G | X86_PG_PDE_CACHE; | |||||
if (range->sva > va || | |||||
!sysctl_kmaps_match(range, attrs, mask)) { | |||||
sysctl_kmaps_dump(sb, range, va); | |||||
sysctl_kmaps_reinit(range, va, attrs); | |||||
} | |||||
range->pdpes++; | |||||
return (false); | |||||
} | |||||
if (pd == NULL) | |||||
return (true); | |||||
if ((*pd & X86_PG_V) == 0) { | |||||
sysctl_kmaps_dump(sb, range, va); | |||||
return (false); | |||||
} | |||||
attrs |= *pd & pg_nx; | |||||
attrs &= pg_nx | (*pd & (X86_PG_RW | X86_PG_U)); | |||||
if ((*pd & X86_PG_PS) != 0) { | |||||
attrs |= *pd & (X86_PG_G | X86_PG_PDE_CACHE); | |||||
mask |= X86_PG_G | X86_PG_PDE_CACHE; | |||||
if (range->sva > va || | |||||
!sysctl_kmaps_match(range, attrs, mask)) { | |||||
sysctl_kmaps_dump(sb, range, va); | |||||
sysctl_kmaps_reinit(range, va, attrs); | |||||
} | |||||
range->pdes++; | |||||
return (false); | |||||
} | |||||
if (pt == NULL) | |||||
return (true); | |||||
if ((*pt & X86_PG_V) == 0) { | |||||
sysctl_kmaps_dump(sb, range, va); | |||||
return (false); | |||||
} | |||||
attrs |= *pt & pg_nx; | |||||
attrs &= pg_nx | (*pt & (X86_PG_RW | X86_PG_U)); | |||||
attrs |= (*pt & (X86_PG_G | X86_PG_PTE_CACHE)); | |||||
/* | |||||
* PTEs use different bits for the PAT index; canonicalize by using the | |||||
* PDE format. | |||||
*/ | |||||
if ((*pt & X86_PG_PTE_PAT) != 0) { | |||||
attrs &= ~X86_PG_PTE_PAT; | |||||
attrs |= X86_PG_PDE_PAT; | |||||
} | |||||
mask |= X86_PG_G | X86_PG_PDE_CACHE; | |||||
if (range->sva > va || !sysctl_kmaps_match(range, attrs, mask)) { | |||||
sysctl_kmaps_dump(sb, range, va); | |||||
sysctl_kmaps_reinit(range, va, attrs); | |||||
} | |||||
range->ptes++; | |||||
return (true); | |||||
} | |||||
static int | |||||
sysctl_kmaps(SYSCTL_HANDLER_ARGS) | |||||
{ | |||||
struct pmap_kernel_map_range range; | |||||
struct sbuf sbuf, *sb; | |||||
pml4_entry_t *pml4; | |||||
pdp_entry_t *pdp; | |||||
pd_entry_t *pd; | |||||
pt_entry_t *pt; | |||||
vm_offset_t va; | |||||
int error, i, j, k, l; | |||||
error = sysctl_wire_old_buffer(req, 0); | |||||
if (error != 0) | |||||
return (error); | |||||
sb = &sbuf; | |||||
sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); | |||||
range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); | |||||
PMAP_LOCK(kernel_pmap); | |||||
kibUnsubmitted Not Done Inline ActionsWhat if you do not lock the kernel_pmap ? You might need to be somewhat more careful with page table walk, but otherwise I do not see much harm. kib: What if you do not lock the kernel_pmap ? You might need to be somewhat more careful with page… | |||||
markjAuthorUnsubmitted Done Inline ActionsHmm, will this work with the large map? pmap_large_unmap() appears to free PTPs. Otherwise I think this is doable. markj: Hmm, will this work with the large map? pmap_large_unmap() appears to free PTPs. Otherwise I… | |||||
kibUnsubmitted Not Done Inline ActionsSo sometimes you would dump not quite reasonable data for the large map. I think this will occur very rarely. After thinking about it some more, you would only need to add one more check to the code, namely, verify that the physical address is below max memory. Or even better, verify that the physical address from any paging structure belongs to some segment. kib: So sometimes you would dump not quite reasonable data for the large map. I think this will… | |||||
markjAuthorUnsubmitted Done Inline ActionsThis doesn't quite work, since some kernel page table pages are not included in the phys segs, and it is difficult to determine whether such a verification is needed or not for a given PTE. markj: This doesn't quite work, since some kernel page table pages are not included in the phys segs… | |||||
kibUnsubmitted Not Done Inline ActionsThen perhaps directly dig into smap or efi map. I think it is completely fine to access the paging structures unlocked, except we do not want randomly read device registers. kib: Then perhaps directly dig into smap or efi map. I think it is completely fine to access the… | |||||
markjAuthorUnsubmitted Done Inline ActionsWe also do not want to read blacklisted pages. I agree that unlocked accesses are fine, except in the large map. One simpler possibility is to hold the pmap lock only when the PML4 index is in the large map range. In general I would expect the large map to consist mostly of 1GB pages, and the pmap lock is not required when performing writeback. markj: We also do not want to read blacklisted pages.
I agree that unlocked accesses are fine, except… | |||||
for (i = 0; i < NPML4EPG; i++) { | |||||
switch (i) { | |||||
case PML4PML4I: | |||||
sbuf_printf(sb, "\nRecursive map:\n"); | |||||
break; | |||||
case DMPML4I: | |||||
sbuf_printf(sb, "\nDirect map:\n"); | |||||
break; | |||||
case KPML4BASE: | |||||
sbuf_printf(sb, "\nKernel map:\n"); | |||||
break; | |||||
case LMSPML4I: | |||||
sbuf_printf(sb, "\nLarge map:\n"); | |||||
break; | |||||
} | |||||
va = KVADDR(i, 0, 0, 0); | |||||
pml4 = &kernel_pmap->pm_pml4[i]; | |||||
if (!sysctl_kmaps_update(sb, &range, va, pml4, NULL, NULL, | |||||
NULL)) | |||||
continue; | |||||
for (j = 0; j < NPDPEPG; j++) { | |||||
va = KVADDR(i, j, 0, 0); | |||||
pdp = pmap_pml4e_to_pdpe(pml4, va); | |||||
if (!sysctl_kmaps_update(sb, &range, va, pml4, pdp, | |||||
NULL, NULL)) | |||||
continue; | |||||
for (k = 0; k < NPDEPG; k++) { | |||||
va = KVADDR(i, j, k, 0); | |||||
pd = pmap_pdpe_to_pde(pdp, va); | |||||
if (!sysctl_kmaps_update(sb, &range, va, pml4, | |||||
pdp, pd, NULL)) | |||||
continue; | |||||
for (l = 0; l < NPTEPG; l++) { | |||||
va = KVADDR(i, j, k, l); | |||||
pt = pmap_pde_to_pte(pd, va); | |||||
(void)sysctl_kmaps_update(sb, &range, | |||||
va, pml4, pdp, pd, pt); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
PMAP_UNLOCK(kernel_pmap); | |||||
error = sbuf_finish(sb); | |||||
sbuf_delete(sb); | |||||
return (error); | |||||
} | |||||
SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, | |||||
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, | |||||
NULL, 0, sysctl_kmaps, "A", | |||||
"Dump kernel address layout"); | |||||
#ifdef DDB | #ifdef DDB | ||||
DB_SHOW_COMMAND(pte, pmap_print_pte) | DB_SHOW_COMMAND(pte, pmap_print_pte) | ||||
{ | { | ||||
pmap_t pmap; | pmap_t pmap; | ||||
pml4_entry_t *pml4; | pml4_entry_t *pml4; | ||||
pdp_entry_t *pdp; | pdp_entry_t *pdp; | ||||
pd_entry_t *pde; | pd_entry_t *pde; | ||||
▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines |
Can we print the raw pat bits in hex instead of panicing ? I feel it is too evil to panic in informational code.