Changeset View
Standalone View
sys/amd64/amd64/pmap.c
Show First 20 Lines • Show All 391 Lines • ▼ Show 20 Lines | |||||
*/ | */ | ||||
static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); | static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks); | ||||
static struct mtx __exclusive_cache_line pv_chunks_mutex; | static struct mtx __exclusive_cache_line pv_chunks_mutex; | ||||
static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; | static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS]; | ||||
static u_long pv_invl_gen[NPV_LIST_LOCKS]; | static u_long pv_invl_gen[NPV_LIST_LOCKS]; | ||||
static struct md_page *pv_table; | static struct md_page *pv_table; | ||||
static struct md_page pv_dummy; | static struct md_page pv_dummy; | ||||
static int pmap_kernelro = 1; | |||||
SYSCTL_INT(_vm_pmap, OID_AUTO, kernelro, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, | |||||
&pmap_kernelro, 0, | |||||
"Map the read-only portions of the kernel with read-only permissions"); | |||||
/* | /* | ||||
kib: kernelprot seems to be too generic. Might be, kernelnx ? | |||||
Done Inline ActionsI think kernelnx is too specific, since it also changes the way the RW bits are set. I agree that kernelprot is too generic, but I struggled to come up with a better name. I'm open to suggestions. (Other similar suggestions could be kernelrwx or kernelnxrw.) I'm really open for something better.) jtl: I think kernelnx is too specific, since it also changes the way the RW bits are set. I agree… | |||||
* All those kernel PT submaps that BSD is so fond of | * All those kernel PT submaps that BSD is so fond of | ||||
*/ | */ | ||||
Done Inline ActionsTweaking the description here (it's not just read-only) might also help with the tunable name. Maybe "Map the kernel with fine-grained permissions rather than RWX." You could in fact invert the option to be 'vm.pmap.kernel_rwx' where a 1 setting is the legacy behavior and 0 is the default which is the new behavior. If you go that route than the description would be "Map the entire kernel RWX rather than using fine-grained permissions." jhb: Tweaking the description here (it's not just read-only) might also help with the tunable name. | |||||
pt_entry_t *CMAP1 = NULL; | pt_entry_t *CMAP1 = NULL; | ||||
caddr_t CADDR1 = 0; | caddr_t CADDR1 = 0; | ||||
static vm_offset_t qframe = 0; | static vm_offset_t qframe = 0; | ||||
static struct mtx qframe_mtx; | static struct mtx qframe_mtx; | ||||
static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ | static int pmap_flags = PMAP_PDE_SUPERPAGE; /* flags for x86 pmaps */ | ||||
int pmap_pcid_enabled = 1; | int pmap_pcid_enabled = 1; | ||||
▲ Show 20 Lines • Show All 471 Lines • ▼ Show 20 Lines | #else | ||||
* before vm_init() and pmap_init(). 20MB for a frame buffer is | * before vm_init() and pmap_init(). 20MB for a frame buffer is | ||||
* not uncommon. | * not uncommon. | ||||
*/ | */ | ||||
pt_pages += 32; /* 64MB additional slop. */ | pt_pages += 32; /* 64MB additional slop. */ | ||||
#endif | #endif | ||||
nkpt = pt_pages; | nkpt = pt_pages; | ||||
} | } | ||||
/* | |||||
* Returns the proper write/execute permission for a physical page that is | |||||
* part of the initial boot allocations. | |||||
* | |||||
* If the page has kernel text, it is marked as read-only. If the page has | |||||
* kernel read-only data, it is marked as read-only/not-executable. If the | |||||
* page has only read-write data, it is marked as read-write/not-executable. | |||||
* If the page is below/above the kernel range, it is marked as read-write. | |||||
* | |||||
* This function operates on 2M pages, since we map the kernel space that | |||||
* way. | |||||
* | |||||
* Note that this doesn't currently provide any protection for modules. | |||||
*/ | |||||
static inline pt_entry_t | |||||
bootaddr_rwx(vm_paddr_t pa) | |||||
{ | |||||
/* | |||||
* Everything in the same 2M page as the start of the kernel | |||||
* should be static. On the other hand, things in the same 2M | |||||
* page as the end of the kernel could be read-write/executable, | |||||
* as the kernel image is not guaranteed to end on a 2M boundary. | |||||
*/ | |||||
if (pa < trunc_2mpage(btext - KERNBASE) || | |||||
pa >= trunc_2mpage(_end - KERNBASE)) | |||||
return (X86_PG_RW); | |||||
/* | |||||
* The linker should ensure that the read-only and read-write | |||||
* portions don't share the same 2M page, so this shouldn't | |||||
* impact read-only data. However, in any case, any page with | |||||
* read-write data needs to be read-write. | |||||
*/ | |||||
if (pa >= trunc_2mpage(brwsection - KERNBASE)) | |||||
return (X86_PG_RW | pg_nx); | |||||
/* | |||||
* Mark any 2M page containing kernel text as read-only. Mark | |||||
* other pages with read-only data as read-only and not executable. | |||||
* (It is likely a small portion of the read-only data section will | |||||
* be marked as read-only, but executable. This should be acceptable | |||||
* since the read-only protection will keep the data from changing.) | |||||
*/ | |||||
if (pa < round_2mpage(etext - KERNBASE)) | |||||
return (pmap_kernelro ? 0 : X86_PG_RW); | |||||
return (pg_nx | (pmap_kernelro ? 0 : X86_PG_RW)); | |||||
Done Inline ActionsLook at the fpuinit_bsp1, particularly CPUID_EXTSTATE_XSAVEOPT block. kib: Look at the fpuinit_bsp1, particularly CPUID_EXTSTATE_XSAVEOPT block. | |||||
Done Inline ActionsThanks for pointing that out! I didn't know we actually had self-modifying code in the kernel. Hmmm... My CPU supports XSAVE, but didn't fault on that. I'll need to dig deeper into why not. jtl: Thanks for pointing that out! I didn't know we actually had self-modifying code in the kernel. | |||||
Done Inline ActionsIt probably has XSAVE, but not XSAVEOPT. kib: It probably has XSAVE, but not XSAVEOPT. | |||||
Done Inline ActionsThis works because we don't set cr0.wp until a little bit later in the boot process. So, we aren't enforcing write protection at this point. jtl: This works because we don't set cr0.wp until a little bit later in the boot process. So, we… | |||||
Done Inline ActionsI think you should perhaps extend the comment above this range with a note that writes to perform fixups in .text are still permitted until CR0.WP is set. jhb: I think you should perhaps extend the comment above this range with a note that writes to… | |||||
Done Inline ActionsThanks! I changed it. jtl: Thanks! I changed it. | |||||
Done Inline ActionsDoes that imply we should defer the removal of write from kernel text until after doing any fixups? We could use a second pass to walk the page tables and clear W from executable pages? jhb: Does that imply we should defer the removal of write from kernel text until after doing any… | |||||
} | |||||
static void | static void | ||||
create_pagetables(vm_paddr_t *firstaddr) | create_pagetables(vm_paddr_t *firstaddr) | ||||
{ | { | ||||
int i, j, ndm1g, nkpdpe; | int i, j, ndm1g, nkpdpe, nkdmpde; | ||||
pt_entry_t *pt_p; | pt_entry_t *pt_p; | ||||
pd_entry_t *pd_p; | pd_entry_t *pd_p; | ||||
pdp_entry_t *pdp_p; | pdp_entry_t *pdp_p; | ||||
pml4_entry_t *p4_p; | pml4_entry_t *p4_p; | ||||
uint64_t DMPDkernphys; | |||||
/* | |||||
* Determine if we are marking the read-only portion with read-only | |||||
* permissions. | |||||
*/ | |||||
TUNABLE_INT_FETCH("vm.pmap.kernelro", &pmap_kernelro); | |||||
/* Allocate page table pages for the direct map */ | /* Allocate page table pages for the direct map */ | ||||
ndmpdp = howmany(ptoa(Maxmem), NBPDP); | ndmpdp = howmany(ptoa(Maxmem), NBPDP); | ||||
if (ndmpdp < 4) /* Minimum 4GB of dirmap */ | if (ndmpdp < 4) /* Minimum 4GB of dirmap */ | ||||
ndmpdp = 4; | ndmpdp = 4; | ||||
ndmpdpphys = howmany(ndmpdp, NPDPEPG); | ndmpdpphys = howmany(ndmpdp, NPDPEPG); | ||||
if (ndmpdpphys > NDMPML4E) { | if (ndmpdpphys > NDMPML4E) { | ||||
/* | /* | ||||
* Each NDMPML4E allows 512 GB, so limit to that, | * Each NDMPML4E allows 512 GB, so limit to that, | ||||
* and then readjust ndmpdp and ndmpdpphys. | * and then readjust ndmpdp and ndmpdpphys. | ||||
*/ | */ | ||||
printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); | printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512); | ||||
Maxmem = atop(NDMPML4E * NBPML4); | Maxmem = atop(NDMPML4E * NBPML4); | ||||
ndmpdpphys = NDMPML4E; | ndmpdpphys = NDMPML4E; | ||||
ndmpdp = NDMPML4E * NPDEPG; | ndmpdp = NDMPML4E * NPDEPG; | ||||
} | } | ||||
DMPDPphys = allocpages(firstaddr, ndmpdpphys); | DMPDPphys = allocpages(firstaddr, ndmpdpphys); | ||||
ndm1g = 0; | ndm1g = 0; | ||||
if ((amd_feature & AMDID_PAGE1GB) != 0) | if ((amd_feature & AMDID_PAGE1GB) != 0) { | ||||
ndm1g = ptoa(Maxmem) >> PDPSHIFT; | ndm1g = ptoa(Maxmem) >> PDPSHIFT; | ||||
if (pmap_kernelro) { | |||||
nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), | |||||
NBPDP); | |||||
DMPDkernphys = allocpages(firstaddr, nkdmpde); | |||||
} | |||||
} | |||||
if (ndm1g < ndmpdp) | if (ndm1g < ndmpdp) | ||||
DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); | DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); | ||||
dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; | dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; | ||||
/* Allocate pages */ | /* Allocate pages */ | ||||
KPML4phys = allocpages(firstaddr, 1); | KPML4phys = allocpages(firstaddr, 1); | ||||
KPDPphys = allocpages(firstaddr, NKPML4E); | KPDPphys = allocpages(firstaddr, NKPML4E); | ||||
Show All 9 Lines | create_pagetables(vm_paddr_t *firstaddr) | ||||
*/ | */ | ||||
nkpt_init(*firstaddr); | nkpt_init(*firstaddr); | ||||
nkpdpe = NKPDPE(nkpt); | nkpdpe = NKPDPE(nkpt); | ||||
KPTphys = allocpages(firstaddr, nkpt); | KPTphys = allocpages(firstaddr, nkpt); | ||||
KPDphys = allocpages(firstaddr, nkpdpe); | KPDphys = allocpages(firstaddr, nkpdpe); | ||||
/* Fill in the underlying page table pages */ | /* Fill in the underlying page table pages */ | ||||
/* Nominally read-only (but really R/W) from zero to physfree */ | |||||
/* XXX not fully used, underneath 2M pages */ | /* XXX not fully used, underneath 2M pages */ | ||||
pt_p = (pt_entry_t *)KPTphys; | pt_p = (pt_entry_t *)KPTphys; | ||||
for (i = 0; ptoa(i) < *firstaddr; i++) | for (i = 0; ptoa(i) < *firstaddr; i++) | ||||
pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | pg_g; | pt_p[i] = ptoa(i) | X86_PG_V | pg_g | bootaddr_rwx(ptoa(i)); | ||||
/* Now map the page tables at their location within PTmap */ | /* Now map the page tables at their location within PTmap */ | ||||
pd_p = (pd_entry_t *)KPDphys; | pd_p = (pd_entry_t *)KPDphys; | ||||
for (i = 0; i < nkpt; i++) | for (i = 0; i < nkpt; i++) | ||||
pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; | pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; | ||||
Done Inline ActionsI believe page tables never need exec perms. kib: I believe page tables never need exec perms. | |||||
Done Inline ActionsIf I read the Intel documentation correctly, setting NX on a PDE propagates the NX behavior to all pages mapped by PTEs referenced by the PDE. I don't think that's what we want. jtl: If I read the Intel documentation correctly, setting NX on a PDE propagates the NX behavior to… | |||||
Done Inline ActionsThis is mapping of the page tables itself, not the introduction of PDEs into the pt hierarchy. The KPTmap does not map kernel objects, but kernel *page tables*. kib: This is mapping of the page tables itself, not the introduction of PDEs into the pt hierarchy. | |||||
Done Inline ActionsI'm fairly certain I'm interpreting this correctly; however, I might not have stated it well. So, let me try again. (I know you know what is below. I'm merely restating it to pass on my understanding of what this particular structure does. This will let you identify the element of my thinking that is incorrect.) The kernel page table is rooted at KPML4phys, which contains the PML4Es. The PDPEs are in KPDPphys. The code puts entries for the PDPEs in KPML4phys starting at KPML4BASE. The PDEs are in KPDphys. The code puts entries for the PDEs into KPDPphys. Some of the PDEs end up mapping 2M pages. Some of the PDEs point to PTEs. The PTEs are in KPTphys. The code puts entries for the PTEs into KPDphys. (However, some get over-written with 2M pages.) According to the Intel SDM (Vol. 3a, section 5.13.2, beginning on page 5-30), if "the execute-disable bit is set in any of the paging-structure entries used to map [a] page", execution will be denied. Table 5-5 illustrates this. So, my understanding is that if I set the NX bit on entries in KPDphys, the processor will disable execution for any PTEs mapped using that PDE. (Note that this is distinct from the recursive mapping found my mapping KPML4phys[PML4PML4I] to itself, which vtopte() and vtopde() follow. For those, I agree that we could set the NX bit; however, we could do that at the top level. I'll open a separate review for that, since that is a bit different from what I'm trying to do here.) jtl: I'm fairly certain I'm interpreting this correctly; however, I might not have stated it well. | |||||
Done Inline ActionsNevermind, I misread the fragment, thinking that it is creation of PTmap. Ignore me. kib: Nevermind, I misread the fragment, thinking that it is creation of PTmap. Ignore me. | |||||
Done Inline ActionsNo problem. It took me multiple times through this code to fully grok it. :-) jtl: No problem. It took me multiple times through this code to fully grok it. :-) | |||||
/* Map from zero to end of allocations under 2M pages */ | /* Map from zero to end of allocations under 2M pages */ | ||||
/* This replaces some of the KPTphys entries above */ | /* This replaces some of the KPTphys entries above */ | ||||
for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) | for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) | ||||
pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS | | pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | | ||||
pg_g; | bootaddr_rwx(i << PDRSHIFT); | ||||
/* | /* | ||||
* Because we map the physical blocks in 2M pages, adjust firstaddr | * Because we map the physical blocks in 2M pages, adjust firstaddr | ||||
* to record the physical blocks we've actually mapped into kernel | * to record the physical blocks we've actually mapped into kernel | ||||
* virtual address space. | * virtual address space. | ||||
*/ | */ | ||||
*firstaddr = round_2mpage(*firstaddr); | *firstaddr = round_2mpage(*firstaddr); | ||||
Show All 23 Lines | for (i = 0; i < ndm1g; i++) { | ||||
pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; | pdp_p[i] = (vm_paddr_t)i << PDPSHIFT; | ||||
/* Preset PG_M and PG_A because demotion expects it. */ | /* Preset PG_M and PG_A because demotion expects it. */ | ||||
pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | | pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g | | ||||
X86_PG_M | X86_PG_A | pg_nx; | X86_PG_M | X86_PG_A | pg_nx; | ||||
} | } | ||||
for (j = 0; i < ndmpdp; i++, j++) { | for (j = 0; i < ndmpdp; i++, j++) { | ||||
pdp_p[i] = DMPDphys + ptoa(j); | pdp_p[i] = DMPDphys + ptoa(j); | ||||
pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U; | pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U; | ||||
} | |||||
/* | |||||
* Instead of using a 1G page for the memory containing the kernel, | |||||
* use 2M pages with appropriate permissions. (If using 1G pages, | |||||
* this will partially overwrite the PDPEs above.) | |||||
*/ | |||||
if (ndm1g && pmap_kernelro) { | |||||
pd_p = (pd_entry_t *)DMPDkernphys; | |||||
for (i = 0; i < NPDPEPG; i++) | |||||
pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | | |||||
X86_PG_M | X86_PG_A | pg_nx | | |||||
bootaddr_rwx(i << PDRSHIFT); | |||||
for (i = 0; i < nkdmpde; i++) | |||||
pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW | | |||||
X86_PG_V | PG_U; | |||||
} | } | ||||
/* And recursively map PML4 to itself in order to get PTmap */ | /* And recursively map PML4 to itself in order to get PTmap */ | ||||
p4_p = (pml4_entry_t *)KPML4phys; | p4_p = (pml4_entry_t *)KPML4phys; | ||||
p4_p[PML4PML4I] = KPML4phys; | p4_p[PML4PML4I] = KPML4phys; | ||||
p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U; | p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | PG_U; | ||||
/* Connect the Direct Map slot(s) up to the PML4. */ | /* Connect the Direct Map slot(s) up to the PML4. */ | ||||
▲ Show 20 Lines • Show All 7,038 Lines • Show Last 20 Lines |
kernelprot seems to be too generic. Might be, kernelnx ?