diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1590,7 +1590,10 @@ int gsel_tss, x; struct pcpu *pc; struct xstate_hdr *xhdr; - u_int64_t rsp0; + uint64_t cr3, rsp0; + pml4_entry_t *pml4e; + pdp_entry_t *pdpe; + pd_entry_t *pde; char *env; struct user_segment_descriptor *gdt; struct region_descriptor r_gdt; @@ -1599,6 +1602,35 @@ TSRAW(&thread0, TS_ENTER, __func__, NULL); + /* + * Calculate kernphys by inspecting page table created by loader. + * The assumptions: + * - kernel is mapped at KERNBASE, backed by contiguous phys memory + * aligned at 2M, below 4G (the latter is important for AP startup) + * - there is a 2M hole at KERNBASE + * - kernel is mapped with 2M superpages + * - all participating memory, i.e. kernel, modules, metadata, + * page table is accessible by pre-created 1:1 mapping + * (right now loader creates 1:1 mapping for lower 4G, and all + * memory is from there) + * - there is a usable memory block right after the end of the + * mapped kernel and all modules/metadata, pointed to by + * physfree, for early allocations + */ + cr3 = rcr3(); + pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index( + (vm_offset_t)hammer_time); + pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index( + (vm_offset_t)hammer_time); + pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index( + (vm_offset_t)hammer_time); + kernphys = (vm_paddr_t)(*pde & ~PDRMASK) - + (vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK); + + /* Fix-up for 2M hole */ + physfree += kernphys; + kernphys += NBPDR; + kmdp = init_ops.parse_preload_data(modulep); efi_boot = preload_search_info(kmdp, MODINFO_METADATA | @@ -1644,7 +1676,7 @@ /* Init basic tunables, hz etc */ init_param1(); - thread0.td_kstack = physfree + KERNBASE; + thread0.td_kstack = physfree - kernphys + KERNSTART; thread0.td_kstack_pages = kstack_pages; kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; bzero((void *)thread0.td_kstack, kstack0_sz); @@ -1681,7 +1713,7 @@ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ - dpcpu_init((void *)(physfree + KERNBASE), 0); + dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0); physfree += DPCPU_SIZE; amd64_bsp_pcpu_init1(pc); /* Non-late cninit() and printf() can be moved up to here. */ diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -436,7 +436,8 @@ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static int ndmpdpphys; /* number of DMPDPphys pages */ -static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ +vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ +vm_paddr_t KERNend; /* and the end */ /* * pmap_mapdev support pre initialization (i.e. console) @@ -1554,7 +1555,7 @@ #ifdef NKPT pt_pages = NKPT; #else - pt_pages = howmany(addr, NBPDR); + pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */ pt_pages += NKPDPE(pt_pages); /* @@ -1594,7 +1595,6 @@ static inline pt_entry_t bootaddr_rwx(vm_paddr_t pa) { - /* * The kernel is loaded at a 2MB-aligned address, and memory below that * need not be executable. The .bss section is padded to a 2MB @@ -1602,8 +1602,8 @@ * either. Preloaded kernel modules have their mapping permissions * fixed up by the linker. */ - if (pa < trunc_2mpage(btext - KERNBASE) || - pa >= trunc_2mpage(_end - KERNBASE)) + if (pa < trunc_2mpage(kernphys + btext - KERNSTART) || + pa >= trunc_2mpage(kernphys + _end - KERNSTART)) return (X86_PG_RW | pg_nx); /* @@ -1612,7 +1612,7 @@ * impact read-only data. However, in any case, any page with * read-write data needs to be read-write. */ - if (pa >= trunc_2mpage(brwsection - KERNBASE)) + if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART)) return (X86_PG_RW | pg_nx); /* @@ -1624,7 +1624,7 @@ * Note that fixups to the .text section will still work until we * set CR0.WP. */ - if (pa < round_2mpage(etext - KERNBASE)) + if (pa < round_2mpage(kernphys + etext - KERNSTART)) return (0); return (pg_nx); } @@ -1636,6 +1636,7 @@ pdp_entry_t *pdp_p; pml4_entry_t *p4_p; uint64_t DMPDkernphys; + vm_paddr_t pax; #ifdef KASAN pt_entry_t *pt_p; uint64_t KASANPDphys, KASANPTphys, KASANphys; @@ -1670,9 +1671,11 @@ /* * Allocate 2M pages for the kernel. These will be used in - * place of the first one or more 1G pages from ndm1g. + * place of the one or more 1G pages from ndm1g that maps + * kernel memory into DMAP. */ - nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP); + nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART + + kernphys - rounddown2(kernphys, NBPDP), NBPDP); DMPDkernphys = allocpages(firstaddr, nkdmpde); } if (ndm1g < ndmpdp) @@ -1719,14 +1722,18 @@ pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* - * Map from physical address zero to the end of loader preallocated - * memory using 2MB pages. This replaces some of the PD entries - * created above. + * Map from start of the kernel in physical memory (staging + * area) to the end of loader preallocated memory using 2MB + * pages. This replaces some of the PD entries created above. + * For compatibility, identity map 2M at the start. */ - for (i = 0; (i << PDRSHIFT) < KERNend; i++) + pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | + X86_PG_RW | pg_nx; + for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) { /* Preset PG_M and PG_A because demotion expects it. */ - pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | - X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT); + pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | + X86_PG_A | bootaddr_rwx(pax); + } /* * Because we map the physical blocks in 2M pages, adjust firstaddr @@ -1792,15 +1799,18 @@ * use 2M pages with read-only and no-execute permissions. (If using 1G * pages, this will partially overwrite the PDPEs above.) */ - if (ndm1g) { + if (ndm1g > 0) { pd_p = (pd_entry_t *)DMPDkernphys; - for (i = 0; i < (NPDEPG * nkdmpde); i++) - pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | - X86_PG_M | X86_PG_A | pg_nx | - bootaddr_rwx(i << PDRSHIFT); - for (i = 0; i < nkdmpde; i++) - pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW | - X86_PG_V | pg_nx; + for (i = 0, pax = rounddown2(kernphys, NBPDP); + i < NPDEPG * nkdmpde; i++, pax += NBPDR) { + pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | + X86_PG_A | pg_nx | bootaddr_rwx(pax); + } + j = rounddown2(kernphys, NBPDP) >> PDPSHIFT; + for (i = 0; i < nkdmpde; i++) { + pdp_p[i + j] = (DMPDkernphys + ptoa(i)) | + X86_PG_RW | X86_PG_V | pg_nx; + } } /* And recursively map PML4 to itself in order to get PTmap */ @@ -1876,7 +1886,8 @@ /* * Account for the virtual addresses mapped by create_pagetables(). */ - virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend); + virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend - + (vm_paddr_t)kernphys); virtual_end = VM_MAX_KERNEL_ADDRESS; /* @@ -2414,7 +2425,8 @@ * Collect the page table pages that were replaced by a 2MB * page in create_pagetables(). They are zero filled. */ - if ((vm_paddr_t)i << PDRSHIFT < KERNend && + if ((i == 0 || + kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) && pmap_insert_pt_page(kernel_pmap, mpte, false)) panic("pmap_init: pmap_insert_pt_page failed"); } @@ -6681,7 +6693,9 @@ mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == pmap_pde_pindex(va), - ("pmap_promote_pde: page table page's pindex is wrong")); + ("pmap_promote_pde: page table page's pindex is wrong " + "mpte %p pidx %#lx va %#lx va pde pidx %#lx", + mpte, mpte->pindex, va, pmap_pde_pindex(va))); if (pmap_insert_pt_page(pmap, mpte, true)) { counter_u64_add(pmap_pde_p_failures, 1); CTR2(KTR_PMAP, @@ -10751,8 +10765,8 @@ va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false); } - pmap_pti_add_kva_locked((vm_offset_t)KERNBASE + NBPDR, - (vm_offset_t)etext, true); + pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext, + true); pti_finalized = true; VM_OBJECT_WUNLOCK(pti_obj); } diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h --- a/sys/amd64/include/md_var.h +++ b/sys/amd64/include/md_var.h @@ -49,11 +49,8 @@ extern int la57; -/* - * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its - * value is the physical address at which the kernel is loaded. - */ -extern char kernphys[]; +extern vm_paddr_t kernphys; +extern vm_paddr_t KERNend; extern bool efi_boot; diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h --- a/sys/amd64/include/vmparam.h +++ b/sys/amd64/include/vmparam.h @@ -151,8 +151,10 @@ #endif /* - * Kernel physical load address. Needs to be aligned at 2MB superpage - * boundary. + * Kernel physical load address for non-UEFI boot and for legacy UEFI loader. + * Newer UEFI loader loads kernel anywhere below 4G, with memory allocated + * by boot services. + * Needs to be aligned at 2MB superpage boundary. */ #ifndef KERNLOAD #define KERNLOAD 0x200000 @@ -192,7 +194,17 @@ #define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0) #define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0) +/* + * Formally kernel mapping starts at KERNBASE, but kernel linker + * script leaves first PDE reserved. For legacy BIOS boot, kernel is + * loaded at KERNLOAD = 2M, and initial kernel page table maps + * physical memory from zero to KERNend starting at KERNBASE. + * + * KERNSTART is where the first actual kernel page is mapped, after + * the compatibility mapping. + */ #define KERNBASE KV4ADDR(KPML4I, KPDPI, 0, 0) +#define KERNSTART (KERNBASE + NBPDR) #define UPT_MAX_ADDRESS KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I) #define UPT_MIN_ADDRESS KV4ADDR(PML4PML4I, 0, 0, 0) diff --git a/sys/conf/ldscript.amd64 b/sys/conf/ldscript.amd64 --- a/sys/conf/ldscript.amd64 +++ b/sys/conf/ldscript.amd64 @@ -5,15 +5,14 @@ SEARCH_DIR("/usr/lib"); SECTIONS { - kernphys = kernload; /* Read-only sections, merged into text segment: */ - . = kernbase + kernphys + SIZEOF_HEADERS; + . = kernbase + kernload + SIZEOF_HEADERS; /* * Use the AT keyword in order to set the right LMA that contains * the physical address where the section should be loaded. This is * needed for the Xen loader which honours the LMA. */ - .interp : AT (kernphys + SIZEOF_HEADERS) { *(.interp) } + .interp : AT (kernload + SIZEOF_HEADERS) { *(.interp) } .hash : { *(.hash) } .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) }