diff --git a/TODO b/TODO new file mode 100644 --- /dev/null +++ b/TODO @@ -0,0 +1,5 @@ +- Xen + +checks: +non-EFI boot +non-amd64 EFI diff --git a/stand/common/bootstrap.h b/stand/common/bootstrap.h --- a/stand/common/bootstrap.h +++ b/stand/common/bootstrap.h @@ -228,6 +228,9 @@ size_t f_size; /* file size */ struct kernel_module *f_modules; /* list of modules if any */ struct preloaded_file *f_next; /* next file */ +#ifdef __amd64__ + bool f_kernphys_relocatable; +#endif }; struct file_format diff --git a/stand/common/load_elf.c b/stand/common/load_elf.c --- a/stand/common/load_elf.c +++ b/stand/common/load_elf.c @@ -207,6 +207,18 @@ #undef CONVERT_SWITCH #undef CONVERT_FIELD + +#ifdef __amd64__ +static bool +is_kernphys_relocatable(elf_file_t ef) +{ + Elf_Sym sym; + + return (__elfN(lookup_symbol)(ef, "kernphys", &sym, STT_OBJECT) == 0 && + sym.st_size == 8); +} +#endif + static int __elfN(load_elf_header)(char *filename, elf_file_t ef) { @@ -434,6 +446,9 @@ /* Load OK, return module pointer */ *result = (struct preloaded_file *)fp; err = 0; +#ifdef __amd64__ + fp->f_kernphys_relocatable = is_kernphys_relocatable(&ef); +#endif goto out; ioerr: diff --git a/stand/efi/loader/arch/amd64/elf64_freebsd.c b/stand/efi/loader/arch/amd64/elf64_freebsd.c --- a/stand/efi/loader/arch/amd64/elf64_freebsd.c +++ b/stand/efi/loader/arch/amd64/elf64_freebsd.c @@ -82,7 +82,11 @@ static pml4_entry_t *PT4; static pdp_entry_t *PT3; +static pdp_entry_t *PT3_l, *PT3_u; static pd_entry_t *PT2; +static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1; + +extern EFI_PHYSICAL_ADDRESS staging; static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend, uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry); @@ -105,6 +109,12 @@ ACPI_TABLE_RSDP *rsdp; char buf[24]; int revision; + bool copy_auto; + + copy_auto = copy_staging == COPY_STAGING_AUTO; + if (copy_auto) + copy_staging = fp->f_kernphys_relocatable ? + COPY_STAGING_DISABLE : COPY_STAGING_ENABLE; /* * Report the RSDP to the kernel. While this can be found with @@ -151,57 +161,133 @@ } if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL) - return(EFTYPE); + return (EFTYPE); ehdr = (Elf_Ehdr *)&(md->md_data); - trampcode = (vm_offset_t)0x0000000040000000; + trampcode = copy_staging == COPY_STAGING_ENABLE ? + (vm_offset_t)0x0000000040000000 /* 1G */ : + (vm_offset_t)0x0000000100000000; /* 4G */; err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1, (EFI_PHYSICAL_ADDRESS *)&trampcode); + if (EFI_ERROR(err)) { + printf("Unable to allocate trampoline\n"); + if (copy_auto) + copy_staging = COPY_STAGING_AUTO; + return (ENOMEM); + } bzero((void *)trampcode, EFI_PAGE_SIZE); trampstack = trampcode + EFI_PAGE_SIZE - 8; bcopy((void *)&amd64_tramp, (void *)trampcode, amd64_tramp_size); trampoline = (void *)trampcode; - PT4 = (pml4_entry_t *)0x0000000040000000; - err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3, - (EFI_PHYSICAL_ADDRESS *)&PT4); - bzero(PT4, 3 * EFI_PAGE_SIZE); + if (copy_staging == COPY_STAGING_ENABLE) { + PT4 = (pml4_entry_t *)0x0000000040000000; + err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3, + (EFI_PHYSICAL_ADDRESS *)&PT4); + if (EFI_ERROR(err)) { + printf("Unable to allocate trampoline page table\n"); + BS->FreePages(trampcode, 1); + if (copy_auto) + copy_staging = COPY_STAGING_AUTO; + return (ENOMEM); + } + bzero(PT4, 3 * EFI_PAGE_SIZE); + PT3 = &PT4[512]; + PT2 = &PT3[512]; + + /* + * This is kinda brutal, but every single 1GB VM + * memory segment points to the same first 1GB of + * physical memory. But it is more than adequate. + */ + for (i = 0; i < NPTEPG; i++) { + /* + * Each slot of the L4 pages points to the + * same L3 page. + */ + PT4[i] = (pml4_entry_t)PT3; + PT4[i] |= PG_V | PG_RW; + + /* + * Each slot of the L3 pages points to the + * same L2 page. + */ + PT3[i] = (pdp_entry_t)PT2; + PT3[i] |= PG_V | PG_RW; + + /* + * The L2 page slots are mapped with 2MB pages for 1GB. + */ + PT2[i] = (pd_entry_t)i * (2 * 1024 * 1024); + PT2[i] |= PG_V | PG_RW | PG_PS; + } + } else { + PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */ + err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9, + (EFI_PHYSICAL_ADDRESS *)&PT4); + if (EFI_ERROR(err)) { + printf("Unable to allocate trampoline page table\n"); + BS->FreePages(trampcode, 9); + if (copy_auto) + copy_staging = COPY_STAGING_AUTO; + return (ENOMEM); + } - PT3 = &PT4[512]; - PT2 = &PT3[512]; + bzero(PT4, 9 * EFI_PAGE_SIZE); + + PT3_l = &PT4[NPML4EPG * 1]; + PT3_u = &PT4[NPML4EPG * 2]; + PT2_l0 = &PT4[NPML4EPG * 3]; + PT2_l1 = &PT4[NPML4EPG * 4]; + PT2_l2 = &PT4[NPML4EPG * 5]; + PT2_l3 = &PT4[NPML4EPG * 6]; + PT2_u0 = &PT4[NPML4EPG * 7]; + PT2_u1 = &PT4[NPML4EPG * 8]; + + /* 1:1 mapping of lower 4G */ + PT4[0] = (pml4_entry_t)PT3_l | PG_V | PG_RW; + PT3_l[0] = (pdp_entry_t)PT2_l0 | PG_V | PG_RW; + PT3_l[1] = (pdp_entry_t)PT2_l1 | PG_V | PG_RW; + PT3_l[2] = (pdp_entry_t)PT2_l2 | PG_V | PG_RW; + PT3_l[3] = (pdp_entry_t)PT2_l3 | PG_V | PG_RW; + for (i = 0; i < 4 * NPDEPG; i++) { + PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V | + PG_RW | PG_PS; + } - /* - * This is kinda brutal, but every single 1GB VM memory segment points - * to the same first 1GB of physical memory. But it is more than - * adequate. - */ - for (i = 0; i < 512; i++) { - /* Each slot of the L4 pages points to the same L3 page. */ - PT4[i] = (pml4_entry_t)PT3; - PT4[i] |= PG_V | PG_RW; - - /* Each slot of the L3 pages points to the same L2 page. */ - PT3[i] = (pdp_entry_t)PT2; - PT3[i] |= PG_V | PG_RW; - - /* The L2 page slots are mapped with 2MB pages for 1GB. */ - PT2[i] = i * (2 * 1024 * 1024); - PT2[i] |= PG_V | PG_RW | PG_PS; + /* mapping of kernel 2G below top */ + PT4[NPML4EPG - 1] = (pml4_entry_t)PT3_u | PG_V | PG_RW; + PT3_u[NPDPEPG - 2] = (pdp_entry_t)PT2_u0 | PG_V | PG_RW; + PT3_u[NPDPEPG - 1] = (pdp_entry_t)PT2_u1 | PG_V | PG_RW; + /* compat mapping of phys @0 */ + PT2_u0[0] = PG_PS | PG_V | PG_RW; + /* this maps past staging area */ + for (i = 1; i < 2 * NPDEPG; i++) { + PT2_u0[i] = ((pd_entry_t)staging + + ((pd_entry_t)i - 1) * NBPDR) | + PG_V | PG_RW | PG_PS; + } } + printf("staging %#lx (%scoping) tramp %p PT4 %p\n", + staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ", + trampoline, PT4); printf("Start @ 0x%lx ...\n", ehdr->e_entry); efi_time_fini(); err = bi_load(fp->f_args, &modulep, &kernend, true); if (err != 0) { efi_time_init(); - return(err); + if (copy_auto) + copy_staging = COPY_STAGING_AUTO; + return (err); } dev_cleanup(); - trampoline(trampstack, efi_copy_finish, kernend, modulep, PT4, - ehdr->e_entry); + trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ? + efi_copy_finish : efi_copy_finish_nop, kernend, modulep, + PT4, ehdr->e_entry); panic("exec returned"); } diff --git a/stand/efi/loader/bootinfo.c b/stand/efi/loader/bootinfo.c --- a/stand/efi/loader/bootinfo.c +++ b/stand/efi/loader/bootinfo.c @@ -65,6 +65,8 @@ extern EFI_SYSTEM_TABLE *ST; +int boot_services_gone; + static int bi_getboothowto(char *kargs) { @@ -396,8 +398,10 @@ if (!exit_bs) break; status = BS->ExitBootServices(IH, efi_mapkey); - if (!EFI_ERROR(status)) + if (!EFI_ERROR(status)) { + boot_services_gone = 1; break; + } } if (retry == 0) { diff --git a/stand/efi/loader/copy.c b/stand/efi/loader/copy.c --- a/stand/efi/loader/copy.c +++ b/stand/efi/loader/copy.c @@ -39,6 +39,11 @@ #include "loader_efi.h" +#define M(x) ((x) * 1024 * 1024) +#define G(x) (1UL * (x) * 1024 * 1024 * 1024) + +extern int boot_services_gone; + #if defined(__i386__) || defined(__amd64__) #include #include @@ -175,24 +180,142 @@ #ifndef EFI_STAGING_SIZE #if defined(__arm__) -#define EFI_STAGING_SIZE 32 +#define EFI_STAGING_SIZE M(32) +#else +#define EFI_STAGING_SIZE M(64) +#endif +#endif + +#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ + defined(__riscv) +#define EFI_STAGING_2M_ALIGN 1 #else -#define EFI_STAGING_SIZE 64 +#define EFI_STAGING_2M_ALIGN 0 #endif + +#if defined(__amd64__) +#define EFI_STAGING_SLOP M(8) +#else +#define EFI_STAGING_SLOP 0 #endif +static u_long staging_slop = EFI_STAGING_SLOP; + EFI_PHYSICAL_ADDRESS staging, staging_end, staging_base; int stage_offset_set = 0; ssize_t stage_offset; +static void +efi_copy_free(void) +{ + BS->FreePages(staging_base, (staging_end - staging_base) / + EFI_PAGE_SIZE); + stage_offset_set = 0; + stage_offset = 0; +} + +#ifdef __amd64__ +int copy_staging = COPY_STAGING_AUTO; + +static int +command_copy_staging(int argc, char *argv[]) +{ + static const char *const mode[3] = { + [COPY_STAGING_ENABLE] = "enable", + [COPY_STAGING_DISABLE] = "disable", + [COPY_STAGING_AUTO] = "auto", + }; + int prev, res; + + res = CMD_OK; + if (argc > 2) { + res = CMD_ERROR; + } else if (argc == 2) { + prev = copy_staging; + if (strcmp(argv[1], "enable") == 0) + copy_staging = COPY_STAGING_ENABLE; + else if (strcmp(argv[1], "disable") == 0) + copy_staging = COPY_STAGING_DISABLE; + else if (strcmp(argv[1], "auto") == 0) + copy_staging = COPY_STAGING_AUTO; + else { + printf("usage: copy_staging enable|disable|auto\n"); + res = CMD_ERROR; + } + if (res == CMD_OK && prev != copy_staging) { + printf("changed copy_staging, unloading kernel\n"); + unload(); + efi_copy_free(); + efi_copy_init(); + } + } else { + printf("copy staging: %s\n", mode[copy_staging]); + } + return (res); +} +COMMAND_SET(copy_staging, "copy_staging", "copy staging", command_copy_staging); +#endif + +static int +command_staging_slop(int argc, char *argv[]) +{ + char *endp; + u_long new, prev; + int res; + + res = CMD_OK; + if (argc > 2) { + res = CMD_ERROR; + } else if (argc == 2) { + new = strtoul(argv[1], &endp, 0); + if (*endp != '\0') { + printf("invalid slop value\n"); + res = CMD_ERROR; + } + if (res == CMD_OK && staging_slop != new) { + printf("changed slop, unloading kernel\n"); + unload(); + efi_copy_free(); + efi_copy_init(); + } + } else { + printf("staging slop %#lx\n", staging_slop); + } + return (res); +} +COMMAND_SET(staging_slop, "staging_slop", "set staging slop", + command_staging_slop); + +#if defined(__i386__) || defined(__amd64__) +/* + * The staging area must reside in the the first 1GB or 4GB physical + * memory: see elf64_exec() in + * boot/efi/loader/arch/amd64/elf64_freebsd.c. + */ +static EFI_PHYSICAL_ADDRESS +get_staging_max(void) +{ + EFI_PHYSICAL_ADDRESS res; + +#if defined(__i386__) + res = G(1); +#elif defined(__amd64__) + res = copy_staging == COPY_STAGING_ENABLE ? G(1) : G(4); +#endif + return (res); +} +#define EFI_ALLOC_METHOD AllocateMaxAddress +#else +#define EFI_ALLOC_METHOD AllocateAnyPages +#endif + int efi_copy_init(void) { EFI_STATUS status; - unsigned long nr_pages; - nr_pages = EFI_SIZE_TO_PAGES((EFI_STAGING_SIZE) * 1024 * 1024); + nr_pages = EFI_SIZE_TO_PAGES((EFI_STAGING_SIZE)); #if defined(__i386__) || defined(__amd64__) /* @@ -203,18 +326,10 @@ if (running_on_hyperv()) efi_verify_staging_size(&nr_pages); - /* - * The staging area must reside in the the first 1GB physical - * memory: see elf64_exec() in - * boot/efi/loader/arch/amd64/elf64_freebsd.c. - */ - staging = 1024*1024*1024; - status = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, - nr_pages, &staging); -#else - status = BS->AllocatePages(AllocateAnyPages, EfiLoaderData, - nr_pages, &staging); + staging = get_staging_max(); #endif + status = BS->AllocatePages(EFI_ALLOC_METHOD, EfiLoaderData, + nr_pages, &staging); if (EFI_ERROR(status)) { printf("failed to allocate staging area: %lu\n", EFI_ERROR_CODE(status)); @@ -223,7 +338,7 @@ staging_base = staging; staging_end = staging + nr_pages * EFI_PAGE_SIZE; -#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) +#if EFI_STAGING_2M_ALIGN /* * Round the kernel load address to a 2MiB value. This is needed * because the kernel builds a page table based on where it has @@ -231,7 +346,7 @@ * either a 1MiB or 2MiB page for this we need to make sure it * is correctly aligned for both cases. */ - staging = roundup2(staging, 2 * 1024 * 1024); + staging = roundup2(staging, M(2)); #endif return (0); @@ -240,20 +355,42 @@ static bool efi_check_space(vm_offset_t end) { - EFI_PHYSICAL_ADDRESS addr; + EFI_PHYSICAL_ADDRESS addr, new_base, new_staging; EFI_STATUS status; unsigned long nr_pages; + end = roundup2(end, EFI_PAGE_SIZE); + /* There is already enough space */ - if (end <= staging_end) + if (end + staging_slop <= staging_end) return (true); - end = roundup2(end, EFI_PAGE_SIZE); - nr_pages = EFI_SIZE_TO_PAGES(end - staging_end); + if (boot_services_gone) { + if (end <= staging_end) + return (true); + panic("efi_check_space: cannot expand staging area " + "after boot services were exited\n"); + } + + /* + * Add slop at the end: + * 1. amd64 kernel expects to do some very early allocations + * by carving out memory after kernend. Slop guarantees + * that it does not ovewrite anything useful. + * 2. It seems that initial calculation of the staging size + * could be somewhat smaller than actually copying in after + * boot services are exited. Slop avoids calling + * BS->AllocatePages() when it cannot work. + */ + end += staging_slop; + nr_pages = EFI_SIZE_TO_PAGES(end - staging_end); #if defined(__i386__) || defined(__amd64__) - /* X86 needs all memory to be allocated under the 1G boundary */ - if (end > 1024*1024*1024) + /* + * i386 needs all memory to be allocated under the 1G boundary. + * amd64 needs all memory to be allocated under the 1G or 4G boundary. + */ + if (end > get_staging_max()) goto before_staging; #endif @@ -268,14 +405,12 @@ before_staging: /* Try allocating space before the previous allocation */ - if (staging < nr_pages * EFI_PAGE_SIZE) { - printf("Not enough space before allocation\n"); - return (false); - } + if (staging < nr_pages * EFI_PAGE_SIZE) + goto expand; addr = staging - nr_pages * EFI_PAGE_SIZE; -#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) +#if EFI_STAGING_2M_ALIGN /* See efi_copy_init for why this is needed */ - addr = rounddown2(addr, 2 * 1024 * 1024); + addr = rounddown2(addr, M(2)); #endif nr_pages = EFI_SIZE_TO_PAGES(staging_base - addr); status = BS->AllocatePages(AllocateAddress, EfiLoaderData, nr_pages, @@ -288,11 +423,42 @@ staging_base = addr; memmove((void *)(uintptr_t)staging_base, (void *)(uintptr_t)staging, staging_end - staging); - stage_offset -= (staging - staging_base); + stage_offset -= staging - staging_base; staging = staging_base; return (true); } +expand: + nr_pages = EFI_SIZE_TO_PAGES(end - (vm_offset_t)staging); +#if EFI_STAGING_2M_ALIGN + nr_pages += M(2) / EFI_PAGE_SIZE; +#endif +#if defined(__i386__) || defined(__amd64__) + new_base = get_staging_max(); +#endif + status = BS->AllocatePages(EFI_ALLOC_METHOD, EfiLoaderData, + nr_pages, &new_base); + if (!EFI_ERROR(status)) { +#if EFI_STAGING_2M_ALIGN + new_staging = roundup2(new_base, M(2)); +#else + new_staging = new_base; +#endif + /* + * Move the old allocation and update the state so + * translation still works. + */ + memcpy((void *)(uintptr_t)new_staging, + (void *)(uintptr_t)staging, staging_end - staging); + BS->FreePages(staging_base, (staging_end - staging_base) / + EFI_PAGE_SIZE); + stage_offset -= staging - new_staging; + staging = new_staging; + staging_end = new_base + nr_pages * EFI_PAGE_SIZE; + staging_base = new_base; + return (true); + } + printf("efi_check_space: Unable to expand staging area\n"); return (false); } @@ -335,7 +501,6 @@ return (len); } - ssize_t efi_readin(readin_handle_t fd, vm_offset_t dest, const size_t len) { @@ -364,3 +529,8 @@ while (src < last) *dst++ = *src++; } + +void +efi_copy_finish_nop(void) +{ +} diff --git a/stand/efi/loader/loader_efi.h b/stand/efi/loader/loader_efi.h --- a/stand/efi/loader/loader_efi.h +++ b/stand/efi/loader/loader_efi.h @@ -34,6 +34,15 @@ #include #include +#ifdef __amd64__ +enum { + COPY_STAGING_ENABLE, + COPY_STAGING_DISABLE, + COPY_STAGING_AUTO, +}; +extern int copy_staging; +#endif + int efi_autoload(void); int efi_copy_init(void); @@ -44,5 +53,6 @@ void * efi_translate(vm_offset_t ptr); void efi_copy_finish(void); +void efi_copy_finish_nop(void); #endif /* _LOADER_EFI_COPY_H_ */ diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -222,6 +222,8 @@ void (*vmm_resume_p)(void); +bool efi_boot; + static void cpu_startup(dummy) void *dummy; @@ -1277,7 +1279,7 @@ * in real mode mode (e.g. SMP bare metal). */ #ifdef SMP - mp_bootaddress(physmap, &physmap_idx); + alloc_ap_trampoline(physmap, &physmap_idx); #endif /* call pmap initialization to make new kernel address space */ @@ -1598,16 +1600,47 @@ int gsel_tss, x; struct pcpu *pc; struct xstate_hdr *xhdr; - u_int64_t rsp0; + uint64_t cr3, rsp0; + pml4_entry_t *pml4e; + pdp_entry_t *pdpe; + pd_entry_t *pde; char *env; struct user_segment_descriptor *gdt; struct region_descriptor r_gdt; size_t kstack0_sz; int late_console; - bool efi_boot; TSRAW(&thread0, TS_ENTER, __func__, NULL); + /* + * Calculate kernphys by inspecting page table created by loader. + * The assumptions: + * - kernel is mapped at KERNBASE, backed by contiguous phys memory + * aligned at 2M, below 4G (the latter is important for AP startup) + * - there is a 2M hole at KERNBASE + * - kernel is mapped with 2M superpages + * - all participating memory, i.e. kernel, modules, metadata, + * page table is accessible by pre-created 1:1 mapping + * (right now loader creates 1:1 mapping for lower 4G, and all + * memory is from there) + * - there is a usable memory block right after the end of the + * mapped kernel and all modules/metadata, pointed to by + * physfree, for early allocations + */ + cr3 = rcr3(); + pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index( + (vm_offset_t)hammer_time); + pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index( + (vm_offset_t)hammer_time); + pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index( + (vm_offset_t)hammer_time); + kernphys = (vm_paddr_t)(*pde & ~PDRMASK) - + (vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK); + + /* Fix-up for 2M hole */ + physfree += kernphys; + kernphys += NBPDR; + kmdp = init_ops.parse_preload_data(modulep); efi_boot = preload_search_info(kmdp, MODINFO_METADATA | @@ -1653,7 +1686,7 @@ /* Init basic tunables, hz etc */ init_param1(); - thread0.td_kstack = physfree + KERNBASE; + thread0.td_kstack = physfree - kernphys + KERNSTART; thread0.td_kstack_pages = kstack_pages; kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; bzero((void *)thread0.td_kstack, kstack0_sz); @@ -1690,7 +1723,7 @@ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ - dpcpu_init((void *)(physfree + KERNBASE), 0); + dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0); physfree += DPCPU_SIZE; amd64_bsp_pcpu_init1(pc); /* Non-late cninit() and printf() can be moved up to here. */ diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -105,6 +105,7 @@ static char *dbg_stack; extern u_int mptramp_la57; +extern u_int mptramp_nx; /* * Local data and functions. @@ -112,86 +113,6 @@ static int start_ap(int apic_id); -static bool -is_kernel_paddr(vm_paddr_t pa) -{ - - return (pa >= trunc_2mpage(btext - KERNBASE) && - pa < round_page(_end - KERNBASE)); -} - -static bool -is_mpboot_good(vm_paddr_t start, vm_paddr_t end) -{ - - return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem); -} - -/* - * Calculate usable address in base memory for AP trampoline code. - */ -void -mp_bootaddress(vm_paddr_t *physmap, unsigned int *physmap_idx) -{ - vm_paddr_t start, end; - unsigned int i; - bool allocated; - - alloc_ap_trampoline(physmap, physmap_idx); - - /* - * Find a memory region big enough below the 4GB boundary to - * store the initial page tables. Region must be mapped by - * the direct map. - * - * Note that it needs to be aligned to a page boundary. - */ - allocated = false; - for (i = *physmap_idx; i <= *physmap_idx; i -= 2) { - /* - * First, try to chomp at the start of the physmap region. - * Kernel binary might claim it already. - */ - start = round_page(physmap[i]); - end = start + AP_BOOTPT_SZ; - if (start < end && end <= physmap[i + 1] && - is_mpboot_good(start, end) && - !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) { - allocated = true; - physmap[i] = end; - break; - } - - /* - * Second, try to chomp at the end. Again, check - * against kernel. - */ - end = trunc_page(physmap[i + 1]); - start = end - AP_BOOTPT_SZ; - if (start < end && start >= physmap[i] && - is_mpboot_good(start, end) && - !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) { - allocated = true; - physmap[i + 1] = start; - break; - } - } - if (allocated) { - mptramp_pagetables = start; - if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) { - memmove(&physmap[i], &physmap[i + 2], - sizeof(*physmap) * (*physmap_idx - i + 2)); - *physmap_idx -= 2; - } - } else { - mptramp_pagetables = trunc_page(boot_address) - AP_BOOTPT_SZ; - if (bootverbose) - printf( -"Cannot find enough space for the initial AP page tables, placing them at %#x", - mptramp_pagetables); - } -} - /* * Initialize the IPI handlers and start up the AP's. */ @@ -243,6 +164,9 @@ assign_cpu_ids(); mptramp_la57 = la57; + mptramp_nx = pg_nx != 0; + MPASS(kernel_pmap->pm_cr3 < (1UL << 32)); + mptramp_pagetables = kernel_pmap->pm_cr3; /* Start each Application Processor */ start_all_aps(); @@ -399,64 +323,79 @@ int start_all_aps(void) { - u_int64_t *pt5, *pt4, *pt3, *pt2; + vm_page_t m_pml4, m_pdp, m_pd[4]; + pml5_entry_t old_pml45; + pml4_entry_t *v_pml4; + pdp_entry_t *v_pdp; + pd_entry_t *v_pd; u_int32_t mpbioswarmvec; - int apic_id, cpu, domain, i, xo; + int apic_id, cpu, domain, i; u_char mpbiosreason; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); - /* copy the AP 1st level boot code */ - bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size); - - /* Locate the page tables, they'll be below the trampoline */ + /* Create a transient 1:1 mapping of low 4G */ if (la57) { - pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables); - xo = 1; + m_pml4 = pmap_page_alloc_below_4g(true); + v_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); } else { - xo = 0; + v_pml4 = &kernel_pmap->pm_pmltop[0]; } - pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE); - pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); - pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); - - /* Create the initial 1GB replicated page tables */ - for (i = 0; i < 512; i++) { - if (la57) { - pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + - PAGE_SIZE); - pt5[i] |= PG_V | PG_RW | PG_U; - } - - /* - * Each slot of the level 4 pages points to the same - * level 3 page. - */ - pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + - (xo + 1) * PAGE_SIZE); - pt4[i] |= PG_V | PG_RW | PG_U; - - /* - * Each slot of the level 3 pages points to the same - * level 2 page. - */ - pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + - ((xo + 2) * PAGE_SIZE)); - pt3[i] |= PG_V | PG_RW | PG_U; - - /* The level 2 page slots are mapped with 2MB pages for 1GB. */ - pt2[i] = i * (2 * 1024 * 1024); - pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; + m_pdp = pmap_page_alloc_below_4g(true); + v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); + m_pd[0] = pmap_page_alloc_below_4g(false); + v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[0])); + for (i = 0; i < NPDEPG; i++) + v_pd[i] = (i << PDRSHIFT) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M | PG_PS; + m_pd[1] = pmap_page_alloc_below_4g(false); + v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[1])); + for (i = 0; i < NPDEPG; i++) + v_pd[i] = (NBPDP + (i << PDRSHIFT)) | X86_PG_V | X86_PG_RW | + X86_PG_A | X86_PG_M | PG_PS; + m_pd[2] = pmap_page_alloc_below_4g(false); + v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[2])); + for (i = 0; i < NPDEPG; i++) + v_pd[i] = (2UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS; + m_pd[3] = pmap_page_alloc_below_4g(false); + v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[3])); + for (i = 0; i < NPDEPG; i++) + v_pd[i] = (3UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS; + v_pdp[0] = VM_PAGE_TO_PHYS(m_pd[0]) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + v_pdp[1] = VM_PAGE_TO_PHYS(m_pd[1]) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + v_pdp[2] = VM_PAGE_TO_PHYS(m_pd[2]) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + v_pdp[3] = VM_PAGE_TO_PHYS(m_pd[3]) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + old_pml45 = kernel_pmap->pm_pmltop[0]; + if (la57) { + kernel_pmap->pm_pmltop[0] = VM_PAGE_TO_PHYS(m_pml4) | + X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; } + v_pml4[0] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + pmap_invalidate_all(kernel_pmap); + + /* copy the AP 1st level boot code */ + bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size); + if (bootverbose) + printf("AP boot address %#x\n", boot_address); /* save the current value of the warm-start vector */ - mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); + if (!efi_boot) + mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); outb(CMOS_REG, BIOS_RESET); mpbiosreason = inb(CMOS_DATA); /* setup a vector to our boot code */ - *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; - *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4); + if (!efi_boot) { + *((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET; + *((volatile u_short *)WARMBOOT_SEG) = (boot_address >> 4); + } outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ @@ -512,6 +451,17 @@ outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); + /* Destroy transient 1:1 mapping */ + kernel_pmap->pm_pmltop[0] = old_pml45; + invlpg(0); + if (la57) + vm_page_free(m_pml4); + vm_page_free(m_pd[3]); + vm_page_free(m_pd[2]); + vm_page_free(m_pd[1]); + vm_page_free(m_pd[0]); + vm_page_free(m_pdp); + /* number of APs actually started */ return (mp_naps); } diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S --- a/sys/amd64/amd64/mpboot.S +++ b/sys/amd64/amd64/mpboot.S @@ -95,12 +95,25 @@ * is later enabled. */ mov %cr4, %eax - orl $CR4_PAE, %eax + orl $(CR4_PAE | CR4_PGE), %eax cmpb $0, mptramp_la57-mptramp_start(%ebx) je 1f orl $CR4_LA57, %eax 1: mov %eax, %cr4 + /* + * If the BSP reported NXE support, enable EFER.NXE for all APs + * prior to loading %cr3. This avoids page faults if the AP + * encounters memory marked with the NX bit prior to detecting and + * enabling NXE support. + */ + cmpb $0,mptramp_nx-mptramp_start(%ebx) + je 2f + movl $MSR_EFER, %ecx + rdmsr + orl $EFER_NXE, %eax + wrmsr +2: /* * Enable EFER.LME so that we get long mode when all the prereqs are * in place. In this case, it turns on when CR0_PG is finally enabled. @@ -112,12 +125,13 @@ wrmsr /* - * Point to the embedded page tables for startup. Note that this - * only gets accessed after we're actually in 64 bit mode, however - * we can only set the bottom 32 bits of %cr3 in this state. This - * means we are required to use a temporary page table that is below - * the 4GB limit. %ebx is still our relocation base. We could just - * subtract 3 * PAGE_SIZE, but that would be too easy. + * Load kernel page table pointer into %cr3. + * %ebx is still our relocation base. + * + * Note that this only gets accessed after we're actually in 64 bit + * mode, however we can only set the bottom 32 bits of %cr3 in this + * state. This means we depend on the kernel page table being + * allocated from the low 4G. */ leal mptramp_pagetables-mptramp_start(%ebx),%eax movl (%eax), %eax @@ -155,10 +169,8 @@ /* * Yeehar! We're running in 64 bit mode! We can mostly ignore our * segment registers, and get on with it. - * Note that we are running at the correct virtual address, but with - * a 1:1 1GB mirrored mapping over entire address space. We had better - * switch to a real %cr3 promptly so that we can get to the direct map - * space. Remember that jmp is relative and that we've been relocated, + * We are running at the correct virtual address space. + * Note that the jmp is relative and that we've been relocated, * so use an indirect jump. */ .code64 @@ -220,6 +232,10 @@ mptramp_la57: .long 0 + .globl mptramp_nx +mptramp_nx: + .long 0 + /* * The pseudo descriptor for lgdt to use. */ @@ -243,31 +259,5 @@ .code64 .p2align 4,0 entry_64: - /* - * If the BSP reported NXE support, enable EFER.NXE for all APs - * prior to loading %cr3. This avoids page faults if the AP - * encounters memory marked with the NX bit prior to detecting and - * enabling NXE support. - */ - movq pg_nx, %rbx - testq %rbx, %rbx - je 1f - movl $MSR_EFER, %ecx - rdmsr - orl $EFER_NXE, %eax - wrmsr - -1: - /* - * Load a real %cr3 that has all the direct map stuff and switches - * off the 1GB replicated mirror. Load a stack pointer and jump - * into AP startup code in C. - */ - cmpl $0, la57 - jne 2f - movq KPML4phys, %rax - jmp 3f -2: movq KPML5phys, %rax -3: movq %rax, %cr3 movq bootSTK, %rsp jmp init_secondary diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -436,7 +436,8 @@ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static int ndmpdpphys; /* number of DMPDPphys pages */ -static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ +vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ +vm_paddr_t KERNend; /* and the end */ /* * pmap_mapdev support pre initialization (i.e. console) @@ -1554,7 +1555,7 @@ #ifdef NKPT pt_pages = NKPT; #else - pt_pages = howmany(addr, NBPDR); + pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */ pt_pages += NKPDPE(pt_pages); /* @@ -1594,7 +1595,6 @@ static inline pt_entry_t bootaddr_rwx(vm_paddr_t pa) { - /* * The kernel is loaded at a 2MB-aligned address, and memory below that * need not be executable. The .bss section is padded to a 2MB @@ -1602,8 +1602,8 @@ * either. Preloaded kernel modules have their mapping permissions * fixed up by the linker. */ - if (pa < trunc_2mpage(btext - KERNBASE) || - pa >= trunc_2mpage(_end - KERNBASE)) + if (pa < trunc_2mpage(kernphys + btext - KERNSTART) || + pa >= trunc_2mpage(kernphys + _end - KERNSTART)) return (X86_PG_RW | pg_nx); /* @@ -1612,7 +1612,7 @@ * impact read-only data. However, in any case, any page with * read-write data needs to be read-write. */ - if (pa >= trunc_2mpage(brwsection - KERNBASE)) + if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART)) return (X86_PG_RW | pg_nx); /* @@ -1624,7 +1624,7 @@ * Note that fixups to the .text section will still work until we * set CR0.WP. */ - if (pa < round_2mpage(etext - KERNBASE)) + if (pa < round_2mpage(kernphys + etext - KERNSTART)) return (0); return (pg_nx); } @@ -1636,6 +1636,7 @@ pdp_entry_t *pdp_p; pml4_entry_t *p4_p; uint64_t DMPDkernphys; + vm_paddr_t pax; #ifdef KASAN pt_entry_t *pt_p; uint64_t KASANPDphys, KASANPTphys, KASANphys; @@ -1670,9 +1671,11 @@ /* * Allocate 2M pages for the kernel. These will be used in - * place of the first one or more 1G pages from ndm1g. + * place of the one or more 1G pages from ndm1g that maps + * kernel memory into DMAP. */ - nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP); + nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART + + kernphys - rounddown2(kernphys, NBPDP), NBPDP); DMPDkernphys = allocpages(firstaddr, nkdmpde); } if (ndm1g < ndmpdp) @@ -1719,14 +1722,18 @@ pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* - * Map from physical address zero to the end of loader preallocated - * memory using 2MB pages. This replaces some of the PD entries - * created above. + * Map from start of the kernel in physical memory (staging + * area) to the end of loader preallocated memory using 2MB + * pages. This replaces some of the PD entries created above. + * For compatibility, identity map 2M at the start. */ - for (i = 0; (i << PDRSHIFT) < KERNend; i++) + pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | + X86_PG_RW | pg_nx; + for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) { /* Preset PG_M and PG_A because demotion expects it. */ - pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | - X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT); + pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | + X86_PG_A | bootaddr_rwx(pax); + } /* * Because we map the physical blocks in 2M pages, adjust firstaddr @@ -1792,15 +1799,18 @@ * use 2M pages with read-only and no-execute permissions. (If using 1G * pages, this will partially overwrite the PDPEs above.) */ - if (ndm1g) { + if (ndm1g > 0) { pd_p = (pd_entry_t *)DMPDkernphys; - for (i = 0; i < (NPDEPG * nkdmpde); i++) - pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | - X86_PG_M | X86_PG_A | pg_nx | - bootaddr_rwx(i << PDRSHIFT); - for (i = 0; i < nkdmpde; i++) - pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW | - X86_PG_V | pg_nx; + for (i = 0, pax = rounddown2(kernphys, NBPDP); + i < NPDEPG * nkdmpde; i++, pax += NBPDR) { + pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | + X86_PG_A | pg_nx | bootaddr_rwx(pax); + } + j = rounddown2(kernphys, NBPDP) >> PDPSHIFT; + for (i = 0; i < nkdmpde; i++) { + pdp_p[i + j] = (DMPDkernphys + ptoa(i)) | + X86_PG_RW | X86_PG_V | pg_nx; + } } /* And recursively map PML4 to itself in order to get PTmap */ @@ -1876,7 +1886,8 @@ /* * Account for the virtual addresses mapped by create_pagetables(). */ - virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend); + virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend - + (vm_paddr_t)kernphys); virtual_end = VM_MAX_KERNEL_ADDRESS; /* @@ -2062,6 +2073,19 @@ load_cr4(cr4); } +vm_page_t +pmap_page_alloc_below_4g(bool zeroed) +{ + vm_page_t m; + + m = vm_page_alloc_contig(NULL, 0, (zeroed ? VM_ALLOC_ZERO : 0) | + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if (m != NULL && zeroed && (m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + return (m); +} + extern const char la57_trampoline[], la57_trampoline_gdt_desc[], la57_trampoline_gdt[], la57_trampoline_end[]; @@ -2087,42 +2111,18 @@ r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; r_gdt.rd_base = (long)__pcpu[0].pc_gdt; - m_code = vm_page_alloc_contig(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, - 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); - if ((m_code->flags & PG_ZERO) == 0) - pmap_zero_page(m_code); + m_code = pmap_page_alloc_below_4g(true); v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); - m_pml5 = vm_page_alloc_contig(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, - 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); - if ((m_pml5->flags & PG_ZERO) == 0) - pmap_zero_page(m_pml5); + m_pml5 = pmap_page_alloc_below_4g(true); KPML5phys = VM_PAGE_TO_PHYS(m_pml5); v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); - m_pml4 = vm_page_alloc_contig(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, - 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); - if ((m_pml4->flags & PG_ZERO) == 0) - pmap_zero_page(m_pml4); + m_pml4 = pmap_page_alloc_below_4g(true); v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); - m_pdp = vm_page_alloc_contig(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, - 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); - if ((m_pdp->flags & PG_ZERO) == 0) - pmap_zero_page(m_pdp); + m_pdp = pmap_page_alloc_below_4g(true); v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); - m_pd = vm_page_alloc_contig(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, - 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); - if ((m_pd->flags & PG_ZERO) == 0) - pmap_zero_page(m_pd); + m_pd = pmap_page_alloc_below_4g(true); v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); - m_pt = vm_page_alloc_contig(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, - 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); - if ((m_pt->flags & PG_ZERO) == 0) - pmap_zero_page(m_pt); + m_pt = pmap_page_alloc_below_4g(true); v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); /* @@ -2425,7 +2425,8 @@ * Collect the page table pages that were replaced by a 2MB * page in create_pagetables(). They are zero filled. */ - if ((vm_paddr_t)i << PDRSHIFT < KERNend && + if ((i == 0 || + kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) && pmap_insert_pt_page(kernel_pmap, mpte, false)) panic("pmap_init: pmap_insert_pt_page failed"); } @@ -6692,7 +6693,9 @@ mpte < &vm_page_array[vm_page_array_size], ("pmap_promote_pde: page table page is out of range")); KASSERT(mpte->pindex == pmap_pde_pindex(va), - ("pmap_promote_pde: page table page's pindex is wrong")); + ("pmap_promote_pde: page table page's pindex is wrong " + "mpte %p pidx %#lx va %#lx va pde pidx %#lx", + mpte, mpte->pindex, va, pmap_pde_pindex(va))); if (pmap_insert_pt_page(pmap, mpte, true)) { counter_u64_add(pmap_pde_p_failures, 1); CTR2(KTR_PMAP, @@ -10763,8 +10766,8 @@ va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false); } - pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE, - (vm_offset_t)etext, true); + pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext, + true); pti_finalized = true; VM_OBJECT_WUNLOCK(pti_obj); } diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h --- a/sys/amd64/include/md_var.h +++ b/sys/amd64/include/md_var.h @@ -49,11 +49,10 @@ extern int la57; -/* - * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its - * value is the physical address at which the kernel is loaded. - */ -extern char kernphys[]; +extern vm_paddr_t kernphys; +extern vm_paddr_t KERNend; + +extern bool efi_boot; struct savefpu; struct sysentvec; diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -456,6 +456,10 @@ #define pmap_page_is_write_mapped(m) (((m)->a.flags & PGA_WRITEABLE) != 0) #define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) +#define pmap_vm_page_alloc_check(m) \ + KASSERT(m->phys_addr < kernphys || m->phys_addr >= KERNend, \ + ("allocating kernel page %p", m)); + struct thread; void pmap_activate_boot(pmap_t pmap); @@ -509,6 +513,7 @@ void pmap_thread_init_invl_gen(struct thread *td); int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap); void pmap_page_array_startup(long count); +vm_page_t pmap_page_alloc_below_4g(bool zeroed); #ifdef KASAN void pmap_kasan_enter(vm_offset_t); diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -39,7 +39,6 @@ void invlop_handler(void); int start_all_aps(void); -void mp_bootaddress(vm_paddr_t *, unsigned int *); #endif /* !LOCORE */ #endif /* SMP */ diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h --- a/sys/amd64/include/vmparam.h +++ b/sys/amd64/include/vmparam.h @@ -151,8 +151,10 @@ #endif /* - * Kernel physical load address. Needs to be aligned at 2MB superpage - * boundary. + * Kernel physical load address for non-UEFI boot and for legacy UEFI loader. + * Newer UEFI loader loads kernel anywhere below 4G, with memory allocated + * by boot services. + * Needs to be aligned at 2MB superpage boundary. */ #ifndef KERNLOAD #define KERNLOAD 0x200000 @@ -192,7 +194,17 @@ #define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0) #define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0) +/* + * Formally kernel mapping starts at KERNBASE, but kernel linker + * script leaves first PDE reserved. For legacy BIOS boot, kernel is + * loaded at KERNLOAD = 2M, and initial kernel page table maps + * physical memory from zero to KERNend starting at KERNBASE. + * + * KERNSTART is where the first actual kernel page is mapped, after + * the compatibility mapping. + */ #define KERNBASE KV4ADDR(KPML4I, KPDPI, 0, 0) +#define KERNSTART (KERNBASE + NBPDR) #define UPT_MAX_ADDRESS KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I) #define UPT_MIN_ADDRESS KV4ADDR(PML4PML4I, 0, 0, 0) diff --git a/sys/conf/ldscript.amd64 b/sys/conf/ldscript.amd64 --- a/sys/conf/ldscript.amd64 +++ b/sys/conf/ldscript.amd64 @@ -5,15 +5,14 @@ SEARCH_DIR("/usr/lib"); SECTIONS { - kernphys = kernload; /* Read-only sections, merged into text segment: */ - . = kernbase + kernphys + SIZEOF_HEADERS; + . = kernbase + kernload + SIZEOF_HEADERS; /* * Use the AT keyword in order to set the right LMA that contains * the physical address where the section should be loaded. This is * needed for the Xen loader which honours the LMA. */ - .interp : AT (kernphys + SIZEOF_HEADERS) { *(.interp) } + .interp : AT (kernload + SIZEOF_HEADERS) { *(.interp) } .hash : { *(.hash) } .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) } diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -2416,6 +2416,7 @@ ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); KASSERT(m->valid == 0, ("free page %p is valid", m)); + pmap_vm_page_alloc_check(m); } /* diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c --- a/sys/x86/x86/mp_x86.c +++ b/sys/x86/x86/mp_x86.c @@ -1065,11 +1065,6 @@ } #ifdef __amd64__ - /* - * Enable global pages TLB extension - * This also implicitly flushes the TLB - */ - load_cr4(rcr4() | CR4_PGE); if (pmap_pcid_enabled) load_cr4(rcr4() | CR4_PCIDE); load_ds(_udatasel);