diff --git a/TODO b/TODO new file mode 100644 --- /dev/null +++ b/TODO @@ -0,0 +1,6 @@ +- Xen +- mark kernel for accepting any kernphys below 4G + +checks: +non-EFI boot +non-amd64 EFI diff --git a/stand/common/bootstrap.h b/stand/common/bootstrap.h --- a/stand/common/bootstrap.h +++ b/stand/common/bootstrap.h @@ -228,6 +228,9 @@ size_t f_size; /* file size */ struct kernel_module *f_modules; /* list of modules if any */ struct preloaded_file *f_next; /* next file */ +#ifdef __amd64__ + bool f_kernphys_relocatable; +#endif }; struct file_format diff --git a/stand/common/load_elf.c b/stand/common/load_elf.c --- a/stand/common/load_elf.c +++ b/stand/common/load_elf.c @@ -207,6 +207,18 @@ #undef CONVERT_SWITCH #undef CONVERT_FIELD + +#ifdef __amd64__ +static bool +is_kernphys_relocatable(elf_file_t ef) +{ + Elf_Sym sym; + + return (__elfN(lookup_symbol)(ef, "kernphys", &sym, STT_OBJECT) == 0 && + sym.st_size == 8); +} +#endif + static int __elfN(load_elf_header)(char *filename, elf_file_t ef) { @@ -434,6 +446,9 @@ /* Load OK, return module pointer */ *result = (struct preloaded_file *)fp; err = 0; +#ifdef __amd64__ + fp->f_kernphys_relocatable = is_kernphys_relocatable(&ef); +#endif goto out; ioerr: diff --git a/stand/efi/loader/arch/amd64/elf64_freebsd.c b/stand/efi/loader/arch/amd64/elf64_freebsd.c --- a/stand/efi/loader/arch/amd64/elf64_freebsd.c +++ b/stand/efi/loader/arch/amd64/elf64_freebsd.c @@ -82,7 +82,12 @@ static pml4_entry_t *PT4; static pdp_entry_t *PT3; +static pdp_entry_t *PT3_l, *PT3_u; static pd_entry_t *PT2; +static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1; + +extern EFI_PHYSICAL_ADDRESS staging; +extern int copy_staging; static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend, uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry); @@ -105,6 +110,14 @@ ACPI_TABLE_RSDP *rsdp; char buf[24]; int revision; + bool copy_auto; + + if (copy_staging == 2) { + copy_auto = true; + copy_staging = fp->f_kernphys_relocatable ? 0 : 1; + } else { + copy_auto = false; + } /* * Report the RSDP to the kernel. While this can be found with @@ -151,57 +164,130 @@ } if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL) - return(EFTYPE); + return (EFTYPE); ehdr = (Elf_Ehdr *)&(md->md_data); - trampcode = (vm_offset_t)0x0000000040000000; + trampcode = copy_staging ? (vm_offset_t)0x0000000040000000 /* 1G */ : + (vm_offset_t)0x0000000100000000; /* 4G */; err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1, (EFI_PHYSICAL_ADDRESS *)&trampcode); + if (EFI_ERROR(err)) { + printf("Unable to allocate trampoline\n"); + if (copy_auto) + copy_staging = 2; + return (ENOMEM); + } bzero((void *)trampcode, EFI_PAGE_SIZE); trampstack = trampcode + EFI_PAGE_SIZE - 8; bcopy((void *)&amd64_tramp, (void *)trampcode, amd64_tramp_size); trampoline = (void *)trampcode; - PT4 = (pml4_entry_t *)0x0000000040000000; - err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3, - (EFI_PHYSICAL_ADDRESS *)&PT4); - bzero(PT4, 3 * EFI_PAGE_SIZE); + if (copy_staging) { + PT4 = (pml4_entry_t *)0x0000000040000000; + err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3, + (EFI_PHYSICAL_ADDRESS *)&PT4); + if (EFI_ERROR(err)) { + printf("Unable to allocate trampoline page table\n"); + BS->FreePages(trampcode, 1); + if (copy_auto) + copy_staging = 2; + return (ENOMEM); + } + bzero(PT4, 3 * EFI_PAGE_SIZE); + PT3 = &PT4[512]; + PT2 = &PT3[512]; + + /* + * This is kinda brutal, but every single 1GB VM + * memory segment points to the same first 1GB of + * physical memory. But it is more than adequate. + */ + for (i = 0; i < NPTEPG; i++) { + /* + * Each slot of the L4 pages points to the + * same L3 page. + */ + PT4[i] = (pml4_entry_t)PT3; + PT4[i] |= PG_V | PG_RW; + + /* + * Each slot of the L3 pages points to the + * same L2 page. + */ + PT3[i] = (pdp_entry_t)PT2; + PT3[i] |= PG_V | PG_RW; + + /* + * The L2 page slots are mapped with 2MB pages for 1GB. + */ + PT2[i] = (pd_entry_t)i * (2 * 1024 * 1024); + PT2[i] |= PG_V | PG_RW | PG_PS; + } + } else { + PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */ + err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9, + (EFI_PHYSICAL_ADDRESS *)&PT4); + if (EFI_ERROR(err)) { + printf("Unable to allocate trampoline page table\n"); + BS->FreePages(trampcode, 9); + if (copy_auto) + copy_staging = 2; + return (ENOMEM); + } - PT3 = &PT4[512]; - PT2 = &PT3[512]; + bzero(PT4, 9 * EFI_PAGE_SIZE); + + PT3_l = &PT4[NPML4EPG * 1]; + PT3_u = &PT4[NPML4EPG * 2]; + PT2_l0 = &PT4[NPML4EPG * 3]; + PT2_l1 = &PT4[NPML4EPG * 4]; + PT2_l2 = &PT4[NPML4EPG * 5]; + PT2_l3 = &PT4[NPML4EPG * 6]; + PT2_u0 = &PT4[NPML4EPG * 7]; + PT2_u1 = &PT4[NPML4EPG * 8]; + + /* 1:1 mapping of lower 4G */ + PT4[0] = (pml4_entry_t)PT3_l | PG_V | PG_RW; + PT3_l[0] = (pdp_entry_t)PT2_l0 | PG_V | PG_RW; + PT3_l[1] = (pdp_entry_t)PT2_l1 | PG_V | PG_RW; + PT3_l[2] = (pdp_entry_t)PT2_l2 | PG_V | PG_RW; + PT3_l[3] = (pdp_entry_t)PT2_l3 | PG_V | PG_RW; + for (i = 0; i < 4 * NPDEPG; i++) { + PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V | + PG_RW | PG_PS; + } - /* - * This is kinda brutal, but every single 1GB VM memory segment points - * to the same first 1GB of physical memory. But it is more than - * adequate. - */ - for (i = 0; i < 512; i++) { - /* Each slot of the L4 pages points to the same L3 page. */ - PT4[i] = (pml4_entry_t)PT3; - PT4[i] |= PG_V | PG_RW; - - /* Each slot of the L3 pages points to the same L2 page. */ - PT3[i] = (pdp_entry_t)PT2; - PT3[i] |= PG_V | PG_RW; - - /* The L2 page slots are mapped with 2MB pages for 1GB. */ - PT2[i] = i * (2 * 1024 * 1024); - PT2[i] |= PG_V | PG_RW | PG_PS; + /* mapping of kernel 2G below top */ + PT4[NPML4EPG - 1] = (pml4_entry_t)PT3_u | PG_V | PG_RW; + PT3_u[NPDPEPG - 2] = (pdp_entry_t)PT2_u0 | PG_V | PG_RW; + PT3_u[NPDPEPG - 1] = (pdp_entry_t)PT2_u1 | PG_V | PG_RW; + /* compat mapping of phys @0 */ + PT2_u0[0] = PG_PS | PG_V | PG_RW; + /* this maps past staging area */ + for (i = 1; i < 2 * NPDEPG; i++) { + PT2_u0[i] = ((pd_entry_t)staging + + ((pd_entry_t)i - 1) * NBPDR) | + PG_V | PG_RW | PG_PS; + } } + printf("staging %#lx (%scoping) tramp %p PT4 %p\n", + staging, copy_staging ? "" : "not ", trampoline, PT4); printf("Start @ 0x%lx ...\n", ehdr->e_entry); efi_time_fini(); err = bi_load(fp->f_args, &modulep, &kernend, true); if (err != 0) { efi_time_init(); - return(err); + if (copy_auto) + copy_staging = 2; + return (err); } dev_cleanup(); - trampoline(trampstack, efi_copy_finish, kernend, modulep, PT4, - ehdr->e_entry); + trampoline(trampstack, copy_staging ? efi_copy_finish : + efi_copy_finish_nop, kernend, modulep, PT4, ehdr->e_entry); panic("exec returned"); } diff --git a/stand/efi/loader/bootinfo.c b/stand/efi/loader/bootinfo.c --- a/stand/efi/loader/bootinfo.c +++ b/stand/efi/loader/bootinfo.c @@ -65,6 +65,8 @@ extern EFI_SYSTEM_TABLE *ST; +int boot_services_gone; + static int bi_getboothowto(char *kargs) { @@ -396,8 +398,10 @@ if (!exit_bs) break; status = BS->ExitBootServices(IH, efi_mapkey); - if (!EFI_ERROR(status)) + if (!EFI_ERROR(status)) { + boot_services_gone = 1; break; + } } if (retry == 0) { diff --git a/stand/efi/loader/copy.c b/stand/efi/loader/copy.c --- a/stand/efi/loader/copy.c +++ b/stand/efi/loader/copy.c @@ -39,6 +39,12 @@ #include "loader_efi.h" +#define M2 (2 * 1024 * 1024) +#define G1 (1 * 1024 * 1024 * 1024) +#define G4 (4UL * G1) + +extern int boot_services_gone; + #if defined(__i386__) || defined(__amd64__) #include #include @@ -181,15 +187,93 @@ #endif #endif +#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ + defined(__riscv) +#define EFI_STAGING_2M_ALIGN 1 +#else +#define EFI_STAGING_2M_ALIGN 0 +#endif + EFI_PHYSICAL_ADDRESS staging, staging_end, staging_base; int stage_offset_set = 0; ssize_t stage_offset; +#ifdef __amd64__ +int copy_staging = 2; + +static void +efi_copy_free(void) +{ + BS->FreePages(staging_base, (staging_end - staging_base) / + EFI_PAGE_SIZE); + stage_offset_set = 0; + stage_offset = 0; +} + +static int +command_copy_staging(int argc, char *argv[]) +{ + static const char *const mode[3] = { + [0] = "enable", + [1] = "disable", + [2] = "auto", + }; + int prev, res; + + res = CMD_OK; + if (argc > 2) { + res = CMD_ERROR; + } else if (argc == 2) { + prev = copy_staging; + if (strcmp(argv[1], "enable") == 0) + copy_staging = 1; + else if (strcmp(argv[1], "disable") == 0) + copy_staging = 0; + else if (strcmp(argv[1], "auto") == 0) + copy_staging = 2; + else { + printf("usage: copy_staging enable|disable\n"); + res = CMD_ERROR; + } + if (res == CMD_OK && prev != copy_staging) { + unload(); + efi_copy_free(); + efi_copy_init(); + } + } else { + printf("copy staging: %s\n", mode[copy_staging]); + } + return (res); +} +COMMAND_SET(copy_staging, "copy_staging", "copy staging", command_copy_staging); +#endif + +#if defined(__i386__) || defined(__amd64__) +/* + * The staging area must reside in the the first 1GB or 4GB physical + * memory: see elf64_exec() in + * boot/efi/loader/arch/amd64/elf64_freebsd.c. + */ +static EFI_PHYSICAL_ADDRESS +get_staging_max(void) +{ + EFI_PHYSICAL_ADDRESS res; + +#if defined(__i386__) + res = G1; +#elif defined(__amd64__) + res = copy_staging ? G1 : G4; +#endif + return (res); +} +#define EFI_ALLOC_METHOD AllocateMaxAddress +#else +#define EFI_ALLOC_METHOD AllocateAnyPages +#endif int efi_copy_init(void) { EFI_STATUS status; - unsigned long nr_pages; nr_pages = EFI_SIZE_TO_PAGES((EFI_STAGING_SIZE) * 1024 * 1024); @@ -203,18 +287,10 @@ if (running_on_hyperv()) efi_verify_staging_size(&nr_pages); - /* - * The staging area must reside in the the first 1GB physical - * memory: see elf64_exec() in - * boot/efi/loader/arch/amd64/elf64_freebsd.c. - */ - staging = 1024*1024*1024; - status = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, - nr_pages, &staging); -#else - status = BS->AllocatePages(AllocateAnyPages, EfiLoaderData, - nr_pages, &staging); + staging = get_staging_max(); #endif + status = BS->AllocatePages(EFI_ALLOC_METHOD, EfiLoaderData, + nr_pages, &staging); if (EFI_ERROR(status)) { printf("failed to allocate staging area: %lu\n", EFI_ERROR_CODE(status)); @@ -223,7 +299,7 @@ staging_base = staging; staging_end = staging + nr_pages * EFI_PAGE_SIZE; -#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) +#if EFI_STAGING_2M_ALIGN /* * Round the kernel load address to a 2MiB value. This is needed * because the kernel builds a page table based on where it has @@ -231,7 +307,7 @@ * either a 1MiB or 2MiB page for this we need to make sure it * is correctly aligned for both cases. */ - staging = roundup2(staging, 2 * 1024 * 1024); + staging = roundup2(staging, M2); #endif return (0); @@ -240,20 +316,42 @@ static bool efi_check_space(vm_offset_t end) { - EFI_PHYSICAL_ADDRESS addr; + EFI_PHYSICAL_ADDRESS addr, new_base, new_staging; EFI_STATUS status; unsigned long nr_pages; + end = roundup2(end, EFI_PAGE_SIZE); + /* There is already enough space */ - if (end <= staging_end) + if (end + M2 <= staging_end) return (true); - end = roundup2(end, EFI_PAGE_SIZE); - nr_pages = EFI_SIZE_TO_PAGES(end - staging_end); + if (boot_services_gone) { + if (end <= staging_end) + return (true); + panic("efi_check_space: cannot expand staging area " + "after boot services were exited\n"); + } + + /* + * Add a slop at the end: + * 1. amd64 kernel expects to do some very early allocations + * by carving out memory after kernend. Slop guarantees + * that it does not override anything useful. + * 2. It seems that initial calculation of the staging size + * could be somewhat smaller than actually copying in after + * boot services are exited. Slop avoids calling + * BS->AllocatePages() when it cannot work. + */ + end += M2; + nr_pages = EFI_SIZE_TO_PAGES(end - staging_end); #if defined(__i386__) || defined(__amd64__) - /* X86 needs all memory to be allocated under the 1G boundary */ - if (end > 1024*1024*1024) + /* + * i386 needs all memory to be allocated under the 1G boundary. + * amd64 needs all memory to be allocated under the 1G or 4G boundary. + */ + if (end > get_staging_max()) goto before_staging; #endif @@ -268,14 +366,12 @@ before_staging: /* Try allocating space before the previous allocation */ - if (staging < nr_pages * EFI_PAGE_SIZE) { - printf("Not enough space before allocation\n"); - return (false); - } + if (staging < nr_pages * EFI_PAGE_SIZE) + goto expand; addr = staging - nr_pages * EFI_PAGE_SIZE; -#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) +#if EFI_STAGING_2M_ALIGN /* See efi_copy_init for why this is needed */ - addr = rounddown2(addr, 2 * 1024 * 1024); + addr = rounddown2(addr, M2); #endif nr_pages = EFI_SIZE_TO_PAGES(staging_base - addr); status = BS->AllocatePages(AllocateAddress, EfiLoaderData, nr_pages, @@ -288,11 +384,42 @@ staging_base = addr; memmove((void *)(uintptr_t)staging_base, (void *)(uintptr_t)staging, staging_end - staging); - stage_offset -= (staging - staging_base); + stage_offset -= staging - staging_base; staging = staging_base; return (true); } +expand: + nr_pages = EFI_SIZE_TO_PAGES(end - (vm_offset_t)staging); +#if EFI_STAGING_2M_ALIGN + nr_pages += M2 / EFI_PAGE_SIZE; +#endif +#if defined(__i386__) || defined(__amd64__) + new_base = get_staging_max(); +#endif + status = BS->AllocatePages(EFI_ALLOC_METHOD, EfiLoaderData, + nr_pages, &new_base); + if (!EFI_ERROR(status)) { +#if EFI_STAGING_2M_ALIGN + new_staging = roundup2(new_base, M2); +#else + new_staging = new_base; +#endif + /* + * Move the old allocation and update the state so + * translation still works. + */ + memcpy((void *)(uintptr_t)new_staging, + (void *)(uintptr_t)staging, staging_end - staging); + BS->FreePages(staging_base, (staging_end - staging_base) / + EFI_PAGE_SIZE); + stage_offset -= staging - new_staging; + staging = new_staging; + staging_end = new_base + nr_pages * EFI_PAGE_SIZE; + staging_base = new_base; + return (true); + } + printf("efi_check_space: Unable to expand staging area\n"); return (false); } @@ -335,7 +462,6 @@ return (len); } - ssize_t efi_readin(readin_handle_t fd, vm_offset_t dest, const size_t len) { @@ -364,3 +490,8 @@ while (src < last) *dst++ = *src++; } + +void +efi_copy_finish_nop(void) +{ +} diff --git a/stand/efi/loader/loader_efi.h b/stand/efi/loader/loader_efi.h --- a/stand/efi/loader/loader_efi.h +++ b/stand/efi/loader/loader_efi.h @@ -44,5 +44,6 @@ void * efi_translate(vm_offset_t ptr); void efi_copy_finish(void); +void efi_copy_finish_nop(void); #endif /* _LOADER_EFI_COPY_H_ */ diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c --- a/sys/amd64/amd64/machdep.c +++ b/sys/amd64/amd64/machdep.c @@ -1277,7 +1277,7 @@ * in real mode mode (e.g. SMP bare metal). */ #ifdef SMP - mp_bootaddress(physmap, &physmap_idx); + alloc_ap_trampoline(physmap, &physmap_idx); #endif /* call pmap initialization to make new kernel address space */ @@ -1598,7 +1598,10 @@ int gsel_tss, x; struct pcpu *pc; struct xstate_hdr *xhdr; - u_int64_t rsp0; + uint64_t cr3, rsp0; + pml4_entry_t *pml4e; + pdp_entry_t *pdpe; + pd_entry_t *pde; char *env; struct user_segment_descriptor *gdt; struct region_descriptor r_gdt; @@ -1608,6 +1611,35 @@ TSRAW(&thread0, TS_ENTER, __func__, NULL); + /* + * Calculate kernphys by inspecting page table created by loader. + * The assumptions: + * - kernel is mapped at KERNBASE, backed by contiguous phys memory + * aligned at 2M, below 4G (the latter is important for AP startup) + * - there is a 2M hole at KERNBASE + * - kernel is mapped with 2M superpages + * - all participating memory, i.e. kernel, modules, metadata, + * page table is accessible by pre-created 1:1 mapping + * (right now loader creates 1:1 mapping for lower 4G, and all + * memory is from there) + * - there is a usable memory block right after the end of the + * mapped kernel and all modules/metadata, pointed to by + * physfree, for early allocations + */ + cr3 = rcr3(); + pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index( + (vm_offset_t)hammer_time); + pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index( + (vm_offset_t)hammer_time); + pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index( + (vm_offset_t)hammer_time); + kernphys = (vm_paddr_t)(*pde & ~PDRMASK) - + (((vm_paddr_t)hammer_time - KERNBASE) & ~PDRMASK); + + /* Fix-up for 2M hole */ + physfree += kernphys; + kernphys += NBPDR; + kmdp = init_ops.parse_preload_data(modulep); efi_boot = preload_search_info(kmdp, MODINFO_METADATA | @@ -1653,7 +1685,7 @@ /* Init basic tunables, hz etc */ init_param1(); - thread0.td_kstack = physfree + KERNBASE; + thread0.td_kstack = physfree - kernphys + KERNBASE + NBPDR; thread0.td_kstack_pages = kstack_pages; kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE; bzero((void *)thread0.td_kstack, kstack0_sz); @@ -1690,7 +1722,7 @@ wrmsr(MSR_GSBASE, (u_int64_t)pc); wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */ - dpcpu_init((void *)(physfree + KERNBASE), 0); + dpcpu_init((void *)(physfree - kernphys + KERNBASE + NBPDR), 0); physfree += DPCPU_SIZE; amd64_bsp_pcpu_init1(pc); /* Non-late cninit() and printf() can be moved up to here. */ diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c --- a/sys/amd64/amd64/mp_machdep.c +++ b/sys/amd64/amd64/mp_machdep.c @@ -105,6 +105,7 @@ static char *dbg_stack; extern u_int mptramp_la57; +extern u_int mptramp_nx; /* * Local data and functions. @@ -112,86 +113,6 @@ static int start_ap(int apic_id); -static bool -is_kernel_paddr(vm_paddr_t pa) -{ - - return (pa >= trunc_2mpage(btext - KERNBASE) && - pa < round_page(_end - KERNBASE)); -} - -static bool -is_mpboot_good(vm_paddr_t start, vm_paddr_t end) -{ - - return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem); -} - -/* - * Calculate usable address in base memory for AP trampoline code. - */ -void -mp_bootaddress(vm_paddr_t *physmap, unsigned int *physmap_idx) -{ - vm_paddr_t start, end; - unsigned int i; - bool allocated; - - alloc_ap_trampoline(physmap, physmap_idx); - - /* - * Find a memory region big enough below the 4GB boundary to - * store the initial page tables. Region must be mapped by - * the direct map. - * - * Note that it needs to be aligned to a page boundary. - */ - allocated = false; - for (i = *physmap_idx; i <= *physmap_idx; i -= 2) { - /* - * First, try to chomp at the start of the physmap region. - * Kernel binary might claim it already. - */ - start = round_page(physmap[i]); - end = start + AP_BOOTPT_SZ; - if (start < end && end <= physmap[i + 1] && - is_mpboot_good(start, end) && - !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) { - allocated = true; - physmap[i] = end; - break; - } - - /* - * Second, try to chomp at the end. Again, check - * against kernel. - */ - end = trunc_page(physmap[i + 1]); - start = end - AP_BOOTPT_SZ; - if (start < end && start >= physmap[i] && - is_mpboot_good(start, end) && - !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) { - allocated = true; - physmap[i + 1] = start; - break; - } - } - if (allocated) { - mptramp_pagetables = start; - if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) { - memmove(&physmap[i], &physmap[i + 2], - sizeof(*physmap) * (*physmap_idx - i + 2)); - *physmap_idx -= 2; - } - } else { - mptramp_pagetables = trunc_page(boot_address) - AP_BOOTPT_SZ; - if (bootverbose) - printf( -"Cannot find enough space for the initial AP page tables, placing them at %#x", - mptramp_pagetables); - } -} - /* * Initialize the IPI handlers and start up the AP's. */ @@ -243,6 +164,9 @@ assign_cpu_ids(); mptramp_la57 = la57; + mptramp_nx = pg_nx != 0; + MPASS(kernel_pmap->pm_cr3 < (1UL << 32)); + mptramp_pagetables = kernel_pmap->pm_cr3; /* Start each Application Processor */ start_all_aps(); @@ -399,55 +323,67 @@ int start_all_aps(void) { - u_int64_t *pt5, *pt4, *pt3, *pt2; + vm_page_t m_pml4, m_pdp, m_pd[4]; + pml5_entry_t old_pml45; + pml4_entry_t *v_pml4; + pdp_entry_t *v_pdp; + pd_entry_t *v_pd; u_int32_t mpbioswarmvec; - int apic_id, cpu, domain, i, xo; + int apic_id, cpu, domain, i; u_char mpbiosreason; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); - /* copy the AP 1st level boot code */ - bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size); - - /* Locate the page tables, they'll be below the trampoline */ + /* Create a transient 1:1 mapping of low 4G */ if (la57) { - pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables); - xo = 1; + m_pml4 = pmap_page_alloc_below_4g(true); + v_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); } else { - xo = 0; + v_pml4 = &kernel_pmap->pm_pmltop[0]; } - pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE); - pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); - pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); - - /* Create the initial 1GB replicated page tables */ - for (i = 0; i < 512; i++) { - if (la57) { - pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + - PAGE_SIZE); - pt5[i] |= PG_V | PG_RW | PG_U; - } - - /* - * Each slot of the level 4 pages points to the same - * level 3 page. - */ - pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + - (xo + 1) * PAGE_SIZE); - pt4[i] |= PG_V | PG_RW | PG_U; - - /* - * Each slot of the level 3 pages points to the same - * level 2 page. - */ - pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + - ((xo + 2) * PAGE_SIZE)); - pt3[i] |= PG_V | PG_RW | PG_U; - - /* The level 2 page slots are mapped with 2MB pages for 1GB. */ - pt2[i] = i * (2 * 1024 * 1024); - pt2[i] |= PG_V | PG_RW | PG_PS | PG_U; + m_pdp = pmap_page_alloc_below_4g(true); + v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); + m_pd[0] = pmap_page_alloc_below_4g(false); + v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[0])); + for (i = 0; i < NPDEPG; i++) + v_pd[i] = (i << PDRSHIFT) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M | PG_PS; + m_pd[1] = pmap_page_alloc_below_4g(false); + v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[1])); + for (i = 0; i < NPDEPG; i++) + v_pd[i] = (NBPDP + (i << PDRSHIFT)) | X86_PG_V | X86_PG_RW | + X86_PG_A | X86_PG_M | PG_PS; + m_pd[2] = pmap_page_alloc_below_4g(false); + v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[2])); + for (i = 0; i < NPDEPG; i++) + v_pd[i] = (2UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS; + m_pd[3] = pmap_page_alloc_below_4g(false); + v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[3])); + for (i = 0; i < NPDEPG; i++) + v_pd[i] = (3UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS; + v_pdp[0] = VM_PAGE_TO_PHYS(m_pd[0]) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + v_pdp[1] = VM_PAGE_TO_PHYS(m_pd[1]) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + v_pdp[2] = VM_PAGE_TO_PHYS(m_pd[2]) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + v_pdp[3] = VM_PAGE_TO_PHYS(m_pd[3]) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + old_pml45 = kernel_pmap->pm_pmltop[0]; + if (la57) { + kernel_pmap->pm_pmltop[0] = VM_PAGE_TO_PHYS(m_pml4) | + X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M; } + v_pml4[0] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M; + pmap_invalidate_all(kernel_pmap); + + /* copy the AP 1st level boot code */ + bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size); + if (bootverbose) + printf("AP boot address %#x\n", boot_address); /* save the current value of the warm-start vector */ mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF); @@ -512,6 +448,17 @@ outb(CMOS_REG, BIOS_RESET); outb(CMOS_DATA, mpbiosreason); + /* Destroy transient 1:1 mapping */ + kernel_pmap->pm_pmltop[0] = old_pml45; + invlpg(0); + if (la57) + vm_page_free(m_pml4); + vm_page_free(m_pd[3]); + vm_page_free(m_pd[2]); + vm_page_free(m_pd[1]); + vm_page_free(m_pd[0]); + vm_page_free(m_pdp); + /* number of APs actually started */ return (mp_naps); } diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S --- a/sys/amd64/amd64/mpboot.S +++ b/sys/amd64/amd64/mpboot.S @@ -95,12 +95,25 @@ * is later enabled. */ mov %cr4, %eax - orl $CR4_PAE, %eax + orl $(CR4_PAE | CR4_PGE), %eax cmpb $0, mptramp_la57-mptramp_start(%ebx) je 1f orl $CR4_LA57, %eax 1: mov %eax, %cr4 + /* + * If the BSP reported NXE support, enable EFER.NXE for all APs + * prior to loading %cr3. This avoids page faults if the AP + * encounters memory marked with the NX bit prior to detecting and + * enabling NXE support. + */ + cmpb $0,mptramp_nx-mptramp_start(%ebx) + je 2f + movl $MSR_EFER, %ecx + rdmsr + orl $EFER_NXE, %eax + wrmsr +2: /* * Enable EFER.LME so that we get long mode when all the prereqs are * in place. In this case, it turns on when CR0_PG is finally enabled. @@ -112,12 +125,13 @@ wrmsr /* - * Point to the embedded page tables for startup. Note that this - * only gets accessed after we're actually in 64 bit mode, however - * we can only set the bottom 32 bits of %cr3 in this state. This - * means we are required to use a temporary page table that is below - * the 4GB limit. %ebx is still our relocation base. We could just - * subtract 3 * PAGE_SIZE, but that would be too easy. + * Load kernel page table pointer into %cr3. + * %ebx is still our relocation base. + * + * Note that this only gets accessed after we're actually in 64 bit + * mode, however we can only set the bottom 32 bits of %cr3 in this + * state. This means we depend on the kernel page table being + * allocated from the low 4G. */ leal mptramp_pagetables-mptramp_start(%ebx),%eax movl (%eax), %eax @@ -155,10 +169,8 @@ /* * Yeehar! We're running in 64 bit mode! We can mostly ignore our * segment registers, and get on with it. - * Note that we are running at the correct virtual address, but with - * a 1:1 1GB mirrored mapping over entire address space. We had better - * switch to a real %cr3 promptly so that we can get to the direct map - * space. Remember that jmp is relative and that we've been relocated, + * We are running at the correct virtual address space. + * Note that the jmp is relative and that we've been relocated, * so use an indirect jump. */ .code64 @@ -220,6 +232,10 @@ mptramp_la57: .long 0 + .globl mptramp_nx +mptramp_nx: + .long 0 + /* * The pseudo descriptor for lgdt to use. */ @@ -243,31 +259,5 @@ .code64 .p2align 4,0 entry_64: - /* - * If the BSP reported NXE support, enable EFER.NXE for all APs - * prior to loading %cr3. This avoids page faults if the AP - * encounters memory marked with the NX bit prior to detecting and - * enabling NXE support. - */ - movq pg_nx, %rbx - testq %rbx, %rbx - je 1f - movl $MSR_EFER, %ecx - rdmsr - orl $EFER_NXE, %eax - wrmsr - -1: - /* - * Load a real %cr3 that has all the direct map stuff and switches - * off the 1GB replicated mirror. Load a stack pointer and jump - * into AP startup code in C. - */ - cmpl $0, la57 - jne 2f - movq KPML4phys, %rax - jmp 3f -2: movq KPML5phys, %rax -3: movq %rax, %cr3 movq bootSTK, %rsp jmp init_secondary diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -436,7 +436,8 @@ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static int ndmpdpphys; /* number of DMPDPphys pages */ -static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */ +vm_paddr_t kernphys; /* phys addr of start of bootstrap data */ +vm_paddr_t KERNend; /* and the end */ /* * pmap_mapdev support pre initialization (i.e. console) @@ -1554,7 +1555,7 @@ #ifdef NKPT pt_pages = NKPT; #else - pt_pages = howmany(addr, NBPDR); + pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */ pt_pages += NKPDPE(pt_pages); /* @@ -1594,6 +1595,9 @@ static inline pt_entry_t bootaddr_rwx(vm_paddr_t pa) { + vm_paddr_t fixed_kernphys; + + fixed_kernphys = kernphys - NBPDR; /* KERNBASE is off by NBPDR */ /* * The kernel is loaded at a 2MB-aligned address, and memory below that @@ -1602,8 +1606,8 @@ * either. Preloaded kernel modules have their mapping permissions * fixed up by the linker. */ - if (pa < trunc_2mpage(btext - KERNBASE) || - pa >= trunc_2mpage(_end - KERNBASE)) + if (pa < trunc_2mpage(fixed_kernphys + btext - KERNBASE) || + pa >= trunc_2mpage(fixed_kernphys + _end - KERNBASE)) return (X86_PG_RW | pg_nx); /* @@ -1612,7 +1616,7 @@ * impact read-only data. However, in any case, any page with * read-write data needs to be read-write. */ - if (pa >= trunc_2mpage(brwsection - KERNBASE)) + if (pa >= trunc_2mpage(fixed_kernphys + brwsection - KERNBASE)) return (X86_PG_RW | pg_nx); /* @@ -1624,7 +1628,7 @@ * Note that fixups to the .text section will still work until we * set CR0.WP. */ - if (pa < round_2mpage(etext - KERNBASE)) + if (pa < round_2mpage(fixed_kernphys + etext - KERNBASE)) return (0); return (pg_nx); } @@ -1636,6 +1640,7 @@ pdp_entry_t *pdp_p; pml4_entry_t *p4_p; uint64_t DMPDkernphys; + vm_paddr_t pax; #ifdef KASAN pt_entry_t *pt_p; uint64_t KASANPDphys, KASANPTphys, KASANphys; @@ -1670,9 +1675,11 @@ /* * Allocate 2M pages for the kernel. These will be used in - * place of the first one or more 1G pages from ndm1g. + * place of the one or more 1G pages from ndm1g that maps + * kernel memory into DMAP. */ - nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP); + nkdmpde = howmany((vm_offset_t)brwsection - KERNBASE - NBPDR + + kernphys - rounddown2(kernphys, NBPDP), NBPDP); DMPDkernphys = allocpages(firstaddr, nkdmpde); } if (ndm1g < ndmpdp) @@ -1719,14 +1726,18 @@ pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V; /* - * Map from physical address zero to the end of loader preallocated - * memory using 2MB pages. This replaces some of the PD entries - * created above. + * Map from start of the kernel in physical memory (staging + * area) to the end of loader preallocated memory using 2MB + * pages. This replaces some of the PD entries created above. + * For compatibility, identity map 2M at the start. */ - for (i = 0; (i << PDRSHIFT) < KERNend; i++) + pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A | + X86_PG_RW | pg_nx; + for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) { /* Preset PG_M and PG_A because demotion expects it. */ - pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | - X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT); + pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | + X86_PG_A | bootaddr_rwx(pax); + } /* * Because we map the physical blocks in 2M pages, adjust firstaddr @@ -1792,15 +1803,18 @@ * use 2M pages with read-only and no-execute permissions. (If using 1G * pages, this will partially overwrite the PDPEs above.) */ - if (ndm1g) { + if (ndm1g > 0) { pd_p = (pd_entry_t *)DMPDkernphys; - for (i = 0; i < (NPDEPG * nkdmpde); i++) - pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | - X86_PG_M | X86_PG_A | pg_nx | - bootaddr_rwx(i << PDRSHIFT); - for (i = 0; i < nkdmpde; i++) - pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW | - X86_PG_V | pg_nx; + for (i = 0, pax = rounddown2(kernphys, NBPDP); + i < NPDEPG * nkdmpde; i++, pax += NBPDR) { + pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M | + X86_PG_A | pg_nx | bootaddr_rwx(pax); + } + j = rounddown2(kernphys, NBPDP) >> PDPSHIFT; + for (i = 0; i < nkdmpde; i++) { + pdp_p[i + j] = (DMPDkernphys + ptoa(i)) | + X86_PG_RW | X86_PG_V | pg_nx; + } } /* And recursively map PML4 to itself in order to get PTmap */ @@ -1876,7 +1890,8 @@ /* * Account for the virtual addresses mapped by create_pagetables(). */ - virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend); + virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend - + (vm_paddr_t)kernphys + NBPDR); virtual_end = VM_MAX_KERNEL_ADDRESS; /* @@ -2062,6 +2077,19 @@ load_cr4(cr4); } +vm_page_t +pmap_page_alloc_below_4g(bool zeroed) +{ + vm_page_t m; + + m = vm_page_alloc_contig(NULL, 0, (zeroed ? VM_ALLOC_ZERO : 0) | + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if (m != NULL && (m->flags & PG_ZERO) == 0) + pmap_zero_page(m); + return (m); +} + extern const char la57_trampoline[], la57_trampoline_gdt_desc[], la57_trampoline_gdt[], la57_trampoline_end[]; @@ -2087,42 +2115,18 @@ r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; r_gdt.rd_base = (long)__pcpu[0].pc_gdt; - m_code = vm_page_alloc_contig(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, - 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); - if ((m_code->flags & PG_ZERO) == 0) - pmap_zero_page(m_code); + m_code = pmap_page_alloc_below_4g(true); v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); - m_pml5 = vm_page_alloc_contig(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, - 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); - if ((m_pml5->flags & PG_ZERO) == 0) - pmap_zero_page(m_pml5); + m_pml5 = pmap_page_alloc_below_4g(true); KPML5phys = VM_PAGE_TO_PHYS(m_pml5); v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); - m_pml4 = vm_page_alloc_contig(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, - 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); - if ((m_pml4->flags & PG_ZERO) == 0) - pmap_zero_page(m_pml4); + m_pml4 = pmap_page_alloc_below_4g(true); v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); - m_pdp = vm_page_alloc_contig(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, - 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); - if ((m_pdp->flags & PG_ZERO) == 0) - pmap_zero_page(m_pdp); + m_pdp = pmap_page_alloc_below_4g(true); v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); - m_pd = vm_page_alloc_contig(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, - 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); - if ((m_pd->flags & PG_ZERO) == 0) - pmap_zero_page(m_pd); + m_pd = pmap_page_alloc_below_4g(true); v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); - m_pt = vm_page_alloc_contig(NULL, 0, - VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, - 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); - if ((m_pt->flags & PG_ZERO) == 0) - pmap_zero_page(m_pt); + m_pt = pmap_page_alloc_below_4g(true); v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); /* @@ -10763,7 +10767,7 @@ va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu); pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false); } - pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE, + pmap_pti_add_kva_locked((vm_offset_t)KERNBASE + NBPDR, (vm_offset_t)etext, true); pti_finalized = true; VM_OBJECT_WUNLOCK(pti_obj); diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h --- a/sys/amd64/include/md_var.h +++ b/sys/amd64/include/md_var.h @@ -49,11 +49,8 @@ extern int la57; -/* - * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its - * value is the physical address at which the kernel is loaded. - */ -extern char kernphys[]; +extern vm_paddr_t kernphys; +extern vm_paddr_t KERNend; struct savefpu; struct sysentvec; diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h --- a/sys/amd64/include/pmap.h +++ b/sys/amd64/include/pmap.h @@ -456,6 +456,10 @@ #define pmap_page_is_write_mapped(m) (((m)->a.flags & PGA_WRITEABLE) != 0) #define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) +#define pmap_vm_page_alloc_check(m) \ + KASSERT(m->phys_addr < kernphys || m->phys_addr >= KERNend, \ + ("allocating kernel page %p", m)); + struct thread; void pmap_activate_boot(pmap_t pmap); @@ -509,6 +513,7 @@ void pmap_thread_init_invl_gen(struct thread *td); int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap); void pmap_page_array_startup(long count); +vm_page_t pmap_page_alloc_below_4g(bool zeroed); #ifdef KASAN void pmap_kasan_enter(vm_offset_t); diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h --- a/sys/amd64/include/smp.h +++ b/sys/amd64/include/smp.h @@ -39,7 +39,6 @@ void invlop_handler(void); int start_all_aps(void); -void mp_bootaddress(vm_paddr_t *, unsigned int *); #endif /* !LOCORE */ #endif /* SMP */ diff --git a/sys/conf/ldscript.amd64 b/sys/conf/ldscript.amd64 --- a/sys/conf/ldscript.amd64 +++ b/sys/conf/ldscript.amd64 @@ -5,15 +5,14 @@ SEARCH_DIR("/usr/lib"); SECTIONS { - kernphys = kernload; /* Read-only sections, merged into text segment: */ - . = kernbase + kernphys + SIZEOF_HEADERS; + . = kernbase + kernload + SIZEOF_HEADERS; /* * Use the AT keyword in order to set the right LMA that contains * the physical address where the section should be loaded. This is * needed for the Xen loader which honours the LMA. */ - .interp : AT (kernphys + SIZEOF_HEADERS) { *(.interp) } + .interp : AT (kernload + SIZEOF_HEADERS) { *(.interp) } .hash : { *(.hash) } .gnu.hash : { *(.gnu.hash) } .dynsym : { *(.dynsym) } diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -2416,6 +2416,7 @@ ("page %p has unexpected memattr %d", m, pmap_page_get_memattr(m))); KASSERT(m->valid == 0, ("free page %p is valid", m)); + pmap_vm_page_alloc_check(m); } /*