diff --git a/stand/common/bootstrap.h b/stand/common/bootstrap.h --- a/stand/common/bootstrap.h +++ b/stand/common/bootstrap.h @@ -228,6 +228,9 @@ size_t f_size; /* file size */ struct kernel_module *f_modules; /* list of modules if any */ struct preloaded_file *f_next; /* next file */ +#ifdef __amd64__ + bool f_kernphys_relocatable; +#endif }; struct file_format diff --git a/stand/common/load_elf.c b/stand/common/load_elf.c --- a/stand/common/load_elf.c +++ b/stand/common/load_elf.c @@ -207,6 +207,18 @@ #undef CONVERT_SWITCH #undef CONVERT_FIELD + +#ifdef __amd64__ +static bool +is_kernphys_relocatable(elf_file_t ef) +{ + Elf_Sym sym; + + return (__elfN(lookup_symbol)(ef, "kernphys", &sym, STT_OBJECT) == 0 && + sym.st_size == 8); +} +#endif + static int __elfN(load_elf_header)(char *filename, elf_file_t ef) { @@ -434,6 +446,9 @@ /* Load OK, return module pointer */ *result = (struct preloaded_file *)fp; err = 0; +#ifdef __amd64__ + fp->f_kernphys_relocatable = is_kernphys_relocatable(&ef); +#endif goto out; ioerr: diff --git a/stand/efi/loader/arch/amd64/elf64_freebsd.c b/stand/efi/loader/arch/amd64/elf64_freebsd.c --- a/stand/efi/loader/arch/amd64/elf64_freebsd.c +++ b/stand/efi/loader/arch/amd64/elf64_freebsd.c @@ -82,7 +82,11 @@ static pml4_entry_t *PT4; static pdp_entry_t *PT3; +static pdp_entry_t *PT3_l, *PT3_u; static pd_entry_t *PT2; +static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1; + +extern EFI_PHYSICAL_ADDRESS staging; static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend, uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry); @@ -105,6 +109,12 @@ ACPI_TABLE_RSDP *rsdp; char buf[24]; int revision; + bool copy_auto; + + copy_auto = copy_staging == COPY_STAGING_AUTO; + if (copy_auto) + copy_staging = fp->f_kernphys_relocatable ? + COPY_STAGING_DISABLE : COPY_STAGING_ENABLE; /* * Report the RSDP to the kernel. While this can be found with @@ -151,57 +161,133 @@ } if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL) - return(EFTYPE); + return (EFTYPE); ehdr = (Elf_Ehdr *)&(md->md_data); - trampcode = (vm_offset_t)0x0000000040000000; + trampcode = copy_staging == COPY_STAGING_ENABLE ? + (vm_offset_t)0x0000000040000000 /* 1G */ : + (vm_offset_t)0x0000000100000000; /* 4G */; err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1, (EFI_PHYSICAL_ADDRESS *)&trampcode); + if (EFI_ERROR(err)) { + printf("Unable to allocate trampoline\n"); + if (copy_auto) + copy_staging = COPY_STAGING_AUTO; + return (ENOMEM); + } bzero((void *)trampcode, EFI_PAGE_SIZE); trampstack = trampcode + EFI_PAGE_SIZE - 8; bcopy((void *)&amd64_tramp, (void *)trampcode, amd64_tramp_size); trampoline = (void *)trampcode; - PT4 = (pml4_entry_t *)0x0000000040000000; - err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3, - (EFI_PHYSICAL_ADDRESS *)&PT4); - bzero(PT4, 3 * EFI_PAGE_SIZE); + if (copy_staging == COPY_STAGING_ENABLE) { + PT4 = (pml4_entry_t *)0x0000000040000000; + err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3, + (EFI_PHYSICAL_ADDRESS *)&PT4); + if (EFI_ERROR(err)) { + printf("Unable to allocate trampoline page table\n"); + BS->FreePages(trampcode, 1); + if (copy_auto) + copy_staging = COPY_STAGING_AUTO; + return (ENOMEM); + } + bzero(PT4, 3 * EFI_PAGE_SIZE); + PT3 = &PT4[512]; + PT2 = &PT3[512]; + + /* + * This is kinda brutal, but every single 1GB VM + * memory segment points to the same first 1GB of + * physical memory. But it is more than adequate. + */ + for (i = 0; i < NPTEPG; i++) { + /* + * Each slot of the L4 pages points to the + * same L3 page. + */ + PT4[i] = (pml4_entry_t)PT3; + PT4[i] |= PG_V | PG_RW; + + /* + * Each slot of the L3 pages points to the + * same L2 page. + */ + PT3[i] = (pdp_entry_t)PT2; + PT3[i] |= PG_V | PG_RW; + + /* + * The L2 page slots are mapped with 2MB pages for 1GB. + */ + PT2[i] = (pd_entry_t)i * (2 * 1024 * 1024); + PT2[i] |= PG_V | PG_RW | PG_PS; + } + } else { + PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */ + err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9, + (EFI_PHYSICAL_ADDRESS *)&PT4); + if (EFI_ERROR(err)) { + printf("Unable to allocate trampoline page table\n"); + BS->FreePages(trampcode, 9); + if (copy_auto) + copy_staging = COPY_STAGING_AUTO; + return (ENOMEM); + } - PT3 = &PT4[512]; - PT2 = &PT3[512]; + bzero(PT4, 9 * EFI_PAGE_SIZE); + + PT3_l = &PT4[NPML4EPG * 1]; + PT3_u = &PT4[NPML4EPG * 2]; + PT2_l0 = &PT4[NPML4EPG * 3]; + PT2_l1 = &PT4[NPML4EPG * 4]; + PT2_l2 = &PT4[NPML4EPG * 5]; + PT2_l3 = &PT4[NPML4EPG * 6]; + PT2_u0 = &PT4[NPML4EPG * 7]; + PT2_u1 = &PT4[NPML4EPG * 8]; + + /* 1:1 mapping of lower 4G */ + PT4[0] = (pml4_entry_t)PT3_l | PG_V | PG_RW; + PT3_l[0] = (pdp_entry_t)PT2_l0 | PG_V | PG_RW; + PT3_l[1] = (pdp_entry_t)PT2_l1 | PG_V | PG_RW; + PT3_l[2] = (pdp_entry_t)PT2_l2 | PG_V | PG_RW; + PT3_l[3] = (pdp_entry_t)PT2_l3 | PG_V | PG_RW; + for (i = 0; i < 4 * NPDEPG; i++) { + PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V | + PG_RW | PG_PS; + } - /* - * This is kinda brutal, but every single 1GB VM memory segment points - * to the same first 1GB of physical memory. But it is more than - * adequate. - */ - for (i = 0; i < 512; i++) { - /* Each slot of the L4 pages points to the same L3 page. */ - PT4[i] = (pml4_entry_t)PT3; - PT4[i] |= PG_V | PG_RW; - - /* Each slot of the L3 pages points to the same L2 page. */ - PT3[i] = (pdp_entry_t)PT2; - PT3[i] |= PG_V | PG_RW; - - /* The L2 page slots are mapped with 2MB pages for 1GB. */ - PT2[i] = i * (2 * 1024 * 1024); - PT2[i] |= PG_V | PG_RW | PG_PS; + /* mapping of kernel 2G below top */ + PT4[NPML4EPG - 1] = (pml4_entry_t)PT3_u | PG_V | PG_RW; + PT3_u[NPDPEPG - 2] = (pdp_entry_t)PT2_u0 | PG_V | PG_RW; + PT3_u[NPDPEPG - 1] = (pdp_entry_t)PT2_u1 | PG_V | PG_RW; + /* compat mapping of phys @0 */ + PT2_u0[0] = PG_PS | PG_V | PG_RW; + /* this maps past staging area */ + for (i = 1; i < 2 * NPDEPG; i++) { + PT2_u0[i] = ((pd_entry_t)staging + + ((pd_entry_t)i - 1) * NBPDR) | + PG_V | PG_RW | PG_PS; + } } + printf("staging %#lx (%scoping) tramp %p PT4 %p\n", + staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ", + trampoline, PT4); printf("Start @ 0x%lx ...\n", ehdr->e_entry); efi_time_fini(); err = bi_load(fp->f_args, &modulep, &kernend, true); if (err != 0) { efi_time_init(); - return(err); + if (copy_auto) + copy_staging = COPY_STAGING_AUTO; + return (err); } dev_cleanup(); - trampoline(trampstack, efi_copy_finish, kernend, modulep, PT4, - ehdr->e_entry); + trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ? + efi_copy_finish : efi_copy_finish_nop, kernend, modulep, + PT4, ehdr->e_entry); panic("exec returned"); } diff --git a/stand/efi/loader/bootinfo.c b/stand/efi/loader/bootinfo.c --- a/stand/efi/loader/bootinfo.c +++ b/stand/efi/loader/bootinfo.c @@ -65,6 +65,8 @@ extern EFI_SYSTEM_TABLE *ST; +int boot_services_gone; + static int bi_getboothowto(char *kargs) { @@ -396,8 +398,10 @@ if (!exit_bs) break; status = BS->ExitBootServices(IH, efi_mapkey); - if (!EFI_ERROR(status)) + if (!EFI_ERROR(status)) { + boot_services_gone = 1; break; + } } if (retry == 0) { diff --git a/stand/efi/loader/copy.c b/stand/efi/loader/copy.c --- a/stand/efi/loader/copy.c +++ b/stand/efi/loader/copy.c @@ -39,6 +39,11 @@ #include "loader_efi.h" +#define M(x) ((x) * 1024 * 1024) +#define G(x) (1UL * (x) * 1024 * 1024 * 1024) + +extern int boot_services_gone; + #if defined(__i386__) || defined(__amd64__) #include #include @@ -175,24 +180,142 @@ #ifndef EFI_STAGING_SIZE #if defined(__arm__) -#define EFI_STAGING_SIZE 32 +#define EFI_STAGING_SIZE M(32) +#else +#define EFI_STAGING_SIZE M(64) +#endif +#endif + +#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \ + defined(__riscv) +#define EFI_STAGING_2M_ALIGN 1 #else -#define EFI_STAGING_SIZE 64 +#define EFI_STAGING_2M_ALIGN 0 #endif + +#if defined(__amd64__) +#define EFI_STAGING_SLOP M(8) +#else +#define EFI_STAGING_SLOP 0 #endif +static u_long staging_slop = EFI_STAGING_SLOP; + EFI_PHYSICAL_ADDRESS staging, staging_end, staging_base; int stage_offset_set = 0; ssize_t stage_offset; +static void +efi_copy_free(void) +{ + BS->FreePages(staging_base, (staging_end - staging_base) / + EFI_PAGE_SIZE); + stage_offset_set = 0; + stage_offset = 0; +} + +#ifdef __amd64__ +int copy_staging = COPY_STAGING_ENABLE; + +static int +command_copy_staging(int argc, char *argv[]) +{ + static const char *const mode[3] = { + [COPY_STAGING_ENABLE] = "enable", + [COPY_STAGING_DISABLE] = "disable", + [COPY_STAGING_AUTO] = "auto", + }; + int prev, res; + + res = CMD_OK; + if (argc > 2) { + res = CMD_ERROR; + } else if (argc == 2) { + prev = copy_staging; + if (strcmp(argv[1], "enable") == 0) + copy_staging = COPY_STAGING_ENABLE; + else if (strcmp(argv[1], "disable") == 0) + copy_staging = COPY_STAGING_DISABLE; + else if (strcmp(argv[1], "auto") == 0) + copy_staging = COPY_STAGING_AUTO; + else { + printf("usage: copy_staging enable|disable|auto\n"); + res = CMD_ERROR; + } + if (res == CMD_OK && prev != copy_staging) { + printf("changed copy_staging, unloading kernel\n"); + unload(); + efi_copy_free(); + efi_copy_init(); + } + } else { + printf("copy staging: %s\n", mode[copy_staging]); + } + return (res); +} +COMMAND_SET(copy_staging, "copy_staging", "copy staging", command_copy_staging); +#endif + +static int +command_staging_slop(int argc, char *argv[]) +{ + char *endp; + u_long new, prev; + int res; + + res = CMD_OK; + if (argc > 2) { + res = CMD_ERROR; + } else if (argc == 2) { + new = strtoul(argv[1], &endp, 0); + if (*endp != '\0') { + printf("invalid slop value\n"); + res = CMD_ERROR; + } + if (res == CMD_OK && staging_slop != new) { + printf("changed slop, unloading kernel\n"); + unload(); + efi_copy_free(); + efi_copy_init(); + } + } else { + printf("staging slop %#lx\n", staging_slop); + } + return (res); +} +COMMAND_SET(staging_slop, "staging_slop", "set staging slop", + command_staging_slop); + +#if defined(__i386__) || defined(__amd64__) +/* + * The staging area must reside in the the first 1GB or 4GB physical + * memory: see elf64_exec() in + * boot/efi/loader/arch/amd64/elf64_freebsd.c. + */ +static EFI_PHYSICAL_ADDRESS +get_staging_max(void) +{ + EFI_PHYSICAL_ADDRESS res; + +#if defined(__i386__) + res = G(1); +#elif defined(__amd64__) + res = copy_staging == COPY_STAGING_ENABLE ? G(1) : G(4); +#endif + return (res); +} +#define EFI_ALLOC_METHOD AllocateMaxAddress +#else +#define EFI_ALLOC_METHOD AllocateAnyPages +#endif + int efi_copy_init(void) { EFI_STATUS status; - unsigned long nr_pages; - nr_pages = EFI_SIZE_TO_PAGES((EFI_STAGING_SIZE) * 1024 * 1024); + nr_pages = EFI_SIZE_TO_PAGES((EFI_STAGING_SIZE)); #if defined(__i386__) || defined(__amd64__) /* @@ -203,18 +326,10 @@ if (running_on_hyperv()) efi_verify_staging_size(&nr_pages); - /* - * The staging area must reside in the the first 1GB physical - * memory: see elf64_exec() in - * boot/efi/loader/arch/amd64/elf64_freebsd.c. - */ - staging = 1024*1024*1024; - status = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, - nr_pages, &staging); -#else - status = BS->AllocatePages(AllocateAnyPages, EfiLoaderData, - nr_pages, &staging); + staging = get_staging_max(); #endif + status = BS->AllocatePages(EFI_ALLOC_METHOD, EfiLoaderData, + nr_pages, &staging); if (EFI_ERROR(status)) { printf("failed to allocate staging area: %lu\n", EFI_ERROR_CODE(status)); @@ -223,7 +338,7 @@ staging_base = staging; staging_end = staging + nr_pages * EFI_PAGE_SIZE; -#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) +#if EFI_STAGING_2M_ALIGN /* * Round the kernel load address to a 2MiB value. This is needed * because the kernel builds a page table based on where it has @@ -231,7 +346,7 @@ * either a 1MiB or 2MiB page for this we need to make sure it * is correctly aligned for both cases. */ - staging = roundup2(staging, 2 * 1024 * 1024); + staging = roundup2(staging, M(2)); #endif return (0); @@ -240,20 +355,42 @@ static bool efi_check_space(vm_offset_t end) { - EFI_PHYSICAL_ADDRESS addr; + EFI_PHYSICAL_ADDRESS addr, new_base, new_staging; EFI_STATUS status; unsigned long nr_pages; + end = roundup2(end, EFI_PAGE_SIZE); + /* There is already enough space */ - if (end <= staging_end) + if (end + staging_slop <= staging_end) return (true); - end = roundup2(end, EFI_PAGE_SIZE); - nr_pages = EFI_SIZE_TO_PAGES(end - staging_end); + if (boot_services_gone) { + if (end <= staging_end) + return (true); + panic("efi_check_space: cannot expand staging area " + "after boot services were exited\n"); + } + + /* + * Add slop at the end: + * 1. amd64 kernel expects to do some very early allocations + * by carving out memory after kernend. Slop guarantees + * that it does not ovewrite anything useful. + * 2. It seems that initial calculation of the staging size + * could be somewhat smaller than actually copying in after + * boot services are exited. Slop avoids calling + * BS->AllocatePages() when it cannot work. + */ + end += staging_slop; + nr_pages = EFI_SIZE_TO_PAGES(end - staging_end); #if defined(__i386__) || defined(__amd64__) - /* X86 needs all memory to be allocated under the 1G boundary */ - if (end > 1024*1024*1024) + /* + * i386 needs all memory to be allocated under the 1G boundary. + * amd64 needs all memory to be allocated under the 1G or 4G boundary. + */ + if (end > get_staging_max()) goto before_staging; #endif @@ -268,14 +405,12 @@ before_staging: /* Try allocating space before the previous allocation */ - if (staging < nr_pages * EFI_PAGE_SIZE) { - printf("Not enough space before allocation\n"); - return (false); - } + if (staging < nr_pages * EFI_PAGE_SIZE) + goto expand; addr = staging - nr_pages * EFI_PAGE_SIZE; -#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) +#if EFI_STAGING_2M_ALIGN /* See efi_copy_init for why this is needed */ - addr = rounddown2(addr, 2 * 1024 * 1024); + addr = rounddown2(addr, M(2)); #endif nr_pages = EFI_SIZE_TO_PAGES(staging_base - addr); status = BS->AllocatePages(AllocateAddress, EfiLoaderData, nr_pages, @@ -288,11 +423,42 @@ staging_base = addr; memmove((void *)(uintptr_t)staging_base, (void *)(uintptr_t)staging, staging_end - staging); - stage_offset -= (staging - staging_base); + stage_offset -= staging - staging_base; staging = staging_base; return (true); } +expand: + nr_pages = EFI_SIZE_TO_PAGES(end - (vm_offset_t)staging); +#if EFI_STAGING_2M_ALIGN + nr_pages += M(2) / EFI_PAGE_SIZE; +#endif +#if defined(__i386__) || defined(__amd64__) + new_base = get_staging_max(); +#endif + status = BS->AllocatePages(EFI_ALLOC_METHOD, EfiLoaderData, + nr_pages, &new_base); + if (!EFI_ERROR(status)) { +#if EFI_STAGING_2M_ALIGN + new_staging = roundup2(new_base, M(2)); +#else + new_staging = new_base; +#endif + /* + * Move the old allocation and update the state so + * translation still works. + */ + memcpy((void *)(uintptr_t)new_staging, + (void *)(uintptr_t)staging, staging_end - staging); + BS->FreePages(staging_base, (staging_end - staging_base) / + EFI_PAGE_SIZE); + stage_offset -= staging - new_staging; + staging = new_staging; + staging_end = new_base + nr_pages * EFI_PAGE_SIZE; + staging_base = new_base; + return (true); + } + printf("efi_check_space: Unable to expand staging area\n"); return (false); } @@ -335,7 +501,6 @@ return (len); } - ssize_t efi_readin(readin_handle_t fd, vm_offset_t dest, const size_t len) { @@ -364,3 +529,8 @@ while (src < last) *dst++ = *src++; } + +void +efi_copy_finish_nop(void) +{ +} diff --git a/stand/efi/loader/loader_efi.h b/stand/efi/loader/loader_efi.h --- a/stand/efi/loader/loader_efi.h +++ b/stand/efi/loader/loader_efi.h @@ -34,6 +34,15 @@ #include #include +#ifdef __amd64__ +enum { + COPY_STAGING_ENABLE, + COPY_STAGING_DISABLE, + COPY_STAGING_AUTO, +}; +extern int copy_staging; +#endif + int efi_autoload(void); int efi_copy_init(void); @@ -44,5 +53,6 @@ void * efi_translate(vm_offset_t ptr); void efi_copy_finish(void); +void efi_copy_finish_nop(void); #endif /* _LOADER_EFI_COPY_H_ */