diff --git a/stand/kboot/arch/amd64/amd64_tramp.S b/stand/kboot/arch/amd64/amd64_tramp.S --- a/stand/kboot/arch/amd64/amd64_tramp.S +++ b/stand/kboot/arch/amd64/amd64_tramp.S @@ -1,9 +1,6 @@ /*- - * Copyright (c) 2013 The FreeBSD Foundation - * All rights reserved. + * Copyright (c) 2022 Netflix, Inc * - * This software was developed by Benno Rice under sponsorship from - * the FreeBSD Foundation. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -24,53 +21,87 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. - * - * $FreeBSD$ */ -#include - -#define ASM_FILE -#include "multiboot2.h" +/* + * This is the trampoline that starts the FreeBSD kernel. Since the Linux kernel + * calls this routine with no args, and has a different environment than the + * boot loader provides and that the kernel expects, this code is responsible + * for setting all that up and calling the normal kernel entry point. It's + * analogous to the "purgatory" code in the linux kernel. Details about these + * operations are contained in comments below. On amd64, the kernel starts all + * the APs so we don't have to worry about them here. + */ +/* + * Keep in sync with elf64_freebsd.c. Kexec starts tramp w/o any parameters, so + * store them here. This is constructed to be a useful stack: + * + * struct trampoline_data { + * uint64_t pt4; // Page table address to pop + * uint64_t entry; // return address to jump to kernel + * uint32_t fill1; // 0 + * uint32_t modulep; // 4 module metadata + * uint32_t kernend; // 8 kernel end + * uint32_t fill2; // 12 + * }; + * + * loader.kboot will construct a stack that btext expects, which is arguments on + * the stack, not in registers, and these args are 32-bit not 64 + * + * Processor is already in long mode when we're called, paging is enabled and + * boot loader loads things such that: + * - kernel mapped at KERNBASE, aligned to 2MB, below 4GB, contiguous memory + * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M) + * - kernel is mapped with 2M superpages + * - The kernel, modules and metadata is in first 4GB which is unity mapped + * - There's additional memory after loader provided data for early allocations + * + * Unlike EFI, we don't support copying the staging area. We tell Linux to land + * the kernel in its final location with the needed alignment, etc. We copy the + * trampoline code to 1MB offset above KERNBASE since that memory is otherwise + * free and safely above the lower 1MB swamp we inherited from IBM PC, though + * this code makes no assumptions about where that might. + * + * Thus, the trampoline just needs to set %rsp to that stack pop the %cr3 value, + * set it and then retq to jump to the kernel with its stack args filled in. + * Since the handoff to this code used to be from 32-bit code, it uses the i386 + * calling conventions which put the arguments on the stack. The kernel's btext + * routine expects this setup. + */ .text - .globl amd64_tramp - + .globl tramp +tramp: + cli /* Make sure we don't get interrupted. */ + leaq tramp_pt4(%rip), %rsp /* Setup our pre-filled-in stack */ + popq %rax /* Pop off the PT4 ptr for %cr3 */ + movq %rax, %cr3 /* set the page table */ + retq /* Return addr and args already on stack */ /* - * void amd64_tramp(uint64_t stack, void *copy_finish, uint64_t kernend, - * uint64_t modulep, uint64_t pagetable, uint64_t entry) + * The following is the stack for the above code. The stack will increase in + * address as things are popped off of it, so we start with the stack pointing + * to tramp_pt4. */ -amd64_tramp: - cli /* Make sure we don't get interrupted. */ - movq %rdi,%rsp /* Switch to our temporary stack. */ - - movq %rdx,%r12 /* Stash the kernel values for later. */ - movq %rcx,%r13 - movq %r8,%r14 - movq %r9,%r15 - - callq *%rsi /* Call copy_finish so we're all ready to go. */ - - pushq %r12 /* Push kernend. */ - salq $32,%r13 /* Shift modulep and push it. */ - pushq %r13 - pushq %r15 /* Push the entry address. */ - movq %r14,%cr3 /* Switch page tables. */ - ret /* "Return" to kernel entry. */ - - ALIGN_TEXT -amd64_tramp_end: - -/* void multiboot2_exec(uint64_t entry, uint64_t multiboot_info, uint64_t stack) */ - .globl multiboot2_exec -multiboot2_exec: - movq %rdx,%rsp - pushq %rdi - movq %rsi,%rbx - movq $MULTIBOOT2_BOOTLOADER_MAGIC,%rax - ret + .p2align 3 /* Stack has to be 8 byte aligned */ +trampoline_data: +tramp_pt4: .quad 0 /* New %cr3 value */ +tramp_entry: .quad 0 /* Entry to kernel (btext) */ + /* %rsp points here on entry to amd64 kernel's btext */ + .long 0 /* 0 filler, ignored (current loaders set to 0) */ +tramp_modulep: .long 0 /* 4 moudlep */ +tramp_kernend: .long 0 /* 8 kernend */ + .long 0 /* 12 alignment filler (also 0) */ +tramp_end: .data - .globl amd64_tramp_size -amd64_tramp_size: - .long amd64_tramp_end-amd64_tramp + .type tramp_size,@object + .globl tramp_size +tramp_size: + .long tramp_end-tramp + .size tramp_size, 4 + + .type tramp_data_offset,@object + .globl tramp_data_offset +tramp_data_offset: + .long trampoline_data-tramp + .size tramp_data_offset, 4 diff --git a/stand/kboot/arch/amd64/elf64_freebsd.c b/stand/kboot/arch/amd64/elf64_freebsd.c --- a/stand/kboot/arch/amd64/elf64_freebsd.c +++ b/stand/kboot/arch/amd64/elf64_freebsd.c @@ -41,9 +41,12 @@ #ifdef EFI #include #include +#else +#include "host_syscall.h" #endif #include "bootstrap.h" +#include "kboot.h" #include "platform/acfreebsd.h" #include "acconfig.h" @@ -53,9 +56,7 @@ #ifdef EFI #include "loader_efi.h" -#endif -#ifdef EFI static EFI_GUID acpi_guid = ACPI_TABLE_GUID; static EFI_GUID acpi20_guid = ACPI_20_TABLE_GUID; #endif @@ -63,9 +64,11 @@ #ifdef EFI #define LOADER_PAGE_SIZE EFI_PAGE_SIZE #else -#define LOADER_PAGE_SIZE 8192 +#define LOADER_PAGE_SIZE PAGE_SIZE #endif +extern vm_offset_t kboot_get_phys_load_segment(void); + extern int bi_load(char *args, vm_offset_t *modulep, vm_offset_t *kernendp, bool exit_bs); @@ -81,13 +84,13 @@ .l_exec = elf64_obj_exec, }; -#if 0 +#ifdef EFI extern struct file_format multiboot2; extern struct file_format multiboot2_obj; #endif struct file_format *file_formats[] = { -#if 0 +#ifdef EFI &multiboot2, &multiboot2_obj, #endif @@ -96,21 +99,44 @@ NULL }; -#ifdef EFI +#ifndef EFI +/* + * We create the stack that we want. We have the address of the page tables + * we make on top (so we pop that off and set %cr3). We have the entry point + * to the kernel (which retq pops off) This leaves the stack that the btext + * wants: offset 4 is modulep and offset8 is kernend, with the filler bytes + * to keep this aligned. This makes the trampoline very simple. + */ +struct trampoline_data { + uint64_t pt4; // Page table address to pop + uint64_t entry; // return address to jump to kernel + uint32_t fill1; // 0 + uint32_t modulep; // 4 module metadata + uint32_t kernend; // 8 kernel end + uint32_t fill2; // 12 +}; +_Static_assert(sizeof(struct trampoline_data) == 32, "Bad size for trampoline data"); +#endif + static pml4_entry_t *PT4; -static pdp_entry_t *PT3; static pdp_entry_t *PT3_l, *PT3_u; -static pd_entry_t *PT2; static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1; +#ifdef EFI +static pdp_entry_t *PT3; +static pd_entry_t *PT2; + extern EFI_PHYSICAL_ADDRESS staging; static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend, uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry); #endif -extern uintptr_t amd64_tramp; -extern uint32_t amd64_tramp_size; +extern uintptr_t tramp; +extern uint32_t tramp_size; +#ifndef EFI +extern uint32_t tramp_data_offset; +#endif /* * There is an ELF kernel and one or more ELF modules loaded. @@ -120,15 +146,27 @@ static int elf64_exec(struct preloaded_file *fp) { -#ifdef EFI struct file_metadata *md; Elf_Ehdr *ehdr; - vm_offset_t modulep, kernend, trampcode, trampstack; + vm_offset_t modulep, kernend; int err, i; - ACPI_TABLE_RSDP *rsdp; char buf[24]; +#ifdef EFI + ACPI_TABLE_RSDP *rsdp = NULL; int revision; - bool copy_auto; + int copy_auto; + vm_offset_t trampstack, trampcode; +#else + vm_offset_t rsdp = 0; + void *trampcode; + int nseg; + void *kseg; + vm_offset_t trampolinebase; + uint64_t *trampoline; + struct trampoline_data *trampoline_data; + vm_offset_t staging; + int error; +#endif #ifdef EFI copy_auto = copy_staging == COPY_STAGING_AUTO; @@ -136,66 +174,49 @@ copy_staging = fp->f_kernphys_relocatable ? COPY_STAGING_DISABLE : COPY_STAGING_ENABLE; #else - copy_auto = COPY_STAGING_DISABLE; /* XXX */ + /* + * Figure out where to put it. + * + * Linux does not allow to do kexec_load into any part of memory. Ask + * arch_loadaddr to resolve the first available chunk of physical memory + * where loading is possible (load_addr). + * + * The kernel is loaded at the 'base' address in continguous physical + * pages (using 2MB super pages). The first such page is unused by the + * kernel and serves as a good place to put not only the trampoline, but + * the page table pages that the trampoline needs to setup the proper + * kernel starting environment. + */ + staging = trampolinebase = kboot_get_phys_load_segment(); + trampolinebase += 1ULL << 20; /* Copy trampoline to base + 1MB, kernel will wind up at 2MB */ + printf("Load address at %#jx\n", (uintmax_t)trampolinebase); + printf("Relocation offset is %#jx\n", (uintmax_t)elf64_relocation_offset); #endif /* * Report the RSDP to the kernel. While this can be found with * a BIOS boot, the RSDP may be elsewhere when booted from UEFI. - * The old code used the 'hints' method to communite this to - * the kernel. However, while convenient, the 'hints' method - * is fragile and does not work when static hints are compiled - * into the kernel. Instead, move to setting different tunables - * that start with acpi. The old 'hints' can be removed before - * we branch for FreeBSD 12. */ - #ifdef EFI rsdp = efi_get_table(&acpi20_guid); if (rsdp == NULL) { rsdp = efi_get_table(&acpi_guid); } #else - rsdp = NULL; -#warning "write me" + rsdp = acpi_rsdp(); #endif - if (rsdp != NULL) { + if (rsdp != 0) { sprintf(buf, "0x%016llx", (unsigned long long)rsdp); - setenv("hint.acpi.0.rsdp", buf, 1); setenv("acpi.rsdp", buf, 1); - revision = rsdp->Revision; - if (revision == 0) - revision = 1; - sprintf(buf, "%d", revision); - setenv("hint.acpi.0.revision", buf, 1); - setenv("acpi.revision", buf, 1); - strncpy(buf, rsdp->OemId, sizeof(rsdp->OemId)); - buf[sizeof(rsdp->OemId)] = '\0'; - setenv("hint.acpi.0.oem", buf, 1); - setenv("acpi.oem", buf, 1); - sprintf(buf, "0x%016x", rsdp->RsdtPhysicalAddress); - setenv("hint.acpi.0.rsdt", buf, 1); - setenv("acpi.rsdt", buf, 1); - if (revision >= 2) { - /* XXX extended checksum? */ - sprintf(buf, "0x%016llx", - (unsigned long long)rsdp->XsdtPhysicalAddress); - setenv("hint.acpi.0.xsdt", buf, 1); - setenv("acpi.xsdt", buf, 1); - sprintf(buf, "%d", rsdp->Length); - setenv("hint.acpi.0.xsdt_length", buf, 1); - setenv("acpi.xsdt_length", buf, 1); - } } - if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL) return (EFTYPE); ehdr = (Elf_Ehdr *)&(md->md_data); +#ifdef EFI trampcode = copy_staging == COPY_STAGING_ENABLE ? (vm_offset_t)0x0000000040000000 /* 1G */ : (vm_offset_t)0x0000000100000000; /* 4G */; -#ifdef EFI err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1, (EFI_PHYSICAL_ADDRESS *)&trampcode); if (EFI_ERROR(err)) { @@ -204,17 +225,22 @@ copy_staging = COPY_STAGING_AUTO; return (ENOMEM); } + trampstack = trampcode + LOADER_PAGE_SIZE - 8; #else -#warning "Write me" + // XXX Question: why not just use malloc? + trampcode = host_getmem(LOADER_PAGE_SIZE); + if (trampcode == NULL) { + printf("Unable to allocate trampoline\n"); + return (ENOMEM); + } #endif bzero((void *)trampcode, LOADER_PAGE_SIZE); - trampstack = trampcode + LOADER_PAGE_SIZE - 8; - bcopy((void *)&amd64_tramp, (void *)trampcode, amd64_tramp_size); + bcopy((void *)&tramp, (void *)trampcode, tramp_size); trampoline = (void *)trampcode; +#ifdef EFI if (copy_staging == COPY_STAGING_ENABLE) { PT4 = (pml4_entry_t *)0x0000000040000000; -#ifdef EFI err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3, (EFI_PHYSICAL_ADDRESS *)&PT4); if (EFI_ERROR(err)) { @@ -224,9 +250,6 @@ copy_staging = COPY_STAGING_AUTO; return (ENOMEM); } -#else -#warning "Write me" -#endif bzero(PT4, 3 * LOADER_PAGE_SIZE); PT3 = &PT4[512]; PT2 = &PT3[512]; @@ -259,7 +282,6 @@ } } else { PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */ -#ifdef EFI err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9, (EFI_PHYSICAL_ADDRESS *)&PT4); if (EFI_ERROR(err)) { @@ -269,10 +291,6 @@ copy_staging = COPY_STAGING_AUTO; return (ENOMEM); } -#else -#warning "Write me" -#endif - bzero(PT4, 9 * LOADER_PAGE_SIZE); PT3_l = &PT4[NPML4EPG * 1]; @@ -308,10 +326,84 @@ PG_V | PG_RW | PG_PS; } } +#else + { + vm_offset_t pabase, pa_pt3_l, pa_pt3_u, pa_pt2_l0, pa_pt2_l1, pa_pt2_l2, pa_pt2_l3, pa_pt2_u0, pa_pt2_u1; + /* We'll find a place for these later */ + PT4 = (pml4_entry_t *)host_getmem(9 * LOADER_PAGE_SIZE); + bzero(PT4, 9 * LOADER_PAGE_SIZE); + + PT3_l = &PT4[NPML4EPG * 1]; + PT3_u = &PT4[NPML4EPG * 2]; + PT2_l0 = &PT4[NPML4EPG * 3]; + PT2_l1 = &PT4[NPML4EPG * 4]; + PT2_l2 = &PT4[NPML4EPG * 5]; + PT2_l3 = &PT4[NPML4EPG * 6]; + PT2_u0 = &PT4[NPML4EPG * 7]; + PT2_u1 = &PT4[NPML4EPG * 8]; + + pabase = trampolinebase + LOADER_PAGE_SIZE; + pa_pt3_l = pabase + LOADER_PAGE_SIZE * 1; + pa_pt3_u = pabase + LOADER_PAGE_SIZE * 2; + pa_pt2_l0 = pabase + LOADER_PAGE_SIZE * 3; + pa_pt2_l1 = pabase + LOADER_PAGE_SIZE * 4; + pa_pt2_l2 = pabase + LOADER_PAGE_SIZE * 5; + pa_pt2_l3 = pabase + LOADER_PAGE_SIZE * 6; + pa_pt2_u0 = pabase + LOADER_PAGE_SIZE * 7; + pa_pt2_u1 = pabase + LOADER_PAGE_SIZE * 8; + + /* 1:1 mapping of lower 4G */ + PT4[0] = (pml4_entry_t)pa_pt3_l | PG_V | PG_RW; + PT3_l[0] = (pdp_entry_t)pa_pt2_l0 | PG_V | PG_RW; + PT3_l[1] = (pdp_entry_t)pa_pt2_l1 | PG_V | PG_RW; + PT3_l[2] = (pdp_entry_t)pa_pt2_l2 | PG_V | PG_RW; + PT3_l[3] = (pdp_entry_t)pa_pt2_l3 | PG_V | PG_RW; + for (i = 0; i < 4 * NPDEPG; i++) { /* we overflow PT2_l0 into _l1, etc */ + PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V | + PG_RW | PG_PS; + } + + /* mapping of kernel 2G below top */ + PT4[NPML4EPG - 1] = (pml4_entry_t)pa_pt3_u | PG_V | PG_RW; + PT3_u[NPDPEPG - 2] = (pdp_entry_t)pa_pt2_u0 | PG_V | PG_RW; + PT3_u[NPDPEPG - 1] = (pdp_entry_t)pa_pt2_u1 | PG_V | PG_RW; + /* compat mapping of phys @0 */ + PT2_u0[0] = PG_PS | PG_V | PG_RW; + /* this maps past staging area */ + /* + * Kernel uses the KERNSTART (== KERNBASE + 2MB) entry to figure + * out where we loaded the kernel. This is PT2_u0[1] (since + * these map 2MB pages. So the PA that this maps has to be + * kboot's staging + 2MB. For UEFI we do 'i - 1' since we load + * the kernel right at staging (and assume the first address we + * load is 2MB in efi_copyin). However for kboot, staging + 1 * + * NBPDR == staging + 2MB which is where the kernel starts. Our + * trampoline need not be mapped into the kernel space since we + * execute PA==VA for that, and the trampoline can just go away + * once the kernel is called. + * + * Staging should likely be as low as possible, though, because + * all the 'early' allocations are at kernend (which the kernel + * calls physfree). + */ + for (i = 1; i < 2 * NPDEPG; i++) { /* we overflow PT2_u0 into _u1 */ + PT2_u0[i] = ((pd_entry_t)staging + + ((pd_entry_t)i) * NBPDR) | + PG_V | PG_RW | PG_PS; + if (i < 10) printf("Mapping %d to %#lx staging %#lx\n", i, PT2_u0[i], staging); + } + } +#endif + +#ifdef EFI printf("staging %#lx (%scopying) tramp %p PT4 %p\n", staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ", trampoline, PT4); +#else + printf("staging %#lx tramp %p PT4 %p\n", staging, (void *)trampolinebase, + (void *)trampolinebase + LOADER_PAGE_SIZE); +#endif printf("Start @ 0x%lx ...\n", ehdr->e_entry); #ifdef EFI @@ -321,17 +413,46 @@ if (err != 0) { #ifdef EFI efi_time_init(); -#endif if (copy_auto) copy_staging = COPY_STAGING_AUTO; +#endif return (err); } dev_cleanup(); +#ifdef EFI trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ? efi_copy_finish : efi_copy_finish_nop, kernend, modulep, PT4, ehdr->e_entry); +#else + trampoline_data = (void *)trampoline + tramp_data_offset; + trampoline_data->entry = ehdr->e_entry; + trampoline_data->pt4 = trampolinebase + LOADER_PAGE_SIZE; + /* + * So we compute the VA of the module data by modulep + KERNBASE.... + * need to make sure that that address is mapped right. We calculate + * the start of available memory to allocate via kernend (which is + * calculated with a phyaddr of "kernend + PA(PT_u0[1])"), so we better + * make sure we're not overwriting the last 2MB of the kernel :). + */ + trampoline_data->modulep = modulep; /* Offset from KERNBASE */ + trampoline_data->kernend = kernend; /* Offset from the load address */ + trampoline_data->fill1 = trampoline_data->fill2 = 0; + printf("Modulep = %lx kernend %lx\n", modulep, kernend); + /* NOTE: when copyting in, it's relative to the start of our 'area' not an abs addr */ + /* Copy the trampoline to the ksegs */ + archsw.arch_copyin((void *)trampcode, trampolinebase - staging, tramp_size); + /* Copy the page table to the ksegs */ + archsw.arch_copyin(PT4, trampoline_data->pt4 - staging, 9 * LOADER_PAGE_SIZE); + + if (archsw.arch_kexec_kseg_get == NULL) + panic("architecture did not provide kexec segment mapping"); + archsw.arch_kexec_kseg_get(&nseg, &kseg); + error = host_kexec_load(trampolinebase, nseg, kseg, HOST_KEXEC_ARCH_X86_64); + if (error != 0) + panic("kexec_load returned error: %d", error); + host_reboot(HOST_REBOOT_MAGIC1, HOST_REBOOT_MAGIC2, HOST_REBOOT_CMD_KEXEC, 0); #endif panic("exec returned");