diff --git a/stand/kboot/kboot/arch/amd64/amd64_tramp.S b/stand/kboot/kboot/arch/amd64/amd64_tramp.S --- a/stand/kboot/kboot/arch/amd64/amd64_tramp.S +++ b/stand/kboot/kboot/arch/amd64/amd64_tramp.S @@ -38,6 +38,10 @@ * store them here. This is constructed to be a useful stack: * * struct trampoline_data { + * // %rsp points here on start and we pop the args off and then retq to 'entry' + * uint64_t memmap_src; // Linux-provided memory map PA + * uint64_t memmap_dst; // Module data copy PA + * uint64_t memmap_len; // Length to copy * uint64_t pt4; // Page table address to pop * uint64_t entry; // return address to jump to kernel * uint32_t fill1; // 0 @@ -47,33 +51,52 @@ * }; * * loader.kboot will construct a stack that btext expects, which is arguments on - * the stack, not in registers, and these args are 32-bit not 64 + * the stack, not in registers, and these args are 32-bit not 64. The extra stuff + * is data the trampoline code consumes. * * Processor is already in long mode when we're called, paging is enabled and * boot loader loads things such that: * - kernel mapped at KERNBASE, aligned to 2MB, below 4GB, contiguous memory + * - %cr3 tells us our PA later in boot, so we install it before jumping + * to the kernel. * - there is a 2M hole at KERNBASE (KERNSTART = KERNBASE + 2M) * - kernel is mapped with 2M superpages * - The kernel, modules and metadata is in first 4GB which is unity mapped * - There's additional memory after loader provided data for early allocations * - * Unlike EFI, we don't support copying the staging area. We tell Linux to land - * the kernel in its final location with the needed alignment, etc. We copy the - * trampoline code to 1MB offset above KERNBASE since that memory is otherwise - * free and safely above the lower 1MB swamp we inherited from IBM PC, though - * this code makes no assumptions about where that might. + * Unlike coming directly from loader.efi, we don't support copying the staging + * area. We tell Linux to land the kernel in its final location with the needed + * alignment, etc. We copy the trampoline code to 1MB offset above KERNBASE + * since that memory is otherwise free and safely above the lower 1MB swamp we + * inherited from IBM PC, though this code makes no assumptions about where that + * might be. * - * Thus, the trampoline just needs to set %rsp to that stack pop the %cr3 value, - * set it and then retq to jump to the kernel with its stack args filled in. - * Since the handoff to this code used to be from 32-bit code, it uses the i386 - * calling conventions which put the arguments on the stack. The kernel's btext - * routine expects this setup. + * Thus, the trampoline just needs to set %rsp to that stack pop the systab + * patch value, pop the %cr3 value, set it and then retq to jump to the kernel + * with its stack args filled in. Since the handoff to this code used to be + * from 32-bit code, it uses the i386 calling conventions which put the + * arguments on the stack. The kernel's btext routine expects this setup. */ + .text .globl tramp tramp: cli /* Make sure we don't get interrupted. */ - leaq tramp_pt4(%rip), %rsp /* Setup our pre-filled-in stack */ + cld /* Copy in a sane direction */ + leaq stack_start(%rip), %rsp /* Setup our pre-filled-in stack */ + + /* + * If we have a EFI memory map, copy it over. These data are always + * on the stack, so we pop them all off before testing to skip the copy. + */ + popq %rsi /* memmap_src */ + popq %rdi /* memmap_dst */ + popq %rcx /* memmap_size */ + testq %rsi, %rsi + je no_map_copy + rep movsb /* Make the copy */ + +no_map_copy: popq %rax /* Pop off the PT4 ptr for %cr3 */ movq %rax, %cr3 /* set the page table */ retq /* Return addr and args already on stack */ @@ -84,6 +107,13 @@ */ .p2align 3 /* Stack has to be 8 byte aligned */ trampoline_data: +stack_start: /* %rsp at start. */ +tramp_memmap_src: + .quad 0 /* SRC PA (data from Linux) */ +tramp_memmap_dst: + .quad 0 /* DST PA (data to FreeBSD's metadata */ +tramp_memmap_len: + .quad 0 /* Length */ tramp_pt4: .quad 0 /* New %cr3 value */ tramp_entry: .quad 0 /* Entry to kernel (btext) */ /* %rsp points here on entry to amd64 kernel's btext */ diff --git a/stand/kboot/kboot/arch/amd64/elf64_freebsd.c b/stand/kboot/kboot/arch/amd64/elf64_freebsd.c --- a/stand/kboot/kboot/arch/amd64/elf64_freebsd.c +++ b/stand/kboot/kboot/arch/amd64/elf64_freebsd.c @@ -44,6 +44,7 @@ #include "bootstrap.h" #include "kboot.h" +#include "efi.h" #include "platform/acfreebsd.h" #include "acconfig.h" @@ -97,13 +98,19 @@ #ifndef EFI /* - * We create the stack that we want. We have the address of the page tables - * we make on top (so we pop that off and set %cr3). We have the entry point - * to the kernel (which retq pops off) This leaves the stack that the btext - * wants: offset 4 is modulep and offset8 is kernend, with the filler bytes - * to keep this aligned. This makes the trampoline very simple. + * We create the stack that we want. We store any memory map table that we have + * top copy (the metadata has already been filled in). We pop these args off and + * copy if neeed be. Then, we have the address of the page tables we make on top + * (so we pop that off and set %cr3). We have the entry point to the kernel + * (which retq pops off) This leaves the stack that the btext wants: offset 4 is + * modulep and offset8 is kernend, with the filler bytes to keep this + * aligned. This also makes the trampoline very simple: pop some args, maybe copy + * pop the page table and then return into btext as defined in the kernel. */ struct trampoline_data { + uint64_t memmap_src; // Linux-provided memory map PA + uint64_t memmap_dst; // Module data copy PA + uint64_t memmap_len; // Length to copy uint64_t pt4; // Page table address to pop uint64_t entry; // return address to jump to kernel uint32_t fill1; // 0 @@ -111,7 +118,7 @@ uint32_t kernend; // 8 kernel end uint32_t fill2; // 12 }; -_Static_assert(sizeof(struct trampoline_data) == 32, "Bad size for trampoline data"); +_Static_assert(sizeof(struct trampoline_data) == 56, "Bad size for trampoline data"); #endif static pml4_entry_t *PT4; @@ -420,8 +427,24 @@ PT4, ehdr->e_entry); #else trampoline_data = (void *)trampoline + tramp_data_offset; - trampoline_data->entry = ehdr->e_entry; - trampoline_data->pt4 = trampolinebase + LOADER_PAGE_SIZE; + trampoline_data->entry = ehdr->e_entry; /* VA since we start MMU with KERNBASE, etc */ + if (efi_map_phys_src != 0) { + md = file_findmetadata(fp, MODINFOMD_EFI_MAP); + if (md == NULL || md->md_addr == 0) { + printf("Need to copy EFI MAP, but EFI MAP not found. %p\n", md); + } else { + printf("Metadata EFI map loaded at VA %lx\n", md->md_addr); + efi_map_phys_dst = md->md_addr + staging + /* md_addr is taging relative */ + roundup2(sizeof(struct efi_map_header), 16); /* Skip header */ + trampoline_data->memmap_src = efi_map_phys_src; + trampoline_data->memmap_dst = efi_map_phys_dst; + trampoline_data->memmap_len = efi_map_size - roundup2(sizeof(struct efi_map_header), 16); + printf("Copying UEFI Memory Map data from %#lx to %#lx %ld bytes\n", + trampoline_data->memmap_src, + trampoline_data->memmap_dst, + trampoline_data->memmap_len); + } + } /* * So we compute the VA of the module data by modulep + KERNBASE.... * need to make sure that that address is mapped right. We calculate @@ -429,6 +452,7 @@ * calculated with a phyaddr of "kernend + PA(PT_u0[1])"), so we better * make sure we're not overwriting the last 2MB of the kernel :). */ + trampoline_data->pt4 = trampolinebase + LOADER_PAGE_SIZE; trampoline_data->modulep = modulep; /* Offset from KERNBASE */ trampoline_data->kernend = kernend; /* Offset from the load address */ trampoline_data->fill1 = trampoline_data->fill2 = 0; diff --git a/stand/kboot/kboot/arch/amd64/load_addr.c b/stand/kboot/kboot/arch/amd64/load_addr.c --- a/stand/kboot/kboot/arch/amd64/load_addr.c +++ b/stand/kboot/kboot/arch/amd64/load_addr.c @@ -33,12 +33,78 @@ #include "kboot.h" #include "bootstrap.h" +/* + * Abbreviated x86 Linux struct boot_param for the so-called zero-page. + * We have to use this to get systab and memmap since neither of those + * are exposed in a sane way. We only define what we need and pad for + * everything else to minimize cross-coupling. + * + * Transcribed in FreeBSD-ese from Linux's asm/bootparam.h for x86 as of + * 6.15, but these details haven't changed in a long time. + */ + +struct linux_efi_info { + uint32_t efi_loader_signature; /* 0x00 */ + uint32_t efi_systab; /* 0x04 */ + uint32_t efi_memdesc_size; /* 0x08 */ + uint32_t efi_memdesc_version; /* 0x0c */ + uint32_t efi_memmap; /* 0x10 */ + uint32_t efi_memmap_size; /* 0x14 */ + uint32_t efi_systab_hi; /* 0x18 */ + uint32_t efi_memmap_hi; /* 0x1c */ +} __packed; + +struct linux_boot_params { + uint8_t _pad1[0x1c0]; /* 0x000 */ + struct linux_efi_info efi_info; /* 0x1c0 */ + uint8_t _pad2[0x1000 - 0x1c0 - sizeof(struct linux_efi_info)]; /* 0x1e0 */ +} __packed; /* Total size 4k, the page size on x86 */ + bool enumerate_memory_arch(void) { - efi_read_from_sysfs(); - if (!populate_avail_from_iomem()) + struct linux_boot_params bp; + + /* + * Sadly, there's no properly exported data for the EFI memory map nor + * the system table. systab is passed in from the original boot loader. + * memmap is obtained from boot time services (which are long gone) and + * then modified and passed to SetVirtualAddressMap. Even though the + * latter is in runtime services, it can only be called once and Linux + * has already called it. So unless we can dig all this out from the + * Linux kernel, there's no other wy to get it. A proper way would be to + * publish these in /sys/firmware/efi, but that's not done yet. We can + * only get the runtime subset and can't get systbl at all from today's + * (6.15) Linux kernel. Linux's pandora boot loader will copy this same + * information when it calls the new kernel, but since we don't use the + * bzImage kexec vector, we have to harvest it here. + */ + if (data_from_kernel("boot_params", &bp, sizeof(bp))) { + uint64_t systbl, memmap; + + systbl = (uint64_t)bp.efi_info.efi_systab_hi << 32 | + bp.efi_info.efi_systab; + memmap = (uint64_t)bp.efi_info.efi_memmap_hi << 32 | + bp.efi_info.efi_memmap; + + efi_set_systbl(systbl); + efi_read_from_pa(memmap, bp.efi_info.efi_memmap_size, + bp.efi_info.efi_memdesc_size, bp.efi_info.efi_memdesc_version); + printf("UEFI SYSTAB PA: %#lx\n", systbl); + printf("UEFI MMAP: Ver %d Ent Size %d Tot Size %d PA %#lx\n", + bp.efi_info.efi_memdesc_version, bp.efi_info.efi_memdesc_size, + bp.efi_info.efi_memmap_size, memmap); + } + /* + * So, we can't use the EFI map for this, so we have to fall back to + * the proc iomem stuff to at least get started... + */ + if (!populate_avail_from_iomem()) { + printf("Populate from avail also failed.\n"); return (false); + } else { + printf("Populate worked...\n"); + } print_avail(); return (true); }