Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F145168794
D31121.id92658.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
42 KB
Referenced Files
None
Subscribers
None
D31121.id92658.diff
View Options
diff --git a/TODO b/TODO
new file mode 100644
--- /dev/null
+++ b/TODO
@@ -0,0 +1,5 @@
+- Xen
+
+checks:
+non-EFI boot
+non-amd64 EFI
diff --git a/stand/common/bootstrap.h b/stand/common/bootstrap.h
--- a/stand/common/bootstrap.h
+++ b/stand/common/bootstrap.h
@@ -228,6 +228,9 @@
size_t f_size; /* file size */
struct kernel_module *f_modules; /* list of modules if any */
struct preloaded_file *f_next; /* next file */
+#ifdef __amd64__
+ bool f_kernphys_relocatable;
+#endif
};
struct file_format
diff --git a/stand/common/load_elf.c b/stand/common/load_elf.c
--- a/stand/common/load_elf.c
+++ b/stand/common/load_elf.c
@@ -207,6 +207,18 @@
#undef CONVERT_SWITCH
#undef CONVERT_FIELD
+
+#ifdef __amd64__
+static bool
+is_kernphys_relocatable(elf_file_t ef)
+{
+ Elf_Sym sym;
+
+ return (__elfN(lookup_symbol)(ef, "kernphys", &sym, STT_OBJECT) == 0 &&
+ sym.st_size == 8);
+}
+#endif
+
static int
__elfN(load_elf_header)(char *filename, elf_file_t ef)
{
@@ -434,6 +446,9 @@
/* Load OK, return module pointer */
*result = (struct preloaded_file *)fp;
err = 0;
+#ifdef __amd64__
+ fp->f_kernphys_relocatable = is_kernphys_relocatable(&ef);
+#endif
goto out;
ioerr:
diff --git a/stand/efi/loader/arch/amd64/elf64_freebsd.c b/stand/efi/loader/arch/amd64/elf64_freebsd.c
--- a/stand/efi/loader/arch/amd64/elf64_freebsd.c
+++ b/stand/efi/loader/arch/amd64/elf64_freebsd.c
@@ -82,7 +82,11 @@
static pml4_entry_t *PT4;
static pdp_entry_t *PT3;
+static pdp_entry_t *PT3_l, *PT3_u;
static pd_entry_t *PT2;
+static pd_entry_t *PT2_l0, *PT2_l1, *PT2_l2, *PT2_l3, *PT2_u0, *PT2_u1;
+
+extern EFI_PHYSICAL_ADDRESS staging;
static void (*trampoline)(uint64_t stack, void *copy_finish, uint64_t kernend,
uint64_t modulep, pml4_entry_t *pagetable, uint64_t entry);
@@ -105,6 +109,12 @@
ACPI_TABLE_RSDP *rsdp;
char buf[24];
int revision;
+ bool copy_auto;
+
+ copy_auto = copy_staging == COPY_STAGING_AUTO;
+ if (copy_auto)
+ copy_staging = fp->f_kernphys_relocatable ?
+ COPY_STAGING_DISABLE : COPY_STAGING_ENABLE;
/*
* Report the RSDP to the kernel. While this can be found with
@@ -151,57 +161,133 @@
}
if ((md = file_findmetadata(fp, MODINFOMD_ELFHDR)) == NULL)
- return(EFTYPE);
+ return (EFTYPE);
ehdr = (Elf_Ehdr *)&(md->md_data);
- trampcode = (vm_offset_t)0x0000000040000000;
+ trampcode = copy_staging == COPY_STAGING_ENABLE ?
+ (vm_offset_t)0x0000000040000000 /* 1G */ :
+ (vm_offset_t)0x0000000100000000; /* 4G */;
err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 1,
(EFI_PHYSICAL_ADDRESS *)&trampcode);
+ if (EFI_ERROR(err)) {
+ printf("Unable to allocate trampoline\n");
+ if (copy_auto)
+ copy_staging = COPY_STAGING_AUTO;
+ return (ENOMEM);
+ }
bzero((void *)trampcode, EFI_PAGE_SIZE);
trampstack = trampcode + EFI_PAGE_SIZE - 8;
bcopy((void *)&amd64_tramp, (void *)trampcode, amd64_tramp_size);
trampoline = (void *)trampcode;
- PT4 = (pml4_entry_t *)0x0000000040000000;
- err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3,
- (EFI_PHYSICAL_ADDRESS *)&PT4);
- bzero(PT4, 3 * EFI_PAGE_SIZE);
+ if (copy_staging == COPY_STAGING_ENABLE) {
+ PT4 = (pml4_entry_t *)0x0000000040000000;
+ err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 3,
+ (EFI_PHYSICAL_ADDRESS *)&PT4);
+ if (EFI_ERROR(err)) {
+ printf("Unable to allocate trampoline page table\n");
+ BS->FreePages(trampcode, 1);
+ if (copy_auto)
+ copy_staging = COPY_STAGING_AUTO;
+ return (ENOMEM);
+ }
+ bzero(PT4, 3 * EFI_PAGE_SIZE);
+ PT3 = &PT4[512];
+ PT2 = &PT3[512];
+
+ /*
+ * This is kinda brutal, but every single 1GB VM
+ * memory segment points to the same first 1GB of
+ * physical memory. But it is more than adequate.
+ */
+ for (i = 0; i < NPTEPG; i++) {
+ /*
+ * Each slot of the L4 pages points to the
+ * same L3 page.
+ */
+ PT4[i] = (pml4_entry_t)PT3;
+ PT4[i] |= PG_V | PG_RW;
+
+ /*
+ * Each slot of the L3 pages points to the
+ * same L2 page.
+ */
+ PT3[i] = (pdp_entry_t)PT2;
+ PT3[i] |= PG_V | PG_RW;
+
+ /*
+ * The L2 page slots are mapped with 2MB pages for 1GB.
+ */
+ PT2[i] = (pd_entry_t)i * (2 * 1024 * 1024);
+ PT2[i] |= PG_V | PG_RW | PG_PS;
+ }
+ } else {
+ PT4 = (pml4_entry_t *)0x0000000100000000; /* 4G */
+ err = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData, 9,
+ (EFI_PHYSICAL_ADDRESS *)&PT4);
+ if (EFI_ERROR(err)) {
+ printf("Unable to allocate trampoline page table\n");
+ BS->FreePages(trampcode, 9);
+ if (copy_auto)
+ copy_staging = COPY_STAGING_AUTO;
+ return (ENOMEM);
+ }
- PT3 = &PT4[512];
- PT2 = &PT3[512];
+ bzero(PT4, 9 * EFI_PAGE_SIZE);
+
+ PT3_l = &PT4[NPML4EPG * 1];
+ PT3_u = &PT4[NPML4EPG * 2];
+ PT2_l0 = &PT4[NPML4EPG * 3];
+ PT2_l1 = &PT4[NPML4EPG * 4];
+ PT2_l2 = &PT4[NPML4EPG * 5];
+ PT2_l3 = &PT4[NPML4EPG * 6];
+ PT2_u0 = &PT4[NPML4EPG * 7];
+ PT2_u1 = &PT4[NPML4EPG * 8];
+
+ /* 1:1 mapping of lower 4G */
+ PT4[0] = (pml4_entry_t)PT3_l | PG_V | PG_RW;
+ PT3_l[0] = (pdp_entry_t)PT2_l0 | PG_V | PG_RW;
+ PT3_l[1] = (pdp_entry_t)PT2_l1 | PG_V | PG_RW;
+ PT3_l[2] = (pdp_entry_t)PT2_l2 | PG_V | PG_RW;
+ PT3_l[3] = (pdp_entry_t)PT2_l3 | PG_V | PG_RW;
+ for (i = 0; i < 4 * NPDEPG; i++) {
+ PT2_l0[i] = ((pd_entry_t)i << PDRSHIFT) | PG_V |
+ PG_RW | PG_PS;
+ }
- /*
- * This is kinda brutal, but every single 1GB VM memory segment points
- * to the same first 1GB of physical memory. But it is more than
- * adequate.
- */
- for (i = 0; i < 512; i++) {
- /* Each slot of the L4 pages points to the same L3 page. */
- PT4[i] = (pml4_entry_t)PT3;
- PT4[i] |= PG_V | PG_RW;
-
- /* Each slot of the L3 pages points to the same L2 page. */
- PT3[i] = (pdp_entry_t)PT2;
- PT3[i] |= PG_V | PG_RW;
-
- /* The L2 page slots are mapped with 2MB pages for 1GB. */
- PT2[i] = i * (2 * 1024 * 1024);
- PT2[i] |= PG_V | PG_RW | PG_PS;
+ /* mapping of kernel 2G below top */
+ PT4[NPML4EPG - 1] = (pml4_entry_t)PT3_u | PG_V | PG_RW;
+ PT3_u[NPDPEPG - 2] = (pdp_entry_t)PT2_u0 | PG_V | PG_RW;
+ PT3_u[NPDPEPG - 1] = (pdp_entry_t)PT2_u1 | PG_V | PG_RW;
+ /* compat mapping of phys @0 */
+ PT2_u0[0] = PG_PS | PG_V | PG_RW;
+ /* this maps past staging area */
+ for (i = 1; i < 2 * NPDEPG; i++) {
+ PT2_u0[i] = ((pd_entry_t)staging +
+ ((pd_entry_t)i - 1) * NBPDR) |
+ PG_V | PG_RW | PG_PS;
+ }
}
+ printf("staging %#lx (%scoping) tramp %p PT4 %p\n",
+ staging, copy_staging == COPY_STAGING_ENABLE ? "" : "not ",
+ trampoline, PT4);
printf("Start @ 0x%lx ...\n", ehdr->e_entry);
efi_time_fini();
err = bi_load(fp->f_args, &modulep, &kernend, true);
if (err != 0) {
efi_time_init();
- return(err);
+ if (copy_auto)
+ copy_staging = COPY_STAGING_AUTO;
+ return (err);
}
dev_cleanup();
- trampoline(trampstack, efi_copy_finish, kernend, modulep, PT4,
- ehdr->e_entry);
+ trampoline(trampstack, copy_staging == COPY_STAGING_ENABLE ?
+ efi_copy_finish : efi_copy_finish_nop, kernend, modulep,
+ PT4, ehdr->e_entry);
panic("exec returned");
}
diff --git a/stand/efi/loader/bootinfo.c b/stand/efi/loader/bootinfo.c
--- a/stand/efi/loader/bootinfo.c
+++ b/stand/efi/loader/bootinfo.c
@@ -65,6 +65,8 @@
extern EFI_SYSTEM_TABLE *ST;
+int boot_services_gone;
+
static int
bi_getboothowto(char *kargs)
{
@@ -396,8 +398,10 @@
if (!exit_bs)
break;
status = BS->ExitBootServices(IH, efi_mapkey);
- if (!EFI_ERROR(status))
+ if (!EFI_ERROR(status)) {
+ boot_services_gone = 1;
break;
+ }
}
if (retry == 0) {
diff --git a/stand/efi/loader/copy.c b/stand/efi/loader/copy.c
--- a/stand/efi/loader/copy.c
+++ b/stand/efi/loader/copy.c
@@ -39,6 +39,11 @@
#include "loader_efi.h"
+#define M(x) ((x) * 1024 * 1024)
+#define G(x) (1UL * (x) * 1024 * 1024 * 1024)
+
+extern int boot_services_gone;
+
#if defined(__i386__) || defined(__amd64__)
#include <machine/cpufunc.h>
#include <machine/specialreg.h>
@@ -175,24 +180,142 @@
#ifndef EFI_STAGING_SIZE
#if defined(__arm__)
-#define EFI_STAGING_SIZE 32
+#define EFI_STAGING_SIZE M(32)
+#else
+#define EFI_STAGING_SIZE M(64)
+#endif
+#endif
+
+#if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \
+ defined(__riscv)
+#define EFI_STAGING_2M_ALIGN 1
#else
-#define EFI_STAGING_SIZE 64
+#define EFI_STAGING_2M_ALIGN 0
#endif
+
+#if defined(__amd64__)
+#define EFI_STAGING_SLOP M(8)
+#else
+#define EFI_STAGING_SLOP 0
#endif
+static u_long staging_slop = EFI_STAGING_SLOP;
+
EFI_PHYSICAL_ADDRESS staging, staging_end, staging_base;
int stage_offset_set = 0;
ssize_t stage_offset;
+static void
+efi_copy_free(void)
+{
+ BS->FreePages(staging_base, (staging_end - staging_base) /
+ EFI_PAGE_SIZE);
+ stage_offset_set = 0;
+ stage_offset = 0;
+}
+
+#ifdef __amd64__
+int copy_staging = COPY_STAGING_AUTO;
+
+static int
+command_copy_staging(int argc, char *argv[])
+{
+ static const char *const mode[3] = {
+ [COPY_STAGING_ENABLE] = "enable",
+ [COPY_STAGING_DISABLE] = "disable",
+ [COPY_STAGING_AUTO] = "auto",
+ };
+ int prev, res;
+
+ res = CMD_OK;
+ if (argc > 2) {
+ res = CMD_ERROR;
+ } else if (argc == 2) {
+ prev = copy_staging;
+ if (strcmp(argv[1], "enable") == 0)
+ copy_staging = COPY_STAGING_ENABLE;
+ else if (strcmp(argv[1], "disable") == 0)
+ copy_staging = COPY_STAGING_DISABLE;
+ else if (strcmp(argv[1], "auto") == 0)
+ copy_staging = COPY_STAGING_AUTO;
+ else {
+ printf("usage: copy_staging enable|disable|auto\n");
+ res = CMD_ERROR;
+ }
+ if (res == CMD_OK && prev != copy_staging) {
+ printf("changed copy_staging, unloading kernel\n");
+ unload();
+ efi_copy_free();
+ efi_copy_init();
+ }
+ } else {
+ printf("copy staging: %s\n", mode[copy_staging]);
+ }
+ return (res);
+}
+COMMAND_SET(copy_staging, "copy_staging", "copy staging", command_copy_staging);
+#endif
+
+static int
+command_staging_slop(int argc, char *argv[])
+{
+ char *endp;
+ u_long new, prev;
+ int res;
+
+ res = CMD_OK;
+ if (argc > 2) {
+ res = CMD_ERROR;
+ } else if (argc == 2) {
+ new = strtoul(argv[1], &endp, 0);
+ if (*endp != '\0') {
+ printf("invalid slop value\n");
+ res = CMD_ERROR;
+ }
+ if (res == CMD_OK && staging_slop != new) {
+ printf("changed slop, unloading kernel\n");
+ unload();
+ efi_copy_free();
+ efi_copy_init();
+ }
+ } else {
+ printf("staging slop %#lx\n", staging_slop);
+ }
+ return (res);
+}
+COMMAND_SET(staging_slop, "staging_slop", "set staging slop",
+ command_staging_slop);
+
+#if defined(__i386__) || defined(__amd64__)
+/*
+ * The staging area must reside in the the first 1GB or 4GB physical
+ * memory: see elf64_exec() in
+ * boot/efi/loader/arch/amd64/elf64_freebsd.c.
+ */
+static EFI_PHYSICAL_ADDRESS
+get_staging_max(void)
+{
+ EFI_PHYSICAL_ADDRESS res;
+
+#if defined(__i386__)
+ res = G(1);
+#elif defined(__amd64__)
+ res = copy_staging == COPY_STAGING_ENABLE ? G(1) : G(4);
+#endif
+ return (res);
+}
+#define EFI_ALLOC_METHOD AllocateMaxAddress
+#else
+#define EFI_ALLOC_METHOD AllocateAnyPages
+#endif
+
int
efi_copy_init(void)
{
EFI_STATUS status;
-
unsigned long nr_pages;
- nr_pages = EFI_SIZE_TO_PAGES((EFI_STAGING_SIZE) * 1024 * 1024);
+ nr_pages = EFI_SIZE_TO_PAGES((EFI_STAGING_SIZE));
#if defined(__i386__) || defined(__amd64__)
/*
@@ -203,18 +326,10 @@
if (running_on_hyperv())
efi_verify_staging_size(&nr_pages);
- /*
- * The staging area must reside in the the first 1GB physical
- * memory: see elf64_exec() in
- * boot/efi/loader/arch/amd64/elf64_freebsd.c.
- */
- staging = 1024*1024*1024;
- status = BS->AllocatePages(AllocateMaxAddress, EfiLoaderData,
- nr_pages, &staging);
-#else
- status = BS->AllocatePages(AllocateAnyPages, EfiLoaderData,
- nr_pages, &staging);
+ staging = get_staging_max();
#endif
+ status = BS->AllocatePages(EFI_ALLOC_METHOD, EfiLoaderData,
+ nr_pages, &staging);
if (EFI_ERROR(status)) {
printf("failed to allocate staging area: %lu\n",
EFI_ERROR_CODE(status));
@@ -223,7 +338,7 @@
staging_base = staging;
staging_end = staging + nr_pages * EFI_PAGE_SIZE;
-#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+#if EFI_STAGING_2M_ALIGN
/*
* Round the kernel load address to a 2MiB value. This is needed
* because the kernel builds a page table based on where it has
@@ -231,7 +346,7 @@
* either a 1MiB or 2MiB page for this we need to make sure it
* is correctly aligned for both cases.
*/
- staging = roundup2(staging, 2 * 1024 * 1024);
+ staging = roundup2(staging, M(2));
#endif
return (0);
@@ -240,20 +355,42 @@
static bool
efi_check_space(vm_offset_t end)
{
- EFI_PHYSICAL_ADDRESS addr;
+ EFI_PHYSICAL_ADDRESS addr, new_base, new_staging;
EFI_STATUS status;
unsigned long nr_pages;
+ end = roundup2(end, EFI_PAGE_SIZE);
+
/* There is already enough space */
- if (end <= staging_end)
+ if (end + staging_slop <= staging_end)
return (true);
- end = roundup2(end, EFI_PAGE_SIZE);
- nr_pages = EFI_SIZE_TO_PAGES(end - staging_end);
+ if (boot_services_gone) {
+ if (end <= staging_end)
+ return (true);
+ panic("efi_check_space: cannot expand staging area "
+ "after boot services were exited\n");
+ }
+
+ /*
+ * Add slop at the end:
+ * 1. amd64 kernel expects to do some very early allocations
+ * by carving out memory after kernend. Slop guarantees
+ * that it does not ovewrite anything useful.
+ * 2. It seems that initial calculation of the staging size
+ * could be somewhat smaller than actually copying in after
+ * boot services are exited. Slop avoids calling
+ * BS->AllocatePages() when it cannot work.
+ */
+ end += staging_slop;
+ nr_pages = EFI_SIZE_TO_PAGES(end - staging_end);
#if defined(__i386__) || defined(__amd64__)
- /* X86 needs all memory to be allocated under the 1G boundary */
- if (end > 1024*1024*1024)
+ /*
+ * i386 needs all memory to be allocated under the 1G boundary.
+ * amd64 needs all memory to be allocated under the 1G or 4G boundary.
+ */
+ if (end > get_staging_max())
goto before_staging;
#endif
@@ -268,14 +405,12 @@
before_staging:
/* Try allocating space before the previous allocation */
- if (staging < nr_pages * EFI_PAGE_SIZE) {
- printf("Not enough space before allocation\n");
- return (false);
- }
+ if (staging < nr_pages * EFI_PAGE_SIZE)
+ goto expand;
addr = staging - nr_pages * EFI_PAGE_SIZE;
-#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
+#if EFI_STAGING_2M_ALIGN
/* See efi_copy_init for why this is needed */
- addr = rounddown2(addr, 2 * 1024 * 1024);
+ addr = rounddown2(addr, M(2));
#endif
nr_pages = EFI_SIZE_TO_PAGES(staging_base - addr);
status = BS->AllocatePages(AllocateAddress, EfiLoaderData, nr_pages,
@@ -288,11 +423,42 @@
staging_base = addr;
memmove((void *)(uintptr_t)staging_base,
(void *)(uintptr_t)staging, staging_end - staging);
- stage_offset -= (staging - staging_base);
+ stage_offset -= staging - staging_base;
staging = staging_base;
return (true);
}
+expand:
+ nr_pages = EFI_SIZE_TO_PAGES(end - (vm_offset_t)staging);
+#if EFI_STAGING_2M_ALIGN
+ nr_pages += M(2) / EFI_PAGE_SIZE;
+#endif
+#if defined(__i386__) || defined(__amd64__)
+ new_base = get_staging_max();
+#endif
+ status = BS->AllocatePages(EFI_ALLOC_METHOD, EfiLoaderData,
+ nr_pages, &new_base);
+ if (!EFI_ERROR(status)) {
+#if EFI_STAGING_2M_ALIGN
+ new_staging = roundup2(new_base, M(2));
+#else
+ new_staging = new_base;
+#endif
+ /*
+ * Move the old allocation and update the state so
+ * translation still works.
+ */
+ memcpy((void *)(uintptr_t)new_staging,
+ (void *)(uintptr_t)staging, staging_end - staging);
+ BS->FreePages(staging_base, (staging_end - staging_base) /
+ EFI_PAGE_SIZE);
+ stage_offset -= staging - new_staging;
+ staging = new_staging;
+ staging_end = new_base + nr_pages * EFI_PAGE_SIZE;
+ staging_base = new_base;
+ return (true);
+ }
+
printf("efi_check_space: Unable to expand staging area\n");
return (false);
}
@@ -335,7 +501,6 @@
return (len);
}
-
ssize_t
efi_readin(readin_handle_t fd, vm_offset_t dest, const size_t len)
{
@@ -364,3 +529,8 @@
while (src < last)
*dst++ = *src++;
}
+
+void
+efi_copy_finish_nop(void)
+{
+}
diff --git a/stand/efi/loader/loader_efi.h b/stand/efi/loader/loader_efi.h
--- a/stand/efi/loader/loader_efi.h
+++ b/stand/efi/loader/loader_efi.h
@@ -34,6 +34,15 @@
#include <stand.h>
#include <readin.h>
+#ifdef __amd64__
+enum {
+ COPY_STAGING_ENABLE,
+ COPY_STAGING_DISABLE,
+ COPY_STAGING_AUTO,
+};
+extern int copy_staging;
+#endif
+
int efi_autoload(void);
int efi_copy_init(void);
@@ -44,5 +53,6 @@
void * efi_translate(vm_offset_t ptr);
void efi_copy_finish(void);
+void efi_copy_finish_nop(void);
#endif /* _LOADER_EFI_COPY_H_ */
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -222,6 +222,8 @@
void (*vmm_resume_p)(void);
+bool efi_boot;
+
static void
cpu_startup(dummy)
void *dummy;
@@ -1277,7 +1279,7 @@
* in real mode mode (e.g. SMP bare metal).
*/
#ifdef SMP
- mp_bootaddress(physmap, &physmap_idx);
+ alloc_ap_trampoline(physmap, &physmap_idx);
#endif
/* call pmap initialization to make new kernel address space */
@@ -1598,16 +1600,47 @@
int gsel_tss, x;
struct pcpu *pc;
struct xstate_hdr *xhdr;
- u_int64_t rsp0;
+ uint64_t cr3, rsp0;
+ pml4_entry_t *pml4e;
+ pdp_entry_t *pdpe;
+ pd_entry_t *pde;
char *env;
struct user_segment_descriptor *gdt;
struct region_descriptor r_gdt;
size_t kstack0_sz;
int late_console;
- bool efi_boot;
TSRAW(&thread0, TS_ENTER, __func__, NULL);
+ /*
+ * Calculate kernphys by inspecting page table created by loader.
+ * The assumptions:
+ * - kernel is mapped at KERNBASE, backed by contiguous phys memory
+ * aligned at 2M, below 4G (the latter is important for AP startup)
+ * - there is a 2M hole at KERNBASE
+ * - kernel is mapped with 2M superpages
+ * - all participating memory, i.e. kernel, modules, metadata,
+ * page table is accessible by pre-created 1:1 mapping
+ * (right now loader creates 1:1 mapping for lower 4G, and all
+ * memory is from there)
+ * - there is a usable memory block right after the end of the
+ * mapped kernel and all modules/metadata, pointed to by
+ * physfree, for early allocations
+ */
+ cr3 = rcr3();
+ pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index(
+ (vm_offset_t)hammer_time);
+ pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index(
+ (vm_offset_t)hammer_time);
+ pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index(
+ (vm_offset_t)hammer_time);
+ kernphys = (vm_paddr_t)(*pde & ~PDRMASK) -
+ (vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK);
+
+ /* Fix-up for 2M hole */
+ physfree += kernphys;
+ kernphys += NBPDR;
+
kmdp = init_ops.parse_preload_data(modulep);
efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
@@ -1653,7 +1686,7 @@
/* Init basic tunables, hz etc */
init_param1();
- thread0.td_kstack = physfree + KERNBASE;
+ thread0.td_kstack = physfree - kernphys + KERNSTART;
thread0.td_kstack_pages = kstack_pages;
kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
bzero((void *)thread0.td_kstack, kstack0_sz);
@@ -1690,7 +1723,7 @@
wrmsr(MSR_GSBASE, (u_int64_t)pc);
wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
- dpcpu_init((void *)(physfree + KERNBASE), 0);
+ dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
physfree += DPCPU_SIZE;
amd64_bsp_pcpu_init1(pc);
/* Non-late cninit() and printf() can be moved up to here. */
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -105,6 +105,7 @@
static char *dbg_stack;
extern u_int mptramp_la57;
+extern u_int mptramp_nx;
/*
* Local data and functions.
@@ -112,86 +113,6 @@
static int start_ap(int apic_id);
-static bool
-is_kernel_paddr(vm_paddr_t pa)
-{
-
- return (pa >= trunc_2mpage(btext - KERNBASE) &&
- pa < round_page(_end - KERNBASE));
-}
-
-static bool
-is_mpboot_good(vm_paddr_t start, vm_paddr_t end)
-{
-
- return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem);
-}
-
-/*
- * Calculate usable address in base memory for AP trampoline code.
- */
-void
-mp_bootaddress(vm_paddr_t *physmap, unsigned int *physmap_idx)
-{
- vm_paddr_t start, end;
- unsigned int i;
- bool allocated;
-
- alloc_ap_trampoline(physmap, physmap_idx);
-
- /*
- * Find a memory region big enough below the 4GB boundary to
- * store the initial page tables. Region must be mapped by
- * the direct map.
- *
- * Note that it needs to be aligned to a page boundary.
- */
- allocated = false;
- for (i = *physmap_idx; i <= *physmap_idx; i -= 2) {
- /*
- * First, try to chomp at the start of the physmap region.
- * Kernel binary might claim it already.
- */
- start = round_page(physmap[i]);
- end = start + AP_BOOTPT_SZ;
- if (start < end && end <= physmap[i + 1] &&
- is_mpboot_good(start, end) &&
- !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
- allocated = true;
- physmap[i] = end;
- break;
- }
-
- /*
- * Second, try to chomp at the end. Again, check
- * against kernel.
- */
- end = trunc_page(physmap[i + 1]);
- start = end - AP_BOOTPT_SZ;
- if (start < end && start >= physmap[i] &&
- is_mpboot_good(start, end) &&
- !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
- allocated = true;
- physmap[i + 1] = start;
- break;
- }
- }
- if (allocated) {
- mptramp_pagetables = start;
- if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) {
- memmove(&physmap[i], &physmap[i + 2],
- sizeof(*physmap) * (*physmap_idx - i + 2));
- *physmap_idx -= 2;
- }
- } else {
- mptramp_pagetables = trunc_page(boot_address) - AP_BOOTPT_SZ;
- if (bootverbose)
- printf(
-"Cannot find enough space for the initial AP page tables, placing them at %#x",
- mptramp_pagetables);
- }
-}
-
/*
* Initialize the IPI handlers and start up the AP's.
*/
@@ -243,6 +164,9 @@
assign_cpu_ids();
mptramp_la57 = la57;
+ mptramp_nx = pg_nx != 0;
+ MPASS(kernel_pmap->pm_cr3 < (1UL << 32));
+ mptramp_pagetables = kernel_pmap->pm_cr3;
/* Start each Application Processor */
start_all_aps();
@@ -399,64 +323,79 @@
int
start_all_aps(void)
{
- u_int64_t *pt5, *pt4, *pt3, *pt2;
+ vm_page_t m_pml4, m_pdp, m_pd[4];
+ pml5_entry_t old_pml45;
+ pml4_entry_t *v_pml4;
+ pdp_entry_t *v_pdp;
+ pd_entry_t *v_pd;
u_int32_t mpbioswarmvec;
- int apic_id, cpu, domain, i, xo;
+ int apic_id, cpu, domain, i;
u_char mpbiosreason;
mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
- /* copy the AP 1st level boot code */
- bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
-
- /* Locate the page tables, they'll be below the trampoline */
+ /* Create a transient 1:1 mapping of low 4G */
if (la57) {
- pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
- xo = 1;
+ m_pml4 = pmap_page_alloc_below_4g(true);
+ v_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4));
} else {
- xo = 0;
+ v_pml4 = &kernel_pmap->pm_pmltop[0];
}
- pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE);
- pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
- pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
-
- /* Create the initial 1GB replicated page tables */
- for (i = 0; i < 512; i++) {
- if (la57) {
- pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
- PAGE_SIZE);
- pt5[i] |= PG_V | PG_RW | PG_U;
- }
-
- /*
- * Each slot of the level 4 pages points to the same
- * level 3 page.
- */
- pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
- (xo + 1) * PAGE_SIZE);
- pt4[i] |= PG_V | PG_RW | PG_U;
-
- /*
- * Each slot of the level 3 pages points to the same
- * level 2 page.
- */
- pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
- ((xo + 2) * PAGE_SIZE));
- pt3[i] |= PG_V | PG_RW | PG_U;
-
- /* The level 2 page slots are mapped with 2MB pages for 1GB. */
- pt2[i] = i * (2 * 1024 * 1024);
- pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
+ m_pdp = pmap_page_alloc_below_4g(true);
+ v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp));
+ m_pd[0] = pmap_page_alloc_below_4g(false);
+ v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[0]));
+ for (i = 0; i < NPDEPG; i++)
+ v_pd[i] = (i << PDRSHIFT) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M | PG_PS;
+ m_pd[1] = pmap_page_alloc_below_4g(false);
+ v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[1]));
+ for (i = 0; i < NPDEPG; i++)
+ v_pd[i] = (NBPDP + (i << PDRSHIFT)) | X86_PG_V | X86_PG_RW |
+ X86_PG_A | X86_PG_M | PG_PS;
+ m_pd[2] = pmap_page_alloc_below_4g(false);
+ v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[2]));
+ for (i = 0; i < NPDEPG; i++)
+ v_pd[i] = (2UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS;
+ m_pd[3] = pmap_page_alloc_below_4g(false);
+ v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[3]));
+ for (i = 0; i < NPDEPG; i++)
+ v_pd[i] = (3UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS;
+ v_pdp[0] = VM_PAGE_TO_PHYS(m_pd[0]) | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M;
+ v_pdp[1] = VM_PAGE_TO_PHYS(m_pd[1]) | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M;
+ v_pdp[2] = VM_PAGE_TO_PHYS(m_pd[2]) | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M;
+ v_pdp[3] = VM_PAGE_TO_PHYS(m_pd[3]) | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M;
+ old_pml45 = kernel_pmap->pm_pmltop[0];
+ if (la57) {
+ kernel_pmap->pm_pmltop[0] = VM_PAGE_TO_PHYS(m_pml4) |
+ X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
}
+ v_pml4[0] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M;
+ pmap_invalidate_all(kernel_pmap);
+
+ /* copy the AP 1st level boot code */
+ bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
+ if (bootverbose)
+ printf("AP boot address %#x\n", boot_address);
/* save the current value of the warm-start vector */
- mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
+ if (!efi_boot)
+ mpbioswarmvec = *((u_int32_t *) WARMBOOT_OFF);
outb(CMOS_REG, BIOS_RESET);
mpbiosreason = inb(CMOS_DATA);
/* setup a vector to our boot code */
- *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET;
- *((volatile u_short *) WARMBOOT_SEG) = (boot_address >> 4);
+ if (!efi_boot) {
+ *((volatile u_short *)WARMBOOT_OFF) = WARMBOOT_TARGET;
+ *((volatile u_short *)WARMBOOT_SEG) = (boot_address >> 4);
+ }
outb(CMOS_REG, BIOS_RESET);
outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */
@@ -512,6 +451,17 @@
outb(CMOS_REG, BIOS_RESET);
outb(CMOS_DATA, mpbiosreason);
+ /* Destroy transient 1:1 mapping */
+ kernel_pmap->pm_pmltop[0] = old_pml45;
+ invlpg(0);
+ if (la57)
+ vm_page_free(m_pml4);
+ vm_page_free(m_pd[3]);
+ vm_page_free(m_pd[2]);
+ vm_page_free(m_pd[1]);
+ vm_page_free(m_pd[0]);
+ vm_page_free(m_pdp);
+
/* number of APs actually started */
return (mp_naps);
}
diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S
--- a/sys/amd64/amd64/mpboot.S
+++ b/sys/amd64/amd64/mpboot.S
@@ -95,12 +95,25 @@
* is later enabled.
*/
mov %cr4, %eax
- orl $CR4_PAE, %eax
+ orl $(CR4_PAE | CR4_PGE), %eax
cmpb $0, mptramp_la57-mptramp_start(%ebx)
je 1f
orl $CR4_LA57, %eax
1: mov %eax, %cr4
+ /*
+ * If the BSP reported NXE support, enable EFER.NXE for all APs
+ * prior to loading %cr3. This avoids page faults if the AP
+ * encounters memory marked with the NX bit prior to detecting and
+ * enabling NXE support.
+ */
+ cmpb $0,mptramp_nx-mptramp_start(%ebx)
+ je 2f
+ movl $MSR_EFER, %ecx
+ rdmsr
+ orl $EFER_NXE, %eax
+ wrmsr
+2:
/*
* Enable EFER.LME so that we get long mode when all the prereqs are
* in place. In this case, it turns on when CR0_PG is finally enabled.
@@ -112,12 +125,13 @@
wrmsr
/*
- * Point to the embedded page tables for startup. Note that this
- * only gets accessed after we're actually in 64 bit mode, however
- * we can only set the bottom 32 bits of %cr3 in this state. This
- * means we are required to use a temporary page table that is below
- * the 4GB limit. %ebx is still our relocation base. We could just
- * subtract 3 * PAGE_SIZE, but that would be too easy.
+ * Load kernel page table pointer into %cr3.
+ * %ebx is still our relocation base.
+ *
+ * Note that this only gets accessed after we're actually in 64 bit
+ * mode, however we can only set the bottom 32 bits of %cr3 in this
+ * state. This means we depend on the kernel page table being
+ * allocated from the low 4G.
*/
leal mptramp_pagetables-mptramp_start(%ebx),%eax
movl (%eax), %eax
@@ -155,10 +169,8 @@
/*
* Yeehar! We're running in 64 bit mode! We can mostly ignore our
* segment registers, and get on with it.
- * Note that we are running at the correct virtual address, but with
- * a 1:1 1GB mirrored mapping over entire address space. We had better
- * switch to a real %cr3 promptly so that we can get to the direct map
- * space. Remember that jmp is relative and that we've been relocated,
+ * We are running at the correct virtual address space.
+ * Note that the jmp is relative and that we've been relocated,
* so use an indirect jump.
*/
.code64
@@ -220,6 +232,10 @@
mptramp_la57:
.long 0
+ .globl mptramp_nx
+mptramp_nx:
+ .long 0
+
/*
* The pseudo descriptor for lgdt to use.
*/
@@ -243,31 +259,5 @@
.code64
.p2align 4,0
entry_64:
- /*
- * If the BSP reported NXE support, enable EFER.NXE for all APs
- * prior to loading %cr3. This avoids page faults if the AP
- * encounters memory marked with the NX bit prior to detecting and
- * enabling NXE support.
- */
- movq pg_nx, %rbx
- testq %rbx, %rbx
- je 1f
- movl $MSR_EFER, %ecx
- rdmsr
- orl $EFER_NXE, %eax
- wrmsr
-
-1:
- /*
- * Load a real %cr3 that has all the direct map stuff and switches
- * off the 1GB replicated mirror. Load a stack pointer and jump
- * into AP startup code in C.
- */
- cmpl $0, la57
- jne 2f
- movq KPML4phys, %rax
- jmp 3f
-2: movq KPML5phys, %rax
-3: movq %rax, %cr3
movq bootSTK, %rsp
jmp init_secondary
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -436,7 +436,8 @@
static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
static int ndmpdpphys; /* number of DMPDPphys pages */
-static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */
+vm_paddr_t kernphys; /* phys addr of start of bootstrap data */
+vm_paddr_t KERNend; /* and the end */
/*
* pmap_mapdev support pre initialization (i.e. console)
@@ -1554,7 +1555,7 @@
#ifdef NKPT
pt_pages = NKPT;
#else
- pt_pages = howmany(addr, NBPDR);
+ pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */
pt_pages += NKPDPE(pt_pages);
/*
@@ -1594,7 +1595,6 @@
static inline pt_entry_t
bootaddr_rwx(vm_paddr_t pa)
{
-
/*
* The kernel is loaded at a 2MB-aligned address, and memory below that
* need not be executable. The .bss section is padded to a 2MB
@@ -1602,8 +1602,8 @@
* either. Preloaded kernel modules have their mapping permissions
* fixed up by the linker.
*/
- if (pa < trunc_2mpage(btext - KERNBASE) ||
- pa >= trunc_2mpage(_end - KERNBASE))
+ if (pa < trunc_2mpage(kernphys + btext - KERNSTART) ||
+ pa >= trunc_2mpage(kernphys + _end - KERNSTART))
return (X86_PG_RW | pg_nx);
/*
@@ -1612,7 +1612,7 @@
* impact read-only data. However, in any case, any page with
* read-write data needs to be read-write.
*/
- if (pa >= trunc_2mpage(brwsection - KERNBASE))
+ if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART))
return (X86_PG_RW | pg_nx);
/*
@@ -1624,7 +1624,7 @@
* Note that fixups to the .text section will still work until we
* set CR0.WP.
*/
- if (pa < round_2mpage(etext - KERNBASE))
+ if (pa < round_2mpage(kernphys + etext - KERNSTART))
return (0);
return (pg_nx);
}
@@ -1636,6 +1636,7 @@
pdp_entry_t *pdp_p;
pml4_entry_t *p4_p;
uint64_t DMPDkernphys;
+ vm_paddr_t pax;
#ifdef KASAN
pt_entry_t *pt_p;
uint64_t KASANPDphys, KASANPTphys, KASANphys;
@@ -1670,9 +1671,11 @@
/*
* Allocate 2M pages for the kernel. These will be used in
- * place of the first one or more 1G pages from ndm1g.
+ * place of the one or more 1G pages from ndm1g that maps
+ * kernel memory into DMAP.
*/
- nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP);
+ nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART +
+ kernphys - rounddown2(kernphys, NBPDP), NBPDP);
DMPDkernphys = allocpages(firstaddr, nkdmpde);
}
if (ndm1g < ndmpdp)
@@ -1719,14 +1722,18 @@
pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
/*
- * Map from physical address zero to the end of loader preallocated
- * memory using 2MB pages. This replaces some of the PD entries
- * created above.
+ * Map from start of the kernel in physical memory (staging
+ * area) to the end of loader preallocated memory using 2MB
+ * pages. This replaces some of the PD entries created above.
+ * For compatibility, identity map 2M at the start.
*/
- for (i = 0; (i << PDRSHIFT) < KERNend; i++)
+ pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A |
+ X86_PG_RW | pg_nx;
+ for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) {
/* Preset PG_M and PG_A because demotion expects it. */
- pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
- X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT);
+ pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
+ X86_PG_A | bootaddr_rwx(pax);
+ }
/*
* Because we map the physical blocks in 2M pages, adjust firstaddr
@@ -1792,15 +1799,18 @@
* use 2M pages with read-only and no-execute permissions. (If using 1G
* pages, this will partially overwrite the PDPEs above.)
*/
- if (ndm1g) {
+ if (ndm1g > 0) {
pd_p = (pd_entry_t *)DMPDkernphys;
- for (i = 0; i < (NPDEPG * nkdmpde); i++)
- pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
- X86_PG_M | X86_PG_A | pg_nx |
- bootaddr_rwx(i << PDRSHIFT);
- for (i = 0; i < nkdmpde; i++)
- pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW |
- X86_PG_V | pg_nx;
+ for (i = 0, pax = rounddown2(kernphys, NBPDP);
+ i < NPDEPG * nkdmpde; i++, pax += NBPDR) {
+ pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
+ X86_PG_A | pg_nx | bootaddr_rwx(pax);
+ }
+ j = rounddown2(kernphys, NBPDP) >> PDPSHIFT;
+ for (i = 0; i < nkdmpde; i++) {
+ pdp_p[i + j] = (DMPDkernphys + ptoa(i)) |
+ X86_PG_RW | X86_PG_V | pg_nx;
+ }
}
/* And recursively map PML4 to itself in order to get PTmap */
@@ -1876,7 +1886,8 @@
/*
* Account for the virtual addresses mapped by create_pagetables().
*/
- virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend);
+ virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend -
+ (vm_paddr_t)kernphys);
virtual_end = VM_MAX_KERNEL_ADDRESS;
/*
@@ -2062,6 +2073,19 @@
load_cr4(cr4);
}
+vm_page_t
+pmap_page_alloc_below_4g(bool zeroed)
+{
+ vm_page_t m;
+
+ m = vm_page_alloc_contig(NULL, 0, (zeroed ? VM_ALLOC_ZERO : 0) |
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if (m != NULL && zeroed && (m->flags & PG_ZERO) == 0)
+ pmap_zero_page(m);
+ return (m);
+}
+
extern const char la57_trampoline[], la57_trampoline_gdt_desc[],
la57_trampoline_gdt[], la57_trampoline_end[];
@@ -2087,42 +2111,18 @@
r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1;
r_gdt.rd_base = (long)__pcpu[0].pc_gdt;
- m_code = vm_page_alloc_contig(NULL, 0,
- VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
- 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
- if ((m_code->flags & PG_ZERO) == 0)
- pmap_zero_page(m_code);
+ m_code = pmap_page_alloc_below_4g(true);
v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code));
- m_pml5 = vm_page_alloc_contig(NULL, 0,
- VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
- 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
- if ((m_pml5->flags & PG_ZERO) == 0)
- pmap_zero_page(m_pml5);
+ m_pml5 = pmap_page_alloc_below_4g(true);
KPML5phys = VM_PAGE_TO_PHYS(m_pml5);
v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys);
- m_pml4 = vm_page_alloc_contig(NULL, 0,
- VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
- 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
- if ((m_pml4->flags & PG_ZERO) == 0)
- pmap_zero_page(m_pml4);
+ m_pml4 = pmap_page_alloc_below_4g(true);
v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4));
- m_pdp = vm_page_alloc_contig(NULL, 0,
- VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
- 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
- if ((m_pdp->flags & PG_ZERO) == 0)
- pmap_zero_page(m_pdp);
+ m_pdp = pmap_page_alloc_below_4g(true);
v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp));
- m_pd = vm_page_alloc_contig(NULL, 0,
- VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
- 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
- if ((m_pd->flags & PG_ZERO) == 0)
- pmap_zero_page(m_pd);
+ m_pd = pmap_page_alloc_below_4g(true);
v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd));
- m_pt = vm_page_alloc_contig(NULL, 0,
- VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
- 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
- if ((m_pt->flags & PG_ZERO) == 0)
- pmap_zero_page(m_pt);
+ m_pt = pmap_page_alloc_below_4g(true);
v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt));
/*
@@ -2425,7 +2425,8 @@
* Collect the page table pages that were replaced by a 2MB
* page in create_pagetables(). They are zero filled.
*/
- if ((vm_paddr_t)i << PDRSHIFT < KERNend &&
+ if ((i == 0 ||
+ kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) &&
pmap_insert_pt_page(kernel_pmap, mpte, false))
panic("pmap_init: pmap_insert_pt_page failed");
}
@@ -6692,7 +6693,9 @@
mpte < &vm_page_array[vm_page_array_size],
("pmap_promote_pde: page table page is out of range"));
KASSERT(mpte->pindex == pmap_pde_pindex(va),
- ("pmap_promote_pde: page table page's pindex is wrong"));
+ ("pmap_promote_pde: page table page's pindex is wrong "
+ "mpte %p pidx %#lx va %#lx va pde pidx %#lx",
+ mpte, mpte->pindex, va, pmap_pde_pindex(va)));
if (pmap_insert_pt_page(pmap, mpte, true)) {
counter_u64_add(pmap_pde_p_failures, 1);
CTR2(KTR_PMAP,
@@ -10763,8 +10766,8 @@
va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu);
pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false);
}
- pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
- (vm_offset_t)etext, true);
+ pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext,
+ true);
pti_finalized = true;
VM_OBJECT_WUNLOCK(pti_obj);
}
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
--- a/sys/amd64/include/md_var.h
+++ b/sys/amd64/include/md_var.h
@@ -49,11 +49,10 @@
extern int la57;
-/*
- * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its
- * value is the physical address at which the kernel is loaded.
- */
-extern char kernphys[];
+extern vm_paddr_t kernphys;
+extern vm_paddr_t KERNend;
+
+extern bool efi_boot;
struct savefpu;
struct sysentvec;
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -456,6 +456,10 @@
#define pmap_page_is_write_mapped(m) (((m)->a.flags & PGA_WRITEABLE) != 0)
#define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz))
+#define pmap_vm_page_alloc_check(m) \
+ KASSERT(m->phys_addr < kernphys || m->phys_addr >= KERNend, \
+ ("allocating kernel page %p", m));
+
struct thread;
void pmap_activate_boot(pmap_t pmap);
@@ -509,6 +513,7 @@
void pmap_thread_init_invl_gen(struct thread *td);
int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap);
void pmap_page_array_startup(long count);
+vm_page_t pmap_page_alloc_below_4g(bool zeroed);
#ifdef KASAN
void pmap_kasan_enter(vm_offset_t);
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -39,7 +39,6 @@
void invlop_handler(void);
int start_all_aps(void);
-void mp_bootaddress(vm_paddr_t *, unsigned int *);
#endif /* !LOCORE */
#endif /* SMP */
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -151,8 +151,10 @@
#endif
/*
- * Kernel physical load address. Needs to be aligned at 2MB superpage
- * boundary.
+ * Kernel physical load address for non-UEFI boot and for legacy UEFI loader.
+ * Newer UEFI loader loads kernel anywhere below 4G, with memory allocated
+ * by boot services.
+ * Needs to be aligned at 2MB superpage boundary.
*/
#ifndef KERNLOAD
#define KERNLOAD 0x200000
@@ -192,7 +194,17 @@
#define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0)
#define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0)
+/*
+ * Formally kernel mapping starts at KERNBASE, but kernel linker
+ * script leaves first PDE reserved. For legacy BIOS boot, kernel is
+ * loaded at KERNLOAD = 2M, and initial kernel page table maps
+ * physical memory from zero to KERNend starting at KERNBASE.
+ *
+ * KERNSTART is where the first actual kernel page is mapped, after
+ * the compatibility mapping.
+ */
#define KERNBASE KV4ADDR(KPML4I, KPDPI, 0, 0)
+#define KERNSTART (KERNBASE + NBPDR)
#define UPT_MAX_ADDRESS KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)
#define UPT_MIN_ADDRESS KV4ADDR(PML4PML4I, 0, 0, 0)
diff --git a/sys/conf/ldscript.amd64 b/sys/conf/ldscript.amd64
--- a/sys/conf/ldscript.amd64
+++ b/sys/conf/ldscript.amd64
@@ -5,15 +5,14 @@
SEARCH_DIR("/usr/lib");
SECTIONS
{
- kernphys = kernload;
/* Read-only sections, merged into text segment: */
- . = kernbase + kernphys + SIZEOF_HEADERS;
+ . = kernbase + kernload + SIZEOF_HEADERS;
/*
* Use the AT keyword in order to set the right LMA that contains
* the physical address where the section should be loaded. This is
* needed for the Xen loader which honours the LMA.
*/
- .interp : AT (kernphys + SIZEOF_HEADERS) { *(.interp) }
+ .interp : AT (kernload + SIZEOF_HEADERS) { *(.interp) }
.hash : { *(.hash) }
.gnu.hash : { *(.gnu.hash) }
.dynsym : { *(.dynsym) }
diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c
--- a/sys/vm/vm_page.c
+++ b/sys/vm/vm_page.c
@@ -2416,6 +2416,7 @@
("page %p has unexpected memattr %d",
m, pmap_page_get_memattr(m)));
KASSERT(m->valid == 0, ("free page %p is valid", m));
+ pmap_vm_page_alloc_check(m);
}
/*
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1065,11 +1065,6 @@
}
#ifdef __amd64__
- /*
- * Enable global pages TLB extension
- * This also implicitly flushes the TLB
- */
- load_cr4(rcr4() | CR4_PGE);
if (pmap_pcid_enabled)
load_cr4(rcr4() | CR4_PCIDE);
load_ds(_udatasel);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Tue, Feb 17, 5:04 PM (17 h, 12 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28819304
Default Alt Text
D31121.id92658.diff (42 KB)
Attached To
Mode
D31121: amd64 UEFI boot: stop copying staging area to 2M phys
Attached
Detach File
Event Timeline
Log In to Comment