Page MenuHomeFreeBSD

D25273.id74807.diff
No OneTemporary

D25273.id74807.diff

Index: sys/amd64/acpica/acpi_wakecode.S
===================================================================
--- sys/amd64/acpica/acpi_wakecode.S
+++ sys/amd64/acpica/acpi_wakecode.S
@@ -148,10 +148,18 @@
mov $bootdata32 - bootgdt, %eax
mov %ax, %ds
- /* Turn on the PAE bit for when paging is enabled */
+ /*
+ * Turn on the PAE and optionally LA57 bits for when paging
+ * is enabled.
+ */
mov %cr4, %eax
orl $CR4_PAE, %eax
- mov %eax, %cr4
+ leal wakeup_pagetables - wakeup_start(%ebx), %ecx
+ movl (%ecx), %ecx
+ testl $0x1, %ecx
+ je 1f
+ orl $CR4_LA57, %eax
+1: mov %eax, %cr4
/*
* Enable EFER.LME so that we get long mode when all the prereqs are
@@ -174,6 +182,7 @@
*/
leal wakeup_pagetables - wakeup_start(%ebx), %eax
movl (%eax), %eax
+ andl $~0x1, %eax
mov %eax, %cr3
/*
Index: sys/amd64/amd64/cpu_switch.S
===================================================================
--- sys/amd64/amd64/cpu_switch.S
+++ sys/amd64/amd64/cpu_switch.S
@@ -382,8 +382,11 @@
* Resuming processor state from pcb.
*/
ENTRY(resumectx)
- /* Switch to KPML4phys. */
+ /* Switch to KPML5/4phys. */
movq KPML4phys,%rax
+ movq KPML5phys,%rcx
+ cmpl $0, la57
+ cmovne %rcx, %rax
movq %rax,%cr3
/* Force kernel segment registers. */
Index: sys/amd64/amd64/efirt_machdep.c
===================================================================
--- sys/amd64/amd64/efirt_machdep.c
+++ sys/amd64/amd64/efirt_machdep.c
@@ -61,9 +61,10 @@
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
+static pml5_entry_t *efi_pml5;
static pml4_entry_t *efi_pml4;
static vm_object_t obj_1t1_pt;
-static vm_page_t efi_pml4_page;
+static vm_page_t efi_pmltop_page;
static vm_pindex_t efi_1t1_idx;
void
@@ -82,7 +83,8 @@
obj_1t1_pt = NULL;
efi_pml4 = NULL;
- efi_pml4_page = NULL;
+ efi_pml5 = NULL;
+ efi_pmltop_page = NULL;
}
/*
@@ -109,22 +111,38 @@
static pt_entry_t *
efi_1t1_pte(vm_offset_t va)
{
+ pml5_entry_t *pml5e;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t *pde;
pt_entry_t *pte;
vm_page_t m;
- vm_pindex_t pml4_idx, pdp_idx, pd_idx;
+ vm_pindex_t pml5_idx, pml4_idx, pdp_idx, pd_idx;
vm_paddr_t mphys;
pml4_idx = pmap_pml4e_index(va);
- pml4e = &efi_pml4[pml4_idx];
+ if (la57) {
+ pml5_idx = pmap_pml5e_index(va);
+ pml5e = &efi_pml5[pml5_idx];
+ if (*pml5e == 0) {
+ m = efi_1t1_page();
+ mphys = VM_PAGE_TO_PHYS(m);
+ *pml5e = mphys | X86_PG_RW | X86_PG_V;
+ } else {
+ mphys = *pml5e & PG_FRAME;
+ }
+ pml4e = (pml4_entry_t *)PHYS_TO_DMAP(mphys);
+ pml4e = &pml4e[pml4_idx];
+ } else {
+ pml4e = &efi_pml4[pml4_idx];
+ }
+
if (*pml4e == 0) {
m = efi_1t1_page();
mphys = VM_PAGE_TO_PHYS(m);
*pml4e = mphys | X86_PG_RW | X86_PG_V;
} else {
- mphys = *pml4e & ~PAGE_MASK;
+ mphys = *pml4e & PG_FRAME;
}
pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys);
@@ -135,7 +153,7 @@
mphys = VM_PAGE_TO_PHYS(m);
*pdpe = mphys | X86_PG_RW | X86_PG_V;
} else {
- mphys = *pdpe & ~PAGE_MASK;
+ mphys = *pdpe & PG_FRAME;
}
pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
@@ -146,7 +164,7 @@
mphys = VM_PAGE_TO_PHYS(m);
*pde = mphys | X86_PG_RW | X86_PG_V;
} else {
- mphys = *pde & ~PAGE_MASK;
+ mphys = *pde & PG_FRAME;
}
pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
@@ -161,6 +179,7 @@
{
struct efi_md *p;
pt_entry_t *pte;
+ void *pml;
vm_offset_t va;
uint64_t idx;
int bits, i, mode;
@@ -170,10 +189,16 @@
VM_PROT_ALL, 0, NULL);
efi_1t1_idx = 0;
VM_OBJECT_WLOCK(obj_1t1_pt);
- efi_pml4_page = efi_1t1_page();
+ efi_pmltop_page = efi_1t1_page();
VM_OBJECT_WUNLOCK(obj_1t1_pt);
- efi_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(efi_pml4_page));
- pmap_pinit_pml4(efi_pml4_page);
+ pml = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(efi_pmltop_page));
+ if (la57) {
+ efi_pml5 = pml;
+ pmap_pinit_pml5(efi_pmltop_page);
+ } else {
+ efi_pml4 = pml;
+ pmap_pinit_pml4(efi_pmltop_page);
+ }
for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p,
descsz)) {
@@ -279,7 +304,7 @@
if (pmap_pcid_enabled && !invpcid_works)
PCPU_SET(curpmap, NULL);
- load_cr3(VM_PAGE_TO_PHYS(efi_pml4_page) | (pmap_pcid_enabled ?
+ load_cr3(VM_PAGE_TO_PHYS(efi_pmltop_page) | (pmap_pcid_enabled ?
curpmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid : 0));
/*
* If PCID is enabled, the clear CR3_PCID_SAVE bit in the loaded %cr3
Index: sys/amd64/amd64/elf_machdep.c
===================================================================
--- sys/amd64/amd64/elf_machdep.c
+++ sys/amd64/amd64/elf_machdep.c
@@ -49,7 +49,7 @@
#include <machine/fpu.h>
#include <machine/md_var.h>
-struct sysentvec elf64_freebsd_sysvec = {
+struct sysentvec elf64_freebsd_sysvec_la48 = {
.sv_size = SYS_MAXSYSCALL,
.sv_table = sysent,
.sv_errsize = 0,
@@ -64,9 +64,9 @@
.sv_imgact_try = NULL,
.sv_minsigstksz = MINSIGSTKSZ,
.sv_minuser = VM_MIN_ADDRESS,
- .sv_maxuser = VM_MAXUSER_ADDRESS,
- .sv_usrstack = USRSTACK,
- .sv_psstrings = PS_STRINGS,
+ .sv_maxuser = VM_MAXUSER_ADDRESS_LA48,
+ .sv_usrstack = USRSTACK_LA48,
+ .sv_psstrings = PS_STRINGS_LA48,
.sv_stackprot = VM_PROT_ALL,
.sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs),
.sv_copyout_strings = exec_copyout_strings,
@@ -78,14 +78,64 @@
.sv_set_syscall_retval = cpu_set_syscall_retval,
.sv_fetch_syscall_args = cpu_fetch_syscall_args,
.sv_syscallnames = syscallnames,
- .sv_shared_page_base = SHAREDPAGE,
+ .sv_shared_page_base = SHAREDPAGE_LA48,
.sv_shared_page_len = PAGE_SIZE,
.sv_schedtail = NULL,
.sv_thread_detach = NULL,
.sv_trap = NULL,
.sv_stackgap = elf64_stackgap,
};
-INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec);
+
+struct sysentvec elf64_freebsd_sysvec_la57 = {
+ .sv_size = SYS_MAXSYSCALL,
+ .sv_table = sysent,
+ .sv_errsize = 0,
+ .sv_errtbl = NULL,
+ .sv_transtrap = NULL,
+ .sv_fixup = __elfN(freebsd_fixup),
+ .sv_sendsig = sendsig,
+ .sv_sigcode = sigcode,
+ .sv_szsigcode = &szsigcode,
+ .sv_name = "FreeBSD ELF64",
+ .sv_coredump = __elfN(coredump),
+ .sv_imgact_try = NULL,
+ .sv_minsigstksz = MINSIGSTKSZ,
+ .sv_minuser = VM_MIN_ADDRESS,
+ .sv_maxuser = VM_MAXUSER_ADDRESS_LA57,
+ .sv_usrstack = USRSTACK_LA57,
+ .sv_psstrings = PS_STRINGS_LA57,
+ .sv_stackprot = VM_PROT_ALL,
+ .sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs),
+ .sv_copyout_strings = exec_copyout_strings,
+ .sv_setregs = exec_setregs,
+ .sv_fixlimit = NULL,
+ .sv_maxssiz = NULL,
+ .sv_flags = SV_ABI_FREEBSD | SV_ASLR | SV_LP64 | SV_SHP |
+ SV_TIMEKEEP,
+ .sv_set_syscall_retval = cpu_set_syscall_retval,
+ .sv_fetch_syscall_args = cpu_fetch_syscall_args,
+ .sv_syscallnames = syscallnames,
+ .sv_shared_page_base = SHAREDPAGE_LA57,
+ .sv_shared_page_len = PAGE_SIZE,
+ .sv_schedtail = NULL,
+ .sv_thread_detach = NULL,
+ .sv_trap = NULL,
+ .sv_stackgap = elf64_stackgap,
+};
+
+static void
+amd64_init_sysvecs(void *arg)
+{
+ amd64_lower_shared_page(&elf64_freebsd_sysvec_la48);
+ if (la57) {
+ exec_sysvec_init(&elf64_freebsd_sysvec_la57);
+ exec_sysvec_init_secondary(&elf64_freebsd_sysvec_la57,
+ &elf64_freebsd_sysvec_la48);
+ } else {
+ exec_sysvec_init(&elf64_freebsd_sysvec_la48);
+ }
+}
+SYSINIT(elf64_sysvec, SI_SUB_EXEC, SI_ORDER_ANY, amd64_init_sysvecs, NULL);
void
amd64_lower_shared_page(struct sysentvec *sv)
@@ -98,29 +148,57 @@
}
}
-/*
- * Do this fixup before INIT_SYSENTVEC (SI_ORDER_ANY) because the latter
- * uses the value of sv_shared_page_base.
- */
-SYSINIT(elf64_sysvec_fixup, SI_SUB_EXEC, SI_ORDER_FIRST,
- (sysinit_cfunc_t) amd64_lower_shared_page,
- &elf64_freebsd_sysvec);
+static boolean_t
+freebsd_brand_info_la57_img_compat(struct image_params *imgp,
+ int32_t *osrel __unused, uint32_t *fctl0)
+{
+ if ((imgp->proc->p_md.md_flags & P_MD_LA57) != 0)
+ return (TRUE);
+ if (fctl0 == NULL || (*fctl0 & NT_FREEBSD_FCTL_LA48) != 0)
+ return (FALSE);
+ if ((imgp->proc->p_md.md_flags & P_MD_LA48) != 0)
+ return (FALSE);
+ return (TRUE);
+}
-static Elf64_Brandinfo freebsd_brand_info = {
+static Elf64_Brandinfo freebsd_brand_info_la48 = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_X86_64,
.compat_3_brand = "FreeBSD",
.emul_path = NULL,
.interp_path = "/libexec/ld-elf.so.1",
- .sysvec = &elf64_freebsd_sysvec,
+ .sysvec = &elf64_freebsd_sysvec_la48,
.interp_newpath = NULL,
.brand_note = &elf64_freebsd_brandnote,
- .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
+ .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE,
+};
+
+static Elf64_Brandinfo freebsd_brand_info_la57 = {
+ .brand = ELFOSABI_FREEBSD,
+ .machine = EM_X86_64,
+ .compat_3_brand = "FreeBSD",
+ .emul_path = NULL,
+ .interp_path = "/libexec/ld-elf.so.1",
+ .sysvec = &elf64_freebsd_sysvec_la57,
+ .interp_newpath = NULL,
+ .brand_note = &elf64_freebsd_brandnote,
+ .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE,
+ .header_supported = freebsd_brand_info_la57_img_compat,
};
+static void
+sysinit_register_elf64_brand_entries(void *arg __unused)
+{
+ /*
+ * _57 must go first so it can either claim the image, or hand
+ * it to _48.
+ */
+ if (la57)
+ elf64_insert_brand_entry(&freebsd_brand_info_la57);
+ elf64_insert_brand_entry(&freebsd_brand_info_la48);
+}
SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST,
- (sysinit_cfunc_t) elf64_insert_brand_entry,
- &freebsd_brand_info);
+ sysinit_register_elf64_brand_entries, NULL);
static Elf64_Brandinfo freebsd_brand_oinfo = {
.brand = ELFOSABI_FREEBSD,
@@ -128,15 +206,14 @@
.compat_3_brand = "FreeBSD",
.emul_path = NULL,
.interp_path = "/usr/libexec/ld-elf.so.1",
- .sysvec = &elf64_freebsd_sysvec,
+ .sysvec = &elf64_freebsd_sysvec_la48,
.interp_newpath = NULL,
.brand_note = &elf64_freebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY,
- (sysinit_cfunc_t) elf64_insert_brand_entry,
- &freebsd_brand_oinfo);
+ (sysinit_cfunc_t) elf64_insert_brand_entry, &freebsd_brand_oinfo);
static Elf64_Brandinfo kfreebsd_brand_info = {
.brand = ELFOSABI_FREEBSD,
@@ -144,15 +221,14 @@
.compat_3_brand = "FreeBSD",
.emul_path = NULL,
.interp_path = "/lib/ld-kfreebsd-x86-64.so.1",
- .sysvec = &elf64_freebsd_sysvec,
+ .sysvec = &elf64_freebsd_sysvec_la48,
.interp_newpath = NULL,
.brand_note = &elf64_kfreebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY
};
SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY,
- (sysinit_cfunc_t) elf64_insert_brand_entry,
- &kfreebsd_brand_info);
+ (sysinit_cfunc_t) elf64_insert_brand_entry, &kfreebsd_brand_info);
void
elf64_dump_thread(struct thread *td, void *dst, size_t *off)
Index: sys/amd64/amd64/genassym.c
===================================================================
--- sys/amd64/amd64/genassym.c
+++ sys/amd64/amd64/genassym.c
@@ -99,11 +99,10 @@
ASSYM(PAGE_SIZE, PAGE_SIZE);
ASSYM(NPTEPG, NPTEPG);
ASSYM(NPDEPG, NPDEPG);
-ASSYM(addr_PTmap, addr_PTmap);
-ASSYM(addr_PDmap, addr_PDmap);
-ASSYM(addr_PDPmap, addr_PDPmap);
-ASSYM(addr_PML4map, addr_PML4map);
-ASSYM(addr_PML4pml4e, addr_PML4pml4e);
+ASSYM(addr_P4Tmap, addr_P4Tmap);
+ASSYM(addr_P4Dmap, addr_P4Dmap);
+ASSYM(addr_P5Tmap, addr_P5Tmap);
+ASSYM(addr_P5Dmap, addr_P5Dmap);
ASSYM(PDESIZE, sizeof(pd_entry_t));
ASSYM(PTESIZE, sizeof(pt_entry_t));
ASSYM(PAGE_SHIFT, PAGE_SHIFT);
Index: sys/amd64/amd64/locore.S
===================================================================
--- sys/amd64/amd64/locore.S
+++ sys/amd64/amd64/locore.S
@@ -36,13 +36,8 @@
/*
* Compiled KERNBASE location
*/
- .globl kernbase,loc_PTmap,loc_PDmap,loc_PDPmap,loc_PML4map,loc_PML4pml4e,dmapbase,dmapend
+ .globl kernbase, loc_PTmap, loc_PDmap, loc_PDPmap, dmapbase, dmapend
.set kernbase,KERNBASE
- .set loc_PTmap,addr_PTmap
- .set loc_PDmap,addr_PDmap
- .set loc_PDPmap,addr_PDPmap
- .set loc_PML4map,addr_PML4map
- .set loc_PML4pml4e,addr_PML4pml4e
.set dmapbase,DMAP_MIN_ADDRESS
.set dmapend,DMAP_MAX_ADDRESS
@@ -82,6 +77,62 @@
0: hlt
jmp 0b
+/* la57_trampoline(%rdi pml5) */
+NON_GPROF_ENTRY(la57_trampoline)
+ movq %rsp,%r11
+ movq %rbx,%r10
+ leaq la57_trampoline_end(%rip),%rsp
+
+ movq %cr0,%rdx
+ lgdtq la57_trampoline_gdt_desc(%rip)
+
+ pushq $(2<<3)
+ leaq l1(%rip),%rax
+ leaq l2(%rip),%rbx
+
+ pushq %rax
+ lretq
+ .code32
+
+l1: movl $(3<<3),%eax
+ movl %eax,%ss
+
+ movl %edx,%eax
+ andl $~CR0_PG,%eax
+ movl %eax,%cr0
+
+ movl %cr4,%eax
+ orl $CR4_LA57,%eax
+ movl %eax,%cr4
+
+ movl %edi,%cr3
+ movl %edx,%cr0
+
+ pushl $(1<<3)
+ pushl %ebx
+ lretl
+ .code64
+
+l2: movq %r11,%rsp
+ movq %r10,%rbx
+ retq
+ .p2align 4,0
+NON_GPROF_ENTRY(la57_trampoline_gdt_desc)
+ .word la57_trampoline_end - la57_trampoline_gdt
+ .long 0 /* filled by pmap_bootstrap_la57 */
+ .p2align 4,0
+NON_GPROF_ENTRY(la57_trampoline_gdt)
+ .long 0x00000000 /* null desc */
+ .long 0x00000000
+ .long 0x00000000 /* 64bit code */
+ .long 0x00209800
+ .long 0x0000ffff /* 32bit code */
+ .long 0x00cf9b00
+ .long 0x0000ffff /* universal data */
+ .long 0x00cf9300
+ .dcb.l 16,0
+NON_GPROF_ENTRY(la57_trampoline_end)
+
.bss
ALIGN_DATA /* just to be sure */
.globl bootstack
Index: sys/amd64/amd64/mp_machdep.c
===================================================================
--- sys/amd64/amd64/mp_machdep.c
+++ sys/amd64/amd64/mp_machdep.c
@@ -96,7 +96,7 @@
#define GiB(v) (v ## ULL << 30)
-#define AP_BOOTPT_SZ (PAGE_SIZE * 3)
+#define AP_BOOTPT_SZ (PAGE_SIZE * 4)
/* Temporary variables for init_secondary() */
char *doublefault_stack;
@@ -104,6 +104,8 @@
char *nmi_stack;
char *dbg_stack;
+extern u_int mptramp_la57;
+
/*
* Local data and functions.
*/
@@ -236,6 +238,8 @@
assign_cpu_ids();
+ mptramp_la57 = la57;
+
/* Start each Application Processor */
init_ops.start_all_aps();
@@ -391,9 +395,9 @@
int
native_start_all_aps(void)
{
- u_int64_t *pt4, *pt3, *pt2;
+ u_int64_t *pt5, *pt4, *pt3, *pt2;
u_int32_t mpbioswarmvec;
- int apic_id, cpu, domain, i;
+ int apic_id, cpu, domain, i, xo;
u_char mpbiosreason;
mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
@@ -402,18 +406,38 @@
bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
/* Locate the page tables, they'll be below the trampoline */
- pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
+ if (la57) {
+ pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
+ xo = 1;
+ } else {
+ xo = 0;
+ }
+ pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE);
pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
/* Create the initial 1GB replicated page tables */
for (i = 0; i < 512; i++) {
- /* Each slot of the level 4 pages points to the same level 3 page */
- pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE);
+ if (la57) {
+ pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
+ PAGE_SIZE);
+ pt5[i] |= PG_V | PG_RW | PG_U;
+ }
+
+ /*
+ * Each slot of the level 4 pages points to the same
+ * level 3 page.
+ */
+ pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
+ (xo + 1) * PAGE_SIZE);
pt4[i] |= PG_V | PG_RW | PG_U;
- /* Each slot of the level 3 pages points to the same level 2 page */
- pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE));
+ /*
+ * Each slot of the level 3 pages points to the same
+ * level 2 page.
+ */
+ pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
+ ((xo + 2) * PAGE_SIZE));
pt3[i] |= PG_V | PG_RW | PG_U;
/* The level 2 page slots are mapped with 2MB pages for 1GB. */
Index: sys/amd64/amd64/mpboot.S
===================================================================
--- sys/amd64/amd64/mpboot.S
+++ sys/amd64/amd64/mpboot.S
@@ -90,10 +90,13 @@
mov $bootdata-gdt, %eax
mov %ax, %ds
- /* Turn on the PAE bit for when paging is enabled */
+ /* Turn on the PAE and optionally LA57 bit for when paging is enabled */
mov %cr4, %eax
orl $CR4_PAE, %eax
- mov %eax, %cr4
+ cmpb $0, mptramp_la57-mptramp_start(%ebx)
+ je 1f
+ orl $CR4_LA57, %eax
+1: mov %eax, %cr4
/*
* Enable EFER.LME so that we get long mode when all the prereqs are
@@ -132,7 +135,7 @@
/*
* At this point paging is enabled, and we are in "compatibility" mode.
* We do another far jump to reload %cs with the 64 bit selector.
- * %cr3 points to a 4-level page table page.
+ * %cr3 points to a 4- or 5-level page table page.
* We cannot yet jump all the way to the kernel because we can only
* specify a 32 bit linear address. So, yet another trampoline.
*
@@ -209,6 +212,11 @@
mptramp_pagetables:
.long 0
+ /* 5-level paging ? */
+ .globl mptramp_la57
+mptramp_la57:
+ .long 0
+
/*
* The pseudo descriptor for lgdt to use.
*/
@@ -251,8 +259,12 @@
* Load a real %cr3 that has all the direct map stuff and switches
* off the 1GB replicated mirror. Load a stack pointer and jump
* into AP startup code in C.
- */
+ */
+ cmpl $0, la57
+ jne 2f
movq KPML4phys, %rax
- movq %rax, %cr3
+ jmp 3f
+2: movq KPML5phys, %rax
+3: movq %rax, %cr3
movq bootSTK, %rsp
jmp init_secondary
Index: sys/amd64/amd64/pmap.c
===================================================================
--- sys/amd64/amd64/pmap.c
+++ sys/amd64/amd64/pmap.c
@@ -398,6 +398,19 @@
SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
&pg_ps_enabled, 0, "Are large page mappings enabled?");
+int __read_frequently la57 = 0;
+SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
+ &la57, 0,
+ "5-level paging for host is enabled");
+
+static bool
+pmap_is_la57(pmap_t pmap)
+{
+ if (pmap->pm_type == PT_X86)
+ return (la57);
+ return (false); /* XXXKIB handle EPT */
+}
+
#define PAT_INDEX_SIZE 8
static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */
@@ -405,7 +418,10 @@
static u_int64_t KPDphys; /* phys addr of kernel level 2 */
u_int64_t KPDPphys; /* phys addr of kernel level 3 */
u_int64_t KPML4phys; /* phys addr of kernel level 4 */
+u_int64_t KPML5phys; /* phys addr of kernel level 5,
+ if supported */
+static pml4_entry_t *kernel_pml4;
static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
static int ndmpdpphys; /* number of DMPDPphys pages */
@@ -1257,7 +1273,7 @@
static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
- struct rwlock **lockp);
+ struct rwlock **lockp, vm_offset_t va);
static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp,
struct rwlock **lockp);
static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
@@ -1271,20 +1287,85 @@
/* Inline functions */
/********************/
-/* Return a non-clipped PD index for a given VA */
+/*
+ * Return a non-clipped indexes for a given VA, which are page table
+ * pages indexes at the corresponding level.
+ */
static __inline vm_pindex_t
pmap_pde_pindex(vm_offset_t va)
{
return (va >> PDRSHIFT);
}
+static __inline vm_pindex_t
+pmap_pdpe_pindex(vm_offset_t va)
+{
+ return (NUPDE + (va >> PDPSHIFT));
+}
+
+static __inline vm_pindex_t
+pmap_pml4e_pindex(vm_offset_t va)
+{
+ return (NUPDE + NUPDPE + (va >> PML4SHIFT));
+}
+
+static __inline vm_pindex_t
+pmap_pml5e_pindex(vm_offset_t va)
+{
+ return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT));
+}
+
+static __inline pml4_entry_t *
+pmap_pml5e(pmap_t pmap, vm_offset_t va)
+{
+
+ MPASS(pmap_is_la57(pmap));
+ return (&pmap->pm_pmltop[pmap_pml5e_index(va)]);
+}
+
+static __inline pml4_entry_t *
+pmap_pml5e_u(pmap_t pmap, vm_offset_t va)
+{
+
+ MPASS(pmap_is_la57(pmap));
+ return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]);
+}
+
+static __inline pml4_entry_t *
+pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va)
+{
+ pml4_entry_t *pml4e;
+
+ /* XXX MPASS(pmap_is_la57(pmap); */
+ pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME);
+ return (&pml4e[pmap_pml4e_index(va)]);
+}
/* Return a pointer to the PML4 slot that corresponds to a VA */
static __inline pml4_entry_t *
pmap_pml4e(pmap_t pmap, vm_offset_t va)
{
+ pml5_entry_t *pml5e;
+ pml4_entry_t *pml4e;
+ pt_entry_t PG_V;
- return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
+ if (pmap_is_la57(pmap)) {
+ pml5e = pmap_pml5e(pmap, va);
+ PG_V = pmap_valid_bit(pmap);
+ if ((*pml5e & PG_V) == 0)
+ return (NULL);
+ pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME);
+ } else {
+ pml4e = pmap->pm_pmltop;
+ }
+ return (&pml4e[pmap_pml4e_index(va)]);
+}
+
+static __inline pml4_entry_t *
+pmap_pml4e_u(pmap_t pmap, vm_offset_t va)
+{
+ MPASS(!pmap_is_la57(pmap));
+ return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]);
}
/* Return a pointer to the PDP slot that corresponds to a VA */
@@ -1306,7 +1387,7 @@
PG_V = pmap_valid_bit(pmap);
pml4e = pmap_pml4e(pmap, va);
- if ((*pml4e & PG_V) == 0)
+ if (pml4e == NULL || (*pml4e & PG_V) == 0)
return (NULL);
return (pmap_pml4e_to_pdpe(pml4e, va));
}
@@ -1387,21 +1468,37 @@
PMAP_INLINE pt_entry_t *
vtopte(vm_offset_t va)
{
- u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+ u_int64_t mask;
KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
- return (PTmap + ((va >> PAGE_SHIFT) & mask));
+ if (la57) {
+ mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
+ NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1);
+ return (P5Tmap + ((va >> PAGE_SHIFT) & mask));
+ } else {
+ mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
+ NPML4EPGSHIFT)) - 1);
+ return (P4Tmap + ((va >> PAGE_SHIFT) & mask));
+ }
}
static __inline pd_entry_t *
vtopde(vm_offset_t va)
{
- u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+ u_int64_t mask;
KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
- return (PDmap + ((va >> PDRSHIFT) & mask));
+ if (la57) {
+ mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
+ NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1);
+ return (P5Dmap + ((va >> PDRSHIFT) & mask));
+ } else {
+ mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
+ NPML4EPGSHIFT)) - 1);
+ return (P4Dmap + ((va >> PDRSHIFT) & mask));
+ }
}
static u_int64_t
@@ -1658,6 +1755,8 @@
p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
}
+
+ kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
}
/*
@@ -1730,7 +1829,7 @@
* later unmapped (using pmap_remove()) and freed.
*/
PMAP_LOCK_INIT(kernel_pmap);
- kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
+ kernel_pmap->pm_pmltop = kernel_pml4;
kernel_pmap->pm_cr3 = KPML4phys;
kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */
@@ -1891,6 +1990,148 @@
load_cr4(cr4);
}
+extern const char la57_trampoline[], la57_trampoline_gdt_desc[],
+ la57_trampoline_gdt[], la57_trampoline_end[];
+
+static void
+pmap_bootstrap_la57(void *arg __unused)
+{
+ char *v_code;
+ pml5_entry_t *v_pml5;
+ pml4_entry_t *v_pml4;
+ pdp_entry_t *v_pdp;
+ pd_entry_t *v_pd;
+ pt_entry_t *v_pt;
+ vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5;
+ void (*la57_tramp)(uint64_t pml5);
+ struct region_descriptor r_gdt;
+
+ if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0)
+ return;
+ if (!TUNABLE_INT_FETCH("vm.pmap.la57", &la57))
+ la57 = 1;
+ if (!la57)
+ return;
+
+ r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1;
+ r_gdt.rd_base = (long)__pcpu[0].pc_gdt;
+
+ m_code = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_code->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_code);
+ v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code));
+ m_pml5 = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pml5->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pml5);
+ KPML5phys = VM_PAGE_TO_PHYS(m_pml5);
+ v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys);
+ m_pml4 = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pml4->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pml4);
+ v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4));
+ m_pdp = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pdp->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pdp);
+ v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp));
+ m_pd = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pd->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pd);
+ v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd));
+ m_pt = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pt->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pt);
+ v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt));
+
+ /*
+ * Map m_code 1:1, it appears below 4G in KVA due to physical
+ * address being below 4G. Since kernel KVA is in upper half,
+ * the pml4e should be zero and free for temporal use.
+ */
+ kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+ v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+ v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+ v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+
+ /*
+ * Add pml5 entry at top of KVA pointing to existing pml4 table,
+ * entering all existing kernel mapping into level 5 table.
+ */
+ v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M | pg_g;
+
+ /*
+ * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on.
+ */
+ v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+ v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+
+ /*
+ * Copy and call the 48->57 trampoline, hope we return there, alive.
+ */
+ bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline);
+ *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) =
+ la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code);
+ la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code);
+ la57_tramp(KPML5phys);
+
+ /*
+ * gdt was necessary reset, switch back to our gdt.
+ */
+ lgdt(&r_gdt);
+ wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
+ load_ds(_udatasel);
+ load_es(_udatasel);
+ load_fs(_ufssel);
+ ssdtosyssd(&gdt_segs[GPROC0_SEL],
+ (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]);
+ ltr(GSEL(GPROC0_SEL, SEL_KPL));
+
+ /*
+ * Now unmap the trampoline, and free the pages.
+ * Clear pml5 entry used for 1:1 trampoline mapping.
+ */
+ pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]);
+ invlpg((vm_offset_t)v_code);
+ vm_page_free(m_code);
+ vm_page_free(m_pdp);
+ vm_page_free(m_pd);
+ vm_page_free(m_pt);
+
+ /*
+ * Recursively map PML5 to itself in order to get PTmap and
+ * PDmap.
+ */
+ v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx;
+
+ kernel_pmap->pm_cr3 = KPML5phys;
+ kernel_pmap->pm_pmltop = v_pml5;
+}
+SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL);
+
/*
* Initialize a vm_page's machine-dependent fields.
*/
@@ -2190,7 +2431,8 @@
}
for (i = 0; i < lm_ents; i++) {
m = pmap_large_map_getptp_unlocked();
- kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V |
+ /* XXXKIB la57 */
+ kernel_pml4[LMSPML4I + i] = X86_PG_V |
X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
VM_PAGE_TO_PHYS(m);
}
@@ -3566,44 +3808,57 @@
static void
_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
{
+ pml5_entry_t *pml5;
+ pml4_entry_t *pml4;
+ pdp_entry_t *pdp;
+ pd_entry_t *pd;
+ vm_page_t pdpg, pdppg, pml4pg;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
/*
* unmap the page table page
*/
- if (m->pindex >= NUPDE + NUPDPE) {
+ if (m->pindex >= NUPDE + NUPDPE + NUPML4E) {
+ /* PML4 page */
+ MPASS(pmap_is_la57(pmap));
+ pml5 = pmap_pml5e(pmap, va);
+ *pml5 = 0;
+ if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) {
+ pml5 = pmap_pml5e_u(pmap, va);
+ *pml5 = 0;
+ }
+ } else if (m->pindex >= NUPDE + NUPDPE) {
/* PDP page */
- pml4_entry_t *pml4;
pml4 = pmap_pml4e(pmap, va);
*pml4 = 0;
- if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
- pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
+ if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL &&
+ va <= VM_MAXUSER_ADDRESS) {
+ pml4 = pmap_pml4e_u(pmap, va);
*pml4 = 0;
}
} else if (m->pindex >= NUPDE) {
/* PD page */
- pdp_entry_t *pdp;
pdp = pmap_pdpe(pmap, va);
*pdp = 0;
} else {
/* PTE page */
- pd_entry_t *pd;
pd = pmap_pde(pmap, va);
*pd = 0;
}
pmap_resident_count_dec(pmap, 1);
if (m->pindex < NUPDE) {
/* We just released a PT, unhold the matching PD */
- vm_page_t pdpg;
-
pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
pmap_unwire_ptp(pmap, va, pdpg, free);
} else if (m->pindex < NUPDE + NUPDPE) {
/* We just released a PD, unhold the matching PDP */
- vm_page_t pdppg;
-
pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
pmap_unwire_ptp(pmap, va, pdppg, free);
+ } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) {
+ /* We just released a PDP, unhold the matching PML4 */
+ pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME);
+ pmap_unwire_ptp(pmap, va, pml4pg, free);
}
/*
@@ -3659,9 +3914,9 @@
int i;
PMAP_LOCK_INIT(pmap);
- pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
- pmap->pm_pml4u = NULL;
- pmap->pm_cr3 = KPML4phys;
+ pmap->pm_pmltop = kernel_pmap->pm_pmltop;
+ pmap->pm_pmltopu = NULL;
+ pmap->pm_cr3 = kernel_pmap->pm_cr3;
/* hack to keep pmap_pti_pcid_invalidate() alive */
pmap->pm_ucr3 = PMAP_NO_CR3;
pmap->pm_root.rt_root = 0;
@@ -3714,18 +3969,59 @@
/* install large map entries if configured */
for (i = 0; i < lm_ents; i++)
- pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i];
+ pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i];
+}
+
+void
+pmap_pinit_pml5(vm_page_t pml5pg)
+{
+ pml5_entry_t *pm_pml5;
+
+ pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg));
+
+ /*
+ * Add pml5 entry at top of KVA pointing to existing pml4 table,
+ * entering all existing kernel mapping into level 5 table.
+ */
+ pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M | pg_g |
+ pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
+
+ /*
+ * Install self-referential address mapping entry.
+ */
+ pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) |
+ X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A |
+ pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
}
static void
-pmap_pinit_pml4_pti(vm_page_t pml4pg)
+pmap_pinit_pml4_pti(vm_page_t pml4pgu)
{
- pml4_entry_t *pm_pml4;
+ pml4_entry_t *pm_pml4u;
int i;
- pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
+ pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu));
for (i = 0; i < NPML4EPG; i++)
- pm_pml4[i] = pti_pml4[i];
+ pm_pml4u[i] = pti_pml4[i];
+}
+
+static void
+pmap_pinit_pml5_pti(vm_page_t pml5pgu)
+{
+ pml5_entry_t *pm_pml5u;
+
+ pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu));
+
+ /*
+ * Add pml5 entry at top of KVA pointing to existing pml4 pti
+ * table, entering all kernel mapping needed for usermode,
+ * into level 5 table.
+ */
+ pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] =
+ pmap_kextract((vm_offset_t)pti_pml4) |
+ X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g |
+ pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
}
/*
@@ -3735,29 +4031,30 @@
int
pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
{
- vm_page_t pml4pg, pml4pgu;
- vm_paddr_t pml4phys;
+ vm_page_t pmltop_pg, pmltop_pgu;
+ vm_paddr_t pmltop_phys;
int i;
/*
* allocate the page directory page
*/
- pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
+ pmltop_pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK);
- pml4phys = VM_PAGE_TO_PHYS(pml4pg);
- pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
+ pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg);
+ pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys);
+
CPU_FOREACH(i) {
pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
pmap->pm_pcids[i].pm_gen = 0;
}
pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */
pmap->pm_ucr3 = PMAP_NO_CR3;
- pmap->pm_pml4u = NULL;
+ pmap->pm_pmltopu = NULL;
pmap->pm_type = pm_type;
- if ((pml4pg->flags & PG_ZERO) == 0)
- pagezero(pmap->pm_pml4);
+ if ((pmltop_pg->flags & PG_ZERO) == 0)
+ pagezero(pmap->pm_pmltop);
/*
* Do not install the host kernel mappings in the nested page
@@ -3766,15 +4063,21 @@
* Install minimal kernel mappings in PTI case.
*/
if (pm_type == PT_X86) {
- pmap->pm_cr3 = pml4phys;
- pmap_pinit_pml4(pml4pg);
+ pmap->pm_cr3 = pmltop_phys;
+ if (pmap_is_la57(pmap))
+ pmap_pinit_pml5(pmltop_pg);
+ else
+ pmap_pinit_pml4(pmltop_pg);
if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) {
- pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
+ pmltop_pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
- pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
- VM_PAGE_TO_PHYS(pml4pgu));
- pmap_pinit_pml4_pti(pml4pgu);
- pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
+ pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP(
+ VM_PAGE_TO_PHYS(pmltop_pgu));
+ if (pmap_is_la57(pmap))
+ pmap_pinit_pml5_pti(pmltop_pgu);
+ else
+ pmap_pinit_pml4_pti(pmltop_pgu);
+ pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu);
}
if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
rangeset_init(&pmap->pm_pkru, pkru_dup_range,
@@ -3799,13 +4102,87 @@
return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
}
+static pml4_entry_t *
+pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va,
+ bool addref)
+{
+ vm_pindex_t pml5index;
+ pml5_entry_t *pml5;
+ pml4_entry_t *pml4;
+ vm_page_t pml4pg;
+ pt_entry_t PG_V;
+ bool allocated;
+
+ if (!pmap_is_la57(pmap))
+ return (&pmap->pm_pmltop[pmap_pml4e_index(va)]);
+
+ PG_V = pmap_valid_bit(pmap);
+ pml5index = pmap_pml5e_index(va);
+ pml5 = &pmap->pm_pmltop[pml5index];
+ if ((*pml5 & PG_V) == 0) {
+ if (_pmap_allocpte(pmap, pmap_pml5e_pindex(va), lockp, va) ==
+ NULL)
+ return (NULL);
+ allocated = true;
+ } else {
+ allocated = false;
+ }
+ pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME);
+ pml4 = &pml4[pmap_pml4e_index(va)];
+ if ((*pml4 & PG_V) == 0) {
+ pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME);
+ if (allocated && !addref)
+ pml4pg->ref_count--;
+ else if (!allocated && addref)
+ pml4pg->ref_count++;
+ }
+ return (pml4);
+}
+
+static pdp_entry_t *
+pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va,
+ bool addref)
+{
+ vm_page_t pdppg;
+ pml4_entry_t *pml4;
+ pdp_entry_t *pdp;
+ pt_entry_t PG_V;
+ bool allocated;
+
+ PG_V = pmap_valid_bit(pmap);
+
+ pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false);
+ if (pml4 == NULL)
+ return (NULL);
+
+ if ((*pml4 & PG_V) == 0) {
+ /* Have to allocate a new pdp, recurse */
+ if (_pmap_allocpte(pmap, pmap_pml4e_pindex(va), lockp, va) ==
+ NULL)
+ return (NULL);
+ allocated = true;
+ } else {
+ allocated = false;
+ }
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
+ pdp = &pdp[pmap_pdpe_index(va)];
+ if ((*pdp & PG_V) == 0) {
+ pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
+ if (allocated && !addref)
+ pdppg->ref_count--;
+ else if (!allocated && addref)
+ pdppg->ref_count++;
+ }
+ return (pdp);
+}
+
/*
* This routine is called if the desired page table page does not exist.
*
* If page table page allocation fails, this routine may sleep before
* returning NULL. It sleeps only if a lock pointer was given.
*
- * Note: If a page allocation fails at page table level two or three,
+ * Note: If a page allocation fails at page table level two, three, or four,
* one or two pages may be held during the wait, only to be released
* afterwards. This conservative approach is easily argued to avoid
* race conditions.
@@ -3823,20 +4200,35 @@
* - for the page directory pointer page,
* ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT +
* NPML4EPGSHIFT),
- * i.e. index of pml4e is put after the last index of PDPE.
+ * i.e. index of pml4e is put after the last index of PDPE,
+ * - for the PML4 page (if LA57 mode is enabled),
+ * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >>
+ * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT),
+ * i.e. index of pml5e is put after the last index of PML4E.
*
* Define an order on the paging entries, where all entries of the
* same height are put together, then heights are put from deepest to
* root. Then ptexpindex is the sequential number of the
* corresponding paging entry in this order.
*
- * The root page at PML4 does not participate in this indexing scheme, since
- * it is statically allocated by pmap_pinit() and not by _pmap_allocpte().
+ * The values of NUPDE, NUPDPE, and NUPML4E are fixed by the size of
+ * LA57 paging structures even in LA48 paging mode, as well as the
+ * ptepindexes are calculated as if the paging structures were 5-level
+ * regardless of the actual mode of operation.
+ *
+ * The root page at PML4/PML5 does not participate in this indexing scheme,
+ * since it is statically allocated by pmap_pinit() and not by _pmap_allocpte().
*/
static vm_page_t
-_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
+_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp,
+ vm_offset_t va __unused)
{
- vm_page_t m, pdppg, pdpg;
+ vm_pindex_t pml5index, pml4index;
+ pml5_entry_t *pml5, *pml5u;
+ pml4_entry_t *pml4, *pml4u;
+ pdp_entry_t *pdp;
+ pd_entry_t *pd;
+ vm_page_t m, pdpg;
pt_entry_t PG_A, PG_M, PG_RW, PG_V;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -3872,16 +4264,38 @@
* Map the pagetable page into the process address space, if
* it isn't already there.
*/
+ if (ptepindex >= NUPDE + NUPDPE + NUPML4E) {
+ MPASS(pmap_is_la57(pmap));
+
+ pml5index = pmap_pml5e_index(va);
+ pml5 = &pmap->pm_pmltop[pml5index];
+ KASSERT((*pml5 & PG_V) == 0,
+ ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5));
+ *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
- if (ptepindex >= (NUPDE + NUPDPE)) {
- pml4_entry_t *pml4, *pml4u;
- vm_pindex_t pml4index;
+ if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) {
+ if (pmap->pm_ucr3 != PMAP_NO_CR3)
+ *pml5 |= pg_nx;
+ pml5u = &pmap->pm_pmltopu[pml5index];
+ *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
+ PG_A | PG_M;
+ }
+ } else if (ptepindex >= NUPDE + NUPDPE) {
+ pml4index = pmap_pml4e_index(va);
/* Wire up a new PDPE page */
- pml4index = ptepindex - (NUPDE + NUPDPE);
- pml4 = &pmap->pm_pml4[pml4index];
+ pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true);
+ if (pml4 == NULL) {
+ vm_page_unwire_noq(m);
+ vm_page_free_zero(m);
+ return (NULL);
+ }
+ KASSERT((*pml4 & PG_V) == 0,
+ ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4));
*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
- if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
+
+ if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL &&
+ pml4index < NUPML4E) {
/*
* PTI: Make all user-space mappings in the
* kernel-mode page table no-execute so that
@@ -3892,85 +4306,48 @@
if (pmap->pm_ucr3 != PMAP_NO_CR3)
*pml4 |= pg_nx;
- pml4u = &pmap->pm_pml4u[pml4index];
+ pml4u = &pmap->pm_pmltopu[pml4index];
*pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
PG_A | PG_M;
}
-
} else if (ptepindex >= NUPDE) {
- vm_pindex_t pml4index;
- vm_pindex_t pdpindex;
- pml4_entry_t *pml4;
- pdp_entry_t *pdp;
-
/* Wire up a new PDE page */
- pdpindex = ptepindex - NUPDE;
- pml4index = pdpindex >> NPML4EPGSHIFT;
-
- pml4 = &pmap->pm_pml4[pml4index];
- if ((*pml4 & PG_V) == 0) {
- /* Have to allocate a new pdp, recurse */
- if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
- lockp) == NULL) {
- vm_page_unwire_noq(m);
- vm_page_free_zero(m);
- return (NULL);
- }
- } else {
- /* Add reference to pdp page */
- pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
- pdppg->ref_count++;
+ pdp = pmap_allocpte_getpdp(pmap, lockp, va, true);
+ if (pdp == NULL) {
+ vm_page_unwire_noq(m);
+ vm_page_free_zero(m);
+ return (NULL);
}
- pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
-
- /* Now find the pdp page */
- pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
+ KASSERT((*pdp & PG_V) == 0,
+ ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp));
*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
-
} else {
- vm_pindex_t pml4index;
- vm_pindex_t pdpindex;
- pml4_entry_t *pml4;
- pdp_entry_t *pdp;
- pd_entry_t *pd;
-
/* Wire up a new PTE page */
- pdpindex = ptepindex >> NPDPEPGSHIFT;
- pml4index = pdpindex >> NPML4EPGSHIFT;
-
- /* First, find the pdp and check that its valid. */
- pml4 = &pmap->pm_pml4[pml4index];
- if ((*pml4 & PG_V) == 0) {
+ pdp = pmap_allocpte_getpdp(pmap, lockp, va, false);
+ if (pdp == NULL) {
+ vm_page_unwire_noq(m);
+ vm_page_free_zero(m);
+ return (NULL);
+ }
+ if ((*pdp & PG_V) == 0) {
/* Have to allocate a new pd, recurse */
- if (_pmap_allocpte(pmap, NUPDE + pdpindex,
- lockp) == NULL) {
+ if (_pmap_allocpte(pmap, pmap_pdpe_pindex(va),
+ lockp, va) == NULL) {
vm_page_unwire_noq(m);
vm_page_free_zero(m);
return (NULL);
}
- pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
- pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
} else {
- pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
- pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
- if ((*pdp & PG_V) == 0) {
- /* Have to allocate a new pd, recurse */
- if (_pmap_allocpte(pmap, NUPDE + pdpindex,
- lockp) == NULL) {
- vm_page_unwire_noq(m);
- vm_page_free_zero(m);
- return (NULL);
- }
- } else {
- /* Add reference to the pd page */
- pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
- pdpg->ref_count++;
- }
+ /* Add reference to the pd page */
+ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
+ pdpg->ref_count++;
}
pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
/* Now we know where the page directory page is */
- pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
+ pd = &pd[pmap_pde_index(va)];
+ KASSERT((*pd & PG_V) == 0,
+ ("pmap %p va %#lx pd %#lx", pmap, va, *pd));
*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
}
@@ -4003,7 +4380,7 @@
} else if (va < VM_MAXUSER_ADDRESS) {
/* Allocate a pd page. */
pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT;
- pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
+ pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp, va);
if (pdpg == NULL) {
if (lockp != NULL)
goto retry;
@@ -4064,7 +4441,7 @@
* Here if the pte page isn't mapped, or if it has been
* deallocated.
*/
- m = _pmap_allocpte(pmap, ptepindex, lockp);
+ m = _pmap_allocpte(pmap, ptepindex, lockp, va);
if (m == NULL && lockp != NULL)
goto retry;
}
@@ -4088,28 +4465,35 @@
int i;
KASSERT(pmap->pm_stats.resident_count == 0,
- ("pmap_release: pmap resident count %ld != 0",
- pmap->pm_stats.resident_count));
+ ("pmap_release: pmap %p resident count %ld != 0",
+ pmap, pmap->pm_stats.resident_count));
KASSERT(vm_radix_is_empty(&pmap->pm_root),
- ("pmap_release: pmap has reserved page table page(s)"));
+ ("pmap_release: pmap %p has reserved page table page(s)",
+ pmap));
KASSERT(CPU_EMPTY(&pmap->pm_active),
("releasing active pmap %p", pmap));
- m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop));
- for (i = 0; i < NKPML4E; i++) /* KVA */
- pmap->pm_pml4[KPML4BASE + i] = 0;
- for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
- pmap->pm_pml4[DMPML4I + i] = 0;
- pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */
- for (i = 0; i < lm_ents; i++) /* Large Map */
- pmap->pm_pml4[LMSPML4I + i] = 0;
+ if (pmap_is_la57(pmap)) {
+ pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0;
+ pmap->pm_pmltop[PML5PML5I] = 0;
+ } else {
+ for (i = 0; i < NKPML4E; i++) /* KVA */
+ pmap->pm_pmltop[KPML4BASE + i] = 0;
+ for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
+ pmap->pm_pmltop[DMPML4I + i] = 0;
+ pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */
+ for (i = 0; i < lm_ents; i++) /* Large Map */
+ pmap->pm_pmltop[LMSPML4I + i] = 0;
+ }
vm_page_unwire_noq(m);
vm_page_free_zero(m);
- if (pmap->pm_pml4u != NULL) {
- m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
+ if (pmap->pm_pmltopu != NULL) {
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->
+ pm_pmltopu));
vm_page_unwire_noq(m);
vm_page_free(m);
}
@@ -5448,6 +5832,7 @@
{
struct rwlock *lock;
vm_offset_t va_next;
+ pml5_entry_t *pml5e;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t ptpaddr, *pde;
@@ -5490,7 +5875,18 @@
if (pmap->pm_stats.resident_count == 0)
break;
- pml4e = pmap_pml4e(pmap, sva);
+ if (pmap_is_la57(pmap)) {
+ pml5e = pmap_pml5e(pmap, sva);
+ if ((*pml5e & PG_V) == 0) {
+ va_next = (sva + NBPML5) & ~PML5MASK;
+ if (va_next < sva)
+ va_next = eva;
+ continue;
+ }
+ pml4e = pmap_pml5e_to_pml4e(pml5e, sva);
+ } else {
+ pml4e = pmap_pml4e(pmap, sva);
+ }
if ((*pml4e & PG_V) == 0) {
va_next = (sva + NBPML4) & ~PML4MASK;
if (va_next < sva)
@@ -6110,7 +6506,7 @@
*/
nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
- nosleep ? NULL : &lock);
+ nosleep ? NULL : &lock, va);
if (mpte == NULL && nosleep) {
rv = KERN_RESOURCE_SHORTAGE;
goto out;
@@ -6593,7 +6989,8 @@
* Pass NULL instead of the PV list lock
* pointer, because we don't intend to sleep.
*/
- mpte = _pmap_allocpte(pmap, ptepindex, NULL);
+ mpte = _pmap_allocpte(pmap, ptepindex, NULL,
+ va);
if (mpte == NULL)
return (mpte);
}
@@ -9346,11 +9743,11 @@
("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
"%#jx lm_ents %d",
(uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
- KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0,
+ KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
"LMSPML4I %#jx lm_ents %d",
(uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
- mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME;
+ mphys = kernel_pml4[pml4_idx] & PG_FRAME;
return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
}
@@ -10425,7 +10822,9 @@
mode, range->pdpes, range->pdes, range->ptes);
/* Reset to sentinel value. */
- range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
+ range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1);
}
/*
@@ -10519,7 +10918,9 @@
sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
/* Sentinel value. */
- range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
+ range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1);
/*
* Iterate over the kernel page tables without holding the kernel pmap
@@ -10549,7 +10950,7 @@
sva |= -1ul << 48;
restart:
- pml4e = kernel_pmap->pm_pml4[i];
+ pml4e = kernel_pml4[i];
if ((pml4e & X86_PG_V) == 0) {
sva = rounddown2(sva, NBPML4);
sysctl_kmaps_dump(sb, &range, sva);
@@ -10632,6 +11033,7 @@
DB_SHOW_COMMAND(pte, pmap_print_pte)
{
pmap_t pmap;
+ pml5_entry_t *pml5;
pml4_entry_t *pml4;
pdp_entry_t *pdp;
pd_entry_t *pde;
@@ -10650,8 +11052,20 @@
pmap = PCPU_GET(curpmap);
PG_V = pmap_valid_bit(pmap);
- pml4 = pmap_pml4e(pmap, va);
- db_printf("VA 0x%016lx pml4e 0x%016lx", va, *pml4);
+ db_printf("VA 0x%016lx", va);
+
+ if (pmap_is_la57(pmap)) {
+ pml5 = pmap_pml5e(pmap, va);
+ db_printf(" pml5e 0x%016lx", *pml5);
+ if ((*pml5 & PG_V) == 0) {
+ db_printf("\n");
+ return;
+ }
+ pml4 = pmap_pml5e_to_pml4e(pml5, va);
+ } else {
+ pml4 = pmap_pml4e(pmap, va);
+ }
+ db_printf(" pml4e 0x%016lx", *pml4);
if ((*pml4 & PG_V) == 0) {
db_printf("\n");
return;
@@ -10683,4 +11097,95 @@
db_printf("show phys2dmap addr\n");
}
}
+
+static void
+ptpages_show_page(int level, int idx, vm_page_t pg)
+{
+ db_printf("l %d i %d pg %p phys %#lx ref %x\n",
+ level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count);
+}
+
+static void
+ptpages_show_complain(int level, int idx, uint64_t pte)
+{
+ db_printf("l %d i %d pte %#lx\n", level, idx, pte);
+}
+
+static void
+ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V)
+{
+ vm_page_t pg3, pg2, pg1;
+ pml4_entry_t *pml4;
+ pdp_entry_t *pdp;
+ pd_entry_t *pd;
+ int i4, i3, i2;
+
+ pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4));
+ for (i4 = 0; i4 < num_entries; i4++) {
+ if ((pml4[i4] & PG_V) == 0)
+ continue;
+ pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME);
+ if (pg3 == NULL) {
+ ptpages_show_complain(3, i4, pml4[i4]);
+ continue;
+ }
+ ptpages_show_page(3, i4, pg3);
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3));
+ for (i3 = 0; i3 < NPDPEPG; i3++) {
+ if ((pdp[i3] & PG_V) == 0)
+ continue;
+ pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME);
+ if (pg3 == NULL) {
+ ptpages_show_complain(2, i3, pdp[i3]);
+ continue;
+ }
+ ptpages_show_page(2, i3, pg2);
+ pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2));
+ for (i2 = 0; i2 < NPDEPG; i2++) {
+ if ((pd[i2] & PG_V) == 0)
+ continue;
+ pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME);
+ if (pg1 == NULL) {
+ ptpages_show_complain(1, i2, pd[i2]);
+ continue;
+ }
+ ptpages_show_page(1, i2, pg1);
+ }
+ }
+ }
+}
+
+DB_SHOW_COMMAND(ptpages, pmap_ptpages)
+{
+ pmap_t pmap;
+ vm_page_t pg;
+ pml5_entry_t *pml5;
+ uint64_t PG_V;
+ int i5;
+
+ if (have_addr)
+ pmap = (pmap_t)addr;
+ else
+ pmap = PCPU_GET(curpmap);
+
+ PG_V = pmap_valid_bit(pmap);
+
+ if (pmap_is_la57(pmap)) {
+ pml5 = pmap->pm_pmltop;
+ for (i5 = 0; i5 < NUPML5E; i5++) {
+ if ((pml5[i5] & PG_V) == 0)
+ continue;
+ pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME);
+ if (pg == NULL) {
+ ptpages_show_complain(4, i5, pml5[i5]);
+ continue;
+ }
+ ptpages_show_page(4, i5, pg);
+ ptpages_show_pml4(pg, NPML4EPG, PG_V);
+ }
+ } else {
+ ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS(
+ (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V);
+ }
+}
#endif
Index: sys/amd64/amd64/vm_machdep.c
===================================================================
--- sys/amd64/amd64/vm_machdep.c
+++ sys/amd64/amd64/vm_machdep.c
@@ -377,21 +377,67 @@
}
static void
-cpu_procctl_kpti(struct proc *p, int com, int *val)
+cpu_procctl_kpti_ctl(struct proc *p, int val)
{
- if (com == PROC_KPTI_CTL) {
- if (pti && *val == PROC_KPTI_CTL_ENABLE_ON_EXEC)
- p->p_md.md_flags |= P_MD_KPTI;
- if (*val == PROC_KPTI_CTL_DISABLE_ON_EXEC)
- p->p_md.md_flags &= ~P_MD_KPTI;
- } else /* PROC_KPTI_STATUS */ {
- *val = (p->p_md.md_flags & P_MD_KPTI) != 0 ?
- PROC_KPTI_CTL_ENABLE_ON_EXEC:
- PROC_KPTI_CTL_DISABLE_ON_EXEC;
- if (vmspace_pmap(p->p_vmspace)->pm_ucr3 != PMAP_NO_CR3)
- *val |= PROC_KPTI_STATUS_ACTIVE;
+ if (pti && val == PROC_KPTI_CTL_ENABLE_ON_EXEC)
+ p->p_md.md_flags |= P_MD_KPTI;
+ if (val == PROC_KPTI_CTL_DISABLE_ON_EXEC)
+ p->p_md.md_flags &= ~P_MD_KPTI;
+}
+
+static void
+cpu_procctl_kpti_status(struct proc *p, int *val)
+{
+ *val = (p->p_md.md_flags & P_MD_KPTI) != 0 ?
+ PROC_KPTI_CTL_ENABLE_ON_EXEC:
+ PROC_KPTI_CTL_DISABLE_ON_EXEC;
+ if (vmspace_pmap(p->p_vmspace)->pm_ucr3 != PMAP_NO_CR3)
+ *val |= PROC_KPTI_STATUS_ACTIVE;
+}
+
+static int
+cpu_procctl_la_ctl(struct proc *p, int val)
+{
+ int error;
+
+ error = 0;
+ switch (val) {
+ case PROC_LA_CTL_LA48_ON_EXEC:
+ p->p_md.md_flags |= P_MD_LA48;
+ p->p_md.md_flags &= ~P_MD_LA57;
+ break;
+ case PROC_LA_CTL_LA57_ON_EXEC:
+ if (la57) {
+ p->p_md.md_flags &= ~P_MD_LA48;
+ p->p_md.md_flags |= P_MD_LA57;
+ } else {
+ error = ENOTSUP;
+ }
+ break;
+ case PROC_LA_CTL_DEFAULT_ON_EXEC:
+ p->p_md.md_flags &= ~(P_MD_LA48 | P_MD_LA57);
+ break;
}
+ return (error);
+}
+
+static void
+cpu_procctl_la_status(struct proc *p, int *val)
+{
+ int res;
+
+ if ((p->p_md.md_flags & P_MD_LA48) != 0)
+ res = PROC_LA_CTL_LA48_ON_EXEC;
+ else if ((p->p_md.md_flags & P_MD_LA57) != 0)
+ res = PROC_LA_CTL_LA57_ON_EXEC;
+ else
+ res = PROC_LA_CTL_DEFAULT_ON_EXEC;
+ if (p->p_sysent->sv_maxuser == VM_MAXUSER_ADDRESS_LA48)
+ res |= PROC_LA_STATUS_LA48;
+ else
+ res |= PROC_LA_STATUS_LA57;
+ *val = res;
}
int
@@ -403,6 +449,8 @@
switch (com) {
case PROC_KPTI_CTL:
case PROC_KPTI_STATUS:
+ case PROC_LA_CTL:
+ case PROC_LA_STATUS:
if (idtype != P_PID) {
error = EINVAL;
break;
@@ -412,22 +460,45 @@
error = priv_check(td, PRIV_IO);
if (error != 0)
break;
+ }
+ if (com == PROC_KPTI_CTL || com == PROC_LA_CTL) {
error = copyin(data, &val, sizeof(val));
if (error != 0)
break;
- if (val != PROC_KPTI_CTL_ENABLE_ON_EXEC &&
- val != PROC_KPTI_CTL_DISABLE_ON_EXEC) {
- error = EINVAL;
- break;
- }
+ }
+ if (com == PROC_KPTI_CTL &&
+ val != PROC_KPTI_CTL_ENABLE_ON_EXEC &&
+ val != PROC_KPTI_CTL_DISABLE_ON_EXEC) {
+ error = EINVAL;
+ break;
+ }
+ if (com == PROC_LA_CTL &&
+ val != PROC_LA_CTL_LA48_ON_EXEC &&
+ val != PROC_LA_CTL_LA57_ON_EXEC &&
+ val != PROC_LA_CTL_DEFAULT_ON_EXEC) {
+ error = EINVAL;
+ break;
}
error = pget(id, PGET_CANSEE | PGET_NOTWEXIT | PGET_NOTID, &p);
- if (error == 0) {
- cpu_procctl_kpti(p, com, &val);
- PROC_UNLOCK(p);
- if (com == PROC_KPTI_STATUS)
- error = copyout(&val, data, sizeof(val));
+ if (error != 0)
+ break;
+ switch (com) {
+ case PROC_KPTI_CTL:
+ cpu_procctl_kpti_ctl(p, val);
+ break;
+ case PROC_KPTI_STATUS:
+ cpu_procctl_kpti_status(p, &val);
+ break;
+ case PROC_LA_CTL:
+ error = cpu_procctl_la_ctl(p, val);
+ break;
+ case PROC_LA_STATUS:
+ cpu_procctl_la_status(p, &val);
+ break;
}
+ PROC_UNLOCK(p);
+ if (com == PROC_KPTI_STATUS || com == PROC_LA_STATUS)
+ error = copyout(&val, data, sizeof(val));
break;
default:
error = EINVAL;
Index: sys/amd64/include/md_var.h
===================================================================
--- sys/amd64/include/md_var.h
+++ sys/amd64/include/md_var.h
@@ -46,6 +46,8 @@
extern vm_paddr_t intel_graphics_stolen_base;
extern vm_paddr_t intel_graphics_stolen_size;
+extern int la57;
+
/*
* The file "conf/ldscript.amd64" defines the symbol "kernphys". Its
* value is the physical address at which the kernel is loaded.
Index: sys/amd64/include/param.h
===================================================================
--- sys/amd64/include/param.h
+++ sys/amd64/include/param.h
@@ -118,6 +118,12 @@
#define PML4SHIFT 39 /* LOG2(NBPML4) */
#define NBPML4 (1UL<<PML4SHIFT)/* bytes/page map lev4 table */
#define PML4MASK (NBPML4-1)
+/* Size of the level 5 page-map level-5 table units */
+#define NPML5EPG (PAGE_SIZE/(sizeof (pml5_entry_t)))
+#define NPML5EPGSHIFT 9 /* LOG2(NPML5EPG) */
+#define PML5SHIFT 48 /* LOG2(NBPML5) */
+#define NBPML5 (1UL<<PML5SHIFT)/* bytes/page map lev5 table */
+#define PML5MASK (NBPML5-1)
#define MAXPAGESIZES 3 /* maximum number of supported page sizes */
Index: sys/amd64/include/pmap.h
===================================================================
--- sys/amd64/include/pmap.h
+++ sys/amd64/include/pmap.h
@@ -166,14 +166,22 @@
* Pte related macros. This is complicated by having to deal with
* the sign extension of the 48th bit.
*/
-#define KVADDR(l4, l3, l2, l1) ( \
+#define KV4ADDR(l4, l3, l2, l1) ( \
((unsigned long)-1 << 47) | \
((unsigned long)(l4) << PML4SHIFT) | \
((unsigned long)(l3) << PDPSHIFT) | \
((unsigned long)(l2) << PDRSHIFT) | \
((unsigned long)(l1) << PAGE_SHIFT))
+#define KV5ADDR(l5, l4, l3, l2, l1) ( \
+ ((unsigned long)-1 << 56) | \
+ ((unsigned long)(l5) << PML5SHIFT) | \
+ ((unsigned long)(l4) << PML4SHIFT) | \
+ ((unsigned long)(l3) << PDPSHIFT) | \
+ ((unsigned long)(l2) << PDRSHIFT) | \
+ ((unsigned long)(l1) << PAGE_SHIFT))
-#define UVADDR(l4, l3, l2, l1) ( \
+#define UVADDR(l5, l4, l3, l2, l1) ( \
+ ((unsigned long)(l5) << PML5SHIFT) | \
((unsigned long)(l4) << PML4SHIFT) | \
((unsigned long)(l3) << PDPSHIFT) | \
((unsigned long)(l2) << PDRSHIFT) | \
@@ -187,9 +195,19 @@
*/
#define NKPML4E 4
-#define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */
-#define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */
-#define NUPDE (NUPDPE*NPDEPG) /* number of userland PD entries */
+/*
+ * We use consistent numbering of the page table pages for 5-level and
+ * 4-level paging structures.
+ */
+#define NUPML5E (NPML5EPG / 2) /* number of userland PML5
+ pages */
+#define NUPML4E (NUPML5E * NPML4EPG) /* number of userland PML4
+ pages */
+#define NUPDPE (NUPML4E * NPDPEPG) /* number of userland PDP
+ pages */
+#define NUPDE (NUPDPE * NPDEPG) /* number of userland PD
+ entries */
+#define NUP4ML4E (NPML4EPG / 2)
/*
* NDMPML4E is the maximum number of PML4 entries that will be
@@ -216,7 +234,8 @@
* Or, in other words, KPML4I provides bits 39..47 of KERNBASE,
* and KPDPI provides bits 30..38.)
*/
-#define PML4PML4I (NPML4EPG/2) /* Index of recursive pml4 mapping */
+#define PML4PML4I (NPML4EPG / 2) /* Index of recursive pml4 mapping */
+#define PML5PML5I (NPML5EPG / 2) /* Index of recursive pml5 mapping */
#define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */
#define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */
@@ -258,25 +277,34 @@
typedef u_int64_t pt_entry_t;
typedef u_int64_t pdp_entry_t;
typedef u_int64_t pml4_entry_t;
+typedef u_int64_t pml5_entry_t;
/*
* Address of current address space page table maps and directories.
*/
#ifdef _KERNEL
-#define addr_PTmap (KVADDR(PML4PML4I, 0, 0, 0))
-#define addr_PDmap (KVADDR(PML4PML4I, PML4PML4I, 0, 0))
-#define addr_PDPmap (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0))
-#define addr_PML4map (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I))
-#define addr_PML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t)))
-#define PTmap ((pt_entry_t *)(addr_PTmap))
-#define PDmap ((pd_entry_t *)(addr_PDmap))
-#define PDPmap ((pd_entry_t *)(addr_PDPmap))
-#define PML4map ((pd_entry_t *)(addr_PML4map))
-#define PML4pml4e ((pd_entry_t *)(addr_PML4pml4e))
+#define addr_P4Tmap (KV4ADDR(PML4PML4I, 0, 0, 0))
+#define addr_P4Dmap (KV4ADDR(PML4PML4I, PML4PML4I, 0, 0))
+#define addr_P4DPmap (KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0))
+#define addr_P4ML4map (KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I))
+#define addr_P4ML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t)))
+#define P4Tmap ((pt_entry_t *)(addr_P4Tmap))
+#define P4Dmap ((pd_entry_t *)(addr_P4Dmap))
+
+#define addr_P5Tmap (KV5ADDR(PML5PML5I, 0, 0, 0, 0))
+#define addr_P5Dmap (KV5ADDR(PML5PML5I, PML5PML5I, 0, 0, 0))
+#define addr_P5DPmap (KV5ADDR(PML5PML5I, PML5PML5I, PML5PML5I, 0, 0))
+#define addr_P5ML4map (KV5ADDR(PML5PML5I, PML5PML5I, PML5PML5I, PML5PML5I, 0))
+#define addr_P5ML5map \
+ (KVADDR(PML5PML5I, PML5PML5I, PML5PML5I, PML5PML5I, PML5PML5I))
+#define addr_P5ML5pml5e (addr_P5ML5map + (PML5PML5I * sizeof(pml5_entry_t)))
+#define P5Tmap ((pt_entry_t *)(addr_P5Tmap))
+#define P5Dmap ((pd_entry_t *)(addr_P5Dmap))
extern int nkpt; /* Initial number of kernel page tables */
extern u_int64_t KPDPphys; /* physical address of kernel level 3 */
extern u_int64_t KPML4phys; /* physical address of kernel level 4 */
+extern u_int64_t KPML5phys; /* physical address of kernel level 5 */
/*
* virtual address to page table entry and
@@ -333,8 +361,8 @@
*/
struct pmap {
struct mtx pm_mtx;
- pml4_entry_t *pm_pml4; /* KVA of level 4 page table */
- pml4_entry_t *pm_pml4u; /* KVA of user l4 page table */
+ pml4_entry_t *pm_pmltop; /* KVA of top level page table */
+ pml4_entry_t *pm_pmltopu; /* KVA of user top page table */
uint64_t pm_cr3;
uint64_t pm_ucr3;
TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */
@@ -447,6 +475,7 @@
boolean_t pmap_page_is_mapped(vm_page_t m);
void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma);
void pmap_pinit_pml4(vm_page_t);
+void pmap_pinit_pml5(vm_page_t);
bool pmap_ps_enabled(pmap_t pmap);
void pmap_unmapdev(vm_offset_t, vm_size_t);
void pmap_invalidate_page(pmap_t, vm_offset_t);
@@ -502,6 +531,13 @@
return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
}
+static __inline vm_pindex_t
+pmap_pml5e_index(vm_offset_t va)
+{
+
+ return ((va >> PML5SHIFT) & ((1ul << NPML5EPGSHIFT) - 1));
+}
+
#endif /* !LOCORE */
#endif /* !_MACHINE_PMAP_H_ */
Index: sys/amd64/include/proc.h
===================================================================
--- sys/amd64/include/proc.h
+++ sys/amd64/include/proc.h
@@ -84,6 +84,8 @@
};
#define P_MD_KPTI 0x00000001 /* Enable KPTI on exec */
+#define P_MD_LA48 0x00000002 /* Request LA48 after exec */
+#define P_MD_LA57 0x00000004 /* Request LA57 after exec */
#define KINFO_PROC_SIZE 1088
#define KINFO_PROC32_SIZE 768
Index: sys/amd64/include/vmm.h
===================================================================
--- sys/amd64/include/vmm.h
+++ sys/amd64/include/vmm.h
@@ -520,6 +520,7 @@
PAGING_MODE_32,
PAGING_MODE_PAE,
PAGING_MODE_64,
+ PAGING_MODE_64_LA57,
};
struct vm_guest_paging {
Index: sys/amd64/include/vmparam.h
===================================================================
--- sys/amd64/include/vmparam.h
+++ sys/amd64/include/vmparam.h
@@ -169,25 +169,32 @@
* 0xffffffff80000000 KERNBASE
*/
-#define VM_MIN_KERNEL_ADDRESS KVADDR(KPML4BASE, 0, 0, 0)
-#define VM_MAX_KERNEL_ADDRESS KVADDR(KPML4BASE + NKPML4E - 1, \
+#define VM_MIN_KERNEL_ADDRESS KV4ADDR(KPML4BASE, 0, 0, 0)
+#define VM_MAX_KERNEL_ADDRESS KV4ADDR(KPML4BASE + NKPML4E - 1, \
NPDPEPG-1, NPDEPG-1, NPTEPG-1)
-#define DMAP_MIN_ADDRESS KVADDR(DMPML4I, 0, 0, 0)
-#define DMAP_MAX_ADDRESS KVADDR(DMPML4I + NDMPML4E, 0, 0, 0)
+#define DMAP_MIN_ADDRESS KV4ADDR(DMPML4I, 0, 0, 0)
+#define DMAP_MAX_ADDRESS KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0)
-#define LARGEMAP_MIN_ADDRESS KVADDR(LMSPML4I, 0, 0, 0)
-#define LARGEMAP_MAX_ADDRESS KVADDR(LMEPML4I + 1, 0, 0, 0)
+#define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0)
+#define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0)
-#define KERNBASE KVADDR(KPML4I, KPDPI, 0, 0)
+#define KERNBASE KV4ADDR(KPML4I, KPDPI, 0, 0)
-#define UPT_MAX_ADDRESS KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)
-#define UPT_MIN_ADDRESS KVADDR(PML4PML4I, 0, 0, 0)
+#define UPT_MAX_ADDRESS KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)
+#define UPT_MIN_ADDRESS KV4ADDR(PML4PML4I, 0, 0, 0)
-#define VM_MAXUSER_ADDRESS UVADDR(NUPML4E, 0, 0, 0)
+#define VM_MAXUSER_ADDRESS_LA57 UVADDR(NUPML5E, 0, 0, 0, 0)
+#define VM_MAXUSER_ADDRESS_LA48 UVADDR(0, NUP4ML4E, 0, 0, 0)
+#define VM_MAXUSER_ADDRESS VM_MAXUSER_ADDRESS_LA57
-#define SHAREDPAGE (VM_MAXUSER_ADDRESS - PAGE_SIZE)
-#define USRSTACK SHAREDPAGE
+#define SHAREDPAGE_LA57 (VM_MAXUSER_ADDRESS_LA57 - PAGE_SIZE)
+#define SHAREDPAGE_LA48 (VM_MAXUSER_ADDRESS_LA48 - PAGE_SIZE)
+#define USRSTACK_LA57 SHAREDPAGE_LA57
+#define USRSTACK_LA48 SHAREDPAGE_LA48
+#define USRSTACK USRSTACK_LA48
+#define PS_STRINGS_LA57 (USRSTACK_LA57 - sizeof(struct ps_strings))
+#define PS_STRINGS_LA48 (USRSTACK_LA48 - sizeof(struct ps_strings))
#define VM_MAX_ADDRESS UPT_MAX_ADDRESS
#define VM_MIN_ADDRESS (0)
Index: sys/amd64/linux/linux_sysvec.c
===================================================================
--- sys/amd64/linux/linux_sysvec.c
+++ sys/amd64/linux/linux_sysvec.c
@@ -739,9 +739,9 @@
.sv_imgact_try = linux_exec_imgact_try,
.sv_minsigstksz = LINUX_MINSIGSTKSZ,
.sv_minuser = VM_MIN_ADDRESS,
- .sv_maxuser = VM_MAXUSER_ADDRESS,
- .sv_usrstack = USRSTACK,
- .sv_psstrings = PS_STRINGS,
+ .sv_maxuser = VM_MAXUSER_ADDRESS_LA48,
+ .sv_usrstack = USRSTACK_LA48,
+ .sv_psstrings = PS_STRINGS_LA48,
.sv_stackprot = VM_PROT_ALL,
.sv_copyout_auxargs = linux_copyout_auxargs,
.sv_copyout_strings = linux_copyout_strings,
@@ -752,7 +752,7 @@
.sv_set_syscall_retval = linux_set_syscall_retval,
.sv_fetch_syscall_args = linux_fetch_syscall_args,
.sv_syscallnames = NULL,
- .sv_shared_page_base = SHAREDPAGE,
+ .sv_shared_page_base = SHAREDPAGE_LA48,
.sv_shared_page_len = PAGE_SIZE,
.sv_schedtail = linux_schedtail,
.sv_thread_detach = linux_thread_detach,
Index: sys/amd64/vmm/amd/svm.c
===================================================================
--- sys/amd64/vmm/amd/svm.c
+++ sys/amd64/vmm/amd/svm.c
@@ -560,7 +560,7 @@
panic("contigmalloc of SVM IO bitmap failed");
svm_sc->vm = vm;
- svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
+ svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pmltop);
/*
* Intercept read and write accesses to all MSRs.
Index: sys/amd64/vmm/intel/vmx.c
===================================================================
--- sys/amd64/vmm/intel/vmx.c
+++ sys/amd64/vmm/intel/vmx.c
@@ -973,7 +973,7 @@
}
vmx->vm = vm;
- vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
+ vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pmltop));
/*
* Clean up EPTP-tagged guest physical and combined mappings
@@ -1871,14 +1871,18 @@
static enum vm_paging_mode
vmx_paging_mode(void)
{
+ uint64_t cr4;
if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
return (PAGING_MODE_FLAT);
- if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
+ cr4 = vmcs_read(VMCS_GUEST_CR4);
+ if (!(cr4 & CR4_PAE))
return (PAGING_MODE_32);
- if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
- return (PAGING_MODE_64);
- else
+ if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) {
+ if (!(cr4 & CR4_LA57))
+ return (PAGING_MODE_64);
+ return (PAGING_MODE_64_LA57);
+ } else
return (PAGING_MODE_PAE);
}
Index: sys/amd64/vmm/vmm_instruction_emul.c
===================================================================
--- sys/amd64/vmm/vmm_instruction_emul.c
+++ sys/amd64/vmm/vmm_instruction_emul.c
@@ -2189,8 +2189,12 @@
ptpphys = pte;
nlevels = 2;
- } else
+ } else if (paging->paging_mode == PAGING_MODE_64_LA57) {
+ nlevels = 5;
+ } else {
nlevels = 4;
+ }
+
while (--nlevels >= 0) {
/* Zero out the lower 12 bits and the upper 12 bits */
ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
Index: sys/cddl/dev/dtrace/amd64/dtrace_subr.c
===================================================================
--- sys/cddl/dev/dtrace/amd64/dtrace_subr.c
+++ sys/cddl/dev/dtrace/amd64/dtrace_subr.c
@@ -43,6 +43,7 @@
#include <machine/clock.h>
#include <machine/cpufunc.h>
#include <machine/frame.h>
+#include <machine/md_var.h>
#include <machine/psl.h>
#include <machine/trap.h>
#include <vm/pmap.h>
@@ -131,7 +132,7 @@
void
dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit))
{
- (*func)(0, (uintptr_t) addr_PTmap);
+ (*func)(0, la57 ? (uintptr_t)addr_P5Tmap : (uintptr_t)addr_P4Tmap);
}
void
Index: sys/kern/imgact_elf.c
===================================================================
--- sys/kern/imgact_elf.c
+++ sys/kern/imgact_elf.c
@@ -97,7 +97,8 @@
int32_t *osrel);
static bool kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel);
static boolean_t __elfN(check_note)(struct image_params *imgp,
- Elf_Brandnote *checknote, int32_t *osrel, uint32_t *fctl0);
+ Elf_Brandnote *checknote, int32_t *osrel, boolean_t *has_fctl0,
+ uint32_t *fctl0);
static vm_prot_t __elfN(trans_prot)(Elf_Word);
static Elf_Word __elfN(untrans_prot)(vm_prot_t);
@@ -309,7 +310,7 @@
{
const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
Elf_Brandinfo *bi, *bi_m;
- boolean_t ret;
+ boolean_t ret, has_fctl0;
int i, interp_name_len;
interp_name_len = interp != NULL ? strlen(interp) + 1 : 0;
@@ -331,11 +332,16 @@
continue;
if (hdr->e_machine == bi->machine && (bi->flags &
(BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) {
+ has_fctl0 = false;
+ *fctl0 = 0;
+ *osrel = 0;
ret = __elfN(check_note)(imgp, bi->brand_note, osrel,
- fctl0);
+ &has_fctl0, fctl0);
/* Give brand a chance to veto check_note's guess */
- if (ret && bi->header_supported)
- ret = bi->header_supported(imgp);
+ if (ret && bi->header_supported) {
+ ret = bi->header_supported(imgp, osrel,
+ has_fctl0 ? fctl0 : NULL);
+ }
/*
* If note checker claimed the binary, but the
* interpreter path in the image does not
@@ -374,7 +380,7 @@
bi->compat_3_brand) == 0))) {
/* Looks good, but give brand a chance to veto */
if (bi->header_supported == NULL ||
- bi->header_supported(imgp)) {
+ bi->header_supported(imgp, NULL, NULL)) {
/*
* Again, prefer strictly matching
* interpreter path.
@@ -402,7 +408,7 @@
bi->header_supported == NULL)
continue;
if (hdr->e_machine == bi->machine) {
- ret = bi->header_supported(imgp);
+ ret = bi->header_supported(imgp, NULL, NULL);
if (ret)
return (bi);
}
@@ -422,7 +428,7 @@
strlen(bi->interp_path) + 1 == interp_name_len &&
strncmp(interp, bi->interp_path, interp_name_len)
== 0 && (bi->header_supported == NULL ||
- bi->header_supported(imgp)))
+ bi->header_supported(imgp, NULL, NULL)))
return (bi);
}
}
@@ -436,7 +442,7 @@
if (hdr->e_machine == bi->machine &&
__elfN(fallback_brand) == bi->brand &&
(bi->header_supported == NULL ||
- bi->header_supported(imgp)))
+ bi->header_supported(imgp, NULL, NULL)))
return (bi);
}
return (NULL);
@@ -2657,6 +2663,7 @@
};
struct fctl_cb_arg {
+ boolean_t *has_fctl0;
uint32_t *fctl0;
};
@@ -2671,6 +2678,7 @@
p = (uintptr_t)(note + 1);
p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
desc = (const Elf32_Word *)p;
+ *arg->has_fctl0 = TRUE;
*arg->fctl0 = desc[0];
return (TRUE);
}
@@ -2683,7 +2691,7 @@
*/
static boolean_t
__elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote,
- int32_t *osrel, uint32_t *fctl0)
+ int32_t *osrel, boolean_t *has_fctl0, uint32_t *fctl0)
{
const Elf_Phdr *phdr;
const Elf_Ehdr *hdr;
@@ -2695,6 +2703,7 @@
phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
b_arg.brandnote = brandnote;
b_arg.osrel = osrel;
+ f_arg.has_fctl0 = has_fctl0;
f_arg.fctl0 = fctl0;
for (i = 0; i < hdr->e_phnum; i++) {
Index: sys/kern/kern_sharedpage.c
===================================================================
--- sys/kern/kern_sharedpage.c
+++ sys/kern/kern_sharedpage.c
@@ -288,3 +288,21 @@
#endif
}
}
+
+void
+exec_sysvec_init_secondary(struct sysentvec *sv, struct sysentvec *sv2)
+{
+ MPASS((sv2->sv_flags & SV_ABI_MASK) == (sv->sv_flags & SV_ABI_MASK));
+ MPASS((sv2->sv_flags & SV_TIMEKEEP) == (sv->sv_flags & SV_TIMEKEEP));
+ MPASS((sv2->sv_flags & SV_SHP) != 0 && (sv->sv_flags & SV_SHP) != 0);
+
+ sv2->sv_shared_page_obj = sv->sv_shared_page_obj;
+ sv2->sv_sigcode_base = sv2->sv_shared_page_base +
+ (sv->sv_sigcode_base - sv->sv_shared_page_base);
+ if ((sv2->sv_flags & SV_ABI_MASK) != SV_ABI_FREEBSD)
+ return;
+ if ((sv2->sv_flags & SV_TIMEKEEP) != 0) {
+ sv2->sv_timekeep_base = sv2->sv_shared_page_base +
+ (sv->sv_timekeep_base - sv->sv_shared_page_base);
+ }
+}
Index: sys/sys/elf_common.h
===================================================================
--- sys/sys/elf_common.h
+++ sys/sys/elf_common.h
@@ -796,6 +796,7 @@
#define NT_FREEBSD_FCTL_PROTMAX_DISABLE 0x00000002
#define NT_FREEBSD_FCTL_STKGAP_DISABLE 0x00000004
#define NT_FREEBSD_FCTL_WXNEEDED 0x00000008
+#define NT_FREEBSD_FCTL_LA48 0x00000010
/* Values for n_type. Used in core files. */
#define NT_PRSTATUS 1 /* Process status. */
Index: sys/sys/imgact_elf.h
===================================================================
--- sys/sys/imgact_elf.h
+++ sys/sys/imgact_elf.h
@@ -87,7 +87,8 @@
const char *interp_newpath;
int flags;
Elf_Brandnote *brand_note;
- boolean_t (*header_supported)(struct image_params *);
+ boolean_t (*header_supported)(struct image_params *,
+ int32_t *, uint32_t *);
#define BI_CAN_EXEC_DYN 0x0001
#define BI_BRAND_NOTE 0x0002 /* May have note.ABI-tag section. */
#define BI_BRAND_NOTE_MANDATORY 0x0004 /* Must have note.ABI-tag section. */
Index: sys/sys/sysent.h
===================================================================
--- sys/sys/sysent.h
+++ sys/sys/sysent.h
@@ -321,6 +321,7 @@
int shared_page_fill(int size, int align, const void *data);
void shared_page_write(int base, int size, const void *data);
void exec_sysvec_init(void *param);
+void exec_sysvec_init_secondary(struct sysentvec *sv, struct sysentvec *sv2);
void exec_inittk(void);
#define INIT_SYSENTVEC(name, sv) \
Index: sys/x86/acpica/acpi_wakeup.c
===================================================================
--- sys/x86/acpica/acpi_wakeup.c
+++ sys/x86/acpica/acpi_wakeup.c
@@ -99,7 +99,7 @@
#endif
#ifdef __amd64__
-#define ACPI_WAKEPAGES 4
+#define ACPI_WAKEPAGES 5
#else
#define ACPI_WAKEPAGES 1
#endif
@@ -414,8 +414,8 @@
static void *wakeaddr;
void *wakepages[ACPI_WAKEPAGES];
#ifdef __amd64__
- uint64_t *pt4, *pt3, *pt2;
- vm_paddr_t pt4pa, pt3pa, pt2pa;
+ uint64_t *pt5, *pt4, *pt3, *pt2;
+ vm_paddr_t pt5pa, pt4pa, pt3pa, pt2pa;
int i;
#endif
@@ -430,6 +430,10 @@
sc->acpi_wakephys = vtophys(wakeaddr);
#ifdef __amd64__
+ if (la57) {
+ pt5 = wakepages[4];
+ pt5pa = vtophys(pt5);
+ }
pt4 = wakepages[1];
pt3 = wakepages[2];
pt2 = wakepages[3];
@@ -448,7 +452,8 @@
#ifdef __amd64__
WAKECODE_FIXUP((wakeup_sw64 + 1), uint32_t,
sc->acpi_wakephys + wakeup_64);
- WAKECODE_FIXUP(wakeup_pagetables, uint32_t, pt4pa);
+ WAKECODE_FIXUP(wakeup_pagetables, uint32_t, la57 ? (pt5pa | 0x1) :
+ pt4pa);
#endif
/* Save pointers to some global data. */
@@ -457,7 +462,12 @@
WAKECODE_FIXUP(wakeup_cr3, register_t, pmap_get_kcr3());
#else /* __amd64__ */
/* Create the initial 1GB replicated page tables */
- for (i = 0; i < 512; i++) {
+ for (i = 0; i < NPTEPG; i++) {
+ if (la57) {
+ pt5[i] = (uint64_t)pt4pa;
+ pt5[i] |= PG_V | PG_RW | PG_U;
+ }
+
/*
* Each slot of the level 4 pages points
* to the same level 3 page
@@ -473,7 +483,7 @@
pt3[i] |= PG_V | PG_RW | PG_U;
/* The level 2 page slots are mapped with 2MB pages for 1GB. */
- pt2[i] = i * (2 * 1024 * 1024);
+ pt2[i] = i * NBPDR;
pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
}
#endif /* !__amd64__ */
Index: sys/x86/include/procctl.h
===================================================================
--- sys/x86/include/procctl.h
+++ sys/x86/include/procctl.h
@@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
- * Copyright (c) 2019 The FreeBSD Foundation
+ * Copyright (c) 2019,2020 The FreeBSD Foundation
*
* Portions of this software were developed by Konstantin Belousov
* under sponsorship from the FreeBSD Foundation.
@@ -35,9 +35,18 @@
#define PROC_KPTI_CTL (PROC_PROCCTL_MD_MIN + 0)
#define PROC_KPTI_STATUS (PROC_PROCCTL_MD_MIN + 1)
+#define PROC_LA_CTL (PROC_PROCCTL_MD_MIN + 2)
+#define PROC_LA_STATUS (PROC_PROCCTL_MD_MIN + 3)
#define PROC_KPTI_CTL_ENABLE_ON_EXEC 1
#define PROC_KPTI_CTL_DISABLE_ON_EXEC 2
#define PROC_KPTI_STATUS_ACTIVE 0x80000000
+#define PROC_LA_CTL_LA48_ON_EXEC 1
+#define PROC_LA_CTL_LA57_ON_EXEC 2
+#define PROC_LA_CTL_DEFAULT_ON_EXEC 3
+
+#define PROC_LA_STATUS_LA48 0x01000000
+#define PROC_LA_STATUS_LA57 0x02000000
+
#endif
Index: sys/x86/include/specialreg.h
===================================================================
--- sys/x86/include/specialreg.h
+++ sys/x86/include/specialreg.h
@@ -72,6 +72,7 @@
#define CR4_FXSR 0x00000200 /* Fast FPU save/restore used by OS */
#define CR4_XMM 0x00000400 /* enable SIMD/MMX2 to use except 16 */
#define CR4_UMIP 0x00000800 /* User Mode Instruction Prevention */
+#define CR4_LA57 0x00001000 /* Enable 5-level paging */
#define CR4_VMXE 0x00002000 /* enable VMX operation (Intel-specific) */
#define CR4_FSGSBASE 0x00010000 /* Enable FS/GS BASE accessing instructions */
#define CR4_PCIDE 0x00020000 /* Enable Context ID */
Index: usr.bin/elfctl/elfctl.c
===================================================================
--- usr.bin/elfctl/elfctl.c
+++ usr.bin/elfctl/elfctl.c
@@ -67,6 +67,7 @@
"Disable implicit PROT_MAX" },
{ "stackgap", NT_FREEBSD_FCTL_STKGAP_DISABLE, "Disable stack gap" },
{ "wxneeded", NT_FREEBSD_FCTL_WXNEEDED, "Requires W+X mappings" },
+ { "la48", NT_FREEBSD_FCTL_LA48, "amd64: Limit user VA to 48bit" },
};
static struct option long_opts[] = {
Index: usr.bin/proccontrol/proccontrol.1
===================================================================
--- usr.bin/proccontrol/proccontrol.1
+++ usr.bin/proccontrol/proccontrol.1
@@ -71,6 +71,9 @@
.Xr mmap 2 .
.It Ar kpti
Controls the KPTI enable, AMD64 only.
+.It Ar la48
+Control limiting usermode process address space to 48bit of address,
+AMD64 only, on machines capable of 57bit addressing.
.El
.Pp
The
Index: usr.bin/proccontrol/proccontrol.c
===================================================================
--- usr.bin/proccontrol/proccontrol.c
+++ usr.bin/proccontrol/proccontrol.c
@@ -48,6 +48,10 @@
#ifdef PROC_KPTI_CTL
MODE_KPTI,
#endif
+#ifdef PROC_LA_CTL
+ MODE_LA57,
+ MODE_LA48,
+#endif
};
static pid_t
@@ -69,13 +73,18 @@
#else
#define KPTI_USAGE
#endif
+#ifdef PROC_LA_CTL
+#define LA_USAGE "|la48|la57"
+#else
+#define LA_USAGE
+#endif
static void __dead2
usage(void)
{
fprintf(stderr, "Usage: proccontrol -m (aslr|protmax|trace|trapcap|"
- "stackgap"KPTI_USAGE") [-q] "
+ "stackgap"KPTI_USAGE LA_USAGE") [-q] "
"[-s (enable|disable)] [-p pid | command]\n");
exit(1);
}
@@ -107,6 +116,12 @@
#ifdef PROC_KPTI_CTL
else if (strcmp(optarg, "kpti") == 0)
mode = MODE_KPTI;
+#endif
+#ifdef PROC_LA_CTL
+ else if (strcmp(optarg, "la57") == 0)
+ mode = MODE_LA57;
+ else if (strcmp(optarg, "la48") == 0)
+ mode = MODE_LA48;
#endif
else
usage();
@@ -163,6 +178,12 @@
case MODE_KPTI:
error = procctl(P_PID, pid, PROC_KPTI_STATUS, &arg);
break;
+#endif
+#ifdef PROC_LA_CTL
+ case MODE_LA57:
+ case MODE_LA48:
+ error = procctl(P_PID, pid, PROC_LA_STATUS, &arg);
+ break;
#endif
default:
usage();
@@ -258,6 +279,27 @@
else
printf(", not active\n");
break;
+#endif
+#ifdef PROC_LA_CTL
+ case MODE_LA57:
+ case MODE_LA48:
+ switch (arg & ~(PROC_LA_STATUS_LA48 |
+ PROC_LA_STATUS_LA57)) {
+ case PROC_LA_CTL_LA48_ON_EXEC:
+ printf("la48 on exec");
+ break;
+ case PROC_LA_CTL_LA57_ON_EXEC:
+ printf("la57 on exec");
+ break;
+ case PROC_LA_CTL_DEFAULT_ON_EXEC:
+ printf("default on exec");
+ break;
+ }
+ if ((arg & PROC_LA_STATUS_LA48) != 0)
+ printf(", la48 active\n");
+ else if ((arg & PROC_LA_STATUS_LA57) != 0)
+ printf(", la57 active\n");
+ break;
#endif
}
} else {
@@ -294,6 +336,18 @@
PROC_KPTI_CTL_DISABLE_ON_EXEC;
error = procctl(P_PID, pid, PROC_KPTI_CTL, &arg);
break;
+#endif
+#ifdef PROC_LA_CTL
+ case MODE_LA57:
+ arg = enable ? PROC_LA_CTL_LA57_ON_EXEC :
+ PROC_LA_CTL_DEFAULT_ON_EXEC;
+ error = procctl(P_PID, pid, PROC_LA_CTL, &arg);
+ break;
+ case MODE_LA48:
+ arg = enable ? PROC_LA_CTL_LA48_ON_EXEC :
+ PROC_LA_CTL_DEFAULT_ON_EXEC;
+ error = procctl(P_PID, pid, PROC_LA_CTL, &arg);
+ break;
#endif
default:
usage();
Index: usr.sbin/bhyve/gdb.c
===================================================================
--- usr.sbin/bhyve/gdb.c
+++ usr.sbin/bhyve/gdb.c
@@ -251,7 +251,8 @@
else if (!(regs[2] & CR4_PAE))
paging->paging_mode = PAGING_MODE_32;
else if (regs[3] & EFER_LME)
- paging->paging_mode = PAGING_MODE_64;
+ paging->paging_mode = (regs[2] & CR4_LA57) ?
+ PAGING_MODE_64_LA57 : PAGING_MODE_64;
else
paging->paging_mode = PAGING_MODE_PAE;
return (0);

File Metadata

Mime Type
text/plain
Expires
Wed, Nov 12, 12:47 AM (1 h, 4 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
25186640
Default Alt Text
D25273.id74807.diff (79 KB)

Event Timeline