Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F135581061
D25273.id74807.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
79 KB
Referenced Files
None
Subscribers
None
D25273.id74807.diff
View Options
Index: sys/amd64/acpica/acpi_wakecode.S
===================================================================
--- sys/amd64/acpica/acpi_wakecode.S
+++ sys/amd64/acpica/acpi_wakecode.S
@@ -148,10 +148,18 @@
mov $bootdata32 - bootgdt, %eax
mov %ax, %ds
- /* Turn on the PAE bit for when paging is enabled */
+ /*
+ * Turn on the PAE and optionally LA57 bits for when paging
+ * is enabled.
+ */
mov %cr4, %eax
orl $CR4_PAE, %eax
- mov %eax, %cr4
+ leal wakeup_pagetables - wakeup_start(%ebx), %ecx
+ movl (%ecx), %ecx
+ testl $0x1, %ecx
+ je 1f
+ orl $CR4_LA57, %eax
+1: mov %eax, %cr4
/*
* Enable EFER.LME so that we get long mode when all the prereqs are
@@ -174,6 +182,7 @@
*/
leal wakeup_pagetables - wakeup_start(%ebx), %eax
movl (%eax), %eax
+ andl $~0x1, %eax
mov %eax, %cr3
/*
Index: sys/amd64/amd64/cpu_switch.S
===================================================================
--- sys/amd64/amd64/cpu_switch.S
+++ sys/amd64/amd64/cpu_switch.S
@@ -382,8 +382,11 @@
* Resuming processor state from pcb.
*/
ENTRY(resumectx)
- /* Switch to KPML4phys. */
+ /* Switch to KPML5/4phys. */
movq KPML4phys,%rax
+ movq KPML5phys,%rcx
+ cmpl $0, la57
+ cmovne %rcx, %rax
movq %rax,%cr3
/* Force kernel segment registers. */
Index: sys/amd64/amd64/efirt_machdep.c
===================================================================
--- sys/amd64/amd64/efirt_machdep.c
+++ sys/amd64/amd64/efirt_machdep.c
@@ -61,9 +61,10 @@
#include <vm/vm_page.h>
#include <vm/vm_pager.h>
+static pml5_entry_t *efi_pml5;
static pml4_entry_t *efi_pml4;
static vm_object_t obj_1t1_pt;
-static vm_page_t efi_pml4_page;
+static vm_page_t efi_pmltop_page;
static vm_pindex_t efi_1t1_idx;
void
@@ -82,7 +83,8 @@
obj_1t1_pt = NULL;
efi_pml4 = NULL;
- efi_pml4_page = NULL;
+ efi_pml5 = NULL;
+ efi_pmltop_page = NULL;
}
/*
@@ -109,22 +111,38 @@
static pt_entry_t *
efi_1t1_pte(vm_offset_t va)
{
+ pml5_entry_t *pml5e;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t *pde;
pt_entry_t *pte;
vm_page_t m;
- vm_pindex_t pml4_idx, pdp_idx, pd_idx;
+ vm_pindex_t pml5_idx, pml4_idx, pdp_idx, pd_idx;
vm_paddr_t mphys;
pml4_idx = pmap_pml4e_index(va);
- pml4e = &efi_pml4[pml4_idx];
+ if (la57) {
+ pml5_idx = pmap_pml5e_index(va);
+ pml5e = &efi_pml5[pml5_idx];
+ if (*pml5e == 0) {
+ m = efi_1t1_page();
+ mphys = VM_PAGE_TO_PHYS(m);
+ *pml5e = mphys | X86_PG_RW | X86_PG_V;
+ } else {
+ mphys = *pml5e & PG_FRAME;
+ }
+ pml4e = (pml4_entry_t *)PHYS_TO_DMAP(mphys);
+ pml4e = &pml4e[pml4_idx];
+ } else {
+ pml4e = &efi_pml4[pml4_idx];
+ }
+
if (*pml4e == 0) {
m = efi_1t1_page();
mphys = VM_PAGE_TO_PHYS(m);
*pml4e = mphys | X86_PG_RW | X86_PG_V;
} else {
- mphys = *pml4e & ~PAGE_MASK;
+ mphys = *pml4e & PG_FRAME;
}
pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys);
@@ -135,7 +153,7 @@
mphys = VM_PAGE_TO_PHYS(m);
*pdpe = mphys | X86_PG_RW | X86_PG_V;
} else {
- mphys = *pdpe & ~PAGE_MASK;
+ mphys = *pdpe & PG_FRAME;
}
pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
@@ -146,7 +164,7 @@
mphys = VM_PAGE_TO_PHYS(m);
*pde = mphys | X86_PG_RW | X86_PG_V;
} else {
- mphys = *pde & ~PAGE_MASK;
+ mphys = *pde & PG_FRAME;
}
pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
@@ -161,6 +179,7 @@
{
struct efi_md *p;
pt_entry_t *pte;
+ void *pml;
vm_offset_t va;
uint64_t idx;
int bits, i, mode;
@@ -170,10 +189,16 @@
VM_PROT_ALL, 0, NULL);
efi_1t1_idx = 0;
VM_OBJECT_WLOCK(obj_1t1_pt);
- efi_pml4_page = efi_1t1_page();
+ efi_pmltop_page = efi_1t1_page();
VM_OBJECT_WUNLOCK(obj_1t1_pt);
- efi_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(efi_pml4_page));
- pmap_pinit_pml4(efi_pml4_page);
+ pml = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(efi_pmltop_page));
+ if (la57) {
+ efi_pml5 = pml;
+ pmap_pinit_pml5(efi_pmltop_page);
+ } else {
+ efi_pml4 = pml;
+ pmap_pinit_pml4(efi_pmltop_page);
+ }
for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p,
descsz)) {
@@ -279,7 +304,7 @@
if (pmap_pcid_enabled && !invpcid_works)
PCPU_SET(curpmap, NULL);
- load_cr3(VM_PAGE_TO_PHYS(efi_pml4_page) | (pmap_pcid_enabled ?
+ load_cr3(VM_PAGE_TO_PHYS(efi_pmltop_page) | (pmap_pcid_enabled ?
curpmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid : 0));
/*
* If PCID is enabled, the clear CR3_PCID_SAVE bit in the loaded %cr3
Index: sys/amd64/amd64/elf_machdep.c
===================================================================
--- sys/amd64/amd64/elf_machdep.c
+++ sys/amd64/amd64/elf_machdep.c
@@ -49,7 +49,7 @@
#include <machine/fpu.h>
#include <machine/md_var.h>
-struct sysentvec elf64_freebsd_sysvec = {
+struct sysentvec elf64_freebsd_sysvec_la48 = {
.sv_size = SYS_MAXSYSCALL,
.sv_table = sysent,
.sv_errsize = 0,
@@ -64,9 +64,9 @@
.sv_imgact_try = NULL,
.sv_minsigstksz = MINSIGSTKSZ,
.sv_minuser = VM_MIN_ADDRESS,
- .sv_maxuser = VM_MAXUSER_ADDRESS,
- .sv_usrstack = USRSTACK,
- .sv_psstrings = PS_STRINGS,
+ .sv_maxuser = VM_MAXUSER_ADDRESS_LA48,
+ .sv_usrstack = USRSTACK_LA48,
+ .sv_psstrings = PS_STRINGS_LA48,
.sv_stackprot = VM_PROT_ALL,
.sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs),
.sv_copyout_strings = exec_copyout_strings,
@@ -78,14 +78,64 @@
.sv_set_syscall_retval = cpu_set_syscall_retval,
.sv_fetch_syscall_args = cpu_fetch_syscall_args,
.sv_syscallnames = syscallnames,
- .sv_shared_page_base = SHAREDPAGE,
+ .sv_shared_page_base = SHAREDPAGE_LA48,
.sv_shared_page_len = PAGE_SIZE,
.sv_schedtail = NULL,
.sv_thread_detach = NULL,
.sv_trap = NULL,
.sv_stackgap = elf64_stackgap,
};
-INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec);
+
+struct sysentvec elf64_freebsd_sysvec_la57 = {
+ .sv_size = SYS_MAXSYSCALL,
+ .sv_table = sysent,
+ .sv_errsize = 0,
+ .sv_errtbl = NULL,
+ .sv_transtrap = NULL,
+ .sv_fixup = __elfN(freebsd_fixup),
+ .sv_sendsig = sendsig,
+ .sv_sigcode = sigcode,
+ .sv_szsigcode = &szsigcode,
+ .sv_name = "FreeBSD ELF64",
+ .sv_coredump = __elfN(coredump),
+ .sv_imgact_try = NULL,
+ .sv_minsigstksz = MINSIGSTKSZ,
+ .sv_minuser = VM_MIN_ADDRESS,
+ .sv_maxuser = VM_MAXUSER_ADDRESS_LA57,
+ .sv_usrstack = USRSTACK_LA57,
+ .sv_psstrings = PS_STRINGS_LA57,
+ .sv_stackprot = VM_PROT_ALL,
+ .sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs),
+ .sv_copyout_strings = exec_copyout_strings,
+ .sv_setregs = exec_setregs,
+ .sv_fixlimit = NULL,
+ .sv_maxssiz = NULL,
+ .sv_flags = SV_ABI_FREEBSD | SV_ASLR | SV_LP64 | SV_SHP |
+ SV_TIMEKEEP,
+ .sv_set_syscall_retval = cpu_set_syscall_retval,
+ .sv_fetch_syscall_args = cpu_fetch_syscall_args,
+ .sv_syscallnames = syscallnames,
+ .sv_shared_page_base = SHAREDPAGE_LA57,
+ .sv_shared_page_len = PAGE_SIZE,
+ .sv_schedtail = NULL,
+ .sv_thread_detach = NULL,
+ .sv_trap = NULL,
+ .sv_stackgap = elf64_stackgap,
+};
+
+static void
+amd64_init_sysvecs(void *arg)
+{
+ amd64_lower_shared_page(&elf64_freebsd_sysvec_la48);
+ if (la57) {
+ exec_sysvec_init(&elf64_freebsd_sysvec_la57);
+ exec_sysvec_init_secondary(&elf64_freebsd_sysvec_la57,
+ &elf64_freebsd_sysvec_la48);
+ } else {
+ exec_sysvec_init(&elf64_freebsd_sysvec_la48);
+ }
+}
+SYSINIT(elf64_sysvec, SI_SUB_EXEC, SI_ORDER_ANY, amd64_init_sysvecs, NULL);
void
amd64_lower_shared_page(struct sysentvec *sv)
@@ -98,29 +148,57 @@
}
}
-/*
- * Do this fixup before INIT_SYSENTVEC (SI_ORDER_ANY) because the latter
- * uses the value of sv_shared_page_base.
- */
-SYSINIT(elf64_sysvec_fixup, SI_SUB_EXEC, SI_ORDER_FIRST,
- (sysinit_cfunc_t) amd64_lower_shared_page,
- &elf64_freebsd_sysvec);
+static boolean_t
+freebsd_brand_info_la57_img_compat(struct image_params *imgp,
+ int32_t *osrel __unused, uint32_t *fctl0)
+{
+ if ((imgp->proc->p_md.md_flags & P_MD_LA57) != 0)
+ return (TRUE);
+ if (fctl0 == NULL || (*fctl0 & NT_FREEBSD_FCTL_LA48) != 0)
+ return (FALSE);
+ if ((imgp->proc->p_md.md_flags & P_MD_LA48) != 0)
+ return (FALSE);
+ return (TRUE);
+}
-static Elf64_Brandinfo freebsd_brand_info = {
+static Elf64_Brandinfo freebsd_brand_info_la48 = {
.brand = ELFOSABI_FREEBSD,
.machine = EM_X86_64,
.compat_3_brand = "FreeBSD",
.emul_path = NULL,
.interp_path = "/libexec/ld-elf.so.1",
- .sysvec = &elf64_freebsd_sysvec,
+ .sysvec = &elf64_freebsd_sysvec_la48,
.interp_newpath = NULL,
.brand_note = &elf64_freebsd_brandnote,
- .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
+ .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE,
+};
+
+static Elf64_Brandinfo freebsd_brand_info_la57 = {
+ .brand = ELFOSABI_FREEBSD,
+ .machine = EM_X86_64,
+ .compat_3_brand = "FreeBSD",
+ .emul_path = NULL,
+ .interp_path = "/libexec/ld-elf.so.1",
+ .sysvec = &elf64_freebsd_sysvec_la57,
+ .interp_newpath = NULL,
+ .brand_note = &elf64_freebsd_brandnote,
+ .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE,
+ .header_supported = freebsd_brand_info_la57_img_compat,
};
+static void
+sysinit_register_elf64_brand_entries(void *arg __unused)
+{
+ /*
+ * _57 must go first so it can either claim the image, or hand
+ * it to _48.
+ */
+ if (la57)
+ elf64_insert_brand_entry(&freebsd_brand_info_la57);
+ elf64_insert_brand_entry(&freebsd_brand_info_la48);
+}
SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST,
- (sysinit_cfunc_t) elf64_insert_brand_entry,
- &freebsd_brand_info);
+ sysinit_register_elf64_brand_entries, NULL);
static Elf64_Brandinfo freebsd_brand_oinfo = {
.brand = ELFOSABI_FREEBSD,
@@ -128,15 +206,14 @@
.compat_3_brand = "FreeBSD",
.emul_path = NULL,
.interp_path = "/usr/libexec/ld-elf.so.1",
- .sysvec = &elf64_freebsd_sysvec,
+ .sysvec = &elf64_freebsd_sysvec_la48,
.interp_newpath = NULL,
.brand_note = &elf64_freebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE
};
SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY,
- (sysinit_cfunc_t) elf64_insert_brand_entry,
- &freebsd_brand_oinfo);
+ (sysinit_cfunc_t) elf64_insert_brand_entry, &freebsd_brand_oinfo);
static Elf64_Brandinfo kfreebsd_brand_info = {
.brand = ELFOSABI_FREEBSD,
@@ -144,15 +221,14 @@
.compat_3_brand = "FreeBSD",
.emul_path = NULL,
.interp_path = "/lib/ld-kfreebsd-x86-64.so.1",
- .sysvec = &elf64_freebsd_sysvec,
+ .sysvec = &elf64_freebsd_sysvec_la48,
.interp_newpath = NULL,
.brand_note = &elf64_kfreebsd_brandnote,
.flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY
};
SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY,
- (sysinit_cfunc_t) elf64_insert_brand_entry,
- &kfreebsd_brand_info);
+ (sysinit_cfunc_t) elf64_insert_brand_entry, &kfreebsd_brand_info);
void
elf64_dump_thread(struct thread *td, void *dst, size_t *off)
Index: sys/amd64/amd64/genassym.c
===================================================================
--- sys/amd64/amd64/genassym.c
+++ sys/amd64/amd64/genassym.c
@@ -99,11 +99,10 @@
ASSYM(PAGE_SIZE, PAGE_SIZE);
ASSYM(NPTEPG, NPTEPG);
ASSYM(NPDEPG, NPDEPG);
-ASSYM(addr_PTmap, addr_PTmap);
-ASSYM(addr_PDmap, addr_PDmap);
-ASSYM(addr_PDPmap, addr_PDPmap);
-ASSYM(addr_PML4map, addr_PML4map);
-ASSYM(addr_PML4pml4e, addr_PML4pml4e);
+ASSYM(addr_P4Tmap, addr_P4Tmap);
+ASSYM(addr_P4Dmap, addr_P4Dmap);
+ASSYM(addr_P5Tmap, addr_P5Tmap);
+ASSYM(addr_P5Dmap, addr_P5Dmap);
ASSYM(PDESIZE, sizeof(pd_entry_t));
ASSYM(PTESIZE, sizeof(pt_entry_t));
ASSYM(PAGE_SHIFT, PAGE_SHIFT);
Index: sys/amd64/amd64/locore.S
===================================================================
--- sys/amd64/amd64/locore.S
+++ sys/amd64/amd64/locore.S
@@ -36,13 +36,8 @@
/*
* Compiled KERNBASE location
*/
- .globl kernbase,loc_PTmap,loc_PDmap,loc_PDPmap,loc_PML4map,loc_PML4pml4e,dmapbase,dmapend
+ .globl kernbase, loc_PTmap, loc_PDmap, loc_PDPmap, dmapbase, dmapend
.set kernbase,KERNBASE
- .set loc_PTmap,addr_PTmap
- .set loc_PDmap,addr_PDmap
- .set loc_PDPmap,addr_PDPmap
- .set loc_PML4map,addr_PML4map
- .set loc_PML4pml4e,addr_PML4pml4e
.set dmapbase,DMAP_MIN_ADDRESS
.set dmapend,DMAP_MAX_ADDRESS
@@ -82,6 +77,62 @@
0: hlt
jmp 0b
+/* la57_trampoline(%rdi pml5) */
+NON_GPROF_ENTRY(la57_trampoline)
+ movq %rsp,%r11
+ movq %rbx,%r10
+ leaq la57_trampoline_end(%rip),%rsp
+
+ movq %cr0,%rdx
+ lgdtq la57_trampoline_gdt_desc(%rip)
+
+ pushq $(2<<3)
+ leaq l1(%rip),%rax
+ leaq l2(%rip),%rbx
+
+ pushq %rax
+ lretq
+ .code32
+
+l1: movl $(3<<3),%eax
+ movl %eax,%ss
+
+ movl %edx,%eax
+ andl $~CR0_PG,%eax
+ movl %eax,%cr0
+
+ movl %cr4,%eax
+ orl $CR4_LA57,%eax
+ movl %eax,%cr4
+
+ movl %edi,%cr3
+ movl %edx,%cr0
+
+ pushl $(1<<3)
+ pushl %ebx
+ lretl
+ .code64
+
+l2: movq %r11,%rsp
+ movq %r10,%rbx
+ retq
+ .p2align 4,0
+NON_GPROF_ENTRY(la57_trampoline_gdt_desc)
+ .word la57_trampoline_end - la57_trampoline_gdt
+ .long 0 /* filled by pmap_bootstrap_la57 */
+ .p2align 4,0
+NON_GPROF_ENTRY(la57_trampoline_gdt)
+ .long 0x00000000 /* null desc */
+ .long 0x00000000
+ .long 0x00000000 /* 64bit code */
+ .long 0x00209800
+ .long 0x0000ffff /* 32bit code */
+ .long 0x00cf9b00
+ .long 0x0000ffff /* universal data */
+ .long 0x00cf9300
+ .dcb.l 16,0
+NON_GPROF_ENTRY(la57_trampoline_end)
+
.bss
ALIGN_DATA /* just to be sure */
.globl bootstack
Index: sys/amd64/amd64/mp_machdep.c
===================================================================
--- sys/amd64/amd64/mp_machdep.c
+++ sys/amd64/amd64/mp_machdep.c
@@ -96,7 +96,7 @@
#define GiB(v) (v ## ULL << 30)
-#define AP_BOOTPT_SZ (PAGE_SIZE * 3)
+#define AP_BOOTPT_SZ (PAGE_SIZE * 4)
/* Temporary variables for init_secondary() */
char *doublefault_stack;
@@ -104,6 +104,8 @@
char *nmi_stack;
char *dbg_stack;
+extern u_int mptramp_la57;
+
/*
* Local data and functions.
*/
@@ -236,6 +238,8 @@
assign_cpu_ids();
+ mptramp_la57 = la57;
+
/* Start each Application Processor */
init_ops.start_all_aps();
@@ -391,9 +395,9 @@
int
native_start_all_aps(void)
{
- u_int64_t *pt4, *pt3, *pt2;
+ u_int64_t *pt5, *pt4, *pt3, *pt2;
u_int32_t mpbioswarmvec;
- int apic_id, cpu, domain, i;
+ int apic_id, cpu, domain, i, xo;
u_char mpbiosreason;
mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
@@ -402,18 +406,38 @@
bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
/* Locate the page tables, they'll be below the trampoline */
- pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
+ if (la57) {
+ pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
+ xo = 1;
+ } else {
+ xo = 0;
+ }
+ pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE);
pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
/* Create the initial 1GB replicated page tables */
for (i = 0; i < 512; i++) {
- /* Each slot of the level 4 pages points to the same level 3 page */
- pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE);
+ if (la57) {
+ pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
+ PAGE_SIZE);
+ pt5[i] |= PG_V | PG_RW | PG_U;
+ }
+
+ /*
+ * Each slot of the level 4 pages points to the same
+ * level 3 page.
+ */
+ pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
+ (xo + 1) * PAGE_SIZE);
pt4[i] |= PG_V | PG_RW | PG_U;
- /* Each slot of the level 3 pages points to the same level 2 page */
- pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE));
+ /*
+ * Each slot of the level 3 pages points to the same
+ * level 2 page.
+ */
+ pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
+ ((xo + 2) * PAGE_SIZE));
pt3[i] |= PG_V | PG_RW | PG_U;
/* The level 2 page slots are mapped with 2MB pages for 1GB. */
Index: sys/amd64/amd64/mpboot.S
===================================================================
--- sys/amd64/amd64/mpboot.S
+++ sys/amd64/amd64/mpboot.S
@@ -90,10 +90,13 @@
mov $bootdata-gdt, %eax
mov %ax, %ds
- /* Turn on the PAE bit for when paging is enabled */
+ /* Turn on the PAE and optionally LA57 bit for when paging is enabled */
mov %cr4, %eax
orl $CR4_PAE, %eax
- mov %eax, %cr4
+ cmpb $0, mptramp_la57-mptramp_start(%ebx)
+ je 1f
+ orl $CR4_LA57, %eax
+1: mov %eax, %cr4
/*
* Enable EFER.LME so that we get long mode when all the prereqs are
@@ -132,7 +135,7 @@
/*
* At this point paging is enabled, and we are in "compatibility" mode.
* We do another far jump to reload %cs with the 64 bit selector.
- * %cr3 points to a 4-level page table page.
+ * %cr3 points to a 4- or 5-level page table page.
* We cannot yet jump all the way to the kernel because we can only
* specify a 32 bit linear address. So, yet another trampoline.
*
@@ -209,6 +212,11 @@
mptramp_pagetables:
.long 0
+ /* 5-level paging ? */
+ .globl mptramp_la57
+mptramp_la57:
+ .long 0
+
/*
* The pseudo descriptor for lgdt to use.
*/
@@ -251,8 +259,12 @@
* Load a real %cr3 that has all the direct map stuff and switches
* off the 1GB replicated mirror. Load a stack pointer and jump
* into AP startup code in C.
- */
+ */
+ cmpl $0, la57
+ jne 2f
movq KPML4phys, %rax
- movq %rax, %cr3
+ jmp 3f
+2: movq KPML5phys, %rax
+3: movq %rax, %cr3
movq bootSTK, %rsp
jmp init_secondary
Index: sys/amd64/amd64/pmap.c
===================================================================
--- sys/amd64/amd64/pmap.c
+++ sys/amd64/amd64/pmap.c
@@ -398,6 +398,19 @@
SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
&pg_ps_enabled, 0, "Are large page mappings enabled?");
+int __read_frequently la57 = 0;
+SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
+ &la57, 0,
+ "5-level paging for host is enabled");
+
+static bool
+pmap_is_la57(pmap_t pmap)
+{
+ if (pmap->pm_type == PT_X86)
+ return (la57);
+ return (false); /* XXXKIB handle EPT */
+}
+
#define PAT_INDEX_SIZE 8
static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */
@@ -405,7 +418,10 @@
static u_int64_t KPDphys; /* phys addr of kernel level 2 */
u_int64_t KPDPphys; /* phys addr of kernel level 3 */
u_int64_t KPML4phys; /* phys addr of kernel level 4 */
+u_int64_t KPML5phys; /* phys addr of kernel level 5,
+ if supported */
+static pml4_entry_t *kernel_pml4;
static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
static int ndmpdpphys; /* number of DMPDPphys pages */
@@ -1257,7 +1273,7 @@
static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
- struct rwlock **lockp);
+ struct rwlock **lockp, vm_offset_t va);
static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp,
struct rwlock **lockp);
static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
@@ -1271,20 +1287,85 @@
/* Inline functions */
/********************/
-/* Return a non-clipped PD index for a given VA */
+/*
+ * Return a non-clipped indexes for a given VA, which are page table
+ * pages indexes at the corresponding level.
+ */
static __inline vm_pindex_t
pmap_pde_pindex(vm_offset_t va)
{
return (va >> PDRSHIFT);
}
+static __inline vm_pindex_t
+pmap_pdpe_pindex(vm_offset_t va)
+{
+ return (NUPDE + (va >> PDPSHIFT));
+}
+
+static __inline vm_pindex_t
+pmap_pml4e_pindex(vm_offset_t va)
+{
+ return (NUPDE + NUPDPE + (va >> PML4SHIFT));
+}
+
+static __inline vm_pindex_t
+pmap_pml5e_pindex(vm_offset_t va)
+{
+ return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT));
+}
+
+static __inline pml4_entry_t *
+pmap_pml5e(pmap_t pmap, vm_offset_t va)
+{
+
+ MPASS(pmap_is_la57(pmap));
+ return (&pmap->pm_pmltop[pmap_pml5e_index(va)]);
+}
+
+static __inline pml4_entry_t *
+pmap_pml5e_u(pmap_t pmap, vm_offset_t va)
+{
+
+ MPASS(pmap_is_la57(pmap));
+ return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]);
+}
+
+static __inline pml4_entry_t *
+pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va)
+{
+ pml4_entry_t *pml4e;
+
+ /* XXX MPASS(pmap_is_la57(pmap); */
+ pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME);
+ return (&pml4e[pmap_pml4e_index(va)]);
+}
/* Return a pointer to the PML4 slot that corresponds to a VA */
static __inline pml4_entry_t *
pmap_pml4e(pmap_t pmap, vm_offset_t va)
{
+ pml5_entry_t *pml5e;
+ pml4_entry_t *pml4e;
+ pt_entry_t PG_V;
- return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
+ if (pmap_is_la57(pmap)) {
+ pml5e = pmap_pml5e(pmap, va);
+ PG_V = pmap_valid_bit(pmap);
+ if ((*pml5e & PG_V) == 0)
+ return (NULL);
+ pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME);
+ } else {
+ pml4e = pmap->pm_pmltop;
+ }
+ return (&pml4e[pmap_pml4e_index(va)]);
+}
+
+static __inline pml4_entry_t *
+pmap_pml4e_u(pmap_t pmap, vm_offset_t va)
+{
+ MPASS(!pmap_is_la57(pmap));
+ return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]);
}
/* Return a pointer to the PDP slot that corresponds to a VA */
@@ -1306,7 +1387,7 @@
PG_V = pmap_valid_bit(pmap);
pml4e = pmap_pml4e(pmap, va);
- if ((*pml4e & PG_V) == 0)
+ if (pml4e == NULL || (*pml4e & PG_V) == 0)
return (NULL);
return (pmap_pml4e_to_pdpe(pml4e, va));
}
@@ -1387,21 +1468,37 @@
PMAP_INLINE pt_entry_t *
vtopte(vm_offset_t va)
{
- u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+ u_int64_t mask;
KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
- return (PTmap + ((va >> PAGE_SHIFT) & mask));
+ if (la57) {
+ mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
+ NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1);
+ return (P5Tmap + ((va >> PAGE_SHIFT) & mask));
+ } else {
+ mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT +
+ NPML4EPGSHIFT)) - 1);
+ return (P4Tmap + ((va >> PAGE_SHIFT) & mask));
+ }
}
static __inline pd_entry_t *
vtopde(vm_offset_t va)
{
- u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+ u_int64_t mask;
KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
- return (PDmap + ((va >> PDRSHIFT) & mask));
+ if (la57) {
+ mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
+ NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1);
+ return (P5Dmap + ((va >> PDRSHIFT) & mask));
+ } else {
+ mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
+ NPML4EPGSHIFT)) - 1);
+ return (P4Dmap + ((va >> PDRSHIFT) & mask));
+ }
}
static u_int64_t
@@ -1658,6 +1755,8 @@
p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
}
+
+ kernel_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
}
/*
@@ -1730,7 +1829,7 @@
* later unmapped (using pmap_remove()) and freed.
*/
PMAP_LOCK_INIT(kernel_pmap);
- kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
+ kernel_pmap->pm_pmltop = kernel_pml4;
kernel_pmap->pm_cr3 = KPML4phys;
kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */
@@ -1891,6 +1990,148 @@
load_cr4(cr4);
}
+extern const char la57_trampoline[], la57_trampoline_gdt_desc[],
+ la57_trampoline_gdt[], la57_trampoline_end[];
+
+static void
+pmap_bootstrap_la57(void *arg __unused)
+{
+ char *v_code;
+ pml5_entry_t *v_pml5;
+ pml4_entry_t *v_pml4;
+ pdp_entry_t *v_pdp;
+ pd_entry_t *v_pd;
+ pt_entry_t *v_pt;
+ vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5;
+ void (*la57_tramp)(uint64_t pml5);
+ struct region_descriptor r_gdt;
+
+ if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0)
+ return;
+ if (!TUNABLE_INT_FETCH("vm.pmap.la57", &la57))
+ la57 = 1;
+ if (!la57)
+ return;
+
+ r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1;
+ r_gdt.rd_base = (long)__pcpu[0].pc_gdt;
+
+ m_code = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_code->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_code);
+ v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code));
+ m_pml5 = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pml5->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pml5);
+ KPML5phys = VM_PAGE_TO_PHYS(m_pml5);
+ v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys);
+ m_pml4 = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pml4->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pml4);
+ v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4));
+ m_pdp = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pdp->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pdp);
+ v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp));
+ m_pd = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pd->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pd);
+ v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd));
+ m_pt = vm_page_alloc_contig(NULL, 0,
+ VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ,
+ 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT);
+ if ((m_pt->flags & PG_ZERO) == 0)
+ pmap_zero_page(m_pt);
+ v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt));
+
+ /*
+ * Map m_code 1:1, it appears below 4G in KVA due to physical
+ * address being below 4G. Since kernel KVA is in upper half,
+ * the pml4e should be zero and free for temporal use.
+ */
+ kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+ v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+ v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+ v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+
+ /*
+ * Add pml5 entry at top of KVA pointing to existing pml4 table,
+ * entering all existing kernel mapping into level 5 table.
+ */
+ v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M | pg_g;
+
+ /*
+ * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on.
+ */
+ v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+ v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] =
+ VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A |
+ X86_PG_M;
+
+ /*
+ * Copy and call the 48->57 trampoline, hope we return there, alive.
+ */
+ bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline);
+ *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) =
+ la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code);
+ la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code);
+ la57_tramp(KPML5phys);
+
+ /*
+ * gdt was necessary reset, switch back to our gdt.
+ */
+ lgdt(&r_gdt);
+ wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]);
+ load_ds(_udatasel);
+ load_es(_udatasel);
+ load_fs(_ufssel);
+ ssdtosyssd(&gdt_segs[GPROC0_SEL],
+ (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]);
+ ltr(GSEL(GPROC0_SEL, SEL_KPL));
+
+ /*
+ * Now unmap the trampoline, and free the pages.
+ * Clear pml5 entry used for 1:1 trampoline mapping.
+ */
+ pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]);
+ invlpg((vm_offset_t)v_code);
+ vm_page_free(m_code);
+ vm_page_free(m_pdp);
+ vm_page_free(m_pd);
+ vm_page_free(m_pt);
+
+ /*
+ * Recursively map PML5 to itself in order to get PTmap and
+ * PDmap.
+ */
+ v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx;
+
+ kernel_pmap->pm_cr3 = KPML5phys;
+ kernel_pmap->pm_pmltop = v_pml5;
+}
+SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL);
+
/*
* Initialize a vm_page's machine-dependent fields.
*/
@@ -2190,7 +2431,8 @@
}
for (i = 0; i < lm_ents; i++) {
m = pmap_large_map_getptp_unlocked();
- kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V |
+ /* XXXKIB la57 */
+ kernel_pml4[LMSPML4I + i] = X86_PG_V |
X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
VM_PAGE_TO_PHYS(m);
}
@@ -3566,44 +3808,57 @@
static void
_pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
{
+ pml5_entry_t *pml5;
+ pml4_entry_t *pml4;
+ pdp_entry_t *pdp;
+ pd_entry_t *pd;
+ vm_page_t pdpg, pdppg, pml4pg;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+
/*
* unmap the page table page
*/
- if (m->pindex >= NUPDE + NUPDPE) {
+ if (m->pindex >= NUPDE + NUPDPE + NUPML4E) {
+ /* PML4 page */
+ MPASS(pmap_is_la57(pmap));
+ pml5 = pmap_pml5e(pmap, va);
+ *pml5 = 0;
+ if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) {
+ pml5 = pmap_pml5e_u(pmap, va);
+ *pml5 = 0;
+ }
+ } else if (m->pindex >= NUPDE + NUPDPE) {
/* PDP page */
- pml4_entry_t *pml4;
pml4 = pmap_pml4e(pmap, va);
*pml4 = 0;
- if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
- pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
+ if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL &&
+ va <= VM_MAXUSER_ADDRESS) {
+ pml4 = pmap_pml4e_u(pmap, va);
*pml4 = 0;
}
} else if (m->pindex >= NUPDE) {
/* PD page */
- pdp_entry_t *pdp;
pdp = pmap_pdpe(pmap, va);
*pdp = 0;
} else {
/* PTE page */
- pd_entry_t *pd;
pd = pmap_pde(pmap, va);
*pd = 0;
}
pmap_resident_count_dec(pmap, 1);
if (m->pindex < NUPDE) {
/* We just released a PT, unhold the matching PD */
- vm_page_t pdpg;
-
pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
pmap_unwire_ptp(pmap, va, pdpg, free);
} else if (m->pindex < NUPDE + NUPDPE) {
/* We just released a PD, unhold the matching PDP */
- vm_page_t pdppg;
-
pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
pmap_unwire_ptp(pmap, va, pdppg, free);
+ } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) {
+ /* We just released a PDP, unhold the matching PML4 */
+ pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME);
+ pmap_unwire_ptp(pmap, va, pml4pg, free);
}
/*
@@ -3659,9 +3914,9 @@
int i;
PMAP_LOCK_INIT(pmap);
- pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
- pmap->pm_pml4u = NULL;
- pmap->pm_cr3 = KPML4phys;
+ pmap->pm_pmltop = kernel_pmap->pm_pmltop;
+ pmap->pm_pmltopu = NULL;
+ pmap->pm_cr3 = kernel_pmap->pm_cr3;
/* hack to keep pmap_pti_pcid_invalidate() alive */
pmap->pm_ucr3 = PMAP_NO_CR3;
pmap->pm_root.rt_root = 0;
@@ -3714,18 +3969,59 @@
/* install large map entries if configured */
for (i = 0; i < lm_ents; i++)
- pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i];
+ pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i];
+}
+
+void
+pmap_pinit_pml5(vm_page_t pml5pg)
+{
+ pml5_entry_t *pm_pml5;
+
+ pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg));
+
+ /*
+ * Add pml5 entry at top of KVA pointing to existing pml4 table,
+ * entering all existing kernel mapping into level 5 table.
+ */
+ pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V |
+ X86_PG_RW | X86_PG_A | X86_PG_M | pg_g |
+ pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
+
+ /*
+ * Install self-referential address mapping entry.
+ */
+ pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) |
+ X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A |
+ pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
}
static void
-pmap_pinit_pml4_pti(vm_page_t pml4pg)
+pmap_pinit_pml4_pti(vm_page_t pml4pgu)
{
- pml4_entry_t *pm_pml4;
+ pml4_entry_t *pm_pml4u;
int i;
- pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
+ pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu));
for (i = 0; i < NPML4EPG; i++)
- pm_pml4[i] = pti_pml4[i];
+ pm_pml4u[i] = pti_pml4[i];
+}
+
+static void
+pmap_pinit_pml5_pti(vm_page_t pml5pgu)
+{
+ pml5_entry_t *pm_pml5u;
+
+ pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu));
+
+ /*
+ * Add pml5 entry at top of KVA pointing to existing pml4 pti
+ * table, entering all kernel mapping needed for usermode,
+ * into level 5 table.
+ */
+ pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] =
+ pmap_kextract((vm_offset_t)pti_pml4) |
+ X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g |
+ pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE);
}
/*
@@ -3735,29 +4031,30 @@
int
pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
{
- vm_page_t pml4pg, pml4pgu;
- vm_paddr_t pml4phys;
+ vm_page_t pmltop_pg, pmltop_pgu;
+ vm_paddr_t pmltop_phys;
int i;
/*
* allocate the page directory page
*/
- pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
+ pmltop_pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK);
- pml4phys = VM_PAGE_TO_PHYS(pml4pg);
- pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
+ pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg);
+ pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys);
+
CPU_FOREACH(i) {
pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
pmap->pm_pcids[i].pm_gen = 0;
}
pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */
pmap->pm_ucr3 = PMAP_NO_CR3;
- pmap->pm_pml4u = NULL;
+ pmap->pm_pmltopu = NULL;
pmap->pm_type = pm_type;
- if ((pml4pg->flags & PG_ZERO) == 0)
- pagezero(pmap->pm_pml4);
+ if ((pmltop_pg->flags & PG_ZERO) == 0)
+ pagezero(pmap->pm_pmltop);
/*
* Do not install the host kernel mappings in the nested page
@@ -3766,15 +4063,21 @@
* Install minimal kernel mappings in PTI case.
*/
if (pm_type == PT_X86) {
- pmap->pm_cr3 = pml4phys;
- pmap_pinit_pml4(pml4pg);
+ pmap->pm_cr3 = pmltop_phys;
+ if (pmap_is_la57(pmap))
+ pmap_pinit_pml5(pmltop_pg);
+ else
+ pmap_pinit_pml4(pmltop_pg);
if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) {
- pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
+ pmltop_pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
- pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
- VM_PAGE_TO_PHYS(pml4pgu));
- pmap_pinit_pml4_pti(pml4pgu);
- pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
+ pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP(
+ VM_PAGE_TO_PHYS(pmltop_pgu));
+ if (pmap_is_la57(pmap))
+ pmap_pinit_pml5_pti(pmltop_pgu);
+ else
+ pmap_pinit_pml4_pti(pmltop_pgu);
+ pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu);
}
if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
rangeset_init(&pmap->pm_pkru, pkru_dup_range,
@@ -3799,13 +4102,87 @@
return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
}
+static pml4_entry_t *
+pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va,
+ bool addref)
+{
+ vm_pindex_t pml5index;
+ pml5_entry_t *pml5;
+ pml4_entry_t *pml4;
+ vm_page_t pml4pg;
+ pt_entry_t PG_V;
+ bool allocated;
+
+ if (!pmap_is_la57(pmap))
+ return (&pmap->pm_pmltop[pmap_pml4e_index(va)]);
+
+ PG_V = pmap_valid_bit(pmap);
+ pml5index = pmap_pml5e_index(va);
+ pml5 = &pmap->pm_pmltop[pml5index];
+ if ((*pml5 & PG_V) == 0) {
+ if (_pmap_allocpte(pmap, pmap_pml5e_pindex(va), lockp, va) ==
+ NULL)
+ return (NULL);
+ allocated = true;
+ } else {
+ allocated = false;
+ }
+ pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME);
+ pml4 = &pml4[pmap_pml4e_index(va)];
+ if ((*pml4 & PG_V) == 0) {
+ pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME);
+ if (allocated && !addref)
+ pml4pg->ref_count--;
+ else if (!allocated && addref)
+ pml4pg->ref_count++;
+ }
+ return (pml4);
+}
+
+static pdp_entry_t *
+pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va,
+ bool addref)
+{
+ vm_page_t pdppg;
+ pml4_entry_t *pml4;
+ pdp_entry_t *pdp;
+ pt_entry_t PG_V;
+ bool allocated;
+
+ PG_V = pmap_valid_bit(pmap);
+
+ pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false);
+ if (pml4 == NULL)
+ return (NULL);
+
+ if ((*pml4 & PG_V) == 0) {
+ /* Have to allocate a new pdp, recurse */
+ if (_pmap_allocpte(pmap, pmap_pml4e_pindex(va), lockp, va) ==
+ NULL)
+ return (NULL);
+ allocated = true;
+ } else {
+ allocated = false;
+ }
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
+ pdp = &pdp[pmap_pdpe_index(va)];
+ if ((*pdp & PG_V) == 0) {
+ pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
+ if (allocated && !addref)
+ pdppg->ref_count--;
+ else if (!allocated && addref)
+ pdppg->ref_count++;
+ }
+ return (pdp);
+}
+
/*
* This routine is called if the desired page table page does not exist.
*
* If page table page allocation fails, this routine may sleep before
* returning NULL. It sleeps only if a lock pointer was given.
*
- * Note: If a page allocation fails at page table level two or three,
+ * Note: If a page allocation fails at page table level two, three, or four,
* one or two pages may be held during the wait, only to be released
* afterwards. This conservative approach is easily argued to avoid
* race conditions.
@@ -3823,20 +4200,35 @@
* - for the page directory pointer page,
* ptepindex = NUPDE + NUPDPE + (pmap_pde_index(va) >> (NPDEPGSHIFT +
* NPML4EPGSHIFT),
- * i.e. index of pml4e is put after the last index of PDPE.
+ * i.e. index of pml4e is put after the last index of PDPE,
+ * - for the PML4 page (if LA57 mode is enabled),
+ * ptepindex = NUPDE + NUPDPE + NUPML4E + (pmap_pde_index(va) >>
+ * (NPDEPGSHIFT + NPML4EPGSHIFT + NPML5EPGSHIFT),
+ * i.e. index of pml5e is put after the last index of PML4E.
*
* Define an order on the paging entries, where all entries of the
* same height are put together, then heights are put from deepest to
* root. Then ptexpindex is the sequential number of the
* corresponding paging entry in this order.
*
- * The root page at PML4 does not participate in this indexing scheme, since
- * it is statically allocated by pmap_pinit() and not by _pmap_allocpte().
+ * The values of NUPDE, NUPDPE, and NUPML4E are fixed by the size of
+ * LA57 paging structures even in LA48 paging mode, as well as the
+ * ptepindexes are calculated as if the paging structures were 5-level
+ * regardless of the actual mode of operation.
+ *
+ * The root page at PML4/PML5 does not participate in this indexing scheme,
+ * since it is statically allocated by pmap_pinit() and not by _pmap_allocpte().
*/
static vm_page_t
-_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
+_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp,
+ vm_offset_t va __unused)
{
- vm_page_t m, pdppg, pdpg;
+ vm_pindex_t pml5index, pml4index;
+ pml5_entry_t *pml5, *pml5u;
+ pml4_entry_t *pml4, *pml4u;
+ pdp_entry_t *pdp;
+ pd_entry_t *pd;
+ vm_page_t m, pdpg;
pt_entry_t PG_A, PG_M, PG_RW, PG_V;
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -3872,16 +4264,38 @@
* Map the pagetable page into the process address space, if
* it isn't already there.
*/
+ if (ptepindex >= NUPDE + NUPDPE + NUPML4E) {
+ MPASS(pmap_is_la57(pmap));
+
+ pml5index = pmap_pml5e_index(va);
+ pml5 = &pmap->pm_pmltop[pml5index];
+ KASSERT((*pml5 & PG_V) == 0,
+ ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5));
+ *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
- if (ptepindex >= (NUPDE + NUPDPE)) {
- pml4_entry_t *pml4, *pml4u;
- vm_pindex_t pml4index;
+ if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) {
+ if (pmap->pm_ucr3 != PMAP_NO_CR3)
+ *pml5 |= pg_nx;
+ pml5u = &pmap->pm_pmltopu[pml5index];
+ *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
+ PG_A | PG_M;
+ }
+ } else if (ptepindex >= NUPDE + NUPDPE) {
+ pml4index = pmap_pml4e_index(va);
/* Wire up a new PDPE page */
- pml4index = ptepindex - (NUPDE + NUPDPE);
- pml4 = &pmap->pm_pml4[pml4index];
+ pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true);
+ if (pml4 == NULL) {
+ vm_page_unwire_noq(m);
+ vm_page_free_zero(m);
+ return (NULL);
+ }
+ KASSERT((*pml4 & PG_V) == 0,
+ ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4));
*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
- if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
+
+ if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL &&
+ pml4index < NUPML4E) {
/*
* PTI: Make all user-space mappings in the
* kernel-mode page table no-execute so that
@@ -3892,85 +4306,48 @@
if (pmap->pm_ucr3 != PMAP_NO_CR3)
*pml4 |= pg_nx;
- pml4u = &pmap->pm_pml4u[pml4index];
+ pml4u = &pmap->pm_pmltopu[pml4index];
*pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
PG_A | PG_M;
}
-
} else if (ptepindex >= NUPDE) {
- vm_pindex_t pml4index;
- vm_pindex_t pdpindex;
- pml4_entry_t *pml4;
- pdp_entry_t *pdp;
-
/* Wire up a new PDE page */
- pdpindex = ptepindex - NUPDE;
- pml4index = pdpindex >> NPML4EPGSHIFT;
-
- pml4 = &pmap->pm_pml4[pml4index];
- if ((*pml4 & PG_V) == 0) {
- /* Have to allocate a new pdp, recurse */
- if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
- lockp) == NULL) {
- vm_page_unwire_noq(m);
- vm_page_free_zero(m);
- return (NULL);
- }
- } else {
- /* Add reference to pdp page */
- pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
- pdppg->ref_count++;
+ pdp = pmap_allocpte_getpdp(pmap, lockp, va, true);
+ if (pdp == NULL) {
+ vm_page_unwire_noq(m);
+ vm_page_free_zero(m);
+ return (NULL);
}
- pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
-
- /* Now find the pdp page */
- pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
+ KASSERT((*pdp & PG_V) == 0,
+ ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp));
*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
-
} else {
- vm_pindex_t pml4index;
- vm_pindex_t pdpindex;
- pml4_entry_t *pml4;
- pdp_entry_t *pdp;
- pd_entry_t *pd;
-
/* Wire up a new PTE page */
- pdpindex = ptepindex >> NPDPEPGSHIFT;
- pml4index = pdpindex >> NPML4EPGSHIFT;
-
- /* First, find the pdp and check that its valid. */
- pml4 = &pmap->pm_pml4[pml4index];
- if ((*pml4 & PG_V) == 0) {
+ pdp = pmap_allocpte_getpdp(pmap, lockp, va, false);
+ if (pdp == NULL) {
+ vm_page_unwire_noq(m);
+ vm_page_free_zero(m);
+ return (NULL);
+ }
+ if ((*pdp & PG_V) == 0) {
/* Have to allocate a new pd, recurse */
- if (_pmap_allocpte(pmap, NUPDE + pdpindex,
- lockp) == NULL) {
+ if (_pmap_allocpte(pmap, pmap_pdpe_pindex(va),
+ lockp, va) == NULL) {
vm_page_unwire_noq(m);
vm_page_free_zero(m);
return (NULL);
}
- pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
- pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
} else {
- pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
- pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
- if ((*pdp & PG_V) == 0) {
- /* Have to allocate a new pd, recurse */
- if (_pmap_allocpte(pmap, NUPDE + pdpindex,
- lockp) == NULL) {
- vm_page_unwire_noq(m);
- vm_page_free_zero(m);
- return (NULL);
- }
- } else {
- /* Add reference to the pd page */
- pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
- pdpg->ref_count++;
- }
+ /* Add reference to the pd page */
+ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
+ pdpg->ref_count++;
}
pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
/* Now we know where the page directory page is */
- pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
+ pd = &pd[pmap_pde_index(va)];
+ KASSERT((*pd & PG_V) == 0,
+ ("pmap %p va %#lx pd %#lx", pmap, va, *pd));
*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
}
@@ -4003,7 +4380,7 @@
} else if (va < VM_MAXUSER_ADDRESS) {
/* Allocate a pd page. */
pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT;
- pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
+ pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp, va);
if (pdpg == NULL) {
if (lockp != NULL)
goto retry;
@@ -4064,7 +4441,7 @@
* Here if the pte page isn't mapped, or if it has been
* deallocated.
*/
- m = _pmap_allocpte(pmap, ptepindex, lockp);
+ m = _pmap_allocpte(pmap, ptepindex, lockp, va);
if (m == NULL && lockp != NULL)
goto retry;
}
@@ -4088,28 +4465,35 @@
int i;
KASSERT(pmap->pm_stats.resident_count == 0,
- ("pmap_release: pmap resident count %ld != 0",
- pmap->pm_stats.resident_count));
+ ("pmap_release: pmap %p resident count %ld != 0",
+ pmap, pmap->pm_stats.resident_count));
KASSERT(vm_radix_is_empty(&pmap->pm_root),
- ("pmap_release: pmap has reserved page table page(s)"));
+ ("pmap_release: pmap %p has reserved page table page(s)",
+ pmap));
KASSERT(CPU_EMPTY(&pmap->pm_active),
("releasing active pmap %p", pmap));
- m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop));
- for (i = 0; i < NKPML4E; i++) /* KVA */
- pmap->pm_pml4[KPML4BASE + i] = 0;
- for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
- pmap->pm_pml4[DMPML4I + i] = 0;
- pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */
- for (i = 0; i < lm_ents; i++) /* Large Map */
- pmap->pm_pml4[LMSPML4I + i] = 0;
+ if (pmap_is_la57(pmap)) {
+ pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0;
+ pmap->pm_pmltop[PML5PML5I] = 0;
+ } else {
+ for (i = 0; i < NKPML4E; i++) /* KVA */
+ pmap->pm_pmltop[KPML4BASE + i] = 0;
+ for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
+ pmap->pm_pmltop[DMPML4I + i] = 0;
+ pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */
+ for (i = 0; i < lm_ents; i++) /* Large Map */
+ pmap->pm_pmltop[LMSPML4I + i] = 0;
+ }
vm_page_unwire_noq(m);
vm_page_free_zero(m);
- if (pmap->pm_pml4u != NULL) {
- m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
+ if (pmap->pm_pmltopu != NULL) {
+ m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->
+ pm_pmltopu));
vm_page_unwire_noq(m);
vm_page_free(m);
}
@@ -5448,6 +5832,7 @@
{
struct rwlock *lock;
vm_offset_t va_next;
+ pml5_entry_t *pml5e;
pml4_entry_t *pml4e;
pdp_entry_t *pdpe;
pd_entry_t ptpaddr, *pde;
@@ -5490,7 +5875,18 @@
if (pmap->pm_stats.resident_count == 0)
break;
- pml4e = pmap_pml4e(pmap, sva);
+ if (pmap_is_la57(pmap)) {
+ pml5e = pmap_pml5e(pmap, sva);
+ if ((*pml5e & PG_V) == 0) {
+ va_next = (sva + NBPML5) & ~PML5MASK;
+ if (va_next < sva)
+ va_next = eva;
+ continue;
+ }
+ pml4e = pmap_pml5e_to_pml4e(pml5e, sva);
+ } else {
+ pml4e = pmap_pml4e(pmap, sva);
+ }
if ((*pml4e & PG_V) == 0) {
va_next = (sva + NBPML4) & ~PML4MASK;
if (va_next < sva)
@@ -6110,7 +6506,7 @@
*/
nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
- nosleep ? NULL : &lock);
+ nosleep ? NULL : &lock, va);
if (mpte == NULL && nosleep) {
rv = KERN_RESOURCE_SHORTAGE;
goto out;
@@ -6593,7 +6989,8 @@
* Pass NULL instead of the PV list lock
* pointer, because we don't intend to sleep.
*/
- mpte = _pmap_allocpte(pmap, ptepindex, NULL);
+ mpte = _pmap_allocpte(pmap, ptepindex, NULL,
+ va);
if (mpte == NULL)
return (mpte);
}
@@ -9346,11 +9743,11 @@
("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
"%#jx lm_ents %d",
(uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
- KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0,
+ KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0,
("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
"LMSPML4I %#jx lm_ents %d",
(uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
- mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME;
+ mphys = kernel_pml4[pml4_idx] & PG_FRAME;
return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
}
@@ -10425,7 +10822,9 @@
mode, range->pdpes, range->pdes, range->ptes);
/* Reset to sentinel value. */
- range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
+ range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1);
}
/*
@@ -10519,7 +10918,9 @@
sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
/* Sentinel value. */
- range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
+ range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1,
+ NPDEPG - 1, NPTEPG - 1);
/*
* Iterate over the kernel page tables without holding the kernel pmap
@@ -10549,7 +10950,7 @@
sva |= -1ul << 48;
restart:
- pml4e = kernel_pmap->pm_pml4[i];
+ pml4e = kernel_pml4[i];
if ((pml4e & X86_PG_V) == 0) {
sva = rounddown2(sva, NBPML4);
sysctl_kmaps_dump(sb, &range, sva);
@@ -10632,6 +11033,7 @@
DB_SHOW_COMMAND(pte, pmap_print_pte)
{
pmap_t pmap;
+ pml5_entry_t *pml5;
pml4_entry_t *pml4;
pdp_entry_t *pdp;
pd_entry_t *pde;
@@ -10650,8 +11052,20 @@
pmap = PCPU_GET(curpmap);
PG_V = pmap_valid_bit(pmap);
- pml4 = pmap_pml4e(pmap, va);
- db_printf("VA 0x%016lx pml4e 0x%016lx", va, *pml4);
+ db_printf("VA 0x%016lx", va);
+
+ if (pmap_is_la57(pmap)) {
+ pml5 = pmap_pml5e(pmap, va);
+ db_printf(" pml5e 0x%016lx", *pml5);
+ if ((*pml5 & PG_V) == 0) {
+ db_printf("\n");
+ return;
+ }
+ pml4 = pmap_pml5e_to_pml4e(pml5, va);
+ } else {
+ pml4 = pmap_pml4e(pmap, va);
+ }
+ db_printf(" pml4e 0x%016lx", *pml4);
if ((*pml4 & PG_V) == 0) {
db_printf("\n");
return;
@@ -10683,4 +11097,95 @@
db_printf("show phys2dmap addr\n");
}
}
+
+static void
+ptpages_show_page(int level, int idx, vm_page_t pg)
+{
+ db_printf("l %d i %d pg %p phys %#lx ref %x\n",
+ level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count);
+}
+
+static void
+ptpages_show_complain(int level, int idx, uint64_t pte)
+{
+ db_printf("l %d i %d pte %#lx\n", level, idx, pte);
+}
+
+static void
+ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V)
+{
+ vm_page_t pg3, pg2, pg1;
+ pml4_entry_t *pml4;
+ pdp_entry_t *pdp;
+ pd_entry_t *pd;
+ int i4, i3, i2;
+
+ pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4));
+ for (i4 = 0; i4 < num_entries; i4++) {
+ if ((pml4[i4] & PG_V) == 0)
+ continue;
+ pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME);
+ if (pg3 == NULL) {
+ ptpages_show_complain(3, i4, pml4[i4]);
+ continue;
+ }
+ ptpages_show_page(3, i4, pg3);
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3));
+ for (i3 = 0; i3 < NPDPEPG; i3++) {
+ if ((pdp[i3] & PG_V) == 0)
+ continue;
+ pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME);
+ if (pg3 == NULL) {
+ ptpages_show_complain(2, i3, pdp[i3]);
+ continue;
+ }
+ ptpages_show_page(2, i3, pg2);
+ pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2));
+ for (i2 = 0; i2 < NPDEPG; i2++) {
+ if ((pd[i2] & PG_V) == 0)
+ continue;
+ pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME);
+ if (pg1 == NULL) {
+ ptpages_show_complain(1, i2, pd[i2]);
+ continue;
+ }
+ ptpages_show_page(1, i2, pg1);
+ }
+ }
+ }
+}
+
+DB_SHOW_COMMAND(ptpages, pmap_ptpages)
+{
+ pmap_t pmap;
+ vm_page_t pg;
+ pml5_entry_t *pml5;
+ uint64_t PG_V;
+ int i5;
+
+ if (have_addr)
+ pmap = (pmap_t)addr;
+ else
+ pmap = PCPU_GET(curpmap);
+
+ PG_V = pmap_valid_bit(pmap);
+
+ if (pmap_is_la57(pmap)) {
+ pml5 = pmap->pm_pmltop;
+ for (i5 = 0; i5 < NUPML5E; i5++) {
+ if ((pml5[i5] & PG_V) == 0)
+ continue;
+ pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME);
+ if (pg == NULL) {
+ ptpages_show_complain(4, i5, pml5[i5]);
+ continue;
+ }
+ ptpages_show_page(4, i5, pg);
+ ptpages_show_pml4(pg, NPML4EPG, PG_V);
+ }
+ } else {
+ ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS(
+ (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V);
+ }
+}
#endif
Index: sys/amd64/amd64/vm_machdep.c
===================================================================
--- sys/amd64/amd64/vm_machdep.c
+++ sys/amd64/amd64/vm_machdep.c
@@ -377,21 +377,67 @@
}
static void
-cpu_procctl_kpti(struct proc *p, int com, int *val)
+cpu_procctl_kpti_ctl(struct proc *p, int val)
{
- if (com == PROC_KPTI_CTL) {
- if (pti && *val == PROC_KPTI_CTL_ENABLE_ON_EXEC)
- p->p_md.md_flags |= P_MD_KPTI;
- if (*val == PROC_KPTI_CTL_DISABLE_ON_EXEC)
- p->p_md.md_flags &= ~P_MD_KPTI;
- } else /* PROC_KPTI_STATUS */ {
- *val = (p->p_md.md_flags & P_MD_KPTI) != 0 ?
- PROC_KPTI_CTL_ENABLE_ON_EXEC:
- PROC_KPTI_CTL_DISABLE_ON_EXEC;
- if (vmspace_pmap(p->p_vmspace)->pm_ucr3 != PMAP_NO_CR3)
- *val |= PROC_KPTI_STATUS_ACTIVE;
+ if (pti && val == PROC_KPTI_CTL_ENABLE_ON_EXEC)
+ p->p_md.md_flags |= P_MD_KPTI;
+ if (val == PROC_KPTI_CTL_DISABLE_ON_EXEC)
+ p->p_md.md_flags &= ~P_MD_KPTI;
+}
+
+static void
+cpu_procctl_kpti_status(struct proc *p, int *val)
+{
+ *val = (p->p_md.md_flags & P_MD_KPTI) != 0 ?
+ PROC_KPTI_CTL_ENABLE_ON_EXEC:
+ PROC_KPTI_CTL_DISABLE_ON_EXEC;
+ if (vmspace_pmap(p->p_vmspace)->pm_ucr3 != PMAP_NO_CR3)
+ *val |= PROC_KPTI_STATUS_ACTIVE;
+}
+
+static int
+cpu_procctl_la_ctl(struct proc *p, int val)
+{
+ int error;
+
+ error = 0;
+ switch (val) {
+ case PROC_LA_CTL_LA48_ON_EXEC:
+ p->p_md.md_flags |= P_MD_LA48;
+ p->p_md.md_flags &= ~P_MD_LA57;
+ break;
+ case PROC_LA_CTL_LA57_ON_EXEC:
+ if (la57) {
+ p->p_md.md_flags &= ~P_MD_LA48;
+ p->p_md.md_flags |= P_MD_LA57;
+ } else {
+ error = ENOTSUP;
+ }
+ break;
+ case PROC_LA_CTL_DEFAULT_ON_EXEC:
+ p->p_md.md_flags &= ~(P_MD_LA48 | P_MD_LA57);
+ break;
}
+ return (error);
+}
+
+static void
+cpu_procctl_la_status(struct proc *p, int *val)
+{
+ int res;
+
+ if ((p->p_md.md_flags & P_MD_LA48) != 0)
+ res = PROC_LA_CTL_LA48_ON_EXEC;
+ else if ((p->p_md.md_flags & P_MD_LA57) != 0)
+ res = PROC_LA_CTL_LA57_ON_EXEC;
+ else
+ res = PROC_LA_CTL_DEFAULT_ON_EXEC;
+ if (p->p_sysent->sv_maxuser == VM_MAXUSER_ADDRESS_LA48)
+ res |= PROC_LA_STATUS_LA48;
+ else
+ res |= PROC_LA_STATUS_LA57;
+ *val = res;
}
int
@@ -403,6 +449,8 @@
switch (com) {
case PROC_KPTI_CTL:
case PROC_KPTI_STATUS:
+ case PROC_LA_CTL:
+ case PROC_LA_STATUS:
if (idtype != P_PID) {
error = EINVAL;
break;
@@ -412,22 +460,45 @@
error = priv_check(td, PRIV_IO);
if (error != 0)
break;
+ }
+ if (com == PROC_KPTI_CTL || com == PROC_LA_CTL) {
error = copyin(data, &val, sizeof(val));
if (error != 0)
break;
- if (val != PROC_KPTI_CTL_ENABLE_ON_EXEC &&
- val != PROC_KPTI_CTL_DISABLE_ON_EXEC) {
- error = EINVAL;
- break;
- }
+ }
+ if (com == PROC_KPTI_CTL &&
+ val != PROC_KPTI_CTL_ENABLE_ON_EXEC &&
+ val != PROC_KPTI_CTL_DISABLE_ON_EXEC) {
+ error = EINVAL;
+ break;
+ }
+ if (com == PROC_LA_CTL &&
+ val != PROC_LA_CTL_LA48_ON_EXEC &&
+ val != PROC_LA_CTL_LA57_ON_EXEC &&
+ val != PROC_LA_CTL_DEFAULT_ON_EXEC) {
+ error = EINVAL;
+ break;
}
error = pget(id, PGET_CANSEE | PGET_NOTWEXIT | PGET_NOTID, &p);
- if (error == 0) {
- cpu_procctl_kpti(p, com, &val);
- PROC_UNLOCK(p);
- if (com == PROC_KPTI_STATUS)
- error = copyout(&val, data, sizeof(val));
+ if (error != 0)
+ break;
+ switch (com) {
+ case PROC_KPTI_CTL:
+ cpu_procctl_kpti_ctl(p, val);
+ break;
+ case PROC_KPTI_STATUS:
+ cpu_procctl_kpti_status(p, &val);
+ break;
+ case PROC_LA_CTL:
+ error = cpu_procctl_la_ctl(p, val);
+ break;
+ case PROC_LA_STATUS:
+ cpu_procctl_la_status(p, &val);
+ break;
}
+ PROC_UNLOCK(p);
+ if (com == PROC_KPTI_STATUS || com == PROC_LA_STATUS)
+ error = copyout(&val, data, sizeof(val));
break;
default:
error = EINVAL;
Index: sys/amd64/include/md_var.h
===================================================================
--- sys/amd64/include/md_var.h
+++ sys/amd64/include/md_var.h
@@ -46,6 +46,8 @@
extern vm_paddr_t intel_graphics_stolen_base;
extern vm_paddr_t intel_graphics_stolen_size;
+extern int la57;
+
/*
* The file "conf/ldscript.amd64" defines the symbol "kernphys". Its
* value is the physical address at which the kernel is loaded.
Index: sys/amd64/include/param.h
===================================================================
--- sys/amd64/include/param.h
+++ sys/amd64/include/param.h
@@ -118,6 +118,12 @@
#define PML4SHIFT 39 /* LOG2(NBPML4) */
#define NBPML4 (1UL<<PML4SHIFT)/* bytes/page map lev4 table */
#define PML4MASK (NBPML4-1)
+/* Size of the level 5 page-map level-5 table units */
+#define NPML5EPG (PAGE_SIZE/(sizeof (pml5_entry_t)))
+#define NPML5EPGSHIFT 9 /* LOG2(NPML5EPG) */
+#define PML5SHIFT 48 /* LOG2(NBPML5) */
+#define NBPML5 (1UL<<PML5SHIFT)/* bytes/page map lev5 table */
+#define PML5MASK (NBPML5-1)
#define MAXPAGESIZES 3 /* maximum number of supported page sizes */
Index: sys/amd64/include/pmap.h
===================================================================
--- sys/amd64/include/pmap.h
+++ sys/amd64/include/pmap.h
@@ -166,14 +166,22 @@
* Pte related macros. This is complicated by having to deal with
* the sign extension of the 48th bit.
*/
-#define KVADDR(l4, l3, l2, l1) ( \
+#define KV4ADDR(l4, l3, l2, l1) ( \
((unsigned long)-1 << 47) | \
((unsigned long)(l4) << PML4SHIFT) | \
((unsigned long)(l3) << PDPSHIFT) | \
((unsigned long)(l2) << PDRSHIFT) | \
((unsigned long)(l1) << PAGE_SHIFT))
+#define KV5ADDR(l5, l4, l3, l2, l1) ( \
+ ((unsigned long)-1 << 56) | \
+ ((unsigned long)(l5) << PML5SHIFT) | \
+ ((unsigned long)(l4) << PML4SHIFT) | \
+ ((unsigned long)(l3) << PDPSHIFT) | \
+ ((unsigned long)(l2) << PDRSHIFT) | \
+ ((unsigned long)(l1) << PAGE_SHIFT))
-#define UVADDR(l4, l3, l2, l1) ( \
+#define UVADDR(l5, l4, l3, l2, l1) ( \
+ ((unsigned long)(l5) << PML5SHIFT) | \
((unsigned long)(l4) << PML4SHIFT) | \
((unsigned long)(l3) << PDPSHIFT) | \
((unsigned long)(l2) << PDRSHIFT) | \
@@ -187,9 +195,19 @@
*/
#define NKPML4E 4
-#define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */
-#define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */
-#define NUPDE (NUPDPE*NPDEPG) /* number of userland PD entries */
+/*
+ * We use consistent numbering of the page table pages for 5-level and
+ * 4-level paging structures.
+ */
+#define NUPML5E (NPML5EPG / 2) /* number of userland PML5
+ pages */
+#define NUPML4E (NUPML5E * NPML4EPG) /* number of userland PML4
+ pages */
+#define NUPDPE (NUPML4E * NPDPEPG) /* number of userland PDP
+ pages */
+#define NUPDE (NUPDPE * NPDEPG) /* number of userland PD
+ entries */
+#define NUP4ML4E (NPML4EPG / 2)
/*
* NDMPML4E is the maximum number of PML4 entries that will be
@@ -216,7 +234,8 @@
* Or, in other words, KPML4I provides bits 39..47 of KERNBASE,
* and KPDPI provides bits 30..38.)
*/
-#define PML4PML4I (NPML4EPG/2) /* Index of recursive pml4 mapping */
+#define PML4PML4I (NPML4EPG / 2) /* Index of recursive pml4 mapping */
+#define PML5PML5I (NPML5EPG / 2) /* Index of recursive pml5 mapping */
#define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */
#define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */
@@ -258,25 +277,34 @@
typedef u_int64_t pt_entry_t;
typedef u_int64_t pdp_entry_t;
typedef u_int64_t pml4_entry_t;
+typedef u_int64_t pml5_entry_t;
/*
* Address of current address space page table maps and directories.
*/
#ifdef _KERNEL
-#define addr_PTmap (KVADDR(PML4PML4I, 0, 0, 0))
-#define addr_PDmap (KVADDR(PML4PML4I, PML4PML4I, 0, 0))
-#define addr_PDPmap (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0))
-#define addr_PML4map (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I))
-#define addr_PML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t)))
-#define PTmap ((pt_entry_t *)(addr_PTmap))
-#define PDmap ((pd_entry_t *)(addr_PDmap))
-#define PDPmap ((pd_entry_t *)(addr_PDPmap))
-#define PML4map ((pd_entry_t *)(addr_PML4map))
-#define PML4pml4e ((pd_entry_t *)(addr_PML4pml4e))
+#define addr_P4Tmap (KV4ADDR(PML4PML4I, 0, 0, 0))
+#define addr_P4Dmap (KV4ADDR(PML4PML4I, PML4PML4I, 0, 0))
+#define addr_P4DPmap (KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0))
+#define addr_P4ML4map (KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I))
+#define addr_P4ML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t)))
+#define P4Tmap ((pt_entry_t *)(addr_P4Tmap))
+#define P4Dmap ((pd_entry_t *)(addr_P4Dmap))
+
+#define addr_P5Tmap (KV5ADDR(PML5PML5I, 0, 0, 0, 0))
+#define addr_P5Dmap (KV5ADDR(PML5PML5I, PML5PML5I, 0, 0, 0))
+#define addr_P5DPmap (KV5ADDR(PML5PML5I, PML5PML5I, PML5PML5I, 0, 0))
+#define addr_P5ML4map (KV5ADDR(PML5PML5I, PML5PML5I, PML5PML5I, PML5PML5I, 0))
+#define addr_P5ML5map \
+ (KVADDR(PML5PML5I, PML5PML5I, PML5PML5I, PML5PML5I, PML5PML5I))
+#define addr_P5ML5pml5e (addr_P5ML5map + (PML5PML5I * sizeof(pml5_entry_t)))
+#define P5Tmap ((pt_entry_t *)(addr_P5Tmap))
+#define P5Dmap ((pd_entry_t *)(addr_P5Dmap))
extern int nkpt; /* Initial number of kernel page tables */
extern u_int64_t KPDPphys; /* physical address of kernel level 3 */
extern u_int64_t KPML4phys; /* physical address of kernel level 4 */
+extern u_int64_t KPML5phys; /* physical address of kernel level 5 */
/*
* virtual address to page table entry and
@@ -333,8 +361,8 @@
*/
struct pmap {
struct mtx pm_mtx;
- pml4_entry_t *pm_pml4; /* KVA of level 4 page table */
- pml4_entry_t *pm_pml4u; /* KVA of user l4 page table */
+ pml4_entry_t *pm_pmltop; /* KVA of top level page table */
+ pml4_entry_t *pm_pmltopu; /* KVA of user top page table */
uint64_t pm_cr3;
uint64_t pm_ucr3;
TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */
@@ -447,6 +475,7 @@
boolean_t pmap_page_is_mapped(vm_page_t m);
void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma);
void pmap_pinit_pml4(vm_page_t);
+void pmap_pinit_pml5(vm_page_t);
bool pmap_ps_enabled(pmap_t pmap);
void pmap_unmapdev(vm_offset_t, vm_size_t);
void pmap_invalidate_page(pmap_t, vm_offset_t);
@@ -502,6 +531,13 @@
return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
}
+static __inline vm_pindex_t
+pmap_pml5e_index(vm_offset_t va)
+{
+
+ return ((va >> PML5SHIFT) & ((1ul << NPML5EPGSHIFT) - 1));
+}
+
#endif /* !LOCORE */
#endif /* !_MACHINE_PMAP_H_ */
Index: sys/amd64/include/proc.h
===================================================================
--- sys/amd64/include/proc.h
+++ sys/amd64/include/proc.h
@@ -84,6 +84,8 @@
};
#define P_MD_KPTI 0x00000001 /* Enable KPTI on exec */
+#define P_MD_LA48 0x00000002 /* Request LA48 after exec */
+#define P_MD_LA57 0x00000004 /* Request LA57 after exec */
#define KINFO_PROC_SIZE 1088
#define KINFO_PROC32_SIZE 768
Index: sys/amd64/include/vmm.h
===================================================================
--- sys/amd64/include/vmm.h
+++ sys/amd64/include/vmm.h
@@ -520,6 +520,7 @@
PAGING_MODE_32,
PAGING_MODE_PAE,
PAGING_MODE_64,
+ PAGING_MODE_64_LA57,
};
struct vm_guest_paging {
Index: sys/amd64/include/vmparam.h
===================================================================
--- sys/amd64/include/vmparam.h
+++ sys/amd64/include/vmparam.h
@@ -169,25 +169,32 @@
* 0xffffffff80000000 KERNBASE
*/
-#define VM_MIN_KERNEL_ADDRESS KVADDR(KPML4BASE, 0, 0, 0)
-#define VM_MAX_KERNEL_ADDRESS KVADDR(KPML4BASE + NKPML4E - 1, \
+#define VM_MIN_KERNEL_ADDRESS KV4ADDR(KPML4BASE, 0, 0, 0)
+#define VM_MAX_KERNEL_ADDRESS KV4ADDR(KPML4BASE + NKPML4E - 1, \
NPDPEPG-1, NPDEPG-1, NPTEPG-1)
-#define DMAP_MIN_ADDRESS KVADDR(DMPML4I, 0, 0, 0)
-#define DMAP_MAX_ADDRESS KVADDR(DMPML4I + NDMPML4E, 0, 0, 0)
+#define DMAP_MIN_ADDRESS KV4ADDR(DMPML4I, 0, 0, 0)
+#define DMAP_MAX_ADDRESS KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0)
-#define LARGEMAP_MIN_ADDRESS KVADDR(LMSPML4I, 0, 0, 0)
-#define LARGEMAP_MAX_ADDRESS KVADDR(LMEPML4I + 1, 0, 0, 0)
+#define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0)
+#define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0)
-#define KERNBASE KVADDR(KPML4I, KPDPI, 0, 0)
+#define KERNBASE KV4ADDR(KPML4I, KPDPI, 0, 0)
-#define UPT_MAX_ADDRESS KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)
-#define UPT_MIN_ADDRESS KVADDR(PML4PML4I, 0, 0, 0)
+#define UPT_MAX_ADDRESS KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)
+#define UPT_MIN_ADDRESS KV4ADDR(PML4PML4I, 0, 0, 0)
-#define VM_MAXUSER_ADDRESS UVADDR(NUPML4E, 0, 0, 0)
+#define VM_MAXUSER_ADDRESS_LA57 UVADDR(NUPML5E, 0, 0, 0, 0)
+#define VM_MAXUSER_ADDRESS_LA48 UVADDR(0, NUP4ML4E, 0, 0, 0)
+#define VM_MAXUSER_ADDRESS VM_MAXUSER_ADDRESS_LA57
-#define SHAREDPAGE (VM_MAXUSER_ADDRESS - PAGE_SIZE)
-#define USRSTACK SHAREDPAGE
+#define SHAREDPAGE_LA57 (VM_MAXUSER_ADDRESS_LA57 - PAGE_SIZE)
+#define SHAREDPAGE_LA48 (VM_MAXUSER_ADDRESS_LA48 - PAGE_SIZE)
+#define USRSTACK_LA57 SHAREDPAGE_LA57
+#define USRSTACK_LA48 SHAREDPAGE_LA48
+#define USRSTACK USRSTACK_LA48
+#define PS_STRINGS_LA57 (USRSTACK_LA57 - sizeof(struct ps_strings))
+#define PS_STRINGS_LA48 (USRSTACK_LA48 - sizeof(struct ps_strings))
#define VM_MAX_ADDRESS UPT_MAX_ADDRESS
#define VM_MIN_ADDRESS (0)
Index: sys/amd64/linux/linux_sysvec.c
===================================================================
--- sys/amd64/linux/linux_sysvec.c
+++ sys/amd64/linux/linux_sysvec.c
@@ -739,9 +739,9 @@
.sv_imgact_try = linux_exec_imgact_try,
.sv_minsigstksz = LINUX_MINSIGSTKSZ,
.sv_minuser = VM_MIN_ADDRESS,
- .sv_maxuser = VM_MAXUSER_ADDRESS,
- .sv_usrstack = USRSTACK,
- .sv_psstrings = PS_STRINGS,
+ .sv_maxuser = VM_MAXUSER_ADDRESS_LA48,
+ .sv_usrstack = USRSTACK_LA48,
+ .sv_psstrings = PS_STRINGS_LA48,
.sv_stackprot = VM_PROT_ALL,
.sv_copyout_auxargs = linux_copyout_auxargs,
.sv_copyout_strings = linux_copyout_strings,
@@ -752,7 +752,7 @@
.sv_set_syscall_retval = linux_set_syscall_retval,
.sv_fetch_syscall_args = linux_fetch_syscall_args,
.sv_syscallnames = NULL,
- .sv_shared_page_base = SHAREDPAGE,
+ .sv_shared_page_base = SHAREDPAGE_LA48,
.sv_shared_page_len = PAGE_SIZE,
.sv_schedtail = linux_schedtail,
.sv_thread_detach = linux_thread_detach,
Index: sys/amd64/vmm/amd/svm.c
===================================================================
--- sys/amd64/vmm/amd/svm.c
+++ sys/amd64/vmm/amd/svm.c
@@ -560,7 +560,7 @@
panic("contigmalloc of SVM IO bitmap failed");
svm_sc->vm = vm;
- svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4);
+ svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pmltop);
/*
* Intercept read and write accesses to all MSRs.
Index: sys/amd64/vmm/intel/vmx.c
===================================================================
--- sys/amd64/vmm/intel/vmx.c
+++ sys/amd64/vmm/intel/vmx.c
@@ -973,7 +973,7 @@
}
vmx->vm = vm;
- vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4));
+ vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pmltop));
/*
* Clean up EPTP-tagged guest physical and combined mappings
@@ -1871,14 +1871,18 @@
static enum vm_paging_mode
vmx_paging_mode(void)
{
+ uint64_t cr4;
if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
return (PAGING_MODE_FLAT);
- if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
+ cr4 = vmcs_read(VMCS_GUEST_CR4);
+ if (!(cr4 & CR4_PAE))
return (PAGING_MODE_32);
- if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
- return (PAGING_MODE_64);
- else
+ if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) {
+ if (!(cr4 & CR4_LA57))
+ return (PAGING_MODE_64);
+ return (PAGING_MODE_64_LA57);
+ } else
return (PAGING_MODE_PAE);
}
Index: sys/amd64/vmm/vmm_instruction_emul.c
===================================================================
--- sys/amd64/vmm/vmm_instruction_emul.c
+++ sys/amd64/vmm/vmm_instruction_emul.c
@@ -2189,8 +2189,12 @@
ptpphys = pte;
nlevels = 2;
- } else
+ } else if (paging->paging_mode == PAGING_MODE_64_LA57) {
+ nlevels = 5;
+ } else {
nlevels = 4;
+ }
+
while (--nlevels >= 0) {
/* Zero out the lower 12 bits and the upper 12 bits */
ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
Index: sys/cddl/dev/dtrace/amd64/dtrace_subr.c
===================================================================
--- sys/cddl/dev/dtrace/amd64/dtrace_subr.c
+++ sys/cddl/dev/dtrace/amd64/dtrace_subr.c
@@ -43,6 +43,7 @@
#include <machine/clock.h>
#include <machine/cpufunc.h>
#include <machine/frame.h>
+#include <machine/md_var.h>
#include <machine/psl.h>
#include <machine/trap.h>
#include <vm/pmap.h>
@@ -131,7 +132,7 @@
void
dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit))
{
- (*func)(0, (uintptr_t) addr_PTmap);
+ (*func)(0, la57 ? (uintptr_t)addr_P5Tmap : (uintptr_t)addr_P4Tmap);
}
void
Index: sys/kern/imgact_elf.c
===================================================================
--- sys/kern/imgact_elf.c
+++ sys/kern/imgact_elf.c
@@ -97,7 +97,8 @@
int32_t *osrel);
static bool kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel);
static boolean_t __elfN(check_note)(struct image_params *imgp,
- Elf_Brandnote *checknote, int32_t *osrel, uint32_t *fctl0);
+ Elf_Brandnote *checknote, int32_t *osrel, boolean_t *has_fctl0,
+ uint32_t *fctl0);
static vm_prot_t __elfN(trans_prot)(Elf_Word);
static Elf_Word __elfN(untrans_prot)(vm_prot_t);
@@ -309,7 +310,7 @@
{
const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
Elf_Brandinfo *bi, *bi_m;
- boolean_t ret;
+ boolean_t ret, has_fctl0;
int i, interp_name_len;
interp_name_len = interp != NULL ? strlen(interp) + 1 : 0;
@@ -331,11 +332,16 @@
continue;
if (hdr->e_machine == bi->machine && (bi->flags &
(BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) {
+ has_fctl0 = false;
+ *fctl0 = 0;
+ *osrel = 0;
ret = __elfN(check_note)(imgp, bi->brand_note, osrel,
- fctl0);
+ &has_fctl0, fctl0);
/* Give brand a chance to veto check_note's guess */
- if (ret && bi->header_supported)
- ret = bi->header_supported(imgp);
+ if (ret && bi->header_supported) {
+ ret = bi->header_supported(imgp, osrel,
+ has_fctl0 ? fctl0 : NULL);
+ }
/*
* If note checker claimed the binary, but the
* interpreter path in the image does not
@@ -374,7 +380,7 @@
bi->compat_3_brand) == 0))) {
/* Looks good, but give brand a chance to veto */
if (bi->header_supported == NULL ||
- bi->header_supported(imgp)) {
+ bi->header_supported(imgp, NULL, NULL)) {
/*
* Again, prefer strictly matching
* interpreter path.
@@ -402,7 +408,7 @@
bi->header_supported == NULL)
continue;
if (hdr->e_machine == bi->machine) {
- ret = bi->header_supported(imgp);
+ ret = bi->header_supported(imgp, NULL, NULL);
if (ret)
return (bi);
}
@@ -422,7 +428,7 @@
strlen(bi->interp_path) + 1 == interp_name_len &&
strncmp(interp, bi->interp_path, interp_name_len)
== 0 && (bi->header_supported == NULL ||
- bi->header_supported(imgp)))
+ bi->header_supported(imgp, NULL, NULL)))
return (bi);
}
}
@@ -436,7 +442,7 @@
if (hdr->e_machine == bi->machine &&
__elfN(fallback_brand) == bi->brand &&
(bi->header_supported == NULL ||
- bi->header_supported(imgp)))
+ bi->header_supported(imgp, NULL, NULL)))
return (bi);
}
return (NULL);
@@ -2657,6 +2663,7 @@
};
struct fctl_cb_arg {
+ boolean_t *has_fctl0;
uint32_t *fctl0;
};
@@ -2671,6 +2678,7 @@
p = (uintptr_t)(note + 1);
p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE);
desc = (const Elf32_Word *)p;
+ *arg->has_fctl0 = TRUE;
*arg->fctl0 = desc[0];
return (TRUE);
}
@@ -2683,7 +2691,7 @@
*/
static boolean_t
__elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote,
- int32_t *osrel, uint32_t *fctl0)
+ int32_t *osrel, boolean_t *has_fctl0, uint32_t *fctl0)
{
const Elf_Phdr *phdr;
const Elf_Ehdr *hdr;
@@ -2695,6 +2703,7 @@
phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
b_arg.brandnote = brandnote;
b_arg.osrel = osrel;
+ f_arg.has_fctl0 = has_fctl0;
f_arg.fctl0 = fctl0;
for (i = 0; i < hdr->e_phnum; i++) {
Index: sys/kern/kern_sharedpage.c
===================================================================
--- sys/kern/kern_sharedpage.c
+++ sys/kern/kern_sharedpage.c
@@ -288,3 +288,21 @@
#endif
}
}
+
+void
+exec_sysvec_init_secondary(struct sysentvec *sv, struct sysentvec *sv2)
+{
+ MPASS((sv2->sv_flags & SV_ABI_MASK) == (sv->sv_flags & SV_ABI_MASK));
+ MPASS((sv2->sv_flags & SV_TIMEKEEP) == (sv->sv_flags & SV_TIMEKEEP));
+ MPASS((sv2->sv_flags & SV_SHP) != 0 && (sv->sv_flags & SV_SHP) != 0);
+
+ sv2->sv_shared_page_obj = sv->sv_shared_page_obj;
+ sv2->sv_sigcode_base = sv2->sv_shared_page_base +
+ (sv->sv_sigcode_base - sv->sv_shared_page_base);
+ if ((sv2->sv_flags & SV_ABI_MASK) != SV_ABI_FREEBSD)
+ return;
+ if ((sv2->sv_flags & SV_TIMEKEEP) != 0) {
+ sv2->sv_timekeep_base = sv2->sv_shared_page_base +
+ (sv->sv_timekeep_base - sv->sv_shared_page_base);
+ }
+}
Index: sys/sys/elf_common.h
===================================================================
--- sys/sys/elf_common.h
+++ sys/sys/elf_common.h
@@ -796,6 +796,7 @@
#define NT_FREEBSD_FCTL_PROTMAX_DISABLE 0x00000002
#define NT_FREEBSD_FCTL_STKGAP_DISABLE 0x00000004
#define NT_FREEBSD_FCTL_WXNEEDED 0x00000008
+#define NT_FREEBSD_FCTL_LA48 0x00000010
/* Values for n_type. Used in core files. */
#define NT_PRSTATUS 1 /* Process status. */
Index: sys/sys/imgact_elf.h
===================================================================
--- sys/sys/imgact_elf.h
+++ sys/sys/imgact_elf.h
@@ -87,7 +87,8 @@
const char *interp_newpath;
int flags;
Elf_Brandnote *brand_note;
- boolean_t (*header_supported)(struct image_params *);
+ boolean_t (*header_supported)(struct image_params *,
+ int32_t *, uint32_t *);
#define BI_CAN_EXEC_DYN 0x0001
#define BI_BRAND_NOTE 0x0002 /* May have note.ABI-tag section. */
#define BI_BRAND_NOTE_MANDATORY 0x0004 /* Must have note.ABI-tag section. */
Index: sys/sys/sysent.h
===================================================================
--- sys/sys/sysent.h
+++ sys/sys/sysent.h
@@ -321,6 +321,7 @@
int shared_page_fill(int size, int align, const void *data);
void shared_page_write(int base, int size, const void *data);
void exec_sysvec_init(void *param);
+void exec_sysvec_init_secondary(struct sysentvec *sv, struct sysentvec *sv2);
void exec_inittk(void);
#define INIT_SYSENTVEC(name, sv) \
Index: sys/x86/acpica/acpi_wakeup.c
===================================================================
--- sys/x86/acpica/acpi_wakeup.c
+++ sys/x86/acpica/acpi_wakeup.c
@@ -99,7 +99,7 @@
#endif
#ifdef __amd64__
-#define ACPI_WAKEPAGES 4
+#define ACPI_WAKEPAGES 5
#else
#define ACPI_WAKEPAGES 1
#endif
@@ -414,8 +414,8 @@
static void *wakeaddr;
void *wakepages[ACPI_WAKEPAGES];
#ifdef __amd64__
- uint64_t *pt4, *pt3, *pt2;
- vm_paddr_t pt4pa, pt3pa, pt2pa;
+ uint64_t *pt5, *pt4, *pt3, *pt2;
+ vm_paddr_t pt5pa, pt4pa, pt3pa, pt2pa;
int i;
#endif
@@ -430,6 +430,10 @@
sc->acpi_wakephys = vtophys(wakeaddr);
#ifdef __amd64__
+ if (la57) {
+ pt5 = wakepages[4];
+ pt5pa = vtophys(pt5);
+ }
pt4 = wakepages[1];
pt3 = wakepages[2];
pt2 = wakepages[3];
@@ -448,7 +452,8 @@
#ifdef __amd64__
WAKECODE_FIXUP((wakeup_sw64 + 1), uint32_t,
sc->acpi_wakephys + wakeup_64);
- WAKECODE_FIXUP(wakeup_pagetables, uint32_t, pt4pa);
+ WAKECODE_FIXUP(wakeup_pagetables, uint32_t, la57 ? (pt5pa | 0x1) :
+ pt4pa);
#endif
/* Save pointers to some global data. */
@@ -457,7 +462,12 @@
WAKECODE_FIXUP(wakeup_cr3, register_t, pmap_get_kcr3());
#else /* __amd64__ */
/* Create the initial 1GB replicated page tables */
- for (i = 0; i < 512; i++) {
+ for (i = 0; i < NPTEPG; i++) {
+ if (la57) {
+ pt5[i] = (uint64_t)pt4pa;
+ pt5[i] |= PG_V | PG_RW | PG_U;
+ }
+
/*
* Each slot of the level 4 pages points
* to the same level 3 page
@@ -473,7 +483,7 @@
pt3[i] |= PG_V | PG_RW | PG_U;
/* The level 2 page slots are mapped with 2MB pages for 1GB. */
- pt2[i] = i * (2 * 1024 * 1024);
+ pt2[i] = i * NBPDR;
pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
}
#endif /* !__amd64__ */
Index: sys/x86/include/procctl.h
===================================================================
--- sys/x86/include/procctl.h
+++ sys/x86/include/procctl.h
@@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
- * Copyright (c) 2019 The FreeBSD Foundation
+ * Copyright (c) 2019,2020 The FreeBSD Foundation
*
* Portions of this software were developed by Konstantin Belousov
* under sponsorship from the FreeBSD Foundation.
@@ -35,9 +35,18 @@
#define PROC_KPTI_CTL (PROC_PROCCTL_MD_MIN + 0)
#define PROC_KPTI_STATUS (PROC_PROCCTL_MD_MIN + 1)
+#define PROC_LA_CTL (PROC_PROCCTL_MD_MIN + 2)
+#define PROC_LA_STATUS (PROC_PROCCTL_MD_MIN + 3)
#define PROC_KPTI_CTL_ENABLE_ON_EXEC 1
#define PROC_KPTI_CTL_DISABLE_ON_EXEC 2
#define PROC_KPTI_STATUS_ACTIVE 0x80000000
+#define PROC_LA_CTL_LA48_ON_EXEC 1
+#define PROC_LA_CTL_LA57_ON_EXEC 2
+#define PROC_LA_CTL_DEFAULT_ON_EXEC 3
+
+#define PROC_LA_STATUS_LA48 0x01000000
+#define PROC_LA_STATUS_LA57 0x02000000
+
#endif
Index: sys/x86/include/specialreg.h
===================================================================
--- sys/x86/include/specialreg.h
+++ sys/x86/include/specialreg.h
@@ -72,6 +72,7 @@
#define CR4_FXSR 0x00000200 /* Fast FPU save/restore used by OS */
#define CR4_XMM 0x00000400 /* enable SIMD/MMX2 to use except 16 */
#define CR4_UMIP 0x00000800 /* User Mode Instruction Prevention */
+#define CR4_LA57 0x00001000 /* Enable 5-level paging */
#define CR4_VMXE 0x00002000 /* enable VMX operation (Intel-specific) */
#define CR4_FSGSBASE 0x00010000 /* Enable FS/GS BASE accessing instructions */
#define CR4_PCIDE 0x00020000 /* Enable Context ID */
Index: usr.bin/elfctl/elfctl.c
===================================================================
--- usr.bin/elfctl/elfctl.c
+++ usr.bin/elfctl/elfctl.c
@@ -67,6 +67,7 @@
"Disable implicit PROT_MAX" },
{ "stackgap", NT_FREEBSD_FCTL_STKGAP_DISABLE, "Disable stack gap" },
{ "wxneeded", NT_FREEBSD_FCTL_WXNEEDED, "Requires W+X mappings" },
+ { "la48", NT_FREEBSD_FCTL_LA48, "amd64: Limit user VA to 48bit" },
};
static struct option long_opts[] = {
Index: usr.bin/proccontrol/proccontrol.1
===================================================================
--- usr.bin/proccontrol/proccontrol.1
+++ usr.bin/proccontrol/proccontrol.1
@@ -71,6 +71,9 @@
.Xr mmap 2 .
.It Ar kpti
Controls the KPTI enable, AMD64 only.
+.It Ar la48
+Control limiting usermode process address space to 48bit of address,
+AMD64 only, on machines capable of 57bit addressing.
.El
.Pp
The
Index: usr.bin/proccontrol/proccontrol.c
===================================================================
--- usr.bin/proccontrol/proccontrol.c
+++ usr.bin/proccontrol/proccontrol.c
@@ -48,6 +48,10 @@
#ifdef PROC_KPTI_CTL
MODE_KPTI,
#endif
+#ifdef PROC_LA_CTL
+ MODE_LA57,
+ MODE_LA48,
+#endif
};
static pid_t
@@ -69,13 +73,18 @@
#else
#define KPTI_USAGE
#endif
+#ifdef PROC_LA_CTL
+#define LA_USAGE "|la48|la57"
+#else
+#define LA_USAGE
+#endif
static void __dead2
usage(void)
{
fprintf(stderr, "Usage: proccontrol -m (aslr|protmax|trace|trapcap|"
- "stackgap"KPTI_USAGE") [-q] "
+ "stackgap"KPTI_USAGE LA_USAGE") [-q] "
"[-s (enable|disable)] [-p pid | command]\n");
exit(1);
}
@@ -107,6 +116,12 @@
#ifdef PROC_KPTI_CTL
else if (strcmp(optarg, "kpti") == 0)
mode = MODE_KPTI;
+#endif
+#ifdef PROC_LA_CTL
+ else if (strcmp(optarg, "la57") == 0)
+ mode = MODE_LA57;
+ else if (strcmp(optarg, "la48") == 0)
+ mode = MODE_LA48;
#endif
else
usage();
@@ -163,6 +178,12 @@
case MODE_KPTI:
error = procctl(P_PID, pid, PROC_KPTI_STATUS, &arg);
break;
+#endif
+#ifdef PROC_LA_CTL
+ case MODE_LA57:
+ case MODE_LA48:
+ error = procctl(P_PID, pid, PROC_LA_STATUS, &arg);
+ break;
#endif
default:
usage();
@@ -258,6 +279,27 @@
else
printf(", not active\n");
break;
+#endif
+#ifdef PROC_LA_CTL
+ case MODE_LA57:
+ case MODE_LA48:
+ switch (arg & ~(PROC_LA_STATUS_LA48 |
+ PROC_LA_STATUS_LA57)) {
+ case PROC_LA_CTL_LA48_ON_EXEC:
+ printf("la48 on exec");
+ break;
+ case PROC_LA_CTL_LA57_ON_EXEC:
+ printf("la57 on exec");
+ break;
+ case PROC_LA_CTL_DEFAULT_ON_EXEC:
+ printf("default on exec");
+ break;
+ }
+ if ((arg & PROC_LA_STATUS_LA48) != 0)
+ printf(", la48 active\n");
+ else if ((arg & PROC_LA_STATUS_LA57) != 0)
+ printf(", la57 active\n");
+ break;
#endif
}
} else {
@@ -294,6 +336,18 @@
PROC_KPTI_CTL_DISABLE_ON_EXEC;
error = procctl(P_PID, pid, PROC_KPTI_CTL, &arg);
break;
+#endif
+#ifdef PROC_LA_CTL
+ case MODE_LA57:
+ arg = enable ? PROC_LA_CTL_LA57_ON_EXEC :
+ PROC_LA_CTL_DEFAULT_ON_EXEC;
+ error = procctl(P_PID, pid, PROC_LA_CTL, &arg);
+ break;
+ case MODE_LA48:
+ arg = enable ? PROC_LA_CTL_LA48_ON_EXEC :
+ PROC_LA_CTL_DEFAULT_ON_EXEC;
+ error = procctl(P_PID, pid, PROC_LA_CTL, &arg);
+ break;
#endif
default:
usage();
Index: usr.sbin/bhyve/gdb.c
===================================================================
--- usr.sbin/bhyve/gdb.c
+++ usr.sbin/bhyve/gdb.c
@@ -251,7 +251,8 @@
else if (!(regs[2] & CR4_PAE))
paging->paging_mode = PAGING_MODE_32;
else if (regs[3] & EFER_LME)
- paging->paging_mode = PAGING_MODE_64;
+ paging->paging_mode = (regs[2] & CR4_LA57) ?
+ PAGING_MODE_64_LA57 : PAGING_MODE_64;
else
paging->paging_mode = PAGING_MODE_PAE;
return (0);
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Wed, Nov 12, 12:47 AM (1 h, 4 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
25186640
Default Alt Text
D25273.id74807.diff (79 KB)
Attached To
Mode
D25273: amd64 pmap: LA57 AKA 5-level paging
Attached
Detach File
Event Timeline
Log In to Comment