Index: sys/amd64/amd64/efirt_machdep.c =================================================================== --- sys/amd64/amd64/efirt_machdep.c +++ sys/amd64/amd64/efirt_machdep.c @@ -61,9 +61,10 @@ #include #include +static pml5_entry_t *efi_pml5; static pml4_entry_t *efi_pml4; static vm_object_t obj_1t1_pt; -static vm_page_t efi_pml4_page; +static vm_page_t efi_pmltop_page; static vm_pindex_t efi_1t1_idx; void @@ -82,7 +83,8 @@ obj_1t1_pt = NULL; efi_pml4 = NULL; - efi_pml4_page = NULL; + efi_pml5 = NULL; + efi_pmltop_page = NULL; } /* @@ -109,22 +111,38 @@ static pt_entry_t * efi_1t1_pte(vm_offset_t va) { + pml5_entry_t *pml5e; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t *pde; pt_entry_t *pte; vm_page_t m; - vm_pindex_t pml4_idx, pdp_idx, pd_idx; + vm_pindex_t pml5_idx, pml4_idx, pdp_idx, pd_idx; vm_paddr_t mphys; pml4_idx = pmap_pml4e_index(va); - pml4e = &efi_pml4[pml4_idx]; + if (la57) { + pml5_idx = pmap_pml5e_index(va); + pml5e = &efi_pml5[pml5_idx]; + if (*pml5e == 0) { + m = efi_1t1_page(); + mphys = VM_PAGE_TO_PHYS(m); + *pml5e = mphys | X86_PG_RW | X86_PG_V; + } else { + mphys = *pml5e & PG_FRAME; + } + pml4e = (pml4_entry_t *)PHYS_TO_DMAP(mphys); + pml4e = &pml4e[pml4_idx]; + } else { + pml4e = &efi_pml4[pml4_idx]; + } + if (*pml4e == 0) { m = efi_1t1_page(); mphys = VM_PAGE_TO_PHYS(m); *pml4e = mphys | X86_PG_RW | X86_PG_V; } else { - mphys = *pml4e & ~PAGE_MASK; + mphys = *pml4e & PG_FRAME; } pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys); @@ -135,7 +153,7 @@ mphys = VM_PAGE_TO_PHYS(m); *pdpe = mphys | X86_PG_RW | X86_PG_V; } else { - mphys = *pdpe & ~PAGE_MASK; + mphys = *pdpe & PG_FRAME; } pde = (pd_entry_t *)PHYS_TO_DMAP(mphys); @@ -146,7 +164,7 @@ mphys = VM_PAGE_TO_PHYS(m); *pde = mphys | X86_PG_RW | X86_PG_V; } else { - mphys = *pde & ~PAGE_MASK; + mphys = *pde & PG_FRAME; } pte = (pt_entry_t *)PHYS_TO_DMAP(mphys); @@ -161,6 +179,7 @@ { struct efi_md *p; pt_entry_t *pte; + void *pml; vm_offset_t va; uint64_t idx; int bits, i, mode; @@ -170,10 +189,16 @@ VM_PROT_ALL, 0, NULL); efi_1t1_idx = 0; VM_OBJECT_WLOCK(obj_1t1_pt); - efi_pml4_page = efi_1t1_page(); + efi_pmltop_page = efi_1t1_page(); VM_OBJECT_WUNLOCK(obj_1t1_pt); - efi_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(efi_pml4_page)); - pmap_pinit_pml4(efi_pml4_page); + pml = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(efi_pmltop_page)); + if (la57) { + efi_pml5 = pml; + pmap_pinit_pml5(efi_pmltop_page); + } else { + efi_pml4 = pml; + pmap_pinit_pml4(efi_pmltop_page); + } for (i = 0, p = map; i < ndesc; i++, p = efi_next_descriptor(p, descsz)) { @@ -279,7 +304,7 @@ if (pmap_pcid_enabled && !invpcid_works) PCPU_SET(curpmap, NULL); - load_cr3(VM_PAGE_TO_PHYS(efi_pml4_page) | (pmap_pcid_enabled ? + load_cr3(VM_PAGE_TO_PHYS(efi_pmltop_page) | (pmap_pcid_enabled ? curpmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid : 0)); /* * If PCID is enabled, the clear CR3_PCID_SAVE bit in the loaded %cr3 Index: sys/amd64/amd64/elf_machdep.c =================================================================== --- sys/amd64/amd64/elf_machdep.c +++ sys/amd64/amd64/elf_machdep.c @@ -49,7 +49,7 @@ #include #include -struct sysentvec elf64_freebsd_sysvec = { +struct sysentvec elf64_freebsd_sysvec_la48 = { .sv_size = SYS_MAXSYSCALL, .sv_table = sysent, .sv_errsize = 0, @@ -64,9 +64,9 @@ .sv_imgact_try = NULL, .sv_minsigstksz = MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, - .sv_maxuser = VM_MAXUSER_ADDRESS, - .sv_usrstack = USRSTACK, - .sv_psstrings = PS_STRINGS, + .sv_maxuser = VM_MAXUSER_ADDRESS_LA48, + .sv_usrstack = USRSTACK_LA48, + .sv_psstrings = PS_STRINGS_LA48, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs), .sv_copyout_strings = exec_copyout_strings, @@ -78,14 +78,52 @@ .sv_set_syscall_retval = cpu_set_syscall_retval, .sv_fetch_syscall_args = cpu_fetch_syscall_args, .sv_syscallnames = syscallnames, - .sv_shared_page_base = SHAREDPAGE, + .sv_shared_page_base = SHAREDPAGE_LA48, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = NULL, .sv_thread_detach = NULL, .sv_trap = NULL, .sv_stackgap = elf64_stackgap, }; -INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec); + +struct sysentvec elf64_freebsd_sysvec_la57 = { + .sv_size = SYS_MAXSYSCALL, + .sv_table = sysent, + .sv_errsize = 0, + .sv_errtbl = NULL, + .sv_transtrap = NULL, + .sv_fixup = __elfN(freebsd_fixup), + .sv_sendsig = sendsig, + .sv_sigcode = sigcode, + .sv_szsigcode = &szsigcode, + .sv_name = "FreeBSD ELF64", + .sv_coredump = __elfN(coredump), + .sv_imgact_try = NULL, + .sv_minsigstksz = MINSIGSTKSZ, + .sv_minuser = VM_MIN_ADDRESS, + .sv_maxuser = VM_MAXUSER_ADDRESS_LA57, + .sv_usrstack = USRSTACK_LA57, + .sv_psstrings = PS_STRINGS_LA57, + .sv_stackprot = VM_PROT_ALL, + .sv_copyout_auxargs = __elfN(freebsd_copyout_auxargs), + .sv_copyout_strings = exec_copyout_strings, + .sv_setregs = exec_setregs, + .sv_fixlimit = NULL, + .sv_maxssiz = NULL, + .sv_flags = SV_ABI_FREEBSD | SV_ASLR | SV_LP64 | SV_SHP | + SV_TIMEKEEP, + .sv_set_syscall_retval = cpu_set_syscall_retval, + .sv_fetch_syscall_args = cpu_fetch_syscall_args, + .sv_syscallnames = syscallnames, + .sv_shared_page_base = SHAREDPAGE_LA57, + .sv_shared_page_len = PAGE_SIZE, + .sv_schedtail = NULL, + .sv_thread_detach = NULL, + .sv_trap = NULL, + .sv_stackgap = elf64_stackgap, +}; + +INIT_SYSENTVEC(elf64_sysvec, &elf64_freebsd_sysvec_la57); void amd64_lower_shared_page(struct sysentvec *sv) @@ -103,24 +141,59 @@ * uses the value of sv_shared_page_base. */ SYSINIT(elf64_sysvec_fixup, SI_SUB_EXEC, SI_ORDER_FIRST, - (sysinit_cfunc_t) amd64_lower_shared_page, - &elf64_freebsd_sysvec); + (sysinit_cfunc_t) amd64_lower_shared_page, &elf64_freebsd_sysvec_la48); + +static boolean_t +freebsd_brand_info_la57_checker(struct image_params *imgp, + int32_t *osrel __unused, uint32_t *fctl0) +{ + if ((imgp->proc->p_md.md_flags & P_MD_LA57) != 0) + return (TRUE); + if (fctl0 == NULL || (*fctl0 & NT_FREEBSD_FCTL_LA48) != 0) + return (FALSE); + if ((imgp->proc->p_md.md_flags & P_MD_LA48) != 0) + return (FALSE); + return (TRUE); +} -static Elf64_Brandinfo freebsd_brand_info = { +static Elf64_Brandinfo freebsd_brand_info_la48 = { .brand = ELFOSABI_FREEBSD, .machine = EM_X86_64, .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/libexec/ld-elf.so.1", - .sysvec = &elf64_freebsd_sysvec, + .sysvec = &elf64_freebsd_sysvec_la48, .interp_newpath = NULL, .brand_note = &elf64_freebsd_brandnote, - .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE + .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE, +}; + +static Elf64_Brandinfo freebsd_brand_info_la57 = { + .brand = ELFOSABI_FREEBSD, + .machine = EM_X86_64, + .compat_3_brand = "FreeBSD", + .emul_path = NULL, + .interp_path = "/libexec/ld-elf.so.1", + .sysvec = &elf64_freebsd_sysvec_la57, + .interp_newpath = NULL, + .brand_note = &elf64_freebsd_brandnote, + .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE, + .header_supported = freebsd_brand_info_la57_checker, }; +static void +sysinit_register_elf64_brand_entries(void *arg __unused) +{ + /* + * _57 must go first so it can either claim the image, or hand + * it to _48. + */ + if (la57) + elf64_insert_brand_entry(&freebsd_brand_info_la57); + elf64_insert_brand_entry(&freebsd_brand_info_la48); +} SYSINIT(elf64, SI_SUB_EXEC, SI_ORDER_FIRST, - (sysinit_cfunc_t) elf64_insert_brand_entry, - &freebsd_brand_info); + sysinit_register_elf64_brand_entries, NULL); static Elf64_Brandinfo freebsd_brand_oinfo = { .brand = ELFOSABI_FREEBSD, @@ -128,15 +201,14 @@ .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/usr/libexec/ld-elf.so.1", - .sysvec = &elf64_freebsd_sysvec, + .sysvec = &elf64_freebsd_sysvec_la48, .interp_newpath = NULL, .brand_note = &elf64_freebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE }; SYSINIT(oelf64, SI_SUB_EXEC, SI_ORDER_ANY, - (sysinit_cfunc_t) elf64_insert_brand_entry, - &freebsd_brand_oinfo); + (sysinit_cfunc_t) elf64_insert_brand_entry, &freebsd_brand_oinfo); static Elf64_Brandinfo kfreebsd_brand_info = { .brand = ELFOSABI_FREEBSD, @@ -144,15 +216,14 @@ .compat_3_brand = "FreeBSD", .emul_path = NULL, .interp_path = "/lib/ld-kfreebsd-x86-64.so.1", - .sysvec = &elf64_freebsd_sysvec, + .sysvec = &elf64_freebsd_sysvec_la48, .interp_newpath = NULL, .brand_note = &elf64_kfreebsd_brandnote, .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE_MANDATORY }; SYSINIT(kelf64, SI_SUB_EXEC, SI_ORDER_ANY, - (sysinit_cfunc_t) elf64_insert_brand_entry, - &kfreebsd_brand_info); + (sysinit_cfunc_t) elf64_insert_brand_entry, &kfreebsd_brand_info); void elf64_dump_thread(struct thread *td, void *dst, size_t *off) Index: sys/amd64/amd64/genassym.c =================================================================== --- sys/amd64/amd64/genassym.c +++ sys/amd64/amd64/genassym.c @@ -99,11 +99,10 @@ ASSYM(PAGE_SIZE, PAGE_SIZE); ASSYM(NPTEPG, NPTEPG); ASSYM(NPDEPG, NPDEPG); -ASSYM(addr_PTmap, addr_PTmap); -ASSYM(addr_PDmap, addr_PDmap); -ASSYM(addr_PDPmap, addr_PDPmap); -ASSYM(addr_PML4map, addr_PML4map); -ASSYM(addr_PML4pml4e, addr_PML4pml4e); +ASSYM(addr_P4Tmap, addr_P4Tmap); +ASSYM(addr_P4Dmap, addr_P4Dmap); +ASSYM(addr_P5Tmap, addr_P5Tmap); +ASSYM(addr_P5Dmap, addr_P5Dmap); ASSYM(PDESIZE, sizeof(pd_entry_t)); ASSYM(PTESIZE, sizeof(pt_entry_t)); ASSYM(PAGE_SHIFT, PAGE_SHIFT); Index: sys/amd64/amd64/locore.S =================================================================== --- sys/amd64/amd64/locore.S +++ sys/amd64/amd64/locore.S @@ -36,13 +36,8 @@ /* * Compiled KERNBASE location */ - .globl kernbase,loc_PTmap,loc_PDmap,loc_PDPmap,loc_PML4map,loc_PML4pml4e,dmapbase,dmapend + .globl kernbase, loc_PTmap, loc_PDmap, loc_PDPmap, dmapbase, dmapend .set kernbase,KERNBASE - .set loc_PTmap,addr_PTmap - .set loc_PDmap,addr_PDmap - .set loc_PDPmap,addr_PDPmap - .set loc_PML4map,addr_PML4map - .set loc_PML4pml4e,addr_PML4pml4e .set dmapbase,DMAP_MIN_ADDRESS .set dmapend,DMAP_MAX_ADDRESS @@ -82,6 +77,62 @@ 0: hlt jmp 0b +/* la57_trampoline(%rdi pml5) */ +NON_GPROF_ENTRY(la57_trampoline) + movq %rsp,%r11 + movq %rbx,%r10 + leaq la57_trampoline_end(%rip),%rsp + + movq %cr0,%rdx + lgdtq la57_trampoline_gdt_desc(%rip) + + pushq $(2<<3) + leaq l1(%rip),%rax + leaq l2(%rip),%rbx + + pushq %rax + lretq + .code32 + +l1: movl $(3<<3),%eax + movl %eax,%ss + + movl %edx,%eax + andl $~CR0_PG,%eax + movl %eax,%cr0 + + movl %cr4,%eax + orl $CR4_LA57,%eax + movl %eax,%cr4 + + movl %edi,%cr3 + movl %edx,%cr0 + + pushl $(1<<3) + pushl %ebx + lretl + .code64 + +l2: movq %r11,%rsp + movq %r10,%rbx + retq + .p2align 4,0 +NON_GPROF_ENTRY(la57_trampoline_gdt_desc) + .word la57_trampoline_end - la57_trampoline_gdt + .long 0 /* filled by pmap_bootstrap_la57 */ + .p2align 4,0 +NON_GPROF_ENTRY(la57_trampoline_gdt) + .long 0x00000000 /* null desc */ + .long 0x00000000 + .long 0x00000000 /* 64bit code */ + .long 0x00209800 + .long 0x0000ffff /* 32bit code */ + .long 0x00cf9b00 + .long 0x0000ffff /* universal data */ + .long 0x00cf9300 + .dcb.l 16,0 +NON_GPROF_ENTRY(la57_trampoline_end) + .bss ALIGN_DATA /* just to be sure */ .globl bootstack Index: sys/amd64/amd64/mp_machdep.c =================================================================== --- sys/amd64/amd64/mp_machdep.c +++ sys/amd64/amd64/mp_machdep.c @@ -95,7 +95,7 @@ #define GiB(v) (v ## ULL << 30) -#define AP_BOOTPT_SZ (PAGE_SIZE * 3) +#define AP_BOOTPT_SZ (PAGE_SIZE * 4) /* Temporary variables for init_secondary() */ char *doublefault_stack; @@ -103,6 +103,8 @@ char *nmi_stack; char *dbg_stack; +extern u_int mptramp_la57; + /* * Local data and functions. */ @@ -263,6 +265,8 @@ assign_cpu_ids(); + mptramp_la57 = la57; + /* Start each Application Processor */ init_ops.start_all_aps(); @@ -415,9 +419,9 @@ int native_start_all_aps(void) { - u_int64_t *pt4, *pt3, *pt2; + u_int64_t *pt5, *pt4, *pt3, *pt2; u_int32_t mpbioswarmvec; - int apic_id, cpu, domain, i; + int apic_id, cpu, domain, i, xo; u_char mpbiosreason; mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN); @@ -426,18 +430,38 @@ bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size); /* Locate the page tables, they'll be below the trampoline */ - pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables); + if (la57) { + pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables); + xo = 1; + } else { + xo = 0; + } + pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE); pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t); pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t); /* Create the initial 1GB replicated page tables */ for (i = 0; i < 512; i++) { - /* Each slot of the level 4 pages points to the same level 3 page */ - pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE); + if (la57) { + pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + + PAGE_SIZE); + pt5[i] |= PG_V | PG_RW | PG_U; + } + + /* + * Each slot of the level 4 pages points to the same + * level 3 page. + */ + pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + + (xo + 1) * PAGE_SIZE); pt4[i] |= PG_V | PG_RW | PG_U; - /* Each slot of the level 3 pages points to the same level 2 page */ - pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE)); + /* + * Each slot of the level 3 pages points to the same + * level 2 page. + */ + pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + + ((xo + 2) * PAGE_SIZE)); pt3[i] |= PG_V | PG_RW | PG_U; /* The level 2 page slots are mapped with 2MB pages for 1GB. */ Index: sys/amd64/amd64/mpboot.S =================================================================== --- sys/amd64/amd64/mpboot.S +++ sys/amd64/amd64/mpboot.S @@ -90,10 +90,13 @@ mov $bootdata-gdt, %eax mov %ax, %ds - /* Turn on the PAE bit for when paging is enabled */ + /* Turn on the PAE and optionally LA57 bit for when paging is enabled */ mov %cr4, %eax orl $CR4_PAE, %eax - mov %eax, %cr4 + cmpb $0, mptramp_la57-mptramp_start(%ebx) + je 1f + orl $CR4_LA57, %eax +1: mov %eax, %cr4 /* * Enable EFER.LME so that we get long mode when all the prereqs are @@ -132,7 +135,7 @@ /* * At this point paging is enabled, and we are in "compatibility" mode. * We do another far jump to reload %cs with the 64 bit selector. - * %cr3 points to a 4-level page table page. + * %cr3 points to a 4- or 5-level page table page. * We cannot yet jump all the way to the kernel because we can only * specify a 32 bit linear address. So, yet another trampoline. * @@ -209,6 +212,11 @@ mptramp_pagetables: .long 0 + /* 5-level paging ? */ + .globl mptramp_la57 +mptramp_la57: + .long 0 + /* * The pseudo descriptor for lgdt to use. */ @@ -251,8 +259,12 @@ * Load a real %cr3 that has all the direct map stuff and switches * off the 1GB replicated mirror. Load a stack pointer and jump * into AP startup code in C. - */ + */ + cmpl $0, la57 + jne 2f movq KPML4phys, %rax - movq %rax, %cr3 + jmp 3f +2: movq KPML5phys, %rax +3: movq %rax, %cr3 movq bootSTK, %rsp jmp init_secondary Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -398,6 +398,19 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &pg_ps_enabled, 0, "Are large page mappings enabled?"); +int __read_frequently la57 = 0; +SYSCTL_INT(_vm_pmap, OID_AUTO, la57, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &la57, 0, + "5-level paging for host is enabled"); + +static bool +pmap_is_la57(pmap_t pmap) +{ + if (pmap->pm_type == PT_X86) + return (la57); + return (false); /* XXXKIB handle EPT */ +} + #define PAT_INDEX_SIZE 8 static int pat_index[PAT_INDEX_SIZE]; /* cache mode to PAT index conversion */ @@ -405,7 +418,10 @@ static u_int64_t KPDphys; /* phys addr of kernel level 2 */ u_int64_t KPDPphys; /* phys addr of kernel level 3 */ u_int64_t KPML4phys; /* phys addr of kernel level 4 */ +u_int64_t KPML5phys; /* phys addr of kernel level 5, + if supported */ +static pml4_entry_t *kernel_pml4; static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */ static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */ static int ndmpdpphys; /* number of DMPDPphys pages */ @@ -1257,7 +1273,7 @@ static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde); static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, - struct rwlock **lockp); + struct rwlock **lockp, vm_offset_t va); static pd_entry_t *pmap_alloc_pde(pmap_t pmap, vm_offset_t va, vm_page_t *pdpgp, struct rwlock **lockp); static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, @@ -1271,20 +1287,85 @@ /* Inline functions */ /********************/ -/* Return a non-clipped PD index for a given VA */ +/* + * Return a non-clipped indexes for a given VA, which are page table + * pages indexes at the corresponding level. + */ static __inline vm_pindex_t pmap_pde_pindex(vm_offset_t va) { return (va >> PDRSHIFT); } +static __inline vm_pindex_t +pmap_pdpe_pindex(vm_offset_t va) +{ + return (NUPDE + (va >> PDPSHIFT)); +} + +static __inline vm_pindex_t +pmap_pml4e_pindex(vm_offset_t va) +{ + return (NUPDE + NUPDPE + (va >> PML4SHIFT)); +} + +static __inline vm_pindex_t +pmap_pml5e_pindex(vm_offset_t va) +{ + return (NUPDE + NUPDPE + NUPML4E + (va >> PML5SHIFT)); +} + +static __inline pml4_entry_t * +pmap_pml5e(pmap_t pmap, vm_offset_t va) +{ + + MPASS(pmap_is_la57(pmap)); + return (&pmap->pm_pmltop[pmap_pml5e_index(va)]); +} + +static __inline pml4_entry_t * +pmap_pml5e_u(pmap_t pmap, vm_offset_t va) +{ + + MPASS(pmap_is_la57(pmap)); + return (&pmap->pm_pmltopu[pmap_pml5e_index(va)]); +} + +static __inline pml4_entry_t * +pmap_pml5e_to_pml4e(pml5_entry_t *pml5e, vm_offset_t va) +{ + pml4_entry_t *pml4e; + + /* XXX MPASS(pmap_is_la57(pmap); */ + pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); + return (&pml4e[pmap_pml4e_index(va)]); +} /* Return a pointer to the PML4 slot that corresponds to a VA */ static __inline pml4_entry_t * pmap_pml4e(pmap_t pmap, vm_offset_t va) { + pml5_entry_t *pml5e; + pml4_entry_t *pml4e; + pt_entry_t PG_V; - return (&pmap->pm_pml4[pmap_pml4e_index(va)]); + if (pmap_is_la57(pmap)) { + pml5e = pmap_pml5e(pmap, va); + PG_V = pmap_valid_bit(pmap); + if ((*pml5e & PG_V) == 0) + return (NULL); + pml4e = (pml4_entry_t *)PHYS_TO_DMAP(*pml5e & PG_FRAME); + } else { + pml4e = pmap->pm_pmltop; + } + return (&pml4e[pmap_pml4e_index(va)]); +} + +static __inline pml4_entry_t * +pmap_pml4e_u(pmap_t pmap, vm_offset_t va) +{ + MPASS(!pmap_is_la57(pmap)); + return (&pmap->pm_pmltopu[pmap_pml4e_index(va)]); } /* Return a pointer to the PDP slot that corresponds to a VA */ @@ -1306,7 +1387,7 @@ PG_V = pmap_valid_bit(pmap); pml4e = pmap_pml4e(pmap, va); - if ((*pml4e & PG_V) == 0) + if (pml4e == NULL || (*pml4e & PG_V) == 0) return (NULL); return (pmap_pml4e_to_pdpe(pml4e, va)); } @@ -1387,21 +1468,37 @@ PMAP_INLINE pt_entry_t * vtopte(vm_offset_t va) { - u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); + u_int64_t mask; KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va)); - return (PTmap + ((va >> PAGE_SHIFT) & mask)); + if (la57) { + mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1); + return (P5Tmap + ((va >> PAGE_SHIFT) & mask)); + } else { + mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT)) - 1); + return (P4Tmap + ((va >> PAGE_SHIFT) & mask)); + } } static __inline pd_entry_t * vtopde(vm_offset_t va) { - u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); + u_int64_t mask; KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va)); - return (PDmap + ((va >> PDRSHIFT) & mask)); + if (la57) { + mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT + NPML5EPGSHIFT)) - 1); + return (P5Dmap + ((va >> PDRSHIFT) & mask)); + } else { + mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + + NPML4EPGSHIFT)) - 1); + return (P4Dmap + ((va >> PDRSHIFT) & mask)); + } } static u_int64_t @@ -1658,6 +1755,8 @@ p4_p[KPML4BASE + i] = KPDPphys + ptoa(i); p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V; } + + kernel_pml4 = (pml5_entry_t *)PHYS_TO_DMAP(KPML4phys); } /* @@ -1730,7 +1829,7 @@ * later unmapped (using pmap_remove()) and freed. */ PMAP_LOCK_INIT(kernel_pmap); - kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys); + kernel_pmap->pm_pmltop = kernel_pml4; kernel_pmap->pm_cr3 = KPML4phys; kernel_pmap->pm_ucr3 = PMAP_NO_CR3; CPU_FILL(&kernel_pmap->pm_active); /* don't allow deactivation */ @@ -1891,6 +1990,148 @@ load_cr4(cr4); } +extern const char la57_trampoline[], la57_trampoline_gdt_desc[], + la57_trampoline_gdt[], la57_trampoline_end[]; + +static void +pmap_bootstrap_la57(void *arg __unused) +{ + char *v_code; + pml5_entry_t *v_pml5; + pml4_entry_t *v_pml4; + pdp_entry_t *v_pdp; + pd_entry_t *v_pd; + pt_entry_t *v_pt; + vm_page_t m_code, m_pml4, m_pdp, m_pd, m_pt, m_pml5; + void (*la57_tramp)(uint64_t pml5); + struct region_descriptor r_gdt; + + if ((cpu_stdext_feature2 & CPUID_STDEXT2_LA57) == 0) + return; + if (!TUNABLE_INT_FETCH("vm.pmap.la57", &la57)) + la57 = 1; + if (!la57) + return; + + r_gdt.rd_limit = NGDT * sizeof(struct user_segment_descriptor) - 1; + r_gdt.rd_base = (long)__pcpu[0].pc_gdt; + + m_code = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_code->flags & PG_ZERO) == 0) + pmap_zero_page(m_code); + v_code = (char *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_code)); + m_pml5 = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pml5->flags & PG_ZERO) == 0) + pmap_zero_page(m_pml5); + KPML5phys = VM_PAGE_TO_PHYS(m_pml5); + v_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(KPML5phys); + m_pml4 = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pml4->flags & PG_ZERO) == 0) + pmap_zero_page(m_pml4); + v_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4)); + m_pdp = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pdp->flags & PG_ZERO) == 0) + pmap_zero_page(m_pdp); + v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp)); + m_pd = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pd->flags & PG_ZERO) == 0) + pmap_zero_page(m_pd); + v_pd = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd)); + m_pt = vm_page_alloc_contig(NULL, 0, + VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_ZERO | VM_ALLOC_NOOBJ, + 1, 0, (1ULL << 32), PAGE_SIZE, 0, VM_MEMATTR_DEFAULT); + if ((m_pt->flags & PG_ZERO) == 0) + pmap_zero_page(m_pt); + v_pt = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pt)); + + /* + * Map m_code 1:1, it appears below 4G in KVA due to physical + * address being below 4G. Since kernel KVA is in upper half, + * the pml4e should be zero and free for temporal use. + */ + kernel_pmap->pm_pmltop[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pdp[pmap_pdpe_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pd) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pd[pmap_pde_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pt) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pt[pmap_pte_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_code) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + + /* + * Add pml5 entry at top of KVA pointing to existing pml4 table, + * entering all existing kernel mapping into level 5 table. + */ + v_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | pg_g; + + /* + * Add pml5 entry for 1:1 trampoline mapping after LA57 is turned on. + */ + v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pml4) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + v_pml4[pmap_pml4e_index(VM_PAGE_TO_PHYS(m_code))] = + VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V | X86_PG_RW | X86_PG_A | + X86_PG_M; + + /* + * Copy and call the 48->57 trampoline, hope we return there, alive. + */ + bcopy(la57_trampoline, v_code, la57_trampoline_end - la57_trampoline); + *(u_long *)(v_code + 2 + (la57_trampoline_gdt_desc - la57_trampoline)) = + la57_trampoline_gdt - la57_trampoline + VM_PAGE_TO_PHYS(m_code); + la57_tramp = (void (*)(uint64_t))VM_PAGE_TO_PHYS(m_code); + la57_tramp(KPML5phys); + + /* + * gdt was necessary reset, switch back to our gdt. + */ + lgdt(&r_gdt); + wrmsr(MSR_GSBASE, (uint64_t)&__pcpu[0]); + load_ds(_udatasel); + load_es(_udatasel); + load_fs(_ufssel); + ssdtosyssd(&gdt_segs[GPROC0_SEL], + (struct system_segment_descriptor *)&__pcpu[0].pc_gdt[GPROC0_SEL]); + ltr(GSEL(GPROC0_SEL, SEL_KPL)); + + /* + * Now unmap the trampoline, and free the pages. + * Clear pml5 entry used for 1:1 trampoline mapping. + */ + pte_clear(&v_pml5[pmap_pml5e_index(VM_PAGE_TO_PHYS(m_code))]); + invlpg((vm_offset_t)v_code); + vm_page_free(m_code); + vm_page_free(m_pdp); + vm_page_free(m_pd); + vm_page_free(m_pt); + + /* + * Recursively map PML5 to itself in order to get PTmap and + * PDmap. + */ + v_pml5[PML5PML5I] = KPML5phys | X86_PG_RW | X86_PG_V | pg_nx; + + kernel_pmap->pm_cr3 = KPML5phys; + kernel_pmap->pm_pmltop = v_pml5; +} +SYSINIT(la57, SI_SUB_KMEM, SI_ORDER_ANY, pmap_bootstrap_la57, NULL); + /* * Initialize a vm_page's machine-dependent fields. */ @@ -2190,7 +2431,8 @@ } for (i = 0; i < lm_ents; i++) { m = pmap_large_map_getptp_unlocked(); - kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V | + /* XXXKIB la57 */ + kernel_pml4[LMSPML4I + i] = X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx | VM_PAGE_TO_PHYS(m); } @@ -3564,44 +3806,57 @@ static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free) { + pml5_entry_t *pml5; + pml4_entry_t *pml4; + pdp_entry_t *pdp; + pd_entry_t *pd; + vm_page_t pdpg, pdppg, pml4pg; PMAP_LOCK_ASSERT(pmap, MA_OWNED); + /* * unmap the page table page */ - if (m->pindex >= NUPDE + NUPDPE) { + if (m->pindex >= NUPDE + NUPDPE + NUPML4E) { + /* PML4 page */ + MPASS(pmap_is_la57(pmap)); + pml5 = pmap_pml5e(pmap, va); + *pml5 = 0; + if (pmap->pm_pmltopu != NULL && va <= VM_MAXUSER_ADDRESS) { + pml5 = pmap_pml5e_u(pmap, va); + *pml5 = 0; + } + } else if (m->pindex >= NUPDE + NUPDPE) { /* PDP page */ - pml4_entry_t *pml4; pml4 = pmap_pml4e(pmap, va); *pml4 = 0; - if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) { - pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)]; + if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && + va <= VM_MAXUSER_ADDRESS) { + pml4 = pmap_pml4e_u(pmap, va); *pml4 = 0; } } else if (m->pindex >= NUPDE) { /* PD page */ - pdp_entry_t *pdp; pdp = pmap_pdpe(pmap, va); *pdp = 0; } else { /* PTE page */ - pd_entry_t *pd; pd = pmap_pde(pmap, va); *pd = 0; } pmap_resident_count_dec(pmap, 1); if (m->pindex < NUPDE) { /* We just released a PT, unhold the matching PD */ - vm_page_t pdpg; - pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdpg, free); } else if (m->pindex < NUPDE + NUPDPE) { /* We just released a PD, unhold the matching PDP */ - vm_page_t pdppg; - pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME); pmap_unwire_ptp(pmap, va, pdppg, free); + } else if (m->pindex < NUPDE + NUPDPE + NUPML4E && pmap_is_la57(pmap)) { + /* We just released a PDP, unhold the matching PML4 */ + pml4pg = PHYS_TO_VM_PAGE(*pmap_pml5e(pmap, va) & PG_FRAME); + pmap_unwire_ptp(pmap, va, pml4pg, free); } /* @@ -3657,9 +3912,9 @@ int i; PMAP_LOCK_INIT(pmap); - pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys); - pmap->pm_pml4u = NULL; - pmap->pm_cr3 = KPML4phys; + pmap->pm_pmltop = kernel_pmap->pm_pmltop; + pmap->pm_pmltopu = NULL; + pmap->pm_cr3 = kernel_pmap->pm_cr3; /* hack to keep pmap_pti_pcid_invalidate() alive */ pmap->pm_ucr3 = PMAP_NO_CR3; pmap->pm_root.rt_root = 0; @@ -3712,18 +3967,59 @@ /* install large map entries if configured */ for (i = 0; i < lm_ents; i++) - pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i]; + pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pmltop[LMSPML4I + i]; +} + +void +pmap_pinit_pml5(vm_page_t pml5pg) +{ + pml5_entry_t *pm_pml5; + + pm_pml5 = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pg)); + + /* + * Add pml5 entry at top of KVA pointing to existing pml4 table, + * entering all existing kernel mapping into level 5 table. + */ + pm_pml5[pmap_pml5e_index(UPT_MAX_ADDRESS)] = KPML4phys | X86_PG_V | + X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); + + /* + * Install self-referential address mapping entry. + */ + pm_pml5[PML5PML5I] = VM_PAGE_TO_PHYS(pml5pg) | + X86_PG_RW | X86_PG_V | X86_PG_M | X86_PG_A | + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); } static void -pmap_pinit_pml4_pti(vm_page_t pml4pg) +pmap_pinit_pml4_pti(vm_page_t pml4pgu) { - pml4_entry_t *pm_pml4; + pml4_entry_t *pm_pml4u; int i; - pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg)); + pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pgu)); for (i = 0; i < NPML4EPG; i++) - pm_pml4[i] = pti_pml4[i]; + pm_pml4u[i] = pti_pml4[i]; +} + +static void +pmap_pinit_pml5_pti(vm_page_t pml5pgu) +{ + pml5_entry_t *pm_pml5u; + + pm_pml5u = (pml5_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml5pgu)); + + /* + * Add pml5 entry at top of KVA pointing to existing pml4 pti + * table, entering all kernel mapping needed for usermode, + * into level 5 table. + */ + pm_pml5u[pmap_pml5e_index(UPT_MAX_ADDRESS)] = + pmap_kextract((vm_offset_t)pti_pml4) | + X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M | pg_g | + pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE); } /* @@ -3733,29 +4029,30 @@ int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags) { - vm_page_t pml4pg, pml4pgu; - vm_paddr_t pml4phys; + vm_page_t pmltop_pg, pmltop_pgu; + vm_paddr_t pmltop_phys; int i; /* * allocate the page directory page */ - pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + pmltop_pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK); - pml4phys = VM_PAGE_TO_PHYS(pml4pg); - pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys); + pmltop_phys = VM_PAGE_TO_PHYS(pmltop_pg); + pmap->pm_pmltop = (pml5_entry_t *)PHYS_TO_DMAP(pmltop_phys); + CPU_FOREACH(i) { pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE; pmap->pm_pcids[i].pm_gen = 0; } pmap->pm_cr3 = PMAP_NO_CR3; /* initialize to an invalid value */ pmap->pm_ucr3 = PMAP_NO_CR3; - pmap->pm_pml4u = NULL; + pmap->pm_pmltopu = NULL; pmap->pm_type = pm_type; - if ((pml4pg->flags & PG_ZERO) == 0) - pagezero(pmap->pm_pml4); + if ((pmltop_pg->flags & PG_ZERO) == 0) + pagezero(pmap->pm_pmltop); /* * Do not install the host kernel mappings in the nested page @@ -3764,15 +4061,21 @@ * Install minimal kernel mappings in PTI case. */ if (pm_type == PT_X86) { - pmap->pm_cr3 = pml4phys; - pmap_pinit_pml4(pml4pg); + pmap->pm_cr3 = pmltop_phys; + if (pmap_is_la57(pmap)) + pmap_pinit_pml5(pmltop_pg); + else + pmap_pinit_pml4(pmltop_pg); if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) { - pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | + pmltop_pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK); - pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP( - VM_PAGE_TO_PHYS(pml4pgu)); - pmap_pinit_pml4_pti(pml4pgu); - pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu); + pmap->pm_pmltopu = (pml4_entry_t *)PHYS_TO_DMAP( + VM_PAGE_TO_PHYS(pmltop_pgu)); + if (pmap_is_la57(pmap)) + pmap_pinit_pml5_pti(pmltop_pgu); + else + pmap_pinit_pml4_pti(pmltop_pgu); + pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pmltop_pgu); } if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { rangeset_init(&pmap->pm_pkru, pkru_dup_range, @@ -3797,21 +4100,101 @@ return (pmap_pinit_type(pmap, PT_X86, pmap_flags)); } +static pml4_entry_t * +pmap_allocpte_getpml4(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, + bool addref) +{ + vm_pindex_t pml5index; + pml5_entry_t *pml5; + pml4_entry_t *pml4; + vm_page_t pml4pg; + pt_entry_t PG_V; + bool allocated; + + if (!pmap_is_la57(pmap)) + return (&pmap->pm_pmltop[pmap_pml4e_index(va)]); + + PG_V = pmap_valid_bit(pmap); + pml5index = pmap_pml5e_index(va); + pml5 = &pmap->pm_pmltop[pml5index]; + if ((*pml5 & PG_V) == 0) { + if (_pmap_allocpte(pmap, pmap_pml5e_pindex(va), lockp, va) == + NULL) + return (NULL); + allocated = true; + } else { + allocated = false; + } + pml4 = (pml4_entry_t *)PHYS_TO_DMAP(*pml5 & PG_FRAME); + pml4 = &pml4[pmap_pml4e_index(va)]; + if ((*pml4 & PG_V) == 0) { + pml4pg = PHYS_TO_VM_PAGE(*pml5 & PG_FRAME); + if (allocated && !addref) + pml4pg->ref_count--; + else if (!allocated && addref) + pml4pg->ref_count++; + } + return (pml4); +} + +static pdp_entry_t * +pmap_allocpte_getpdp(pmap_t pmap, struct rwlock **lockp, vm_offset_t va, + bool addref) +{ + vm_page_t pdppg; + pml4_entry_t *pml4; + pdp_entry_t *pdp; + pt_entry_t PG_V; + bool allocated; + + PG_V = pmap_valid_bit(pmap); + + pml4 = pmap_allocpte_getpml4(pmap, lockp, va, false); + if (pml4 == NULL) + return (NULL); + + if ((*pml4 & PG_V) == 0) { + /* Have to allocate a new pdp, recurse */ + if (_pmap_allocpte(pmap, pmap_pml4e_pindex(va), lockp, va) == + NULL) + return (NULL); + allocated = true; + } else { + allocated = false; + } + pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); + pdp = &pdp[pmap_pdpe_index(va)]; + if ((*pdp & PG_V) == 0) { + pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); + if (allocated && !addref) + pdppg->ref_count--; + else if (!allocated && addref) + pdppg->ref_count++; + } + return (pdp); +} + /* * This routine is called if the desired page table page does not exist. * * If page table page allocation fails, this routine may sleep before * returning NULL. It sleeps only if a lock pointer was given. * - * Note: If a page allocation fails at page table level two or three, + * Note: If a page allocation fails at page table level two, three, or four, * one or two pages may be held during the wait, only to be released * afterwards. This conservative approach is easily argued to avoid * race conditions. */ static vm_page_t -_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp) +_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp, + vm_offset_t va __unused) { - vm_page_t m, pdppg, pdpg; + vm_pindex_t pml5index, pml4index; + pml5_entry_t *pml5, *pml5u; + pml4_entry_t *pml4, *pml4u; + pdp_entry_t *pdp; + pd_entry_t *pd; + vm_page_t m, pdpg; pt_entry_t PG_A, PG_M, PG_RW, PG_V; PMAP_LOCK_ASSERT(pmap, MA_OWNED); @@ -3847,16 +4230,38 @@ * Map the pagetable page into the process address space, if * it isn't already there. */ + if (ptepindex >= NUPDE + NUPDPE + NUPML4E) { + MPASS(pmap_is_la57(pmap)); + + pml5index = pmap_pml5e_index(va); + pml5 = &pmap->pm_pmltop[pml5index]; + KASSERT((*pml5 & PG_V) == 0, + ("pmap %p va %#lx pml5 %#lx", pmap, va, *pml5)); + *pml5 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; - if (ptepindex >= (NUPDE + NUPDPE)) { - pml4_entry_t *pml4, *pml4u; - vm_pindex_t pml4index; + if (pmap->pm_pmltopu != NULL && pml5index < NUPML5E) { + if (pmap->pm_ucr3 != PMAP_NO_CR3) + *pml5 |= pg_nx; + pml5u = &pmap->pm_pmltopu[pml5index]; + *pml5u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | + PG_A | PG_M; + } + } else if (ptepindex >= NUPDE + NUPDPE) { + pml4index = pmap_pml4e_index(va); /* Wire up a new PDPE page */ - pml4index = ptepindex - (NUPDE + NUPDPE); - pml4 = &pmap->pm_pml4[pml4index]; + pml4 = pmap_allocpte_getpml4(pmap, lockp, va, true); + if (pml4 == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + KASSERT((*pml4 & PG_V) == 0, + ("pmap %p va %#lx pml4 %#lx", pmap, va, *pml4)); *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; - if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) { + + if (!pmap_is_la57(pmap) && pmap->pm_pmltopu != NULL && + pml4index < NUPML4E) { /* * PTI: Make all user-space mappings in the * kernel-mode page table no-execute so that @@ -3867,85 +4272,48 @@ if (pmap->pm_ucr3 != PMAP_NO_CR3) *pml4 |= pg_nx; - pml4u = &pmap->pm_pml4u[pml4index]; + pml4u = &pmap->pm_pmltopu[pml4index]; *pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } - } else if (ptepindex >= NUPDE) { - vm_pindex_t pml4index; - vm_pindex_t pdpindex; - pml4_entry_t *pml4; - pdp_entry_t *pdp; - /* Wire up a new PDE page */ - pdpindex = ptepindex - NUPDE; - pml4index = pdpindex >> NPML4EPGSHIFT; - - pml4 = &pmap->pm_pml4[pml4index]; - if ((*pml4 & PG_V) == 0) { - /* Have to allocate a new pdp, recurse */ - if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index, - lockp) == NULL) { - vm_page_unwire_noq(m); - vm_page_free_zero(m); - return (NULL); - } - } else { - /* Add reference to pdp page */ - pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME); - pdppg->ref_count++; + pdp = pmap_allocpte_getpdp(pmap, lockp, va, true); + if (pdp == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); } - pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); - - /* Now find the pdp page */ - pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; + KASSERT((*pdp & PG_V) == 0, + ("pmap %p va %#lx pdp %#lx", pmap, va, *pdp)); *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; - } else { - vm_pindex_t pml4index; - vm_pindex_t pdpindex; - pml4_entry_t *pml4; - pdp_entry_t *pdp; - pd_entry_t *pd; - /* Wire up a new PTE page */ - pdpindex = ptepindex >> NPDPEPGSHIFT; - pml4index = pdpindex >> NPML4EPGSHIFT; - - /* First, find the pdp and check that its valid. */ - pml4 = &pmap->pm_pml4[pml4index]; - if ((*pml4 & PG_V) == 0) { + pdp = pmap_allocpte_getpdp(pmap, lockp, va, false); + if (pdp == NULL) { + vm_page_unwire_noq(m); + vm_page_free_zero(m); + return (NULL); + } + if ((*pdp & PG_V) == 0) { /* Have to allocate a new pd, recurse */ - if (_pmap_allocpte(pmap, NUPDE + pdpindex, - lockp) == NULL) { + if (_pmap_allocpte(pmap, pmap_pdpe_pindex(va), + lockp, va) == NULL) { vm_page_unwire_noq(m); vm_page_free_zero(m); return (NULL); } - pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); - pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; } else { - pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME); - pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)]; - if ((*pdp & PG_V) == 0) { - /* Have to allocate a new pd, recurse */ - if (_pmap_allocpte(pmap, NUPDE + pdpindex, - lockp) == NULL) { - vm_page_unwire_noq(m); - vm_page_free_zero(m); - return (NULL); - } - } else { - /* Add reference to the pd page */ - pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); - pdpg->ref_count++; - } + /* Add reference to the pd page */ + pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); + pdpg->ref_count++; } pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME); /* Now we know where the page directory page is */ - pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)]; + pd = &pd[pmap_pde_index(va)]; + KASSERT((*pd & PG_V) == 0, + ("pmap %p va %#lx pd %#lx", pmap, va, *pd)); *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M; } @@ -3978,7 +4346,7 @@ } else if (va < VM_MAXUSER_ADDRESS) { /* Allocate a pd page. */ pdpindex = pmap_pde_pindex(va) >> NPDPEPGSHIFT; - pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp); + pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp, va); if (pdpg == NULL) { if (lockp != NULL) goto retry; @@ -4039,7 +4407,7 @@ * Here if the pte page isn't mapped, or if it has been * deallocated. */ - m = _pmap_allocpte(pmap, ptepindex, lockp); + m = _pmap_allocpte(pmap, ptepindex, lockp, va); if (m == NULL && lockp != NULL) goto retry; } @@ -4063,28 +4431,35 @@ int i; KASSERT(pmap->pm_stats.resident_count == 0, - ("pmap_release: pmap resident count %ld != 0", - pmap->pm_stats.resident_count)); + ("pmap_release: pmap %p resident count %ld != 0", + pmap, pmap->pm_stats.resident_count)); KASSERT(vm_radix_is_empty(&pmap->pm_root), - ("pmap_release: pmap has reserved page table page(s)")); + ("pmap_release: pmap %p has reserved page table page(s)", + pmap)); KASSERT(CPU_EMPTY(&pmap->pm_active), ("releasing active pmap %p", pmap)); - m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4)); + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pmltop)); - for (i = 0; i < NKPML4E; i++) /* KVA */ - pmap->pm_pml4[KPML4BASE + i] = 0; - for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ - pmap->pm_pml4[DMPML4I + i] = 0; - pmap->pm_pml4[PML4PML4I] = 0; /* Recursive Mapping */ - for (i = 0; i < lm_ents; i++) /* Large Map */ - pmap->pm_pml4[LMSPML4I + i] = 0; + if (pmap_is_la57(pmap)) { + pmap->pm_pmltop[pmap_pml5e_index(UPT_MAX_ADDRESS)] = 0; + pmap->pm_pmltop[PML5PML5I] = 0; + } else { + for (i = 0; i < NKPML4E; i++) /* KVA */ + pmap->pm_pmltop[KPML4BASE + i] = 0; + for (i = 0; i < ndmpdpphys; i++)/* Direct Map */ + pmap->pm_pmltop[DMPML4I + i] = 0; + pmap->pm_pmltop[PML4PML4I] = 0; /* Recursive Mapping */ + for (i = 0; i < lm_ents; i++) /* Large Map */ + pmap->pm_pmltop[LMSPML4I + i] = 0; + } vm_page_unwire_noq(m); vm_page_free_zero(m); - if (pmap->pm_pml4u != NULL) { - m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u)); + if (pmap->pm_pmltopu != NULL) { + m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap-> + pm_pmltopu)); vm_page_unwire_noq(m); vm_page_free(m); } @@ -5423,6 +5798,7 @@ { struct rwlock *lock; vm_offset_t va_next; + pml5_entry_t *pml5e; pml4_entry_t *pml4e; pdp_entry_t *pdpe; pd_entry_t ptpaddr, *pde; @@ -5465,7 +5841,18 @@ if (pmap->pm_stats.resident_count == 0) break; - pml4e = pmap_pml4e(pmap, sva); + if (pmap_is_la57(pmap)) { + pml5e = pmap_pml5e(pmap, sva); + if ((*pml5e & PG_V) == 0) { + va_next = (sva + NBPML5) & ~PML5MASK; + if (va_next < sva) + va_next = eva; + continue; + } + pml4e = pmap_pml5e_to_pml4e(pml5e, sva); + } else { + pml4e = pmap_pml4e(pmap, sva); + } if ((*pml4e & PG_V) == 0) { va_next = (sva + NBPML4) & ~PML4MASK; if (va_next < sva) @@ -6085,7 +6472,7 @@ */ nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0; mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va), - nosleep ? NULL : &lock); + nosleep ? NULL : &lock, va); if (mpte == NULL && nosleep) { rv = KERN_RESOURCE_SHORTAGE; goto out; @@ -6568,7 +6955,8 @@ * Pass NULL instead of the PV list lock * pointer, because we don't intend to sleep. */ - mpte = _pmap_allocpte(pmap, ptepindex, NULL); + mpte = _pmap_allocpte(pmap, ptepindex, NULL, + va); if (mpte == NULL) return (mpte); } @@ -9358,11 +9746,11 @@ ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I " "%#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); - KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0, + KASSERT((kernel_pml4[pml4_idx] & X86_PG_V) != 0, ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx " "LMSPML4I %#jx lm_ents %d", (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents)); - mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME; + mphys = kernel_pml4[pml4_idx] & PG_FRAME; return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va)); } @@ -10437,7 +10825,9 @@ mode, range->pdpes, range->pdes, range->ptes); /* Reset to sentinel value. */ - range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); + range->sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1); } /* @@ -10531,7 +10921,9 @@ sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); /* Sentinel value. */ - range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); + range.sva = la57 ? KV5ADDR(NPML5EPG - 1, NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1) : KV4ADDR(NPML4EPG - 1, NPDPEPG - 1, + NPDEPG - 1, NPTEPG - 1); /* * Iterate over the kernel page tables without holding the kernel pmap @@ -10561,7 +10953,7 @@ sva |= -1ul << 48; restart: - pml4e = kernel_pmap->pm_pml4[i]; + pml4e = kernel_pml4[i]; if ((pml4e & X86_PG_V) == 0) { sva = rounddown2(sva, NBPML4); sysctl_kmaps_dump(sb, &range, sva); @@ -10644,6 +11036,7 @@ DB_SHOW_COMMAND(pte, pmap_print_pte) { pmap_t pmap; + pml5_entry_t *pml5; pml4_entry_t *pml4; pdp_entry_t *pdp; pd_entry_t *pde; @@ -10662,8 +11055,20 @@ pmap = PCPU_GET(curpmap); PG_V = pmap_valid_bit(pmap); - pml4 = pmap_pml4e(pmap, va); - db_printf("VA 0x%016lx pml4e 0x%016lx", va, *pml4); + db_printf("VA 0x%016lx", va); + + if (pmap_is_la57(pmap)) { + pml5 = pmap_pml5e(pmap, va); + db_printf(" pml5e 0x%016lx", *pml5); + if ((*pml5 & PG_V) == 0) { + db_printf("\n"); + return; + } + pml4 = pmap_pml5e_to_pml4e(pml5, va); + } else { + pml4 = pmap_pml4e(pmap, va); + } + db_printf(" pml4e 0x%016lx", *pml4); if ((*pml4 & PG_V) == 0) { db_printf("\n"); return; @@ -10695,4 +11100,95 @@ db_printf("show phys2dmap addr\n"); } } + +static void +ptpages_show_page(int level, int idx, vm_page_t pg) +{ + db_printf("l %d i %d pg %p phys %#lx ref %x\n", + level, idx, pg, VM_PAGE_TO_PHYS(pg), pg->ref_count); +} + +static void +ptpages_show_complain(int level, int idx, uint64_t pte) +{ + db_printf("l %d i %d pte %#lx\n", level, idx, pte); +} + +static void +ptpages_show_pml4(vm_page_t pg4, int num_entries, uint64_t PG_V) +{ + vm_page_t pg3, pg2, pg1; + pml4_entry_t *pml4; + pdp_entry_t *pdp; + pd_entry_t *pd; + int i4, i3, i2; + + pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg4)); + for (i4 = 0; i4 < num_entries; i4++) { + if ((pml4[i4] & PG_V) == 0) + continue; + pg3 = PHYS_TO_VM_PAGE(pml4[i4] & PG_FRAME); + if (pg3 == NULL) { + ptpages_show_complain(3, i4, pml4[i4]); + continue; + } + ptpages_show_page(3, i4, pg3); + pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg3)); + for (i3 = 0; i3 < NPDPEPG; i3++) { + if ((pdp[i3] & PG_V) == 0) + continue; + pg2 = PHYS_TO_VM_PAGE(pdp[i3] & PG_FRAME); + if (pg3 == NULL) { + ptpages_show_complain(2, i3, pdp[i3]); + continue; + } + ptpages_show_page(2, i3, pg2); + pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pg2)); + for (i2 = 0; i2 < NPDEPG; i2++) { + if ((pd[i2] & PG_V) == 0) + continue; + pg1 = PHYS_TO_VM_PAGE(pd[i2] & PG_FRAME); + if (pg1 == NULL) { + ptpages_show_complain(1, i2, pd[i2]); + continue; + } + ptpages_show_page(1, i2, pg1); + } + } + } +} + +DB_SHOW_COMMAND(ptpages, pmap_ptpages) +{ + pmap_t pmap; + vm_page_t pg; + pml5_entry_t *pml5; + uint64_t PG_V; + int i5; + + if (have_addr) + pmap = (pmap_t)addr; + else + pmap = PCPU_GET(curpmap); + + PG_V = pmap_valid_bit(pmap); + + if (pmap_is_la57(pmap)) { + pml5 = pmap->pm_pmltop; + for (i5 = 0; i5 < NUPML5E; i5++) { + if ((pml5[i5] & PG_V) == 0) + continue; + pg = PHYS_TO_VM_PAGE(pml5[i5] & PG_FRAME); + if (pg == NULL) { + ptpages_show_complain(4, i5, pml5[i5]); + continue; + } + ptpages_show_page(4, i5, pg); + ptpages_show_pml4(pg, NPML4EPG, PG_V); + } + } else { + ptpages_show_pml4(PHYS_TO_VM_PAGE(DMAP_TO_PHYS( + (vm_offset_t)pmap->pm_pmltop)), NUP4ML4E, PG_V); + } +} #endif Index: sys/amd64/amd64/vm_machdep.c =================================================================== --- sys/amd64/amd64/vm_machdep.c +++ sys/amd64/amd64/vm_machdep.c @@ -377,21 +377,67 @@ } static void -cpu_procctl_kpti(struct proc *p, int com, int *val) +cpu_procctl_kpti_ctl(struct proc *p, int val) { - if (com == PROC_KPTI_CTL) { - if (pti && *val == PROC_KPTI_CTL_ENABLE_ON_EXEC) - p->p_md.md_flags |= P_MD_KPTI; - if (*val == PROC_KPTI_CTL_DISABLE_ON_EXEC) - p->p_md.md_flags &= ~P_MD_KPTI; - } else /* PROC_KPTI_STATUS */ { - *val = (p->p_md.md_flags & P_MD_KPTI) != 0 ? - PROC_KPTI_CTL_ENABLE_ON_EXEC: - PROC_KPTI_CTL_DISABLE_ON_EXEC; - if (vmspace_pmap(p->p_vmspace)->pm_ucr3 != PMAP_NO_CR3) - *val |= PROC_KPTI_STATUS_ACTIVE; + if (pti && val == PROC_KPTI_CTL_ENABLE_ON_EXEC) + p->p_md.md_flags |= P_MD_KPTI; + if (val == PROC_KPTI_CTL_DISABLE_ON_EXEC) + p->p_md.md_flags &= ~P_MD_KPTI; +} + +static void +cpu_procctl_kpti_status(struct proc *p, int *val) +{ + *val = (p->p_md.md_flags & P_MD_KPTI) != 0 ? + PROC_KPTI_CTL_ENABLE_ON_EXEC: + PROC_KPTI_CTL_DISABLE_ON_EXEC; + if (vmspace_pmap(p->p_vmspace)->pm_ucr3 != PMAP_NO_CR3) + *val |= PROC_KPTI_STATUS_ACTIVE; +} + +static int +cpu_procctl_la_ctl(struct proc *p, int val) +{ + int error; + + error = 0; + switch (val) { + case PROC_LA_CTL_LA48_ON_EXEC: + p->p_md.md_flags |= P_MD_LA48; + p->p_md.md_flags &= ~P_MD_LA57; + break; + case PROC_LA_CTL_LA57_ON_EXEC: + if (la57) { + p->p_md.md_flags &= ~P_MD_LA48; + p->p_md.md_flags |= P_MD_LA57; + } else { + error = ENOTSUP; + } + break; + case PROC_LA_CTL_DEFAULT_ON_EXEC: + p->p_md.md_flags &= ~(P_MD_LA48 | P_MD_LA57); + break; } + return (error); +} + +static void +cpu_procctl_la_status(struct proc *p, int *val) +{ + int res; + + if ((p->p_md.md_flags & P_MD_LA48) != 0) + res = PROC_LA_CTL_LA48_ON_EXEC; + else if ((p->p_md.md_flags & P_MD_LA57) != 0) + res = PROC_LA_CTL_LA57_ON_EXEC; + else + res = PROC_LA_CTL_DEFAULT_ON_EXEC; + if (p->p_sysent->sv_maxuser == VM_MAXUSER_ADDRESS_LA48) + res |= PROC_LA_STATUS_LA48; + else + res |= PROC_LA_STATUS_LA57; + *val = res; } int @@ -403,6 +449,8 @@ switch (com) { case PROC_KPTI_CTL: case PROC_KPTI_STATUS: + case PROC_LA_CTL: + case PROC_LA_STATUS: if (idtype != P_PID) { error = EINVAL; break; @@ -412,22 +460,45 @@ error = priv_check(td, PRIV_IO); if (error != 0) break; + } + if (com == PROC_KPTI_CTL || com == PROC_LA_CTL) { error = copyin(data, &val, sizeof(val)); if (error != 0) break; - if (val != PROC_KPTI_CTL_ENABLE_ON_EXEC && - val != PROC_KPTI_CTL_DISABLE_ON_EXEC) { - error = EINVAL; - break; - } + } + if (com == PROC_KPTI_CTL && + val != PROC_KPTI_CTL_ENABLE_ON_EXEC && + val != PROC_KPTI_CTL_DISABLE_ON_EXEC) { + error = EINVAL; + break; + } + if (com == PROC_LA_CTL && + val != PROC_LA_CTL_LA48_ON_EXEC && + val != PROC_LA_CTL_LA57_ON_EXEC && + val != PROC_LA_CTL_DEFAULT_ON_EXEC) { + error = EINVAL; + break; } error = pget(id, PGET_CANSEE | PGET_NOTWEXIT | PGET_NOTID, &p); - if (error == 0) { - cpu_procctl_kpti(p, com, &val); - PROC_UNLOCK(p); - if (com == PROC_KPTI_STATUS) - error = copyout(&val, data, sizeof(val)); + if (error != 0) + break; + switch (com) { + case PROC_KPTI_CTL: + cpu_procctl_kpti_ctl(p, val); + break; + case PROC_KPTI_STATUS: + cpu_procctl_kpti_status(p, &val); + break; + case PROC_LA_CTL: + error = cpu_procctl_la_ctl(p, val); + break; + case PROC_LA_STATUS: + cpu_procctl_la_status(p, &val); + break; } + PROC_UNLOCK(p); + if (com == PROC_KPTI_STATUS || com == PROC_LA_STATUS) + error = copyout(&val, data, sizeof(val)); break; default: error = EINVAL; Index: sys/amd64/include/md_var.h =================================================================== --- sys/amd64/include/md_var.h +++ sys/amd64/include/md_var.h @@ -46,6 +46,8 @@ extern vm_paddr_t intel_graphics_stolen_base; extern vm_paddr_t intel_graphics_stolen_size; +extern int la57; + /* * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its * value is the physical address at which the kernel is loaded. Index: sys/amd64/include/param.h =================================================================== --- sys/amd64/include/param.h +++ sys/amd64/include/param.h @@ -118,6 +118,12 @@ #define PML4SHIFT 39 /* LOG2(NBPML4) */ #define NBPML4 (1UL<> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); } +static __inline vm_pindex_t +pmap_pml5e_index(vm_offset_t va) +{ + + return ((va >> PML5SHIFT) & ((1ul << NPML5EPGSHIFT) - 1)); +} + #endif /* !LOCORE */ #endif /* !_MACHINE_PMAP_H_ */ Index: sys/amd64/include/proc.h =================================================================== --- sys/amd64/include/proc.h +++ sys/amd64/include/proc.h @@ -84,6 +84,8 @@ }; #define P_MD_KPTI 0x00000001 /* Enable KPTI on exec */ +#define P_MD_LA48 0x00000002 /* Request LA48 after exec */ +#define P_MD_LA57 0x00000004 /* Request LA57 after exec */ #define KINFO_PROC_SIZE 1088 #define KINFO_PROC32_SIZE 768 Index: sys/amd64/include/vmparam.h =================================================================== --- sys/amd64/include/vmparam.h +++ sys/amd64/include/vmparam.h @@ -169,25 +169,32 @@ * 0xffffffff80000000 KERNBASE */ -#define VM_MIN_KERNEL_ADDRESS KVADDR(KPML4BASE, 0, 0, 0) -#define VM_MAX_KERNEL_ADDRESS KVADDR(KPML4BASE + NKPML4E - 1, \ +#define VM_MIN_KERNEL_ADDRESS KV4ADDR(KPML4BASE, 0, 0, 0) +#define VM_MAX_KERNEL_ADDRESS KV4ADDR(KPML4BASE + NKPML4E - 1, \ NPDPEPG-1, NPDEPG-1, NPTEPG-1) -#define DMAP_MIN_ADDRESS KVADDR(DMPML4I, 0, 0, 0) -#define DMAP_MAX_ADDRESS KVADDR(DMPML4I + NDMPML4E, 0, 0, 0) +#define DMAP_MIN_ADDRESS KV4ADDR(DMPML4I, 0, 0, 0) +#define DMAP_MAX_ADDRESS KV4ADDR(DMPML4I + NDMPML4E, 0, 0, 0) -#define LARGEMAP_MIN_ADDRESS KVADDR(LMSPML4I, 0, 0, 0) -#define LARGEMAP_MAX_ADDRESS KVADDR(LMEPML4I + 1, 0, 0, 0) +#define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0) +#define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0) -#define KERNBASE KVADDR(KPML4I, KPDPI, 0, 0) +#define KERNBASE KV4ADDR(KPML4I, KPDPI, 0, 0) -#define UPT_MAX_ADDRESS KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I) -#define UPT_MIN_ADDRESS KVADDR(PML4PML4I, 0, 0, 0) +#define UPT_MAX_ADDRESS KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I) +#define UPT_MIN_ADDRESS KV4ADDR(PML4PML4I, 0, 0, 0) -#define VM_MAXUSER_ADDRESS UVADDR(NUPML4E, 0, 0, 0) +#define VM_MAXUSER_ADDRESS_LA57 UVADDR(NUPML5E, 0, 0, 0, 0) +#define VM_MAXUSER_ADDRESS_LA48 UVADDR(0, NUP4ML4E, 0, 0, 0) +#define VM_MAXUSER_ADDRESS VM_MAXUSER_ADDRESS_LA57 -#define SHAREDPAGE (VM_MAXUSER_ADDRESS - PAGE_SIZE) -#define USRSTACK SHAREDPAGE +#define SHAREDPAGE_LA57 (VM_MAXUSER_ADDRESS_LA57 - PAGE_SIZE) +#define SHAREDPAGE_LA48 (VM_MAXUSER_ADDRESS_LA48 - PAGE_SIZE) +#define USRSTACK_LA57 SHAREDPAGE_LA57 +#define USRSTACK_LA48 SHAREDPAGE_LA48 +#define USRSTACK USRSTACK_LA48 +#define PS_STRINGS_LA57 (USRSTACK_LA57 - sizeof(struct ps_strings)) +#define PS_STRINGS_LA48 (USRSTACK_LA48 - sizeof(struct ps_strings)) #define VM_MAX_ADDRESS UPT_MAX_ADDRESS #define VM_MIN_ADDRESS (0) Index: sys/amd64/linux/linux_sysvec.c =================================================================== --- sys/amd64/linux/linux_sysvec.c +++ sys/amd64/linux/linux_sysvec.c @@ -739,9 +739,9 @@ .sv_imgact_try = linux_exec_imgact_try, .sv_minsigstksz = LINUX_MINSIGSTKSZ, .sv_minuser = VM_MIN_ADDRESS, - .sv_maxuser = VM_MAXUSER_ADDRESS, - .sv_usrstack = USRSTACK, - .sv_psstrings = PS_STRINGS, + .sv_maxuser = VM_MAXUSER_ADDRESS_LA48, + .sv_usrstack = USRSTACK_LA48, + .sv_psstrings = PS_STRINGS_LA48, .sv_stackprot = VM_PROT_ALL, .sv_copyout_auxargs = linux_copyout_auxargs, .sv_copyout_strings = linux_copyout_strings, @@ -752,7 +752,7 @@ .sv_set_syscall_retval = linux_set_syscall_retval, .sv_fetch_syscall_args = linux_fetch_syscall_args, .sv_syscallnames = NULL, - .sv_shared_page_base = SHAREDPAGE, + .sv_shared_page_base = SHAREDPAGE_LA48, .sv_shared_page_len = PAGE_SIZE, .sv_schedtail = linux_schedtail, .sv_thread_detach = linux_thread_detach, Index: sys/amd64/vmm/amd/svm.c =================================================================== --- sys/amd64/vmm/amd/svm.c +++ sys/amd64/vmm/amd/svm.c @@ -560,7 +560,7 @@ panic("contigmalloc of SVM IO bitmap failed"); svm_sc->vm = vm; - svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4); + svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pmltop); /* * Intercept read and write accesses to all MSRs. Index: sys/amd64/vmm/intel/vmx.c =================================================================== --- sys/amd64/vmm/intel/vmx.c +++ sys/amd64/vmm/intel/vmx.c @@ -973,7 +973,7 @@ } vmx->vm = vm; - vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); + vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pmltop)); /* * Clean up EPTP-tagged guest physical and combined mappings Index: sys/cddl/dev/dtrace/amd64/dtrace_subr.c =================================================================== --- sys/cddl/dev/dtrace/amd64/dtrace_subr.c +++ sys/cddl/dev/dtrace/amd64/dtrace_subr.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -131,7 +132,7 @@ void dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit)) { - (*func)(0, (uintptr_t) addr_PTmap); + (*func)(0, la57 ? (uintptr_t)addr_P5Tmap : (uintptr_t)addr_P4Tmap); } void Index: sys/kern/imgact_elf.c =================================================================== --- sys/kern/imgact_elf.c +++ sys/kern/imgact_elf.c @@ -97,7 +97,8 @@ int32_t *osrel); static bool kfreebsd_trans_osrel(const Elf_Note *note, int32_t *osrel); static boolean_t __elfN(check_note)(struct image_params *imgp, - Elf_Brandnote *checknote, int32_t *osrel, uint32_t *fctl0); + Elf_Brandnote *checknote, int32_t *osrel, boolean_t *has_fctl0, + uint32_t *fctl0); static vm_prot_t __elfN(trans_prot)(Elf_Word); static Elf_Word __elfN(untrans_prot)(vm_prot_t); @@ -309,7 +310,7 @@ { const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header; Elf_Brandinfo *bi, *bi_m; - boolean_t ret; + boolean_t ret, has_fctl0; int i, interp_name_len; interp_name_len = interp != NULL ? strlen(interp) + 1 : 0; @@ -331,11 +332,16 @@ continue; if (hdr->e_machine == bi->machine && (bi->flags & (BI_BRAND_NOTE|BI_BRAND_NOTE_MANDATORY)) != 0) { + has_fctl0 = false; + *fctl0 = 0; + *osrel = 0; ret = __elfN(check_note)(imgp, bi->brand_note, osrel, - fctl0); + &has_fctl0, fctl0); /* Give brand a chance to veto check_note's guess */ - if (ret && bi->header_supported) - ret = bi->header_supported(imgp); + if (ret && bi->header_supported) { + ret = bi->header_supported(imgp, osrel, + has_fctl0 ? fctl0 : NULL); + } /* * If note checker claimed the binary, but the * interpreter path in the image does not @@ -374,7 +380,7 @@ bi->compat_3_brand) == 0))) { /* Looks good, but give brand a chance to veto */ if (bi->header_supported == NULL || - bi->header_supported(imgp)) { + bi->header_supported(imgp, NULL, NULL)) { /* * Again, prefer strictly matching * interpreter path. @@ -402,7 +408,7 @@ bi->header_supported == NULL) continue; if (hdr->e_machine == bi->machine) { - ret = bi->header_supported(imgp); + ret = bi->header_supported(imgp, NULL, NULL); if (ret) return (bi); } @@ -422,7 +428,7 @@ strlen(bi->interp_path) + 1 == interp_name_len && strncmp(interp, bi->interp_path, interp_name_len) == 0 && (bi->header_supported == NULL || - bi->header_supported(imgp))) + bi->header_supported(imgp, NULL, NULL))) return (bi); } } @@ -436,7 +442,7 @@ if (hdr->e_machine == bi->machine && __elfN(fallback_brand) == bi->brand && (bi->header_supported == NULL || - bi->header_supported(imgp))) + bi->header_supported(imgp, NULL, NULL))) return (bi); } return (NULL); @@ -2657,6 +2663,7 @@ }; struct fctl_cb_arg { + boolean_t *has_fctl0; uint32_t *fctl0; }; @@ -2671,6 +2678,7 @@ p = (uintptr_t)(note + 1); p += roundup2(note->n_namesz, ELF_NOTE_ROUNDSIZE); desc = (const Elf32_Word *)p; + *arg->has_fctl0 = TRUE; *arg->fctl0 = desc[0]; return (TRUE); } @@ -2683,7 +2691,7 @@ */ static boolean_t __elfN(check_note)(struct image_params *imgp, Elf_Brandnote *brandnote, - int32_t *osrel, uint32_t *fctl0) + int32_t *osrel, boolean_t *has_fctl0, uint32_t *fctl0) { const Elf_Phdr *phdr; const Elf_Ehdr *hdr; @@ -2695,6 +2703,7 @@ phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff); b_arg.brandnote = brandnote; b_arg.osrel = osrel; + f_arg.has_fctl0 = has_fctl0; f_arg.fctl0 = fctl0; for (i = 0; i < hdr->e_phnum; i++) { Index: sys/sys/elf_common.h =================================================================== --- sys/sys/elf_common.h +++ sys/sys/elf_common.h @@ -796,6 +796,7 @@ #define NT_FREEBSD_FCTL_PROTMAX_DISABLE 0x00000002 #define NT_FREEBSD_FCTL_STKGAP_DISABLE 0x00000004 #define NT_FREEBSD_FCTL_WXNEEDED 0x00000008 +#define NT_FREEBSD_FCTL_LA48 0x00000010 /* Values for n_type. Used in core files. */ #define NT_PRSTATUS 1 /* Process status. */ Index: sys/sys/imgact_elf.h =================================================================== --- sys/sys/imgact_elf.h +++ sys/sys/imgact_elf.h @@ -87,7 +87,8 @@ const char *interp_newpath; int flags; Elf_Brandnote *brand_note; - boolean_t (*header_supported)(struct image_params *); + boolean_t (*header_supported)(struct image_params *, + int32_t *, uint32_t *); #define BI_CAN_EXEC_DYN 0x0001 #define BI_BRAND_NOTE 0x0002 /* May have note.ABI-tag section. */ #define BI_BRAND_NOTE_MANDATORY 0x0004 /* Must have note.ABI-tag section. */ Index: sys/x86/include/procctl.h =================================================================== --- sys/x86/include/procctl.h +++ sys/x86/include/procctl.h @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * - * Copyright (c) 2019 The FreeBSD Foundation + * Copyright (c) 2019,2020 The FreeBSD Foundation * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. @@ -35,9 +35,18 @@ #define PROC_KPTI_CTL (PROC_PROCCTL_MD_MIN + 0) #define PROC_KPTI_STATUS (PROC_PROCCTL_MD_MIN + 1) +#define PROC_LA_CTL (PROC_PROCCTL_MD_MIN + 2) +#define PROC_LA_STATUS (PROC_PROCCTL_MD_MIN + 3) #define PROC_KPTI_CTL_ENABLE_ON_EXEC 1 #define PROC_KPTI_CTL_DISABLE_ON_EXEC 2 #define PROC_KPTI_STATUS_ACTIVE 0x80000000 +#define PROC_LA_CTL_LA48_ON_EXEC 1 +#define PROC_LA_CTL_LA57_ON_EXEC 2 +#define PROC_LA_CTL_DEFAULT_ON_EXEC 3 + +#define PROC_LA_STATUS_LA48 0x01000000 +#define PROC_LA_STATUS_LA57 0x02000000 + #endif Index: sys/x86/include/specialreg.h =================================================================== --- sys/x86/include/specialreg.h +++ sys/x86/include/specialreg.h @@ -72,6 +72,7 @@ #define CR4_FXSR 0x00000200 /* Fast FPU save/restore used by OS */ #define CR4_XMM 0x00000400 /* enable SIMD/MMX2 to use except 16 */ #define CR4_UMIP 0x00000800 /* User Mode Instruction Prevention */ +#define CR4_LA57 0x00001000 /* Enable 5-level paging */ #define CR4_VMXE 0x00002000 /* enable VMX operation (Intel-specific) */ #define CR4_FSGSBASE 0x00010000 /* Enable FS/GS BASE accessing instructions */ #define CR4_PCIDE 0x00020000 /* Enable Context ID */ Index: usr.bin/elfctl/elfctl.c =================================================================== --- usr.bin/elfctl/elfctl.c +++ usr.bin/elfctl/elfctl.c @@ -67,6 +67,7 @@ "Disable implicit PROT_MAX" }, { "stackgap", NT_FREEBSD_FCTL_STKGAP_DISABLE, "Disable stack gap" }, { "wxneeded", NT_FREEBSD_FCTL_WXNEEDED, "Requires W+X mappings" }, + { "la48", NT_FREEBSD_FCTL_LA48, "amd64: Limit user VA to 48bit" }, }; static struct option long_opts[] = { Index: usr.bin/proccontrol/proccontrol.1 =================================================================== --- usr.bin/proccontrol/proccontrol.1 +++ usr.bin/proccontrol/proccontrol.1 @@ -71,6 +71,9 @@ .Xr mmap 2 . .It Ar kpti Controls the KPTI enable, AMD64 only. +.It Ar la48 +Control limiting usermode process address space to 48bit of address, +AMD64 only, on machines capable of 57bit addressing. .El .Pp The Index: usr.bin/proccontrol/proccontrol.c =================================================================== --- usr.bin/proccontrol/proccontrol.c +++ usr.bin/proccontrol/proccontrol.c @@ -48,6 +48,10 @@ #ifdef PROC_KPTI_CTL MODE_KPTI, #endif +#ifdef PROC_LA_CTL + MODE_LA57, + MODE_LA48, +#endif }; static pid_t @@ -69,13 +73,18 @@ #else #define KPTI_USAGE #endif +#ifdef PROC_LA_CTL +#define LA_USAGE "|la48|la57" +#else +#define LA_USAGE +#endif static void __dead2 usage(void) { fprintf(stderr, "Usage: proccontrol -m (aslr|protmax|trace|trapcap|" - "stackgap"KPTI_USAGE") [-q] " + "stackgap"KPTI_USAGE LA_USAGE") [-q] " "[-s (enable|disable)] [-p pid | command]\n"); exit(1); } @@ -107,6 +116,12 @@ #ifdef PROC_KPTI_CTL else if (strcmp(optarg, "kpti") == 0) mode = MODE_KPTI; +#endif +#ifdef PROC_LA_CTL + else if (strcmp(optarg, "la57") == 0) + mode = MODE_LA57; + else if (strcmp(optarg, "la48") == 0) + mode = MODE_LA48; #endif else usage(); @@ -163,6 +178,12 @@ case MODE_KPTI: error = procctl(P_PID, pid, PROC_KPTI_STATUS, &arg); break; +#endif +#ifdef PROC_LA_CTL + case MODE_LA57: + case MODE_LA48: + error = procctl(P_PID, pid, PROC_LA_STATUS, &arg); + break; #endif default: usage(); @@ -258,6 +279,27 @@ else printf(", not active\n"); break; +#endif +#ifdef PROC_LA_CTL + case MODE_LA57: + case MODE_LA48: + switch (arg & ~(PROC_LA_STATUS_LA48 | + PROC_LA_STATUS_LA57)) { + case PROC_LA_CTL_LA48_ON_EXEC: + printf("la48 on exec"); + break; + case PROC_LA_CTL_LA57_ON_EXEC: + printf("la57 on exec"); + break; + case PROC_LA_CTL_DEFAULT_ON_EXEC: + printf("default on exec"); + break; + } + if ((arg & PROC_LA_STATUS_LA48) != 0) + printf(", la48 active\n"); + else if ((arg & PROC_LA_STATUS_LA57) != 0) + printf(", la57 active\n"); + break; #endif } } else { @@ -294,6 +336,18 @@ PROC_KPTI_CTL_DISABLE_ON_EXEC; error = procctl(P_PID, pid, PROC_KPTI_CTL, &arg); break; +#endif +#ifdef PROC_LA_CTL + case MODE_LA57: + arg = enable ? PROC_LA_CTL_LA57_ON_EXEC : + PROC_LA_CTL_DEFAULT_ON_EXEC; + error = procctl(P_PID, pid, PROC_LA_CTL, &arg); + break; + case MODE_LA48: + arg = enable ? PROC_LA_CTL_LA48_ON_EXEC : + PROC_LA_CTL_DEFAULT_ON_EXEC; + error = procctl(P_PID, pid, PROC_LA_CTL, &arg); + break; #endif default: usage();