Index: sys/amd64/amd64/initcpu.c =================================================================== --- sys/amd64/amd64/initcpu.c +++ sys/amd64/amd64/initcpu.c @@ -218,7 +218,7 @@ if (!IS_BSP() && (cpu_stdext_feature & CPUID_STDEXT_SMEP)) cr4 |= CR4_SMEP; load_cr4(cr4); - if ((amd_feature & AMDID_NX) != 0) { + if (IS_BSP() && (amd_feature & AMDID_NX) != 0) { msr = rdmsr(MSR_EFER) | EFER_NXE; wrmsr(MSR_EFER, msr); pg_nx = PG_NX; Index: sys/amd64/amd64/mpboot.S =================================================================== --- sys/amd64/amd64/mpboot.S +++ sys/amd64/amd64/mpboot.S @@ -221,15 +221,31 @@ /* * From here on down is executed in the kernel .text section. - * - * Load a real %cr3 that has all the direct map stuff and switches - * off the 1GB replicated mirror. Load a stack pointer and jump - * into AP startup code in C. */ .text .code64 .p2align 4,0 entry_64: + /* + * If the BSP reported NXE support, enable EFER.NXE for all APs + * prior to loading %cr3. This avoids page faults if the AP + * encounters memory marked with the NX bit prior to detecting and + * enabling NXE support. + */ + movq pg_nx, %rbx + testq %rbx, %rbx + je 1f + movl $MSR_EFER, %ecx + rdmsr + orl $EFER_NXE, %eax + wrmsr + +1: + /* + * Load a real %cr3 that has all the direct map stuff and switches + * off the 1GB replicated mirror. Load a stack pointer and jump + * into AP startup code in C. + */ movq KPML4phys, %rax movq %rax, %cr3 movq bootSTK, %rsp Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -397,6 +397,12 @@ static struct md_page *pv_table; static struct md_page pv_dummy; +static int pmap_kernelro = 1; + +SYSCTL_INT(_vm_pmap, OID_AUTO, kernelro, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, + &pmap_kernelro, 0, + "Map the read-only portions of the kernel with read-only permissions"); + /* * All those kernel PT submaps that BSD is so fond of */ @@ -887,15 +893,71 @@ nkpt = pt_pages; } +/* + * Returns the proper write/execute permission for a physical page that is + * part of the initial boot allocations. + * + * If the page has kernel text, it is marked as read-only. If the page has + * kernel read-only data, it is marked as read-only/not-executable. If the + * page has only read-write data, it is marked as read-write/not-executable. + * If the page is below/above the kernel range, it is marked as read-write. + * + * This function operates on 2M pages, since we map the kernel space that + * way. + * + * Note that this doesn't currently provide any protection for modules. + */ +static inline pt_entry_t +bootaddr_rwx(vm_paddr_t pa) +{ + + /* + * Everything in the same 2M page as the start of the kernel + * should be static. On the other hand, things in the same 2M + * page as the end of the kernel could be read-write/executable, + * as the kernel image is not guaranteed to end on a 2M boundary. + */ + if (pa < trunc_2mpage(btext - KERNBASE) || + pa >= trunc_2mpage(_end - KERNBASE)) + return (X86_PG_RW); + /* + * The linker should ensure that the read-only and read-write + * portions don't share the same 2M page, so this shouldn't + * impact read-only data. However, in any case, any page with + * read-write data needs to be read-write. + */ + if (pa >= trunc_2mpage(brwsection - KERNBASE)) + return (X86_PG_RW | pg_nx); + /* + * Mark any 2M page containing kernel text as read-only. Mark + * other pages with read-only data as read-only and not executable. + * (It is likely a small portion of the read-only data section will + * be marked as read-only, but executable. This should be acceptable + * since the read-only protection will keep the data from changing.) + * Note that fixups to the .text section will still work until we + * set CR0.WP. + */ + if (pa < round_2mpage(etext - KERNBASE)) + return (pmap_kernelro ? 0 : X86_PG_RW); + return (pg_nx | (pmap_kernelro ? 0 : X86_PG_RW)); +} + static void create_pagetables(vm_paddr_t *firstaddr) { - int i, j, ndm1g, nkpdpe; + int i, j, ndm1g, nkpdpe, nkdmpde; pt_entry_t *pt_p; pd_entry_t *pd_p; pdp_entry_t *pdp_p; pml4_entry_t *p4_p; + uint64_t DMPDkernphys; + /* + * Determine if we are marking the read-only portion with read-only + * permissions. + */ + TUNABLE_INT_FETCH("vm.pmap.kernelro", &pmap_kernelro); + /* Allocate page table pages for the direct map */ ndmpdp = howmany(ptoa(Maxmem), NBPDP); if (ndmpdp < 4) /* Minimum 4GB of dirmap */ @@ -913,8 +975,14 @@ } DMPDPphys = allocpages(firstaddr, ndmpdpphys); ndm1g = 0; - if ((amd_feature & AMDID_PAGE1GB) != 0) + if ((amd_feature & AMDID_PAGE1GB) != 0) { ndm1g = ptoa(Maxmem) >> PDPSHIFT; + if (pmap_kernelro) { + nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), + NBPDP); + DMPDkernphys = allocpages(firstaddr, nkdmpde); + } + } if (ndm1g < ndmpdp) DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g); dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; @@ -940,11 +1008,10 @@ KPDphys = allocpages(firstaddr, nkpdpe); /* Fill in the underlying page table pages */ - /* Nominally read-only (but really R/W) from zero to physfree */ /* XXX not fully used, underneath 2M pages */ pt_p = (pt_entry_t *)KPTphys; for (i = 0; ptoa(i) < *firstaddr; i++) - pt_p[i] = ptoa(i) | X86_PG_RW | X86_PG_V | pg_g; + pt_p[i] = ptoa(i) | X86_PG_V | pg_g | bootaddr_rwx(ptoa(i)); /* Now map the page tables at their location within PTmap */ pd_p = (pd_entry_t *)KPDphys; @@ -954,8 +1021,8 @@ /* Map from zero to end of allocations under 2M pages */ /* This replaces some of the KPTphys entries above */ for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) - pd_p[i] = (i << PDRSHIFT) | X86_PG_RW | X86_PG_V | PG_PS | - pg_g; + pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | + bootaddr_rwx(i << PDRSHIFT); /* * Because we map the physical blocks in 2M pages, adjust firstaddr @@ -995,6 +1062,22 @@ for (j = 0; i < ndmpdp; i++, j++) { pdp_p[i] = DMPDphys + ptoa(j); pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_U; + } + + /* + * Instead of using a 1G page for the memory containing the kernel, + * use 2M pages with appropriate permissions. (If using 1G pages, + * this will partially overwrite the PDPEs above.) + */ + if (ndm1g && pmap_kernelro) { + pd_p = (pd_entry_t *)DMPDkernphys; + for (i = 0; i < NPDPEPG; i++) + pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g | + X86_PG_M | X86_PG_A | pg_nx | + bootaddr_rwx(i << PDRSHIFT); + for (i = 0; i < nkdmpde; i++) + pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW | + X86_PG_V | PG_U; } /* And recursively map PML4 to itself in order to get PTmap */ Index: sys/amd64/include/cpu.h =================================================================== --- sys/amd64/include/cpu.h +++ sys/amd64/include/cpu.h @@ -68,7 +68,9 @@ }; extern struct cpu_ops cpu_ops; +extern char brwsection[]; extern char btext[]; +extern char _end[]; extern char etext[]; /* Resume hook for VMM. */ Index: sys/conf/kern.pre.mk =================================================================== --- sys/conf/kern.pre.mk +++ sys/conf/kern.pre.mk @@ -120,6 +120,10 @@ LDFLAGS+= -Wl,--build-id=sha1 .endif +.if ${MACHINE_CPUARCH} == "amd64" +LDFLAGS+= -z max-page-size=2097152 -z common-page-size=4096 +.endif + NORMAL_C= ${CC} -c ${CFLAGS} ${WERROR} ${PROF} ${.IMPSRC} NORMAL_S= ${CC:N${CCACHE_BIN}} -c ${ASM_CFLAGS} ${WERROR} ${.IMPSRC} PROFILE_C= ${CC} -c ${CFLAGS} ${WERROR} ${.IMPSRC} Index: sys/conf/ldscript.amd64 =================================================================== --- sys/conf/ldscript.amd64 +++ sys/conf/ldscript.amd64 @@ -80,6 +80,7 @@ /* Adjust the address for the data segment. We want to adjust up to the same address within the page on the next page up. */ . = ALIGN (CONSTANT (MAXPAGESIZE)) - ((CONSTANT (MAXPAGESIZE) - .) & (CONSTANT (MAXPAGESIZE) - 1)); . = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE)); + PROVIDE (brwsection = .); /* Exception handling */ .eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) } .gcc_except_table : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) }