Index: lib/libc/amd64/Symbol.map =================================================================== --- lib/libc/amd64/Symbol.map +++ lib/libc/amd64/Symbol.map @@ -44,6 +44,13 @@ vfork; }; +FBSD_1.6 { + x86_pkru_get_perm; + x86_pkru_set_perm; + x86_pkru_protect_range; + x86_pkru_unprotect_range; +}; + /* * * FreeBSD private ABI Index: lib/libc/i386/Symbol.map =================================================================== --- lib/libc/i386/Symbol.map +++ lib/libc/i386/Symbol.map @@ -46,6 +46,13 @@ ___tls_get_addr; }; +FBSD_1.6 { + x86_pkru_get_perm; + x86_pkru_set_perm; + x86_pkru_protect_range; + x86_pkru_unprotect_range; +}; + FBSDprivate_1.0 { /* PSEUDO syscalls */ _getlogin; Index: lib/libc/x86/sys/Makefile.inc =================================================================== --- lib/libc/x86/sys/Makefile.inc +++ lib/libc/x86/sys/Makefile.inc @@ -3,7 +3,8 @@ .PATH: ${LIBC_SRCTOP}/x86/sys SRCS+= \ - __vdso_gettc.c + __vdso_gettc.c \ + pkru.c .if ${MACHINE_CPUARCH} == "amd64" && ${MK_HYPERV} != "no" CFLAGS+= -DWANT_HYPERV Index: lib/libc/x86/sys/pkru.c =================================================================== --- /dev/null +++ lib/libc/x86/sys/pkru.c @@ -0,0 +1,138 @@ +/*- + * Copyright (c) 2019 The FreeBSD Foundation + * All rights reserved. + * + * Portions of this software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#define MAX_PKRU_IDX 0xf +#ifdef __i386__ +#define X86_SET_PKRU I386_SET_PKRU +#define X86_CLEAR_PKRU I386_CLEAR_PKRU +#else +#define X86_SET_PKRU AMD64_SET_PKRU +#define X86_CLEAR_PKRU AMD64_CLEAR_PKRU +#endif + +static int +x86_pkru_get_perm_unsup(u_int keyidx, int *access, int *modify) +{ + + errno = EOPNOTSUPP; + return (-1); +} + +static int +x86_pkru_get_perm_hw(u_int keyidx, int *access, int *modify) +{ + uint32_t pkru; + + if (keyidx > MAX_PKRU_IDX) { + errno = EINVAL; + return (-1); + } + keyidx *= 2; + pkru = rdpkru(); + *access = (pkru & (1 << keyidx)) == 0; + *modify = (pkru & (2 << keyidx)) == 0; + return (0); +} + +DEFINE_UIFUNC(, int, x86_pkru_get_perm, (u_int, int *, int *), static) +{ + + return ((cpu_stdext_feature2 & CPUID_STDEXT2_OSPKE) == 0 ? + x86_pkru_get_perm_unsup : x86_pkru_get_perm_hw); +} + +static int +x86_pkru_set_perm_unsup(u_int keyidx, int access, int modify) +{ + + errno = EOPNOTSUPP; + return (-1); +} + +static int +x86_pkru_set_perm_hw(u_int keyidx, int access, int modify) +{ + uint32_t pkru; + + if (keyidx > MAX_PKRU_IDX) { + errno = EINVAL; + return (-1); + } + keyidx *= 2; + pkru = rdpkru(); + pkru &= ~(3 << keyidx); + if (!access) + pkru |= 1 << keyidx; + if (!modify) + pkru |= 2 << keyidx; + wrpkru(pkru); + return (0); +} + +DEFINE_UIFUNC(, int, x86_pkru_set_perm, (u_int, int, int), static) +{ + + return ((cpu_stdext_feature2 & CPUID_STDEXT2_OSPKE) == 0 ? + x86_pkru_set_perm_unsup : x86_pkru_set_perm_hw); +} + +int +x86_pkru_protect_range(void *addr, unsigned long len, u_int keyidx, int flags) +{ + struct amd64_set_pkru a64pkru; + + memset(&a64pkru, 0, sizeof(a64pkru)); + a64pkru.addr = addr; + a64pkru.len = len; + a64pkru.keyidx = keyidx; + a64pkru.flags = flags; + return (sysarch(X86_SET_PKRU, &a64pkru)); +} + +int +x86_pkru_unprotect_range(void *addr, unsigned long len) +{ + struct amd64_set_pkru a64pkru; + + memset(&a64pkru, 0, sizeof(a64pkru)); + a64pkru.addr = addr; + a64pkru.len = len; + return (sysarch(X86_CLEAR_PKRU, &a64pkru)); +} Index: sys/amd64/amd64/initcpu.c =================================================================== --- sys/amd64/amd64/initcpu.c +++ sys/amd64/amd64/initcpu.c @@ -233,6 +233,9 @@ if (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) cr4 |= CR4_FSGSBASE; + if (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) + cr4 |= CR4_PKE; + /* * Postpone enabling the SMEP on the boot CPU until the page * tables are switched from the boot loader identity mapping Index: sys/amd64/amd64/pmap.c =================================================================== --- sys/amd64/amd64/pmap.c +++ sys/amd64/amd64/pmap.c @@ -48,7 +48,7 @@ */ /*- * Copyright (c) 2003 Networks Associates Technology, Inc. - * Copyright (c) 2014-2018 The FreeBSD Foundation + * Copyright (c) 2014-2019 The FreeBSD Foundation * All rights reserved. * * This software was developed for the FreeBSD Project by Jake Burkholder, @@ -121,6 +121,7 @@ #include #include #include +#include #include #include #include @@ -155,6 +156,7 @@ #ifdef SMP #include #endif +#include #include static __inline boolean_t @@ -285,6 +287,13 @@ return (mask); } +static __inline pt_entry_t +pmap_pku_mask_bit(pmap_t pmap) +{ + + return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0); +} + #if !defined(DIAGNOSTIC) #ifdef __GNUC_GNU_INLINE__ #define PMAP_INLINE __attribute__((__gnu_inline__)) inline @@ -428,6 +437,22 @@ static vm_pindex_t pti_pg_idx; static bool pti_finalized; +struct pmap_pkru_range { + struct rs_el pkru_rs_el; + u_int pkru_keyidx; + int pkru_flags; +}; + +static uma_zone_t pmap_pkru_ranges_zone; +static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); +static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va); +static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); +static void *pkru_dup_range(void *ctx, void *data); +static void pkru_free_range(void *ctx, void *node); +static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap); +static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); +static void pmap_pkru_deassign_all(pmap_t pmap); + static int pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS) { @@ -2866,6 +2891,12 @@ pmap->pm_pcids[i].pm_gen = 1; } pmap_activate_boot(pmap); + + if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { + pmap_pkru_ranges_zone = uma_zcreate("pkru ranges", + sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + } } void @@ -2954,6 +2985,10 @@ pmap_pinit_pml4_pti(pml4pgu); pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu); } + if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { + rangeset_init(&pmap->pm_pkru, pkru_dup_range, + pkru_free_range, pmap, M_NOWAIT); + } } pmap->pm_root.rt_root = 0; @@ -3250,6 +3285,9 @@ vm_page_unwire_noq(m); vm_page_free(m); } + if (pmap->pm_type == PT_X86 && + (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) + rangeset_fini(&pmap->pm_pkru); } static int @@ -4080,7 +4118,7 @@ { pd_entry_t newpde, oldpde; pt_entry_t *firstpte, newpte; - pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V; + pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V; vm_paddr_t mptepa; vm_page_t mpte; struct spglist free; @@ -4093,6 +4131,7 @@ PG_RW = pmap_rw_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); + PG_PKU_MASK = pmap_pku_mask_bit(pmap); PMAP_LOCK_ASSERT(pmap, MA_OWNED); oldpde = *pde; @@ -4525,6 +4564,7 @@ out: if (anyvalid) pmap_invalidate_all(pmap); + pmap_pkru_on_remove(pmap, sva, eva); PMAP_UNLOCK(pmap); pmap_delayed_invl_finished(); vm_page_free_pages_toq(&free, true); @@ -4836,7 +4876,7 @@ { pd_entry_t newpde; pt_entry_t *firstpte, oldpte, pa, *pte; - pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V; + pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK; vm_page_t mpte; int PG_PTE_CACHE; @@ -4845,6 +4885,7 @@ PG_M = pmap_modified_bit(pmap); PG_V = pmap_valid_bit(pmap); PG_RW = pmap_rw_bit(pmap); + PG_PKU_MASK = pmap_pku_mask_bit(pmap); PG_PTE_CACHE = pmap_cache_mask(pmap, 0); PMAP_LOCK_ASSERT(pmap, MA_OWNED); @@ -5072,6 +5113,8 @@ origpte = *pte; pv = NULL; + if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) + newpte |= pmap_pkru_get(pmap, va); /* * Is the specified virtual address already mapped? @@ -5291,6 +5334,25 @@ " in pmap %p", va, pmap); return (KERN_RESOURCE_SHORTAGE); } + + /* + * If pkru is not same for the whole pde range, return failure + * and let vm_fault() cope. Check after pde allocation, since + * it could sleep. + */ + if (!pmap_pkru_same(pmap, va, va + NBPDR)) { + SLIST_INIT(&free); + if (pmap_unwire_ptp(pmap, va, pdpg, &free)) { + pmap_invalidate_page(pmap, va); + vm_page_free_pages_toq(&free, true); + } + return (KERN_FAILURE); + } + if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) { + newpde &= ~X86_PG_PKU_MASK; + newpde |= pmap_pkru_get(pmap, va); + } + pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg)); pde = &pde[pmap_pde_index(va)]; oldpde = *pde; @@ -5550,7 +5612,7 @@ if ((prot & VM_PROT_EXECUTE) == 0) newpte |= pg_nx; if (va < VM_MAXUSER_ADDRESS) - newpte |= PG_U; + newpte |= PG_U | pmap_pkru_get(pmap, va); pte_store(pte, newpte); return (mpte); } @@ -5926,6 +5988,33 @@ PMAP_UNLOCK(dst_pmap); } +int +pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap) +{ + int error; + + if (dst_pmap->pm_type != src_pmap->pm_type || + dst_pmap->pm_type != PT_X86 || + (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) + return (0); + for (;;) { + if (dst_pmap < src_pmap) { + PMAP_LOCK(dst_pmap); + PMAP_LOCK(src_pmap); + } else { + PMAP_LOCK(src_pmap); + PMAP_LOCK(dst_pmap); + } + error = pmap_pkru_copy(dst_pmap, src_pmap); + PMAP_UNLOCK(src_pmap); + PMAP_UNLOCK(dst_pmap); + if (error != ENOMEM) + break; + vm_wait(NULL); + } + return (error); +} + /* * Zero the specified hardware page. */ @@ -6325,6 +6414,7 @@ if (lock != NULL) rw_wunlock(lock); pmap_invalidate_all(pmap); + pmap_pkru_deassign_all(pmap); PMAP_UNLOCK(pmap); vm_page_free_pages_toq(&free, true); } @@ -8961,6 +9051,285 @@ VM_OBJECT_WUNLOCK(pti_obj); } +static void * +pkru_dup_range(void *ctx __unused, void *data) +{ + struct pmap_pkru_range *node, *new_node; + + new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); + if (new_node == NULL) + return (NULL); + node = data; + memcpy(new_node, node, sizeof(*node)); + return (new_node); +} + +static void +pkru_free_range(void *ctx __unused, void *node) +{ + + uma_zfree(pmap_pkru_ranges_zone, node); +} + +static int +pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, + int flags) +{ + struct pmap_pkru_range *ppr; + int error; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + MPASS(pmap->pm_type == PT_X86); + MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); + if ((flags & AMD64_PKRU_EXCL) != 0 && + !rangeset_check_empty(&pmap->pm_pkru, sva, eva)) + return (EBUSY); + ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT); + if (ppr == NULL) + return (ENOMEM); + ppr->pkru_keyidx = keyidx; + ppr->pkru_flags = flags & AMD64_PKRU_PERSIST; + error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr); + if (error != 0) + uma_zfree(pmap_pkru_ranges_zone, ppr); + return (error); +} + +static int +pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + MPASS(pmap->pm_type == PT_X86); + MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); + return (rangeset_remove(&pmap->pm_pkru, sva, eva)); +} + +static void +pmap_pkru_deassign_all(pmap_t pmap) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if (pmap->pm_type == PT_X86 && + (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) + rangeset_remove_all(&pmap->pm_pkru); +} + +static bool +pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + struct pmap_pkru_range *ppr, *prev_ppr; + vm_offset_t va; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if (pmap->pm_type != PT_X86 || + (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || + sva >= VM_MAXUSER_ADDRESS) + return (true); + MPASS(eva <= VM_MAXUSER_ADDRESS); + for (va = sva, prev_ppr = NULL; va < eva;) { + ppr = rangeset_get(&pmap->pm_pkru, va); + if ((ppr == NULL) ^ (prev_ppr == NULL)) + return (false); + if (ppr == NULL) { + va += PAGE_SIZE; + continue; + } + if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx) + return (false); + va = ppr->pkru_rs_el.re_end; + } + return (true); +} + +static pt_entry_t +pmap_pkru_get(pmap_t pmap, vm_offset_t va) +{ + struct pmap_pkru_range *ppr; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if (pmap->pm_type != PT_X86 || + (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 || + va >= VM_MAXUSER_ADDRESS) + return (0); + ppr = rangeset_get(&pmap->pm_pkru, va); + if (ppr != NULL) + return (X86_PG_PKU(ppr->pkru_keyidx)); + return (0); +} + +static bool +pred_pkru_on_remove(void *ctx __unused, void *r) +{ + struct pmap_pkru_range *ppr; + + ppr = r; + return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0); +} + +static void +pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + if (pmap->pm_type == PT_X86 && + (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) { + rangeset_remove_pred(&pmap->pm_pkru, sva, eva, + pred_pkru_on_remove); + } +} + +static int +pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap) +{ + + PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED); + PMAP_LOCK_ASSERT(src_pmap, MA_OWNED); + MPASS(dst_pmap->pm_type == PT_X86); + MPASS(src_pmap->pm_type == PT_X86); + MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0); + if (src_pmap->pm_pkru.rs_data_ctx == NULL) + return (0); + return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru)); +} + +static void +pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + u_int keyidx) +{ + pml4_entry_t *pml4e; + pdp_entry_t *pdpe; + pd_entry_t newpde, ptpaddr, *pde; + pt_entry_t newpte, *ptep, pte; + vm_offset_t va, va_next; + bool changed; + + PMAP_LOCK_ASSERT(pmap, MA_OWNED); + MPASS(pmap->pm_type == PT_X86); + MPASS(keyidx <= PMAP_MAX_PKRU_IDX); + + for (changed = false, va = sva; va < eva; va = va_next) { + pml4e = pmap_pml4e(pmap, va); + if ((*pml4e & X86_PG_V) == 0) { + va_next = (va + NBPML4) & ~PML4MASK; + if (va_next < va) + va_next = eva; + continue; + } + + pdpe = pmap_pml4e_to_pdpe(pml4e, va); + if ((*pdpe & X86_PG_V) == 0) { + va_next = (va + NBPDP) & ~PDPMASK; + if (va_next < va) + va_next = eva; + continue; + } + + va_next = (va + NBPDR) & ~PDRMASK; + if (va_next < va) + va_next = eva; + + pde = pmap_pdpe_to_pde(pdpe, va); + ptpaddr = *pde; + if (ptpaddr == 0) + continue; + + MPASS((ptpaddr & X86_PG_V) != 0); + if ((ptpaddr & PG_PS) != 0) { + if (va + NBPDR == va_next && eva >= va_next) { + newpde = (ptpaddr & ~X86_PG_PKU_MASK) | + X86_PG_PKU(keyidx); + if (newpde != ptpaddr) { + *pde = newpde; + changed = true; + } + continue; + } else if (!pmap_demote_pde(pmap, pde, va)) { + continue; + } + } + + if (va_next > eva) + va_next = eva; + + for (ptep = pmap_pde_to_pte(pde, va); va != va_next; + ptep++, va += PAGE_SIZE) { + pte = *ptep; + if ((pte & X86_PG_V) == 0) + continue; + newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx); + if (newpte != pte) { + *ptep = newpte; + changed = true; + } + } + } + if (changed) + pmap_invalidate_range(pmap, sva, eva); +} + +static int +pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + u_int keyidx, int flags) +{ + + if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX || + (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0) + return (EINVAL); + if (eva <= sva || eva > VM_MAXUSER_ADDRESS) + return (EFAULT); + if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0) + return (ENOTSUP); + return (0); +} + +int +pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx, + int flags) +{ + int error; + + sva = trunc_page(sva); + eva = round_page(eva); + error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags); + if (error != 0) + return (error); + for (;;) { + PMAP_LOCK(pmap); + error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags); + if (error == 0) + pmap_pkru_update_range(pmap, sva, eva, keyidx); + PMAP_UNLOCK(pmap); + if (error != ENOMEM) + break; + vm_wait(NULL); + } + return (error); +} + +int +pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) +{ + int error; + + sva = trunc_page(sva); + eva = round_page(eva); + error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0); + if (error != 0) + return (error); + for (;;) { + PMAP_LOCK(pmap); + error = pmap_pkru_deassign(pmap, sva, eva); + if (error == 0) + pmap_pkru_update_range(pmap, sva, eva, 0); + PMAP_UNLOCK(pmap); + if (error != ENOMEM) + break; + vm_wait(NULL); + } + return (error); +} + #include "opt_ddb.h" #ifdef DDB #include Index: sys/amd64/amd64/sys_machdep.c =================================================================== --- sys/amd64/amd64/sys_machdep.c +++ sys/amd64/amd64/sys_machdep.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -53,6 +54,7 @@ #include #include #include /* for kernel_map */ +#include #include #include @@ -170,13 +172,16 @@ int sysarch(struct thread *td, struct sysarch_args *uap) { - int error = 0; - struct pcb *pcb = curthread->td_pcb; + struct pcb *pcb; + struct vm_map *map; uint32_t i386base; uint64_t a64base; struct i386_ioperm_args iargs; struct i386_get_xfpustate i386xfpu; + struct i386_set_pkru i386pkru; struct amd64_get_xfpustate a64xfpu; + struct amd64_set_pkru a64pkru; + int error; #ifdef CAPABILITY_MODE /* @@ -194,11 +199,15 @@ case I386_GET_GSBASE: case I386_SET_GSBASE: case I386_GET_XFPUSTATE: + case I386_SET_PKRU: + case I386_CLEAR_PKRU: case AMD64_GET_FSBASE: case AMD64_SET_FSBASE: case AMD64_GET_GSBASE: case AMD64_SET_GSBASE: case AMD64_GET_XFPUSTATE: + case AMD64_SET_PKRU: + case AMD64_CLEAR_PKRU: break; case I386_SET_IOPERM: @@ -214,6 +223,10 @@ if (uap->op == I386_GET_LDT || uap->op == I386_SET_LDT) return (sysarch_ldt(td, uap, UIO_USERSPACE)); + + error = 0; + pcb = td->td_pcb; + /* * XXXKIB check that the BSM generation code knows to encode * the op argument. @@ -233,11 +246,27 @@ a64xfpu.addr = (void *)(uintptr_t)i386xfpu.addr; a64xfpu.len = i386xfpu.len; break; + case I386_SET_PKRU: + case I386_CLEAR_PKRU: + if ((error = copyin(uap->parms, &i386pkru, + sizeof(struct i386_set_pkru))) != 0) + return (error); + a64pkru.addr = (void *)(uintptr_t)i386pkru.addr; + a64pkru.len = i386pkru.len; + a64pkru.keyidx = i386pkru.keyidx; + a64pkru.flags = i386pkru.flags; + break; case AMD64_GET_XFPUSTATE: if ((error = copyin(uap->parms, &a64xfpu, sizeof(struct amd64_get_xfpustate))) != 0) return (error); break; + case AMD64_SET_PKRU: + case AMD64_CLEAR_PKRU: + if ((error = copyin(uap->parms, &a64pkru, + sizeof(struct amd64_set_pkru))) != 0) + return (error); + break; default: break; } @@ -326,6 +355,30 @@ a64xfpu.addr, a64xfpu.len); break; + case I386_SET_PKRU: + case AMD64_SET_PKRU: + map = &td->td_proc->p_vmspace->vm_map; + vm_map_lock_read(map); + error = pmap_pkru_set(PCPU_GET(curpmap), + (vm_offset_t)a64pkru.addr, (vm_offset_t)a64pkru.addr + + a64pkru.len, a64pkru.keyidx, a64pkru.flags); + vm_map_unlock_read(map); + break; + + case I386_CLEAR_PKRU: + case AMD64_CLEAR_PKRU: + if (a64pkru.flags != 0 || a64pkru.keyidx != 0) { + error = EINVAL; + break; + } + map = &td->td_proc->p_vmspace->vm_map; + vm_map_lock_read(map); + error = pmap_pkru_clear(PCPU_GET(curpmap), + (vm_offset_t)a64pkru.addr, + (vm_offset_t)a64pkru.addr + a64pkru.len); + vm_map_unlock(map); + break; + default: error = EINVAL; break; Index: sys/amd64/amd64/trap.c =================================================================== --- sys/amd64/amd64/trap.c +++ sys/amd64/amd64/trap.c @@ -807,6 +807,20 @@ return (-1); } + /* + * User-mode protection key violation (PKU). May happen + * either from usermode or from kernel if copyin accessed + * key-protected mapping. + */ + if ((frame->tf_err & PGEX_PK) != 0) { + if (eva > VM_MAXUSER_ADDRESS) { + trap_fatal(frame, eva); + return (-1); + } + rv = KERN_PROTECTION_FAILURE; + goto after_vmfault; + } + /* * If nx protection of the usermode portion of kernel page * tables caused trap, panic. @@ -842,6 +856,7 @@ #endif return (0); } +after_vmfault: if (!usermode) { if (td->td_intr_nesting_level == 0 && curpcb->pcb_onfault != NULL) { @@ -885,10 +900,12 @@ #endif if (type == T_PAGEFLT) { printf("fault virtual address = 0x%lx\n", eva); - printf("fault code = %s %s %s, %s\n", + printf("fault code = %s %s %s%s%s, %s\n", code & PGEX_U ? "user" : "supervisor", code & PGEX_W ? "write" : "read", code & PGEX_I ? "instruction" : "data", + code & PGEX_PK ? " prot key" : " ", + code & PGEX_SGX ? " SGX" : " ", code & PGEX_RSV ? "reserved bits in PTE" : code & PGEX_P ? "protection violation" : "page not present"); } Index: sys/amd64/include/cpufunc.h =================================================================== --- sys/amd64/include/cpufunc.h +++ sys/amd64/include/cpufunc.h @@ -627,6 +627,22 @@ __asm __volatile("mwait" : : "a" (hints), "c" (extensions)); } +static __inline uint32_t +rdpkru(void) +{ + uint32_t res; + + __asm __volatile("rdpkru" : "=a" (res) : "c" (0) : "edx"); + return (res); +} + +static __inline void +wrpkru(uint32_t mask) +{ + + __asm __volatile("wrpkru" : : "a" (mask), "c" (0), "d" (0)); +} + #ifdef _KERNEL /* This is defined in but is too painful to get to */ #ifndef MSR_FSBASE Index: sys/amd64/include/pmap.h =================================================================== --- sys/amd64/include/pmap.h +++ sys/amd64/include/pmap.h @@ -66,6 +66,7 @@ #define X86_PG_AVAIL2 0x400 /* < programmers use */ #define X86_PG_AVAIL3 0x800 /* \ */ #define X86_PG_PDE_PAT 0x1000 /* PAT PAT index */ +#define X86_PG_PKU(idx) ((pt_entry_t)idx << 59) #define X86_PG_NX (1ul<<63) /* No-execute */ #define X86_PG_AVAIL(x) (1ul << (x)) @@ -73,6 +74,10 @@ #define X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) #define X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) +/* Protection keys indexes */ +#define PMAP_MAX_PKRU_IDX 0xf +#define X86_PG_PKU_MASK X86_PG_PKU(PMAP_MAX_PKRU_IDX) + /* * Intel extended page table (EPT) bit definitions. */ @@ -120,7 +125,7 @@ * (PTE) page mappings have identical settings for the following fields: */ #define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \ - PG_M | PG_A | PG_U | PG_RW | PG_V) + PG_M | PG_A | PG_U | PG_RW | PG_V | PG_PKU_MASK) /* * Page Protection Exception bits @@ -131,6 +136,8 @@ #define PGEX_U 0x04 /* access from User mode (UPL) */ #define PGEX_RSV 0x08 /* reserved PTE field is non-zero */ #define PGEX_I 0x10 /* during an instruction fetch */ +#define PGEX_PK 0x20 /* protection key violation */ +#define PGEX_SGX 0x40 /* SGX-related */ /* * undef the PG_xx macros that define bits in the regular x86 PTEs that @@ -240,6 +247,8 @@ #include #include #include +#include +#include #include @@ -334,6 +343,7 @@ long pm_eptgen; /* EPT pmap generation id */ int pm_flags; struct pmap_pcids pm_pcids[MAXCPU]; + struct rangeset pm_pkru; }; /* flags */ @@ -452,6 +462,9 @@ void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va); void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva, vm_offset_t eva); +int pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); +int pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, + u_int keyidx, int flags); #endif /* _KERNEL */ /* Return various clipped indexes for a given VA */ Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -3909,6 +3909,7 @@ kern/subr_power.c standard kern/subr_prf.c standard kern/subr_prof.c standard +kern/subr_rangeset.c standard kern/subr_rman.c standard kern/subr_rtc.c standard kern/subr_sbuf.c standard Index: sys/i386/include/cpufunc.h =================================================================== --- sys/i386/include/cpufunc.h +++ sys/i386/include/cpufunc.h @@ -700,6 +700,22 @@ write_eflags(eflags); } +static __inline uint32_t +rdpkru(void) +{ + uint32_t res; + + __asm __volatile("rdpkru" : "=a" (res) : "c" (0) : "edx"); + return (res); +} + +static __inline void +wrpkru(uint32_t mask) +{ + + __asm __volatile("wrpkru" : : "a" (mask), "c" (0), "d" (0)); +} + #else /* !(__GNUCLIKE_ASM && __CC_SUPPORTS___INLINE) */ int breakpoint(void); Index: sys/i386/include/pmap.h =================================================================== --- sys/i386/include/pmap.h +++ sys/i386/include/pmap.h @@ -372,6 +372,13 @@ #define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0) #define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) +static inline int +pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused) +{ + + return (0); +} + /* * Only the following functions or macros may be used before pmap_bootstrap() * is called: pmap_kenter(), pmap_kextract(), pmap_kremove(), vtophys(), and Index: sys/kern/subr_rangeset.c =================================================================== --- /dev/null +++ sys/kern/subr_rangeset.c @@ -0,0 +1,364 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2019 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#ifdef DIAGNOSTIC +static void rangeset_check(struct rangeset *rs); +#else +#define rangeset_check(rs) +#endif + +static uma_zone_t rs_node_zone; +static void +rs_rangeset_init(void *arg __unused) +{ + + rs_node_zone = uma_zcreate("rangeset pctrie nodes", + pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, + UMA_ALIGN_PTR, UMA_ZONE_NOFREE); +} +SYSINIT(rs, SI_SUB_LOCK, SI_ORDER_ANY, rs_rangeset_init, NULL); + +static void * +rs_node_alloc(struct pctrie *ptree) +{ + struct rangeset *rs; + + rs = __containerof(ptree, struct rangeset, rs_trie); + return (uma_zalloc(rs_node_zone, rs->rs_alloc_flags)); +} + +static void +rs_node_free(struct pctrie *ptree __unused, void *node) +{ + + uma_zfree(rs_node_zone, node); +} + +void +rangeset_init(struct rangeset *rs, rs_dup_data_t dup_data, + rs_free_data_t free_data, void *data_ctx, u_int alloc_flags) +{ + + pctrie_init(&rs->rs_trie); + rs->rs_dup_data = dup_data; + rs->rs_free_data = free_data; + rs->rs_data_ctx = data_ctx; + rs->rs_alloc_flags = alloc_flags; +} + +void +rangeset_fini(struct rangeset *rs) +{ + + rangeset_check(rs); + rangeset_remove_all(rs); +} + +bool +rangeset_check_empty(struct rangeset *rs, uint64_t start, uint64_t end) +{ + struct rs_el *r; + uint64_t *r1; + + rangeset_check(rs); + r1 = pctrie_lookup_le(&rs->rs_trie, end); + if (r1 != NULL) { + r = __containerof(r1, struct rs_el, re_start); + if (r->re_end > start) + return (false); + } + return (true); +} + +int +rangeset_insert(struct rangeset *rs, uint64_t start, uint64_t end, + void *data) +{ + struct rs_el *r; + int error; + + rangeset_check(rs); + error = rangeset_remove(rs, start, end); + if (error != 0) + return (error); + r = data; + r->re_start = start; + r->re_end = end; + error = pctrie_insert(&rs->rs_trie, &r->re_start, rs_node_alloc); + rangeset_check(rs); + return (error); +} + +int +rangeset_remove_pred(struct rangeset *rs, uint64_t start, uint64_t end, + rs_pred_t pred) +{ + struct rs_el *r, *rn; + uint64_t *r1; + int error; + + rangeset_check(rs); + error = 0; + for (; end > 0 && start < end;) { + r1 = pctrie_lookup_le(&rs->rs_trie, end - 1); + if (r1 == NULL) + break; + r = __containerof(r1, struct rs_el, re_start); + + /* + * ------============================--|-------|---- + * rs re s e + */ + if (r->re_end <= start) + break; + + if (r->re_end <= end) { + if (r->re_start < start) { + /* + * ------========|==============-------|---- + * rs s re e + */ + if (pred(rs->rs_data_ctx, r)) + r->re_end = start; + break; + } + + /* + * ------|--------===================----------|---- + * s rs re e + */ + end = r->re_start; + if (pred(rs->rs_data_ctx, r)) { + pctrie_remove(&rs->rs_trie, r->re_start, + rs_node_free); + rs->rs_free_data(rs->rs_data_ctx, r); + } + continue; + } + + /* + * ------|--------====================|==========---- + * s rs e re + */ + if (r->re_start >= start) { + if (pred(rs->rs_data_ctx, r)) { + pctrie_remove(&rs->rs_trie, r->re_start, + rs_node_free); + r->re_start = end; + error = pctrie_insert(&rs->rs_trie, + &r->re_start, rs_node_alloc); + /* + * The insert above must succeed + * because rs_node zone is marked + * nofree and we freed one element + * just before. + */ + MPASS(error == 0); + } else { + end = r->re_start; + } + continue; + } + + /* + * ------=========|===================|==========---- + * rs s e re + */ + if (pred(rs->rs_data_ctx, r)) { + /* + * Split. Can only happen once, and then if + * any allocation fails, the rangeset is kept + * intact. + */ + rn = rs->rs_dup_data(rs->rs_data_ctx, r); + if (rn == NULL) { + error = ENOMEM; + break; + } + rn->re_start = end; + rn->re_end = r->re_end; + error = pctrie_insert(&rs->rs_trie, &rn->re_start, + rs_node_alloc); + if (error != 0) { + rs->rs_free_data(rs->rs_data_ctx, rn); + break; + } + r->re_end = start; + } + break; + } + rangeset_check(rs); + return (error); +} + +static bool +rangeset_true_pred(void *ctx __unused, void *r __unused) +{ + + return (true); +} + +int +rangeset_remove(struct rangeset *rs, uint64_t start, uint64_t end) +{ + + return (rangeset_remove_pred(rs, start, end, rangeset_true_pred)); +} + +void +rangeset_remove_all(struct rangeset *rs) +{ + struct rs_el *r; + uint64_t *r1; + + for (;;) { + r1 = pctrie_lookup_ge(&rs->rs_trie, 0); + if (r1 == NULL) + break; + r = __containerof(r1, struct rs_el, re_start); + pctrie_remove(&rs->rs_trie, r->re_start, rs_node_free); + rs->rs_free_data(rs->rs_data_ctx, r); + } +} + +void * +rangeset_get(struct rangeset *rs, uint64_t place) +{ + struct rs_el *r; + uint64_t *r1; + + rangeset_check(rs); + r1 = pctrie_lookup_le(&rs->rs_trie, place); + if (r1 == NULL) + return (NULL); + r = __containerof(r1, struct rs_el, re_start); + if (r->re_end <= place) + return (NULL); + return (r); +} + +int +rangeset_copy(struct rangeset *dst_rs, struct rangeset *src_rs) +{ + struct rs_el *src_r, *dst_r; + uint64_t cursor, *r1; + int error; + + MPASS(pctrie_is_empty(&dst_rs->rs_trie)); + rangeset_check(src_rs); + MPASS(dst_rs->rs_dup_data == src_rs->rs_dup_data); + + error = 0; + for (cursor = 0;; cursor = src_r->re_start + 1) { + r1 = pctrie_lookup_ge(&src_rs->rs_trie, cursor); + if (r1 == NULL) + break; + src_r = __containerof(r1, struct rs_el, re_start); + dst_r = dst_rs->rs_dup_data(dst_rs->rs_data_ctx, src_r); + if (dst_r == NULL) { + error = ENOMEM; + break; + } + error = pctrie_insert(&dst_rs->rs_trie, &dst_r->re_start, + rs_node_alloc); + if (error != 0) + break; + } + if (error != 0) + rangeset_remove_all(dst_rs); + return (error); +} + +#ifdef DIAGNOSTIC +static void +rangeset_check(struct rangeset *rs) +{ + struct rs_el *r, *rp; + uint64_t cursor, *r1; + + for (cursor = 0, rp = NULL;; cursor = r->re_start + 1, rp = r) { + r1 = pctrie_lookup_ge(&rs->rs_trie, cursor); + if (r1 == NULL) + break; + r = __containerof(r1, struct rs_el, re_start); + KASSERT(r->re_start < r->re_end, + ("invalid interval rs %p elem %p (%#jx, %#jx)", + rs, r, (uintmax_t)r->re_start, (uintmax_t)r->re_end)); + if (rp != NULL) { + KASSERT(rp->re_end <= r->re_start, + ("non-ascending neighbors rs %p " + "prev elem %p (%#jx, %#jx) elem %p (%#jx, %#jx)", + rs, rp, (uintmax_t)rp->re_start, + (uintmax_t)rp->re_end, r, (uintmax_t)r->re_start, + (uintmax_t)r->re_end)); + } + } +} +#endif + +#include "opt_ddb.h" +#ifdef DDB +#include +#include + +DB_SHOW_COMMAND(rangeset, rangeset_show_fn) +{ + struct rangeset *rs; + struct rs_el *r; + uint64_t cursor, *r1; + + if (!have_addr) { + db_printf("show rangeset addr\n"); + return; + } + + rs = (struct rangeset *)addr; + db_printf("rangeset %p\n", rs); + for (cursor = 0;; cursor = r->re_start + 1) { + r1 = pctrie_lookup_ge(&rs->rs_trie, cursor); + if (r1 == NULL) + break; + r = __containerof(r1, struct rs_el, re_start); + db_printf(" el %p start %#jx end %#jx\n", + r, r->re_start, r->re_end); + } +} +#endif Index: sys/sys/_rangeset.h =================================================================== --- /dev/null +++ sys/sys/_rangeset.h @@ -0,0 +1,51 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2019 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS__RANGESET_H +#define _SYS__RANGESET_H + +#include + +typedef void *(*rs_dup_data_t)(void *ctx, void *data); +typedef void (*rs_free_data_t)(void *ctx, void *data); + +struct rangeset { + struct pctrie rs_trie; + rs_dup_data_t rs_dup_data; + rs_free_data_t rs_free_data; + void *rs_data_ctx; + u_int rs_alloc_flags; +}; + +#endif + Index: sys/sys/rangeset.h =================================================================== --- /dev/null +++ sys/sys/rangeset.h @@ -0,0 +1,88 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2019 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_RANGESET_H +#define _SYS_RANGESET_H + +#ifdef _KERNEL + +#include + +typedef bool (*rs_pred_t)(void *ctx, void *r); + +/* + * This structure must be embedded at the start of the rangeset element. + */ +struct rs_el { + uint64_t re_start; /* pctrie key */ + uint64_t re_end; +}; + +void rangeset_init(struct rangeset *rs, rs_dup_data_t dup_data, + rs_free_data_t free_data, void *rs_data_ctx, u_int alloc_flags); +void rangeset_fini(struct rangeset *rs); + +bool rangeset_check_empty(struct rangeset *rs, uint64_t start, + uint64_t end); + +/* + * r point to the app data with struct rs_el at the beginning. + */ +int rangeset_insert(struct rangeset *rs, uint64_t start, uint64_t end, + void *r); + +/* + * Guarantees that on error the rangeset is not modified. Remove + * might need to split element if its start/end completely cover the + * removed range, in which case ENOMEM might be returned. + */ +void rangeset_remove_all(struct rangeset *rs); +int rangeset_remove(struct rangeset *rs, uint64_t start, uint64_t end); +int rangeset_remove_pred(struct rangeset *rs, uint64_t start, + uint64_t end, rs_pred_t pred); + +/* + * Really returns the pointer to the data with struct rs_el embedded + * at the beginning. + */ +void *rangeset_get(struct rangeset *rs, uint64_t place); + +/* + * Copies src_rs entries into dst_rs. dst_rs must be empty. + * Leaves dst_rs empty on failure. + */ +int rangeset_copy(struct rangeset *dst_rs, struct rangeset *src_rs); + +#endif + +#endif Index: sys/vm/pmap.h =================================================================== --- sys/vm/pmap.h +++ sys/vm/pmap.h @@ -168,6 +168,7 @@ void pmap_unwire(pmap_t pmap, vm_offset_t start, vm_offset_t end); void pmap_zero_page(vm_page_t); void pmap_zero_page_area(vm_page_t, int off, int size); +int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap); #define pmap_resident_count(pm) ((pm)->pm_stats.resident_count) #define pmap_wired_count(pm) ((pm)->pm_stats.wired_count) Index: sys/vm/vm_fault.c =================================================================== --- sys/vm/vm_fault.c +++ sys/vm/vm_fault.c @@ -479,8 +479,20 @@ fault_flags, true); } VM_OBJECT_WUNLOCK(fs->first_object); - pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type | (wired ? - PMAP_ENTER_WIRED : 0), psind); + rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type | + (wired ? PMAP_ENTER_WIRED : 0), psind); +#if defined(__amd64__) + if (psind > 0 && rv == KERN_FAILURE) { + for (i = 0; i < npages; i++) { + rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i), + &m[i], prot, fault_type | + (wired ? PMAP_ENTER_WIRED : 0), 0); + MPASS(rv == KERN_SUCCESS); + } + } +#else + MPASS(rv == KERN_SUCCESS); +#endif VM_OBJECT_WLOCK(fs->first_object); m_mtx = NULL; for (i = 0; i < npages; i++) { Index: sys/vm/vm_map.c =================================================================== --- sys/vm/vm_map.c +++ sys/vm/vm_map.c @@ -3424,7 +3424,7 @@ vm_map_t new_map, old_map; vm_map_entry_t new_entry, old_entry; vm_object_t object; - int locked; + int error, locked; vm_inherit_t inh; old_map = &vm1->vm_map; @@ -3433,6 +3433,7 @@ pmap_pinit); if (vm2 == NULL) return (NULL); + vm2->vm_taddr = vm1->vm_taddr; vm2->vm_daddr = vm1->vm_daddr; vm2->vm_maxsaddr = vm1->vm_maxsaddr; @@ -3443,6 +3444,14 @@ locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */ KASSERT(locked, ("vmspace_fork: lock failed")); + error = pmap_vmspace_copy(new_map->pmap, old_map->pmap); + if (error != 0) { + sx_xunlock(&old_map->lock); + sx_xunlock(&new_map->lock); + vmspace_free(vm2); + return (NULL); + } + old_entry = old_map->header.next; while (old_entry != &old_map->header) { Index: sys/x86/include/specialreg.h =================================================================== --- sys/x86/include/specialreg.h +++ sys/x86/include/specialreg.h @@ -77,6 +77,7 @@ #define CR4_XSAVE 0x00040000 /* XSETBV/XGETBV */ #define CR4_SMEP 0x00100000 /* Supervisor-Mode Execution Prevention */ #define CR4_SMAP 0x00200000 /* Supervisor-Mode Access Prevention */ +#define CR4_PKE 0x00400000 /* Protection Keys Enable */ /* * Bits in AMD64 special registers. EFER is 64 bits wide. Index: sys/x86/include/sysarch.h =================================================================== --- sys/x86/include/sysarch.h +++ sys/x86/include/sysarch.h @@ -52,6 +52,8 @@ #define I386_GET_GSBASE 9 #define I386_SET_GSBASE 10 #define I386_GET_XFPUSTATE 11 +#define I386_SET_PKRU 12 +#define I386_CLEAR_PKRU 13 /* Leave space for 0-127 for to avoid translating syscalls */ #define AMD64_GET_FSBASE 128 @@ -59,6 +61,12 @@ #define AMD64_GET_GSBASE 130 #define AMD64_SET_GSBASE 131 #define AMD64_GET_XFPUSTATE 132 +#define AMD64_SET_PKRU 133 +#define AMD64_CLEAR_PKRU 134 + +/* Flags for AMD64_SET_PKRU */ +#define AMD64_PKRU_EXCL 0x0001 +#define AMD64_PKRU_PERSIST 0x0002 struct i386_ioperm_args { unsigned int start; @@ -94,12 +102,26 @@ int len; }; +struct i386_set_pkru { + unsigned int addr; + unsigned int len; + unsigned int keyidx; + int flags; +}; + struct amd64_get_xfpustate { void *addr; int len; }; #endif +struct amd64_set_pkru { + void *addr; + unsigned long len; + unsigned int keyidx; + int flags; +}; + #ifndef _KERNEL union descriptor; struct dbreg; @@ -120,6 +142,11 @@ int amd64_get_gsbase(void **); int amd64_set_fsbase(void *); int amd64_set_gsbase(void *); +int x86_pkru_get_perm(unsigned int keyidx, int *access, int *modify); +int x86_pkru_set_perm(unsigned int keyidx, int access, int modify); +int x86_pkru_protect_range(void *addr, unsigned long len, unsigned int keyidx, + int flag); +int x86_pkru_unprotect_range(void *addr, unsigned long len); int sysarch(int, void *); __END_DECLS #else