Page MenuHomeFreeBSD

D18893.id53208.diff
No OneTemporary

D18893.id53208.diff

Index: lib/libc/amd64/Symbol.map
===================================================================
--- lib/libc/amd64/Symbol.map
+++ lib/libc/amd64/Symbol.map
@@ -44,6 +44,13 @@
vfork;
};
+FBSD_1.6 {
+ x86_pkru_get_perm;
+ x86_pkru_set_perm;
+ x86_pkru_protect_range;
+ x86_pkru_unprotect_range;
+};
+
/*
*
* FreeBSD private ABI
Index: lib/libc/i386/Symbol.map
===================================================================
--- lib/libc/i386/Symbol.map
+++ lib/libc/i386/Symbol.map
@@ -46,6 +46,13 @@
___tls_get_addr;
};
+FBSD_1.6 {
+ x86_pkru_get_perm;
+ x86_pkru_set_perm;
+ x86_pkru_protect_range;
+ x86_pkru_unprotect_range;
+};
+
FBSDprivate_1.0 {
/* PSEUDO syscalls */
_getlogin;
Index: lib/libc/x86/sys/Makefile.inc
===================================================================
--- lib/libc/x86/sys/Makefile.inc
+++ lib/libc/x86/sys/Makefile.inc
@@ -3,7 +3,8 @@
.PATH: ${LIBC_SRCTOP}/x86/sys
SRCS+= \
- __vdso_gettc.c
+ __vdso_gettc.c \
+ pkru.c
.if ${MACHINE_CPUARCH} == "amd64" && ${MK_HYPERV} != "no"
CFLAGS+= -DWANT_HYPERV
Index: lib/libc/x86/sys/pkru.c
===================================================================
--- /dev/null
+++ lib/libc/x86/sys/pkru.c
@@ -0,0 +1,138 @@
+/*-
+ * Copyright (c) 2019 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <machine/cpufunc.h>
+#include <machine/specialreg.h>
+#include <machine/sysarch.h>
+#include <x86/ifunc.h>
+#include <errno.h>
+#include <string.h>
+
+#define MAX_PKRU_IDX 0xf
+#ifdef __i386__
+#define X86_SET_PKRU I386_SET_PKRU
+#define X86_CLEAR_PKRU I386_CLEAR_PKRU
+#else
+#define X86_SET_PKRU AMD64_SET_PKRU
+#define X86_CLEAR_PKRU AMD64_CLEAR_PKRU
+#endif
+
+static int
+x86_pkru_get_perm_unsup(u_int keyidx, int *access, int *modify)
+{
+
+ errno = EOPNOTSUPP;
+ return (-1);
+}
+
+static int
+x86_pkru_get_perm_hw(u_int keyidx, int *access, int *modify)
+{
+ uint32_t pkru;
+
+ if (keyidx > MAX_PKRU_IDX) {
+ errno = EINVAL;
+ return (-1);
+ }
+ keyidx *= 2;
+ pkru = rdpkru();
+ *access = (pkru & (1 << keyidx)) == 0;
+ *modify = (pkru & (2 << keyidx)) == 0;
+ return (0);
+}
+
+DEFINE_UIFUNC(, int, x86_pkru_get_perm, (u_int, int *, int *), static)
+{
+
+ return ((cpu_stdext_feature2 & CPUID_STDEXT2_OSPKE) == 0 ?
+ x86_pkru_get_perm_unsup : x86_pkru_get_perm_hw);
+}
+
+static int
+x86_pkru_set_perm_unsup(u_int keyidx, int access, int modify)
+{
+
+ errno = EOPNOTSUPP;
+ return (-1);
+}
+
+static int
+x86_pkru_set_perm_hw(u_int keyidx, int access, int modify)
+{
+ uint32_t pkru;
+
+ if (keyidx > MAX_PKRU_IDX) {
+ errno = EINVAL;
+ return (-1);
+ }
+ keyidx *= 2;
+ pkru = rdpkru();
+ pkru &= ~(3 << keyidx);
+ if (!access)
+ pkru |= 1 << keyidx;
+ if (!modify)
+ pkru |= 2 << keyidx;
+ wrpkru(pkru);
+ return (0);
+}
+
+DEFINE_UIFUNC(, int, x86_pkru_set_perm, (u_int, int, int), static)
+{
+
+ return ((cpu_stdext_feature2 & CPUID_STDEXT2_OSPKE) == 0 ?
+ x86_pkru_set_perm_unsup : x86_pkru_set_perm_hw);
+}
+
+int
+x86_pkru_protect_range(void *addr, unsigned long len, u_int keyidx, int flags)
+{
+ struct amd64_set_pkru a64pkru;
+
+ memset(&a64pkru, 0, sizeof(a64pkru));
+ a64pkru.addr = addr;
+ a64pkru.len = len;
+ a64pkru.keyidx = keyidx;
+ a64pkru.flags = flags;
+ return (sysarch(X86_SET_PKRU, &a64pkru));
+}
+
+int
+x86_pkru_unprotect_range(void *addr, unsigned long len)
+{
+ struct amd64_set_pkru a64pkru;
+
+ memset(&a64pkru, 0, sizeof(a64pkru));
+ a64pkru.addr = addr;
+ a64pkru.len = len;
+ return (sysarch(X86_CLEAR_PKRU, &a64pkru));
+}
Index: sys/amd64/amd64/initcpu.c
===================================================================
--- sys/amd64/amd64/initcpu.c
+++ sys/amd64/amd64/initcpu.c
@@ -233,6 +233,9 @@
if (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE)
cr4 |= CR4_FSGSBASE;
+ if (cpu_stdext_feature2 & CPUID_STDEXT2_PKU)
+ cr4 |= CR4_PKE;
+
/*
* Postpone enabling the SMEP on the boot CPU until the page
* tables are switched from the boot loader identity mapping
Index: sys/amd64/amd64/pmap.c
===================================================================
--- sys/amd64/amd64/pmap.c
+++ sys/amd64/amd64/pmap.c
@@ -48,7 +48,7 @@
*/
/*-
* Copyright (c) 2003 Networks Associates Technology, Inc.
- * Copyright (c) 2014-2018 The FreeBSD Foundation
+ * Copyright (c) 2014-2019 The FreeBSD Foundation
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Jake Burkholder,
@@ -121,6 +121,7 @@
#include <sys/mman.h>
#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/rangeset.h>
#include <sys/rwlock.h>
#include <sys/sx.h>
#include <sys/turnstile.h>
@@ -155,6 +156,7 @@
#ifdef SMP
#include <machine/smp.h>
#endif
+#include <machine/sysarch.h>
#include <machine/tss.h>
static __inline boolean_t
@@ -285,6 +287,13 @@
return (mask);
}
+static __inline pt_entry_t
+pmap_pku_mask_bit(pmap_t pmap)
+{
+
+ return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0);
+}
+
#if !defined(DIAGNOSTIC)
#ifdef __GNUC_GNU_INLINE__
#define PMAP_INLINE __attribute__((__gnu_inline__)) inline
@@ -428,6 +437,22 @@
static vm_pindex_t pti_pg_idx;
static bool pti_finalized;
+struct pmap_pkru_range {
+ struct rs_el pkru_rs_el;
+ u_int pkru_keyidx;
+ int pkru_flags;
+};
+
+static uma_zone_t pmap_pkru_ranges_zone;
+static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
+static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va);
+static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
+static void *pkru_dup_range(void *ctx, void *data);
+static void pkru_free_range(void *ctx, void *node);
+static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap);
+static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
+static void pmap_pkru_deassign_all(pmap_t pmap);
+
static int
pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
{
@@ -2866,6 +2891,12 @@
pmap->pm_pcids[i].pm_gen = 1;
}
pmap_activate_boot(pmap);
+
+ if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
+ pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
+ sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+ }
}
void
@@ -2954,6 +2985,10 @@
pmap_pinit_pml4_pti(pml4pgu);
pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
}
+ if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
+ rangeset_init(&pmap->pm_pkru, pkru_dup_range,
+ pkru_free_range, pmap, M_NOWAIT);
+ }
}
pmap->pm_root.rt_root = 0;
@@ -3250,6 +3285,9 @@
vm_page_unwire_noq(m);
vm_page_free(m);
}
+ if (pmap->pm_type == PT_X86 &&
+ (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
+ rangeset_fini(&pmap->pm_pkru);
}
static int
@@ -4080,7 +4118,7 @@
{
pd_entry_t newpde, oldpde;
pt_entry_t *firstpte, newpte;
- pt_entry_t PG_A, PG_G, PG_M, PG_RW, PG_V;
+ pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
vm_paddr_t mptepa;
vm_page_t mpte;
struct spglist free;
@@ -4093,6 +4131,7 @@
PG_RW = pmap_rw_bit(pmap);
PG_V = pmap_valid_bit(pmap);
PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
+ PG_PKU_MASK = pmap_pku_mask_bit(pmap);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
oldpde = *pde;
@@ -4525,6 +4564,7 @@
out:
if (anyvalid)
pmap_invalidate_all(pmap);
+ pmap_pkru_on_remove(pmap, sva, eva);
PMAP_UNLOCK(pmap);
pmap_delayed_invl_finished();
vm_page_free_pages_toq(&free, true);
@@ -4836,7 +4876,7 @@
{
pd_entry_t newpde;
pt_entry_t *firstpte, oldpte, pa, *pte;
- pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V;
+ pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK;
vm_page_t mpte;
int PG_PTE_CACHE;
@@ -4845,6 +4885,7 @@
PG_M = pmap_modified_bit(pmap);
PG_V = pmap_valid_bit(pmap);
PG_RW = pmap_rw_bit(pmap);
+ PG_PKU_MASK = pmap_pku_mask_bit(pmap);
PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
PMAP_LOCK_ASSERT(pmap, MA_OWNED);
@@ -5032,6 +5073,8 @@
lock = NULL;
PMAP_LOCK(pmap);
+ if (va < VM_MAXUSER_ADDRESS)
+ newpte |= pmap_pkru_get(pmap, va);
if (psind == 1) {
/* Assert the required virtual and physical alignment. */
KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
@@ -5066,6 +5109,10 @@
rv = KERN_RESOURCE_SHORTAGE;
goto out;
}
+ if (pmap->pm_type == PT_X86) {
+ newpte &= ~X86_PG_PKU_MASK;
+ newpte |= pmap_pkru_get(pmap, va);
+ }
goto retry;
} else
panic("pmap_enter: invalid page directory va=%#lx", va);
@@ -5291,6 +5338,21 @@
" in pmap %p", va, pmap);
return (KERN_RESOURCE_SHORTAGE);
}
+
+ /*
+ * If pkru is not same for the whole pde range, return failure
+ * and let vm_fault() cope. Check after pde allocation, since
+ * it could sleep.
+ */
+ if (!pmap_pkru_same(pmap, va, va + NBPDR)) {
+ SLIST_INIT(&free);
+ if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
+ pmap_invalidate_page(pmap, va);
+ vm_page_free_pages_toq(&free, true);
+ }
+ return (KERN_FAILURE);
+ }
+
pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
pde = &pde[pmap_pde_index(va)];
oldpde = *pde;
@@ -5550,7 +5612,7 @@
if ((prot & VM_PROT_EXECUTE) == 0)
newpte |= pg_nx;
if (va < VM_MAXUSER_ADDRESS)
- newpte |= PG_U;
+ newpte |= PG_U | pmap_pkru_get(pmap, va);
pte_store(pte, newpte);
return (mpte);
}
@@ -5926,6 +5988,33 @@
PMAP_UNLOCK(dst_pmap);
}
+int
+pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
+{
+ int error;
+
+ if (dst_pmap->pm_type != src_pmap->pm_type ||
+ dst_pmap->pm_type != PT_X86 ||
+ (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
+ return (0);
+ for (;;) {
+ if (dst_pmap < src_pmap) {
+ PMAP_LOCK(dst_pmap);
+ PMAP_LOCK(src_pmap);
+ } else {
+ PMAP_LOCK(src_pmap);
+ PMAP_LOCK(dst_pmap);
+ }
+ error = pmap_pkru_copy(dst_pmap, src_pmap);
+ PMAP_UNLOCK(src_pmap);
+ PMAP_UNLOCK(dst_pmap);
+ if (error != ENOMEM)
+ break;
+ vm_wait(NULL);
+ }
+ return (error);
+}
+
/*
* Zero the specified hardware page.
*/
@@ -6325,6 +6414,7 @@
if (lock != NULL)
rw_wunlock(lock);
pmap_invalidate_all(pmap);
+ pmap_pkru_deassign_all(pmap);
PMAP_UNLOCK(pmap);
vm_page_free_pages_toq(&free, true);
}
@@ -8961,6 +9051,285 @@
VM_OBJECT_WUNLOCK(pti_obj);
}
+static void *
+pkru_dup_range(void *ctx __unused, void *data)
+{
+ struct pmap_pkru_range *node, *new_node;
+
+ new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
+ if (new_node == NULL)
+ return (NULL);
+ node = data;
+ memcpy(new_node, node, sizeof(*node));
+ return (new_node);
+}
+
+static void
+pkru_free_range(void *ctx __unused, void *node)
+{
+
+ uma_zfree(pmap_pkru_ranges_zone, node);
+}
+
+static int
+pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
+ int flags)
+{
+ struct pmap_pkru_range *ppr;
+ int error;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ MPASS(pmap->pm_type == PT_X86);
+ MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
+ if ((flags & AMD64_PKRU_EXCL) != 0 &&
+ !rangeset_check_empty(&pmap->pm_pkru, sva, eva))
+ return (EBUSY);
+ ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
+ if (ppr == NULL)
+ return (ENOMEM);
+ ppr->pkru_keyidx = keyidx;
+ ppr->pkru_flags = flags & AMD64_PKRU_PERSIST;
+ error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr);
+ if (error != 0)
+ uma_zfree(pmap_pkru_ranges_zone, ppr);
+ return (error);
+}
+
+static int
+pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ MPASS(pmap->pm_type == PT_X86);
+ MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
+ return (rangeset_remove(&pmap->pm_pkru, sva, eva));
+}
+
+static void
+pmap_pkru_deassign_all(pmap_t pmap)
+{
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ if (pmap->pm_type == PT_X86 &&
+ (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
+ rangeset_remove_all(&pmap->pm_pkru);
+}
+
+static bool
+pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+ struct pmap_pkru_range *ppr, *prev_ppr;
+ vm_offset_t va;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ if (pmap->pm_type != PT_X86 ||
+ (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
+ sva >= VM_MAXUSER_ADDRESS)
+ return (true);
+ MPASS(eva <= VM_MAXUSER_ADDRESS);
+ for (va = sva, prev_ppr = NULL; va < eva;) {
+ ppr = rangeset_get(&pmap->pm_pkru, va);
+ if ((ppr == NULL) ^ (prev_ppr == NULL))
+ return (false);
+ if (ppr == NULL) {
+ va += PAGE_SIZE;
+ continue;
+ }
+ if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx)
+ return (false);
+ va = ppr->pkru_rs_el.re_end;
+ }
+ return (true);
+}
+
+static pt_entry_t
+pmap_pkru_get(pmap_t pmap, vm_offset_t va)
+{
+ struct pmap_pkru_range *ppr;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ if (pmap->pm_type != PT_X86 ||
+ (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
+ va >= VM_MAXUSER_ADDRESS)
+ return (0);
+ ppr = rangeset_get(&pmap->pm_pkru, va);
+ if (ppr != NULL)
+ return (X86_PG_PKU(ppr->pkru_keyidx));
+ return (0);
+}
+
+static bool
+pred_pkru_on_remove(void *ctx __unused, void *r)
+{
+ struct pmap_pkru_range *ppr;
+
+ ppr = r;
+ return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0);
+}
+
+static void
+pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ if (pmap->pm_type == PT_X86 &&
+ (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
+ rangeset_remove_pred(&pmap->pm_pkru, sva, eva,
+ pred_pkru_on_remove);
+ }
+}
+
+static int
+pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap)
+{
+
+ PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
+ PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
+ MPASS(dst_pmap->pm_type == PT_X86);
+ MPASS(src_pmap->pm_type == PT_X86);
+ MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
+ if (src_pmap->pm_pkru.rs_data_ctx == NULL)
+ return (0);
+ return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru));
+}
+
+static void
+pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+ u_int keyidx)
+{
+ pml4_entry_t *pml4e;
+ pdp_entry_t *pdpe;
+ pd_entry_t newpde, ptpaddr, *pde;
+ pt_entry_t newpte, *ptep, pte;
+ vm_offset_t msva, va_next;
+ bool changed;
+
+ PMAP_LOCK_ASSERT(pmap, MA_OWNED);
+ MPASS(pmap->pm_type == PT_X86);
+ MPASS(keyidx <= PMAP_MAX_PKRU_IDX);
+
+ for (changed = false, msva = sva; msva < eva; msva = va_next) {
+ pml4e = pmap_pml4e(pmap, msva);
+ if ((*pml4e & X86_PG_V) == 0) {
+ va_next = (msva + NBPML4) & ~PML4MASK;
+ if (va_next < msva)
+ va_next = eva;
+ continue;
+ }
+
+ pdpe = pmap_pml4e_to_pdpe(pml4e, msva);
+ if ((*pdpe & X86_PG_V) == 0) {
+ va_next = (msva + NBPDP) & ~PDPMASK;
+ if (va_next < msva)
+ va_next = eva;
+ continue;
+ }
+
+ va_next = (msva + NBPDR) & ~PDRMASK;
+ if (va_next < msva)
+ va_next = eva;
+
+ pde = pmap_pdpe_to_pde(pdpe, msva);
+ ptpaddr = *pde;
+ if (ptpaddr == 0)
+ continue;
+
+ MPASS((ptpaddr & X86_PG_V) != 0);
+ if ((ptpaddr & PG_PS) != 0) {
+ if (msva + NBPDR == va_next && eva >= va_next) {
+ newpde = (ptpaddr & ~X86_PG_PKU_MASK) |
+ X86_PG_PKU(keyidx);
+ if (newpde != ptpaddr) {
+ *pde = newpde;
+ changed = true;
+ }
+ continue;
+ } else if (!pmap_demote_pde(pmap, pde, msva)) {
+ continue;
+ }
+ }
+
+ if (va_next > eva)
+ va_next = eva;
+
+ for (ptep = pmap_pde_to_pte(pde, msva); msva != va_next;
+ ptep++, msva += PAGE_SIZE) {
+ pte = *ptep;
+ if ((pte & X86_PG_V) == 0)
+ continue;
+ newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx);
+ if (newpte != pte) {
+ *ptep = newpte;
+ changed = true;
+ }
+ }
+ }
+ if (changed)
+ pmap_invalidate_range(pmap, sva, eva);
+}
+
+static int
+pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+ u_int keyidx, int flags)
+{
+
+ if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX ||
+ (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0)
+ return (EINVAL);
+ if (eva <= sva || eva > VM_MAXUSER_ADDRESS)
+ return (EFAULT);
+ if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
+ return (ENOTSUP);
+ return (0);
+}
+
+int
+pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
+ int flags)
+{
+ int error;
+
+ sva = trunc_page(sva);
+ eva = round_page(eva);
+ error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags);
+ if (error != 0)
+ return (error);
+ for (;;) {
+ PMAP_LOCK(pmap);
+ error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags);
+ if (error == 0)
+ pmap_pkru_update_range(pmap, sva, eva, keyidx);
+ PMAP_UNLOCK(pmap);
+ if (error != ENOMEM)
+ break;
+ vm_wait(NULL);
+ }
+ return (error);
+}
+
+int
+pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
+{
+ int error;
+
+ sva = trunc_page(sva);
+ eva = round_page(eva);
+ error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0);
+ if (error != 0)
+ return (error);
+ for (;;) {
+ PMAP_LOCK(pmap);
+ error = pmap_pkru_deassign(pmap, sva, eva);
+ if (error == 0)
+ pmap_pkru_update_range(pmap, sva, eva, 0);
+ PMAP_UNLOCK(pmap);
+ if (error != ENOMEM)
+ break;
+ vm_wait(NULL);
+ }
+ return (error);
+}
+
#include "opt_ddb.h"
#ifdef DDB
#include <sys/kdb.h>
Index: sys/amd64/amd64/sys_machdep.c
===================================================================
--- sys/amd64/amd64/sys_machdep.c
+++ sys/amd64/amd64/sys_machdep.c
@@ -44,6 +44,7 @@
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/pcpu.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/smp.h>
@@ -53,6 +54,7 @@
#include <vm/vm.h>
#include <vm/pmap.h>
#include <vm/vm_kern.h> /* for kernel_map */
+#include <vm/vm_map.h>
#include <vm/vm_extern.h>
#include <machine/frame.h>
@@ -170,13 +172,16 @@
int
sysarch(struct thread *td, struct sysarch_args *uap)
{
- int error = 0;
- struct pcb *pcb = curthread->td_pcb;
+ struct pcb *pcb;
+ struct vm_map *map;
uint32_t i386base;
uint64_t a64base;
struct i386_ioperm_args iargs;
struct i386_get_xfpustate i386xfpu;
+ struct i386_set_pkru i386pkru;
struct amd64_get_xfpustate a64xfpu;
+ struct amd64_set_pkru a64pkru;
+ int error;
#ifdef CAPABILITY_MODE
/*
@@ -194,11 +199,15 @@
case I386_GET_GSBASE:
case I386_SET_GSBASE:
case I386_GET_XFPUSTATE:
+ case I386_SET_PKRU:
+ case I386_CLEAR_PKRU:
case AMD64_GET_FSBASE:
case AMD64_SET_FSBASE:
case AMD64_GET_GSBASE:
case AMD64_SET_GSBASE:
case AMD64_GET_XFPUSTATE:
+ case AMD64_SET_PKRU:
+ case AMD64_CLEAR_PKRU:
break;
case I386_SET_IOPERM:
@@ -214,6 +223,10 @@
if (uap->op == I386_GET_LDT || uap->op == I386_SET_LDT)
return (sysarch_ldt(td, uap, UIO_USERSPACE));
+
+ error = 0;
+ pcb = td->td_pcb;
+
/*
* XXXKIB check that the BSM generation code knows to encode
* the op argument.
@@ -233,11 +246,27 @@
a64xfpu.addr = (void *)(uintptr_t)i386xfpu.addr;
a64xfpu.len = i386xfpu.len;
break;
+ case I386_SET_PKRU:
+ case I386_CLEAR_PKRU:
+ if ((error = copyin(uap->parms, &i386pkru,
+ sizeof(struct i386_set_pkru))) != 0)
+ return (error);
+ a64pkru.addr = (void *)(uintptr_t)i386pkru.addr;
+ a64pkru.len = i386pkru.len;
+ a64pkru.keyidx = i386pkru.keyidx;
+ a64pkru.flags = i386pkru.flags;
+ break;
case AMD64_GET_XFPUSTATE:
if ((error = copyin(uap->parms, &a64xfpu,
sizeof(struct amd64_get_xfpustate))) != 0)
return (error);
break;
+ case AMD64_SET_PKRU:
+ case AMD64_CLEAR_PKRU:
+ if ((error = copyin(uap->parms, &a64pkru,
+ sizeof(struct amd64_set_pkru))) != 0)
+ return (error);
+ break;
default:
break;
}
@@ -326,6 +355,30 @@
a64xfpu.addr, a64xfpu.len);
break;
+ case I386_SET_PKRU:
+ case AMD64_SET_PKRU:
+ map = &td->td_proc->p_vmspace->vm_map;
+ vm_map_lock_read(map);
+ error = pmap_pkru_set(PCPU_GET(curpmap),
+ (vm_offset_t)a64pkru.addr, (vm_offset_t)a64pkru.addr +
+ a64pkru.len, a64pkru.keyidx, a64pkru.flags);
+ vm_map_unlock_read(map);
+ break;
+
+ case I386_CLEAR_PKRU:
+ case AMD64_CLEAR_PKRU:
+ if (a64pkru.flags != 0 || a64pkru.keyidx != 0) {
+ error = EINVAL;
+ break;
+ }
+ map = &td->td_proc->p_vmspace->vm_map;
+ vm_map_lock_read(map);
+ error = pmap_pkru_clear(PCPU_GET(curpmap),
+ (vm_offset_t)a64pkru.addr,
+ (vm_offset_t)a64pkru.addr + a64pkru.len);
+ vm_map_unlock(map);
+ break;
+
default:
error = EINVAL;
break;
Index: sys/amd64/amd64/trap.c
===================================================================
--- sys/amd64/amd64/trap.c
+++ sys/amd64/amd64/trap.c
@@ -807,6 +807,20 @@
return (-1);
}
+ /*
+ * User-mode protection key violation (PKU). May happen
+ * either from usermode or from kernel if copyin accessed
+ * key-protected mapping.
+ */
+ if ((frame->tf_err & PGEX_PK) != 0) {
+ if (eva > VM_MAXUSER_ADDRESS) {
+ trap_fatal(frame, eva);
+ return (-1);
+ }
+ rv = KERN_PROTECTION_FAILURE;
+ goto after_vmfault;
+ }
+
/*
* If nx protection of the usermode portion of kernel page
* tables caused trap, panic.
@@ -842,6 +856,7 @@
#endif
return (0);
}
+after_vmfault:
if (!usermode) {
if (td->td_intr_nesting_level == 0 &&
curpcb->pcb_onfault != NULL) {
@@ -885,10 +900,12 @@
#endif
if (type == T_PAGEFLT) {
printf("fault virtual address = 0x%lx\n", eva);
- printf("fault code = %s %s %s, %s\n",
+ printf("fault code = %s %s %s%s%s, %s\n",
code & PGEX_U ? "user" : "supervisor",
code & PGEX_W ? "write" : "read",
code & PGEX_I ? "instruction" : "data",
+ code & PGEX_PK ? " prot key" : " ",
+ code & PGEX_SGX ? " SGX" : " ",
code & PGEX_RSV ? "reserved bits in PTE" :
code & PGEX_P ? "protection violation" : "page not present");
}
Index: sys/amd64/include/cpufunc.h
===================================================================
--- sys/amd64/include/cpufunc.h
+++ sys/amd64/include/cpufunc.h
@@ -627,6 +627,22 @@
__asm __volatile("mwait" : : "a" (hints), "c" (extensions));
}
+static __inline uint32_t
+rdpkru(void)
+{
+ uint32_t res;
+
+ __asm __volatile("rdpkru" : "=a" (res) : "c" (0) : "edx");
+ return (res);
+}
+
+static __inline void
+wrpkru(uint32_t mask)
+{
+
+ __asm __volatile("wrpkru" : : "a" (mask), "c" (0), "d" (0));
+}
+
#ifdef _KERNEL
/* This is defined in <machine/specialreg.h> but is too painful to get to */
#ifndef MSR_FSBASE
Index: sys/amd64/include/pmap.h
===================================================================
--- sys/amd64/include/pmap.h
+++ sys/amd64/include/pmap.h
@@ -66,6 +66,7 @@
#define X86_PG_AVAIL2 0x400 /* < programmers use */
#define X86_PG_AVAIL3 0x800 /* \ */
#define X86_PG_PDE_PAT 0x1000 /* PAT PAT index */
+#define X86_PG_PKU(idx) ((pt_entry_t)idx << 59)
#define X86_PG_NX (1ul<<63) /* No-execute */
#define X86_PG_AVAIL(x) (1ul << (x))
@@ -73,6 +74,10 @@
#define X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
#define X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD)
+/* Protection keys indexes */
+#define PMAP_MAX_PKRU_IDX 0xf
+#define X86_PG_PKU_MASK X86_PG_PKU(PMAP_MAX_PKRU_IDX)
+
/*
* Intel extended page table (EPT) bit definitions.
*/
@@ -120,7 +125,7 @@
* (PTE) page mappings have identical settings for the following fields:
*/
#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \
- PG_M | PG_A | PG_U | PG_RW | PG_V)
+ PG_M | PG_A | PG_U | PG_RW | PG_V | PG_PKU_MASK)
/*
* Page Protection Exception bits
@@ -131,6 +136,8 @@
#define PGEX_U 0x04 /* access from User mode (UPL) */
#define PGEX_RSV 0x08 /* reserved PTE field is non-zero */
#define PGEX_I 0x10 /* during an instruction fetch */
+#define PGEX_PK 0x20 /* protection key violation */
+#define PGEX_SGX 0x40 /* SGX-related */
/*
* undef the PG_xx macros that define bits in the regular x86 PTEs that
@@ -240,6 +247,8 @@
#include <sys/_cpuset.h>
#include <sys/_lock.h>
#include <sys/_mutex.h>
+#include <sys/_pctrie.h>
+#include <sys/_rangeset.h>
#include <vm/_vm_radix.h>
@@ -334,6 +343,7 @@
long pm_eptgen; /* EPT pmap generation id */
int pm_flags;
struct pmap_pcids pm_pcids[MAXCPU];
+ struct rangeset pm_pkru;
};
/* flags */
@@ -452,6 +462,9 @@
void pmap_pti_pcid_invlpg(uint64_t ucr3, uint64_t kcr3, vm_offset_t va);
void pmap_pti_pcid_invlrng(uint64_t ucr3, uint64_t kcr3, vm_offset_t sva,
vm_offset_t eva);
+int pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
+int pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
+ u_int keyidx, int flags);
#endif /* _KERNEL */
/* Return various clipped indexes for a given VA */
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -3909,6 +3909,7 @@
kern/subr_power.c standard
kern/subr_prf.c standard
kern/subr_prof.c standard
+kern/subr_rangeset.c standard
kern/subr_rman.c standard
kern/subr_rtc.c standard
kern/subr_sbuf.c standard
Index: sys/i386/include/cpufunc.h
===================================================================
--- sys/i386/include/cpufunc.h
+++ sys/i386/include/cpufunc.h
@@ -700,6 +700,22 @@
write_eflags(eflags);
}
+static __inline uint32_t
+rdpkru(void)
+{
+ uint32_t res;
+
+ __asm __volatile("rdpkru" : "=a" (res) : "c" (0) : "edx");
+ return (res);
+}
+
+static __inline void
+wrpkru(uint32_t mask)
+{
+
+ __asm __volatile("wrpkru" : : "a" (mask), "c" (0), "d" (0));
+}
+
#else /* !(__GNUCLIKE_ASM && __CC_SUPPORTS___INLINE) */
int breakpoint(void);
Index: sys/i386/include/pmap.h
===================================================================
--- sys/i386/include/pmap.h
+++ sys/i386/include/pmap.h
@@ -372,6 +372,13 @@
#define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0)
#define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz))
+static inline int
+pmap_vmspace_copy(pmap_t dst_pmap __unused, pmap_t src_pmap __unused)
+{
+
+ return (0);
+}
+
/*
* Only the following functions or macros may be used before pmap_bootstrap()
* is called: pmap_kenter(), pmap_kextract(), pmap_kremove(), vtophys(), and
Index: sys/kern/subr_rangeset.c
===================================================================
--- /dev/null
+++ sys/kern/subr_rangeset.c
@@ -0,0 +1,370 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/pctrie.h>
+#include <sys/rangeset.h>
+#include <vm/uma.h>
+
+#ifdef DIAGNOSTIC
+static void rangeset_check(struct rangeset *rs);
+#else
+#define rangeset_check(rs)
+#endif
+
+static uma_zone_t rs_node_zone;
+static void
+rs_rangeset_init(void *arg __unused)
+{
+
+ rs_node_zone = uma_zcreate("rangeset pctrie nodes",
+ pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL,
+ UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+}
+SYSINIT(rs, SI_SUB_LOCK, SI_ORDER_ANY, rs_rangeset_init, NULL);
+
+static void *
+rs_node_alloc(struct pctrie *ptree)
+{
+ struct rangeset *rs;
+
+ rs = __containerof(ptree, struct rangeset, rs_trie);
+ return (uma_zalloc(rs_node_zone, rs->rs_alloc_flags));
+}
+
+static void
+rs_node_free(struct pctrie *ptree __unused, void *node)
+{
+
+ uma_zfree(rs_node_zone, node);
+}
+
+void
+rangeset_init(struct rangeset *rs, rs_dup_data_t dup_data,
+ rs_free_data_t free_data, void *data_ctx, u_int alloc_flags)
+{
+
+ pctrie_init(&rs->rs_trie);
+ rs->rs_dup_data = dup_data;
+ rs->rs_free_data = free_data;
+ rs->rs_data_ctx = data_ctx;
+ rs->rs_alloc_flags = alloc_flags;
+}
+
+void
+rangeset_fini(struct rangeset *rs)
+{
+
+ rangeset_check(rs);
+ rangeset_remove_all(rs);
+}
+
+bool
+rangeset_check_empty(struct rangeset *rs, uint64_t start, uint64_t end)
+{
+ struct rs_el *r;
+ uint64_t *r1;
+
+ rangeset_check(rs);
+ r1 = pctrie_lookup_ge(&rs->rs_trie, start);
+ if (r1 != NULL) {
+ r = __containerof(r1, struct rs_el, re_start);
+ if (r->re_start < end)
+ return (false);
+ }
+ r1 = pctrie_lookup_le(&rs->rs_trie, end);
+ if (r1 != NULL) {
+ r = __containerof(r1, struct rs_el, re_start);
+ if (r->re_end > start)
+ return (false);
+ }
+ return (true);
+}
+
+int
+rangeset_insert(struct rangeset *rs, uint64_t start, uint64_t end,
+ void *data)
+{
+ struct rs_el *r;
+ int error;
+
+ rangeset_check(rs);
+ error = rangeset_remove(rs, start, end);
+ if (error != 0)
+ return (error);
+ r = data;
+ r->re_start = start;
+ r->re_end = end;
+ error = pctrie_insert(&rs->rs_trie, &r->re_start, rs_node_alloc);
+ rangeset_check(rs);
+ return (error);
+}
+
+int
+rangeset_remove_pred(struct rangeset *rs, uint64_t start, uint64_t end,
+ rs_pred_t pred)
+{
+ struct rs_el *r, *rn;
+ uint64_t *r1;
+ int error;
+
+ rangeset_check(rs);
+ error = 0;
+ for (; end > 0 && start < end;) {
+ r1 = pctrie_lookup_le(&rs->rs_trie, end - 1);
+ if (r1 == NULL)
+ break;
+ r = __containerof(r1, struct rs_el, re_start);
+
+ /*
+ * ------============================--|-------|----
+ * rs re s e
+ */
+ if (r->re_end <= start)
+ break;
+
+ if (r->re_end <= end) {
+ if (r->re_start < start) {
+ /*
+ * ------========|==============-------|----
+ * rs s re e
+ */
+ if (pred(rs->rs_data_ctx, r))
+ r->re_end = start;
+ break;
+ }
+
+ /*
+ * ------|--------===================----------|----
+ * s rs re e
+ */
+ end = r->re_start;
+ if (pred(rs->rs_data_ctx, r)) {
+ pctrie_remove(&rs->rs_trie, r->re_start,
+ rs_node_free);
+ rs->rs_free_data(rs->rs_data_ctx, r);
+ }
+ continue;
+ }
+
+ /*
+ * ------|--------====================|==========----
+ * s rs e re
+ */
+ if (r->re_start >= start) {
+ if (pred(rs->rs_data_ctx, r)) {
+ pctrie_remove(&rs->rs_trie, r->re_start,
+ rs_node_free);
+ r->re_start = end;
+ error = pctrie_insert(&rs->rs_trie,
+ &r->re_start, rs_node_alloc);
+ /*
+ * The insert above must succeed
+ * because rs_node zone is marked
+ * nofree and we freed one element
+ * just before.
+ */
+ MPASS(error == 0);
+ } else {
+ end = r->re_start;
+ }
+ continue;
+ }
+
+ /*
+ * ------=========|===================|==========----
+ * rs s e re
+ */
+ if (pred(rs->rs_data_ctx, r)) {
+ /*
+ * Split. Can only happen once, and then if
+ * any allocation fails, the rangeset is kept
+ * intact.
+ */
+ rn = rs->rs_dup_data(rs->rs_data_ctx, r);
+ if (rn == NULL) {
+ error = ENOMEM;
+ break;
+ }
+ rn->re_start = end;
+ rn->re_end = r->re_end;
+ error = pctrie_insert(&rs->rs_trie, &rn->re_start,
+ rs_node_alloc);
+ if (error != 0) {
+ rs->rs_free_data(rs->rs_data_ctx, rn);
+ break;
+ }
+ r->re_end = start;
+ }
+ break;
+ }
+ rangeset_check(rs);
+ return (error);
+}
+
+static bool
+rangeset_true_pred(void *ctx __unused, void *r __unused)
+{
+
+ return (true);
+}
+
+int
+rangeset_remove(struct rangeset *rs, uint64_t start, uint64_t end)
+{
+
+ return (rangeset_remove_pred(rs, start, end, rangeset_true_pred));
+}
+
+void
+rangeset_remove_all(struct rangeset *rs)
+{
+ struct rs_el *r;
+ uint64_t *r1;
+
+ for (;;) {
+ r1 = pctrie_lookup_ge(&rs->rs_trie, 0);
+ if (r1 == NULL)
+ break;
+ r = __containerof(r1, struct rs_el, re_start);
+ pctrie_remove(&rs->rs_trie, r->re_start, rs_node_free);
+ rs->rs_free_data(rs->rs_data_ctx, r);
+ }
+}
+
+void *
+rangeset_get(struct rangeset *rs, uint64_t place)
+{
+ struct rs_el *r;
+ uint64_t *r1;
+
+ rangeset_check(rs);
+ r1 = pctrie_lookup_le(&rs->rs_trie, place);
+ if (r1 == NULL)
+ return (NULL);
+ r = __containerof(r1, struct rs_el, re_start);
+ if (r->re_end <= place)
+ return (NULL);
+ return (r);
+}
+
+int
+rangeset_copy(struct rangeset *dst_rs, struct rangeset *src_rs)
+{
+ struct rs_el *src_r, *dst_r;
+ uint64_t cursor, *r1;
+ int error;
+
+ MPASS(pctrie_is_empty(&dst_rs->rs_trie));
+ rangeset_check(src_rs);
+ MPASS(dst_rs->rs_dup_data == src_rs->rs_dup_data);
+
+ error = 0;
+ for (cursor = 0;; cursor = src_r->re_start + 1) {
+ r1 = pctrie_lookup_ge(&src_rs->rs_trie, cursor);
+ if (r1 == NULL)
+ break;
+ src_r = __containerof(r1, struct rs_el, re_start);
+ dst_r = dst_rs->rs_dup_data(dst_rs->rs_data_ctx, src_r);
+ if (dst_r == NULL) {
+ error = ENOMEM;
+ break;
+ }
+ error = pctrie_insert(&dst_rs->rs_trie, &dst_r->re_start,
+ rs_node_alloc);
+ if (error != 0)
+ break;
+ }
+ if (error != 0)
+ rangeset_remove_all(dst_rs);
+ return (error);
+}
+
+#ifdef DIAGNOSTIC
+static void
+rangeset_check(struct rangeset *rs)
+{
+ struct rs_el *r, *rp;
+ uint64_t cursor, *r1;
+
+ for (cursor = 0, rp = NULL;; cursor = r->re_start + 1, rp = r) {
+ r1 = pctrie_lookup_ge(&rs->rs_trie, cursor);
+ if (r1 == NULL)
+ break;
+ r = __containerof(r1, struct rs_el, re_start);
+ KASSERT(r->re_start < r->re_end,
+ ("invalid interval rs %p elem %p (%#jx, %#jx)",
+ rs, r, (uintmax_t)r->re_start, (uintmax_t)r->re_end));
+ if (rp != NULL) {
+ KASSERT(rp->re_end <= r->re_start,
+ ("non-ascending neighbors rs %p "
+ "prev elem %p (%#jx, %#jx) elem %p (%#jx, %#jx)",
+ rs, rp, (uintmax_t)rp->re_start,
+ (uintmax_t)rp->re_end, r, (uintmax_t)r->re_start,
+ (uintmax_t)r->re_end));
+ }
+ }
+}
+#endif
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <sys/kernel.h>
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(rangeset, rangeset_show_fn)
+{
+ struct rangeset *rs;
+ struct rs_el *r;
+ uint64_t cursor, *r1;
+
+ if (!have_addr) {
+ db_printf("show rangeset addr\n");
+ return;
+ }
+
+ rs = (struct rangeset *)addr;
+ db_printf("rangeset %p\n", rs);
+ for (cursor = 0;; cursor = r->re_start + 1) {
+ r1 = pctrie_lookup_ge(&rs->rs_trie, cursor);
+ if (r1 == NULL)
+ break;
+ r = __containerof(r1, struct rs_el, re_start);
+ db_printf(" el %p start %#jx end %#jx\n",
+ r, r->re_start, r->re_end);
+ }
+}
+#endif
Index: sys/sys/_rangeset.h
===================================================================
--- /dev/null
+++ sys/sys/_rangeset.h
@@ -0,0 +1,51 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS__RANGESET_H
+#define _SYS__RANGESET_H
+
+#include <sys/_pctrie.h>
+
+typedef void *(*rs_dup_data_t)(void *ctx, void *data);
+typedef void (*rs_free_data_t)(void *ctx, void *data);
+
+struct rangeset {
+ struct pctrie rs_trie;
+ rs_dup_data_t rs_dup_data;
+ rs_free_data_t rs_free_data;
+ void *rs_data_ctx;
+ u_int rs_alloc_flags;
+};
+
+#endif
+
Index: sys/sys/rangeset.h
===================================================================
--- /dev/null
+++ sys/sys/rangeset.h
@@ -0,0 +1,88 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
+ *
+ * Copyright (c) 2019 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_RANGESET_H
+#define _SYS_RANGESET_H
+
+#ifdef _KERNEL
+
+#include <sys/_rangeset.h>
+
+typedef bool (*rs_pred_t)(void *ctx, void *r);
+
+/*
+ * This structure must be embedded at the start of the rangeset element.
+ */
+struct rs_el {
+ uint64_t re_start; /* pctrie key */
+ uint64_t re_end;
+};
+
+void rangeset_init(struct rangeset *rs, rs_dup_data_t dup_data,
+ rs_free_data_t free_data, void *rs_data_ctx, u_int alloc_flags);
+void rangeset_fini(struct rangeset *rs);
+
+bool rangeset_check_empty(struct rangeset *rs, uint64_t start,
+ uint64_t end);
+
+/*
+ * r point to the app data with struct rs_el at the beginning.
+ */
+int rangeset_insert(struct rangeset *rs, uint64_t start, uint64_t end,
+ void *r);
+
+/*
+ * Guarantees that on error the rangeset is not modified. Remove
+ * might need to split element if its start/end completely cover the
+ * removed range, in which case ENOMEM might be returned.
+ */
+void rangeset_remove_all(struct rangeset *rs);
+int rangeset_remove(struct rangeset *rs, uint64_t start, uint64_t end);
+int rangeset_remove_pred(struct rangeset *rs, uint64_t start,
+ uint64_t end, rs_pred_t pred);
+
+/*
+ * Really returns the pointer to the data with struct rs_el embedded
+ * at the beginning.
+ */
+void *rangeset_get(struct rangeset *rs, uint64_t place);
+
+/*
+ * Copies src_rs entries into dst_rs. dst_rs must be empty.
+ * Leaves dst_rs empty on failure.
+ */
+int rangeset_copy(struct rangeset *dst_rs, struct rangeset *src_rs);
+
+#endif
+
+#endif
Index: sys/vm/pmap.h
===================================================================
--- sys/vm/pmap.h
+++ sys/vm/pmap.h
@@ -168,6 +168,7 @@
void pmap_unwire(pmap_t pmap, vm_offset_t start, vm_offset_t end);
void pmap_zero_page(vm_page_t);
void pmap_zero_page_area(vm_page_t, int off, int size);
+int pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap);
#define pmap_resident_count(pm) ((pm)->pm_stats.resident_count)
#define pmap_wired_count(pm) ((pm)->pm_stats.wired_count)
Index: sys/vm/vm_fault.c
===================================================================
--- sys/vm/vm_fault.c
+++ sys/vm/vm_fault.c
@@ -479,8 +479,20 @@
fault_flags, true);
}
VM_OBJECT_WUNLOCK(fs->first_object);
- pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type | (wired ?
- PMAP_ENTER_WIRED : 0), psind);
+ rv = pmap_enter(fs->map->pmap, vaddr, m, prot, fault_type |
+ (wired ? PMAP_ENTER_WIRED : 0), psind);
+#if defined(__amd64__)
+ if (psind > 0 && rv == KERN_FAILURE) {
+ for (i = 0; i < npages; i++) {
+ rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i),
+ &m[i], prot, fault_type |
+ (wired ? PMAP_ENTER_WIRED : 0), 0);
+ MPASS(rv == KERN_SUCCESS);
+ }
+ }
+#else
+ MPASS(rv == KERN_SUCCESS);
+#endif
VM_OBJECT_WLOCK(fs->first_object);
m_mtx = NULL;
for (i = 0; i < npages; i++) {
Index: sys/vm/vm_map.c
===================================================================
--- sys/vm/vm_map.c
+++ sys/vm/vm_map.c
@@ -3424,7 +3424,7 @@
vm_map_t new_map, old_map;
vm_map_entry_t new_entry, old_entry;
vm_object_t object;
- int locked;
+ int error, locked;
vm_inherit_t inh;
old_map = &vm1->vm_map;
@@ -3433,6 +3433,7 @@
pmap_pinit);
if (vm2 == NULL)
return (NULL);
+
vm2->vm_taddr = vm1->vm_taddr;
vm2->vm_daddr = vm1->vm_daddr;
vm2->vm_maxsaddr = vm1->vm_maxsaddr;
@@ -3443,6 +3444,14 @@
locked = vm_map_trylock(new_map); /* trylock to silence WITNESS */
KASSERT(locked, ("vmspace_fork: lock failed"));
+ error = pmap_vmspace_copy(new_map->pmap, old_map->pmap);
+ if (error != 0) {
+ sx_xunlock(&old_map->lock);
+ sx_xunlock(&new_map->lock);
+ vmspace_free(vm2);
+ return (NULL);
+ }
+
old_entry = old_map->header.next;
while (old_entry != &old_map->header) {
Index: sys/x86/include/specialreg.h
===================================================================
--- sys/x86/include/specialreg.h
+++ sys/x86/include/specialreg.h
@@ -77,6 +77,7 @@
#define CR4_XSAVE 0x00040000 /* XSETBV/XGETBV */
#define CR4_SMEP 0x00100000 /* Supervisor-Mode Execution Prevention */
#define CR4_SMAP 0x00200000 /* Supervisor-Mode Access Prevention */
+#define CR4_PKE 0x00400000 /* Protection Keys Enable */
/*
* Bits in AMD64 special registers. EFER is 64 bits wide.
Index: sys/x86/include/sysarch.h
===================================================================
--- sys/x86/include/sysarch.h
+++ sys/x86/include/sysarch.h
@@ -52,6 +52,8 @@
#define I386_GET_GSBASE 9
#define I386_SET_GSBASE 10
#define I386_GET_XFPUSTATE 11
+#define I386_SET_PKRU 12
+#define I386_CLEAR_PKRU 13
/* Leave space for 0-127 for to avoid translating syscalls */
#define AMD64_GET_FSBASE 128
@@ -59,6 +61,12 @@
#define AMD64_GET_GSBASE 130
#define AMD64_SET_GSBASE 131
#define AMD64_GET_XFPUSTATE 132
+#define AMD64_SET_PKRU 133
+#define AMD64_CLEAR_PKRU 134
+
+/* Flags for AMD64_SET_PKRU */
+#define AMD64_PKRU_EXCL 0x0001
+#define AMD64_PKRU_PERSIST 0x0002
struct i386_ioperm_args {
unsigned int start;
@@ -94,12 +102,26 @@
int len;
};
+struct i386_set_pkru {
+ unsigned int addr;
+ unsigned int len;
+ unsigned int keyidx;
+ int flags;
+};
+
struct amd64_get_xfpustate {
void *addr;
int len;
};
#endif
+struct amd64_set_pkru {
+ void *addr;
+ unsigned long len;
+ unsigned int keyidx;
+ int flags;
+};
+
#ifndef _KERNEL
union descriptor;
struct dbreg;
@@ -120,6 +142,11 @@
int amd64_get_gsbase(void **);
int amd64_set_fsbase(void *);
int amd64_set_gsbase(void *);
+int x86_pkru_get_perm(unsigned int keyidx, int *access, int *modify);
+int x86_pkru_set_perm(unsigned int keyidx, int access, int modify);
+int x86_pkru_protect_range(void *addr, unsigned long len, unsigned int keyidx,
+ int flag);
+int x86_pkru_unprotect_range(void *addr, unsigned long len);
int sysarch(int, void *);
__END_DECLS
#else

File Metadata

Mime Type
text/plain
Expires
Sat, Nov 22, 9:59 PM (9 h, 2 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
25954998
Default Alt Text
D18893.id53208.diff (45 KB)

Event Timeline