diff --git a/lib/libc/gen/Makefile.inc b/lib/libc/gen/Makefile.inc --- a/lib/libc/gen/Makefile.inc +++ b/lib/libc/gen/Makefile.inc @@ -120,6 +120,7 @@ readpassphrase.c \ recvmmsg.c \ rewinddir.c \ + rseq_abi.c \ scandir.c \ scandir_b.c \ scandir-compat11.c \ diff --git a/lib/libc/gen/Symbol.map b/lib/libc/gen/Symbol.map --- a/lib/libc/gen/Symbol.map +++ b/lib/libc/gen/Symbol.map @@ -436,6 +436,7 @@ }; FBSD_1.7 { + __rseq_abi; posix_spawn_file_actions_addchdir_np; posix_spawn_file_actions_addclosefrom_np; posix_spawn_file_actions_addfchdir_np; @@ -569,4 +570,6 @@ __fillcontextx; __fillcontextx2; __getcontextx_size; + + __rseq_abi_init; }; diff --git a/lib/libc/gen/rseq_abi.c b/lib/libc/gen/rseq_abi.c new file mode 100644 --- /dev/null +++ b/lib/libc/gen/rseq_abi.c @@ -0,0 +1,55 @@ +/*- + * Copyright (c) 2021 The FreeBSD Foundation + * + * This software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include "libc_private.h" + +_Thread_local volatile struct rseq __rseq_abi __weak_symbol; + +static void __main_rseq_abi_init(void) __attribute__((__constructor__, + __used__)); +static void +__main_rseq_abi_init(void) +{ + __rseq_abi_init(); +} + +void +__rseq_abi_init(void) +{ + int bsdflags; + + if (_elf_aux_info(AT_BSDFLAGS, &bsdflags, sizeof(bsdflags)) != 0 || + (bsdflags & ELF_BSDF_RSEQ1) == 0) + return; + __rseq_abi.cpu_id_start = sched_getcpu(); + rseq(&__rseq_abi, sizeof(__rseq_abi), 0, 0/* XXXKIB */); +} diff --git a/lib/libc/include/libc_private.h b/lib/libc/include/libc_private.h --- a/lib/libc/include/libc_private.h +++ b/lib/libc/include/libc_private.h @@ -435,4 +435,6 @@ struct __nl_cat_d *__catopen_l(const char *name, int type, struct _xlocale *locale); +void __rseq_abi_init(void); + #endif /* _LIBC_PRIVATE_H_ */ diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map --- a/lib/libc/sys/Symbol.map +++ b/lib/libc/sys/Symbol.map @@ -419,6 +419,8 @@ FBSD_1.7 { _Fork; fspacectl; + membarrier; + rseq; swapoff; }; diff --git a/lib/libthr/thread/thr_create.c b/lib/libthr/thread/thr_create.c --- a/lib/libthr/thread/thr_create.c +++ b/lib/libthr/thread/thr_create.c @@ -288,6 +288,8 @@ curthread->attr.stacksize_attr; #endif + __rseq_abi_init(); + /* Run the current thread's start routine with argument: */ _pthread_exit(curthread->start_routine(curthread->arg)); diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c --- a/sys/amd64/amd64/pmap.c +++ b/sys/amd64/amd64/pmap.c @@ -10051,6 +10051,12 @@ PCPU_SET(ucr3, PMAP_NO_CR3); } +void +pmap_active_cpus(pmap_t pmap, cpuset_t *res) +{ + *res = pmap->pm_active; +} + void pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz) { diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S --- a/sys/amd64/amd64/support.S +++ b/sys/amd64/amd64/support.S @@ -1977,3 +1977,24 @@ popq %rax retq END(mds_handler_silvermont) + +/* + * Do the same as Linux and execute IRET explicitly, despite IPI + * return does it as well. + */ +ENTRY(cpu_sync_core) +/* + * Can utilize SERIALIZE when instruction is moved from + * 'future extensions' to SDM. + */ + movq (%rsp), %rdx + movl %ss, %eax + pushq %rax + pushq %rsp + addq $16, (%rsp) + pushfq + movl %cs, %eax + pushq %rax + pushq %rdx + iretq +END(cpu_sync_core) diff --git a/sys/arm/arm/pmap-v6.c b/sys/arm/arm/pmap-v6.c --- a/sys/arm/arm/pmap-v6.c +++ b/sys/arm/arm/pmap-v6.c @@ -6214,6 +6214,12 @@ critical_exit(); } +void +pmap_active_cpus(pmap_t pmap, cpuset_t *res) +{ + *res = pmap->pm_active; +} + /* * Perform the pmap work for mincore(2). If the page is not both referenced and * modified by this pmap, returns its physical address so that the caller can diff --git a/sys/arm/arm/vm_machdep.c b/sys/arm/arm/vm_machdep.c --- a/sys/arm/arm/vm_machdep.c +++ b/sys/arm/arm/vm_machdep.c @@ -320,3 +320,8 @@ return (EINVAL); } + +void +cpu_sync_core(void) +{ +} diff --git a/sys/arm64/arm64/vm_machdep.c b/sys/arm64/arm64/vm_machdep.c --- a/sys/arm64/arm64/vm_machdep.c +++ b/sys/arm64/arm64/vm_machdep.c @@ -312,3 +312,14 @@ if (busdma_swi_pending != 0) busdma_swi(); } + +void +cpu_sync_core(void) +{ + /* + * Do nothing. According to ARM ARMv8 D1.11 Exception return + * If FEAT_ExS is not implemented, or if FEAT_ExS is + * implemented and the SCTLR_ELx.EOS field is set, exception + * return from ELx is a context synchronization event. + */ +} diff --git a/sys/arm64/include/pmap.h b/sys/arm64/include/pmap.h --- a/sys/arm64/include/pmap.h +++ b/sys/arm64/include/pmap.h @@ -152,6 +152,8 @@ (uint64_t)(asid) << ASID_TO_OPERAND_SHIFT; \ }) +#define PMAP_WANT_ACTIVE_CPUS_NAIVE + extern vm_offset_t virtual_avail; extern vm_offset_t virtual_end; diff --git a/sys/conf/files b/sys/conf/files --- a/sys/conf/files +++ b/sys/conf/files @@ -3795,6 +3795,7 @@ kern/kern_loginclass.c standard kern/kern_malloc.c standard kern/kern_mbuf.c standard +kern/kern_membarrier.c standard kern/kern_mib.c standard kern/kern_module.c standard kern/kern_mtxpool.c standard @@ -3813,6 +3814,7 @@ kern/kern_rctl.c standard kern/kern_resource.c standard kern/kern_rmlock.c standard +kern/kern_rseq.c standard kern/kern_rwlock.c standard kern/kern_sdt.c optional kdtrace_hooks kern/kern_sema.c standard diff --git a/sys/i386/i386/pmap_base.c b/sys/i386/i386/pmap_base.c --- a/sys/i386/i386/pmap_base.c +++ b/sys/i386/i386/pmap_base.c @@ -946,6 +946,12 @@ pmap_methods_ptr->pm_kremove(va); } +void +pmap_active_cpus(pmap_t pmap, cpuset_t *res) +{ + *res = pmap->pm_active; +} + extern struct pmap_methods pmap_pae_methods, pmap_nopae_methods; int pae_mode; SYSCTL_INT(_vm_pmap, OID_AUTO, pae_mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, diff --git a/sys/i386/i386/support.s b/sys/i386/i386/support.s --- a/sys/i386/i386/support.s +++ b/sys/i386/i386/support.s @@ -580,3 +580,11 @@ movl %eax, %cr0 3: ret END(mds_handler_silvermont) + +ENTRY(cpu_sync_core) + popl %eax + pushfl + pushl %cs + pushl %eax + iretl +END(cpu_sync_core) diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -212,6 +212,11 @@ CTLFLAG_RWTUN, &__elfN(sigfastblock), 0, "enable sigfastblock for new processes"); +static int __elfN(rseq1) = 1; +SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, rseq1, + CTLFLAG_RWTUN, &__elfN(rseq1), 0, + "enable rseq v1 ABI for new processes"); + static bool __elfN(allow_wx) = true; SYSCTL_BOOL(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO, allow_wx, CTLFLAG_RWTUN, &__elfN(allow_wx), 0, @@ -1463,8 +1468,9 @@ AUXARGS_ENTRY(pos, AT_HWCAP, *imgp->sysent->sv_hwcap); if (imgp->sysent->sv_hwcap2 != NULL) AUXARGS_ENTRY(pos, AT_HWCAP2, *imgp->sysent->sv_hwcap2); - AUXARGS_ENTRY(pos, AT_BSDFLAGS, __elfN(sigfastblock) ? - ELF_BSDF_SIGFASTBLK : 0); + AUXARGS_ENTRY(pos, AT_BSDFLAGS, + (__elfN(sigfastblock) ? ELF_BSDF_SIGFASTBLK : 0) | + (__elfN(rseq1) ? ELF_BSDF_RSEQ1 : 0 )); AUXARGS_ENTRY(pos, AT_ARGC, imgp->args->argc); AUXARGS_ENTRY_PTR(pos, AT_ARGV, imgp->argv); AUXARGS_ENTRY(pos, AT_ENVC, imgp->args->envc); diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -826,12 +826,15 @@ p->p_flag2 &= ~P2_NOTRACE; if ((p->p_flag2 & P2_STKGAP_DISABLE_EXEC) == 0) p->p_flag2 &= ~P2_STKGAP_DISABLE; + p->p_flag2 &= ~(P2_MEMBAR_PRIVE | P2_MEMBAR_PRIVE_SYNCORE | + P2_MEMBAR_GLOBE); if (p->p_flag & P_PPWAIT) { p->p_flag &= ~(P_PPWAIT | P_PPTRACE); cv_broadcast(&p->p_pwait); /* STOPs are no longer ignored, arrange for AST */ signotify(td); } + td->td_rseq_abi = NULL; if ((imgp->sysent->sv_setid_allowed != NULL && !(*imgp->sysent->sv_setid_allowed)(td, imgp)) || diff --git a/sys/kern/kern_membarrier.c b/sys/kern/kern_membarrier.c new file mode 100644 --- /dev/null +++ b/sys/kern/kern_membarrier.c @@ -0,0 +1,276 @@ +/*- + * Copyright (c) 2021 The FreeBSD Foundation + * + * This software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define MEMBARRIER_SUPPORTED_CMDS ( \ + MEMBARRIER_CMD_GLOBAL | \ + MEMBARRIER_CMD_GLOBAL_EXPEDITED | \ + MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED | \ + MEMBARRIER_CMD_PRIVATE_EXPEDITED | \ + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED | \ + MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE | \ + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE | \ + MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ | \ + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ) + +static void +membarrier_action_rseq(void *arg __unused) +{ + struct thread *td; + + td = curthread; + thread_lock(td); + td->td_flags |= TDF_ASTPENDING; + td->td_flags2 |= TDF2_RSEQ_MB; + thread_unlock(td); +} + +static void +membarrier_action_seqcst(void *arg __unused) +{ + atomic_thread_fence_seq_cst(); +} + +static void +membarrier_action_seqcst_sync_core(void *arg __unused) +{ + atomic_thread_fence_seq_cst(); + cpu_sync_core(); +} + +static void +do_membarrier_ipi(cpuset_t *csp, void (*func)(void *)) +{ + atomic_thread_fence_seq_cst(); + smp_rendezvous_cpus(*csp, smp_no_rendezvous_barrier, func, + smp_no_rendezvous_barrier, NULL); + atomic_thread_fence_seq_cst(); +} + +static void +check_cpu_switched(int c, cpuset_t *csp, uint64_t *swt, bool init) +{ + struct pcpu *pc; + uint64_t sw; + + if (CPU_ISSET(c, csp)) + return; + + pc = cpuid_to_pcpu[c]; + if (pc->pc_curthread == pc->pc_idlethread) { + CPU_SET(c, csp); + return; + } + + /* + * Sync with context switch to ensure that override of + * pc_curthread with non-idle thread pointer is visible before + * reading of pc_switchtime. + */ + atomic_thread_fence_acq(); + + sw = pc->pc_switchtime; + if (init) + swt[c] = sw; + else if (sw != swt[c]) + CPU_SET(c, csp); +} + +/* + * + * XXXKIB: We execute the requested action (seq_cst and possibly + * sync_core) on current CPU as well. There is no guarantee that + * current thread executes anything with the full fence semantics + * during syscall execution. Similarly, cpu_core_sync() semantics + * might be not provided by the syscall return. E.g. on amd64 we + * typically return without IRET. + */ +int +kern_membarrier(struct thread *td, int cmd, unsigned flags, int cpu_id) +{ + struct proc *p, *p1; + struct thread *td1; + cpuset_t cs; + uint64_t *swt; + int c, error; + bool first; + + if (flags != 0 || (cmd & ~MEMBARRIER_SUPPORTED_CMDS) != 0) + return (EINVAL); + + if (cmd == MEMBARRIER_CMD_QUERY) { + td->td_retval[0] = MEMBARRIER_SUPPORTED_CMDS; + return (0); + } + + p = td->td_proc; + error = 0; + + switch (cmd) { + case MEMBARRIER_CMD_GLOBAL: + swt = malloc((mp_maxid + 1) * sizeof(*swt), M_TEMP, M_WAITOK); + CPU_ZERO(&cs); + sched_pin(); + CPU_SET(PCPU_GET(cpuid), &cs); + for (first = true; error == 0; first = false) { + CPU_FOREACH(c) + check_cpu_switched(c, &cs, swt, first); + if (CPU_CMP(&cs, &all_cpus) == 0) + break; + error = pause_sig("mmbr", 1); + if (error == EWOULDBLOCK) + error = 0; + } + sched_unpin(); + free(swt, M_TEMP); + atomic_thread_fence_seq_cst(); + break; + + case MEMBARRIER_CMD_GLOBAL_EXPEDITED: + if ((td->td_proc->p_flag2 & P2_MEMBAR_GLOBE) == 0) { + error = EPERM; + } else { + CPU_ZERO(&cs); + CPU_FOREACH(c) { + td1 = cpuid_to_pcpu[c]->pc_curthread; + p1 = td1->td_proc; + if (p1 != NULL && + (p1->p_flag2 & P2_MEMBAR_GLOBE) != 0) + CPU_SET(c, &cs); + } + do_membarrier_ipi(&cs, membarrier_action_seqcst); + } + break; + + case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: + if ((p->p_flag2 & P2_MEMBAR_GLOBE) == 0) { + PROC_LOCK(p); + p->p_flag2 |= P2_MEMBAR_GLOBE; + PROC_UNLOCK(p); + } + break; + + case MEMBARRIER_CMD_PRIVATE_EXPEDITED: + if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE) == 0) { + error = EPERM; + } else { + pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs); + do_membarrier_ipi(&cs, membarrier_action_seqcst); + } + break; + + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: + if ((p->p_flag2 & P2_MEMBAR_PRIVE) == 0) { + PROC_LOCK(p); + p->p_flag2 |= P2_MEMBAR_PRIVE; + PROC_UNLOCK(p); + } + break; + + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: + if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) { + error = EPERM; + } else { + /* + * Calculating the IPI multicast mask from + * pmap active mask means that we do not call + * cpu_sync_core() on CPUs that were missed + * from pmap active mask but could be switched + * from or to meantime. This is fine at least + * on amd64 because threads always use slow + * (IRETQ) path to return from syscall after + * context switch. + */ + pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs); + + do_membarrier_ipi(&cs, + membarrier_action_seqcst_sync_core); + } + break; + + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: + if ((p->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) { + PROC_LOCK(p); + p->p_flag2 |= P2_MEMBAR_PRIVE_SYNCORE; + PROC_UNLOCK(p); + } + break; + + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: + if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_RSEQ) == 0) { + error = EPERM; + break; + } + pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs); + if ((flags & MEMBARRIER_CMD_FLAG_CPU) != 0) { + if (!CPU_ISSET(cpu_id, &cs)) + break; + CPU_ZERO(&cs); + CPU_SET(cpu_id, &cs); + } + do_membarrier_ipi(&cs, membarrier_action_rseq); + break; + + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: + if ((p->p_flag2 & P2_MEMBAR_PRIVE_RSEQ) == 0) { + PROC_LOCK(p); + p->p_flag2 |= P2_MEMBAR_PRIVE_RSEQ; + PROC_UNLOCK(p); + } + break; + + default: + error = EINVAL; + break; + } + + return (error); +} + +int +sys_membarrier(struct thread *td, struct membarrier_args *uap) +{ + return (kern_membarrier(td, uap->cmd, uap->flags, uap->cpu_id)); +} diff --git a/sys/kern/kern_rseq.c b/sys/kern/kern_rseq.c new file mode 100644 --- /dev/null +++ b/sys/kern/kern_rseq.c @@ -0,0 +1,280 @@ +/*- + * Copyright (c) 2021 The FreeBSD Foundation + * + * This software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static void +rseq_inactivate(struct thread *td) +{ + td->td_rseq_abi = NULL; +} + +static void +rseq_inactivate_sig(struct thread *td, void *addr, int si_code) +{ + ksiginfo_t ksi; + + rseq_inactivate(td); + ksiginfo_init_trap(&ksi); + ksi.ksi_signo = SIGSEGV; + ksi.ksi_code = si_code; + ksi.ksi_trapno = 0; + ksi.ksi_addr = addr; + trapsignal(td, &ksi); +} + +void +rseq_ast(struct thread *td) +{ + struct rseq rs; + struct rseq_cs rc; + void *sig_addr; + register_t pc; + uint32_t usig; + int cpu, error; + bool clear_cs; + + if (td->td_rseq_abi == NULL) + return; + + /* + * We cannot enter critical section there to keep td_oncpu + * valid due to userspace access. We do not even want to + * sched_pin() for the same reason. + * + * It is fine to get a context switch after reading td_oncpu, + * since this would cause new AST pending and we re-enter this + * function to update rseq cpu number. + * + * Microoptimize 64bit architectures by doing single 64bit + * write for cpu ids. For instance, on SMAP-enabled amd64 + * this saves two serialization instructions STAC/CLAC. + */ + cpu = td->td_oncpu; +#ifdef __LP64__ + rs.cpu_id_start = cpu; + rs.cpu_id = cpu; + error = suword64((char *)td->td_rseq_abi + offsetof(struct rseq, + cpu_id_start), *(uint64_t *)(char *)&rs.cpu_id_start); +#else + error = suword((char *)td->td_rseq_abi + offsetof(struct rseq, + cpu_id_start), cpu); + if (error == 0) { + error = suword((char *)td->td_rseq_abi + + offsetof(struct rseq, cpu_id), cpu); + } +#endif + if (error != 0) { + rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_W); + return; + } + + error = copyin(td->td_rseq_abi, &rs, sizeof(rs)); + if (error != 0) { + rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_R); + return; + } + + if (rs.rseq_cs.ptr64 == 0) + return; + clear_cs = false; + + critical_enter(); + if ((td->td_flags2 & (TDF2_RSEQ_CLRCS | TDF2_RSEQ_MB)) == 0 && + (rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) != 0 && + ((rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) != 0 || + td->td_oncpu == td->td_lastcpu)) + return; + critical_exit(); + + error = copyin((void *)rs.rseq_cs.ptr64, &rc, sizeof(rc)); + if (error != 0) { + rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R); + return; + } + if (rc.version != 0) { + rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R); + return; + } + + critical_enter(); + if ((td->td_flags2 & (TDF2_RSEQ_CLRCS | TDF2_RSEQ_MB)) == 0 && + (rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) != 0 && + ((rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) != 0 || + td->td_oncpu == td->td_lastcpu)) + return; + critical_exit(); + + if ((td->td_flags2 & (TDF2_RSEQ_CLRCS | TDF2_RSEQ_MB)) != 0) { + if ((td->td_flags2 & TDF2_RSEQ_CLRCS) != 0) + clear_cs = true; + thread_lock(td); + td->td_flags2 &= ~(TDF2_RSEQ_CLRCS | TDF2_RSEQ_MB); + thread_unlock(td); + } + pc = TRAPF_PC(td->td_frame); + if (!clear_cs && + pc >= rc.start_ip && pc < rc.start_ip + rc.post_commit_offset) { + /* check signature */ + sig_addr = (void *)(rc.abort_ip - sizeof(usig)); + error = copyin(sig_addr, &usig, sizeof(usig)); + if (error != 0) { + rseq_inactivate_sig(td, sig_addr, SEGV_RSEQ_R); + return; + } + if (usig != td->td_rseq_sig) { + rseq_inactivate_sig(td, sig_addr, SEGV_RSEQ_SIG); + return; + } + + TRAPF_PC(td->td_frame) = rc.abort_ip; + clear_cs = true; + } + if (clear_cs) { + if (suword64((char *)td->td_rseq_abi + offsetof(struct rseq, + rseq_cs.ptr), 0) == -1) { + rseq_inactivate_sig(td, (char *)td->td_rseq_abi + + offsetof(struct rseq, rseq_cs.ptr), + SEGV_RSEQ_W); + return; + } + } +} + +void +rseq_before_sig(struct thread *td) +{ + struct rseq rs; + struct rseq_cs rc; + uint32_t usig; + int error; + + td->td_pflags2 &= ~TDP2_RSEQ_SIG; + if (td->td_rseq_abi == NULL) + return; + + error = copyin(td->td_rseq_abi, &rs, sizeof(rs)); + if (error != 0) { + rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_R); + return; + } + + if (rs.rseq_cs.ptr64 == 0 || + (rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) != 0) + return; + + error = copyin((void *)rs.rseq_cs.ptr64, &rc, sizeof(rc)); + if (error != 0) { + rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R); + return; + } + + if ((rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) != 0) + return; + + /* check signature */ + error = copyin((void *)(rc.start_ip - sizeof(usig)), &usig, + sizeof(usig)); + if (error != 0) { + rseq_inactivate_sig(td, (void *)(rc.start_ip - sizeof(usig)), + SEGV_RSEQ_R); + return; + } + if (usig != td->td_rseq_sig) { + rseq_inactivate_sig(td, (void *)(rc.start_ip - sizeof(usig)), + SEGV_RSEQ_SIG); + return; + } + + td->td_pflags2 |= TDP2_RSEQ_SIG; + td->td_rseq_start_ip = rc.start_ip; + td->td_rseq_end_ip = rc.start_ip + rc.post_commit_offset; + td->td_rseq_abort_ip = rc.abort_ip; +} + +void +rseq_on_sig(struct thread *td) +{ + register_t pc; + + if ((td->td_pflags2 & TDP2_RSEQ_SIG) == 0) + return; + td->td_pflags2 &= ~TDP2_RSEQ_SIG; + pc = TRAPF_PC(td->td_frame); + if (pc >= td->td_rseq_start_ip && pc < td->td_rseq_end_ip) { + TRAPF_PC(td->td_frame) = td->td_rseq_abort_ip; + thread_lock(td); + td->td_flags |= TDF_ASTPENDING | TDF_RSEQ; + td->td_flags2 |= TDF2_RSEQ_CLRCS; + thread_unlock(td); + } +} + +static int +kern_rseq(struct thread *td, uintptr_t rseq, uint32_t rseqlen, int flags, + uint32_t sig) +{ + if (rseqlen != sizeof(struct rseq)) + return (EINVAL); + + if (flags == RSEQ_FLAG_UNREGISTER) { + if (rseq != 0 || td->td_rseq_abi == NULL) + return (EINVAL); + if (sig != td->td_rseq_sig) + return (EPERM); + rseq_inactivate(td); + return (0); + } + + if (td->td_rseq_abi != NULL) + return (EBUSY); + if (flags != 0 || rseq == 0 || + trunc_page(rseq) != trunc_page(rseq + rseqlen)) + return (EINVAL); + + td->td_rseq_abi = (void *)rseq; + td->td_rseq_sig = sig; + thread_lock(td); + td->td_flags |= TDF_ASTPENDING | TDF_RSEQ; + thread_unlock(td); + return (0); +} + +int +sys_rseq(struct thread *td, struct rseq_args *uap) +{ + return (kern_rseq(td, (uintptr_t)uap->rseq, uap->rseqlen, + uap->flags, uap->sig)); +} diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -2029,6 +2030,7 @@ KASSERT(_SIG_VALID(sig), ("invalid signal")); sigfastblock_fetch(td); + rseq_before_sig(td); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); @@ -2042,6 +2044,7 @@ ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, code); #endif + rseq_on_sig(td); (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); postsig_done(sig, td, ps); @@ -3253,6 +3256,7 @@ if (p->p_sig == sig) { p->p_sig = 0; } + rseq_on_sig(td); (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); postsig_done(sig, td, ps); } diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -491,7 +492,7 @@ mi_switch(int flags) { uint64_t runtime, new_switchtime; - struct thread *td; + struct thread *td, *td1; td = curthread; /* XXX */ THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); @@ -549,9 +550,14 @@ /* * If the last thread was exiting, finish cleaning it up. */ - if ((td = PCPU_GET(deadthread))) { + if ((td1 = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); - thread_stash(td); + thread_stash(td1); + } + if (td->td_rseq_abi != NULL) { + thread_lock(td); + td->td_flags |= TDF_ASTPENDING | TDF_RSEQ; + thread_unlock(td); } spinlock_exit(); } diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -87,11 +87,11 @@ #ifdef __amd64__ _Static_assert(offsetof(struct thread, td_flags) == 0x108, "struct thread KBI td_flags"); -_Static_assert(offsetof(struct thread, td_pflags) == 0x110, +_Static_assert(offsetof(struct thread, td_pflags) == 0x114, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x4a8, +_Static_assert(offsetof(struct thread, td_frame) == 0x4d0, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x6e0, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb8, "struct proc KBI p_flag"); @@ -109,9 +109,9 @@ "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0xa4, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x308, +_Static_assert(offsetof(struct thread, td_frame) == 0x31c, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x34c, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x360, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0x6c, "struct proc KBI p_flag"); diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include #include @@ -243,7 +244,7 @@ flags = td->td_flags; td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK | TDF_NEEDRESCHED | TDF_ALRMPEND | TDF_PROFPEND | TDF_MACPEND | - TDF_KQTICKLED); + TDF_KQTICKLED | TDF_RSEQ); thread_unlock(td); VM_CNT_INC(v_trap); @@ -332,6 +333,7 @@ if (flags & TDF_NEEDSIGCHK || p->p_pendingcnt > 0 || !SIGISEMPTY(p->p_siglist)) { sigfastblock_fetch(td); + rseq_before_sig(td); PROC_LOCK(p); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { @@ -354,6 +356,9 @@ */ sigfastblock_setpend(td, resched_sigs); + if ((flags & TDF_RSEQ) != 0) + rseq_ast(td); + #ifdef KTRACE KTRUSERRET(td); #endif diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -3299,13 +3299,28 @@ 581 AUE_NULL STD|CAPENABLED { int sched_getcpu(void); } - 582 AUE_SWAPOFF STD { int swapoff( _In_z_ const char *name, u_int flags, ); } +583 AUE_NULL STD|CAPENABLED { + int membarrier( + int cmd, + unsigned flags, + int cpu_id + ); + } +584 AUE_NULL STD|CAPENABLED { + int rseq( + _Inout_updates_bytes_(rseqlen) void *rseq, + uint32_t rseqlen, + int flags, + uint32_t sig + ); + } + ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master diff --git a/sys/mips/mips/pmap.c b/sys/mips/mips/pmap.c --- a/sys/mips/mips/pmap.c +++ b/sys/mips/mips/pmap.c @@ -3763,3 +3763,9 @@ return (FALSE); } } + +void +pmap_active_cpus(pmap_t pmap, cpuset_t *res) +{ + *res = pmap->pm_active; +} diff --git a/sys/mips/mips/vm_machdep.c b/sys/mips/mips/vm_machdep.c --- a/sys/mips/mips/vm_machdep.c +++ b/sys/mips/mips/vm_machdep.c @@ -459,6 +459,11 @@ return (EINVAL); } +void +cpu_sync_core(void) +{ +} + /* * Software interrupt handler for queued VM system processing. */ diff --git a/sys/powerpc/powerpc/pmap_dispatch.c b/sys/powerpc/powerpc/pmap_dispatch.c --- a/sys/powerpc/powerpc/pmap_dispatch.c +++ b/sys/powerpc/powerpc/pmap_dispatch.c @@ -255,3 +255,9 @@ return (FALSE); } } + +void +pmap_active_cpus(pmap_t pmap, cpuset_t *res) +{ + *res = pmap->pm_active; +} diff --git a/sys/powerpc/powerpc/vm_machdep.c b/sys/powerpc/powerpc/vm_machdep.c --- a/sys/powerpc/powerpc/vm_machdep.c +++ b/sys/powerpc/powerpc/vm_machdep.c @@ -268,3 +268,9 @@ return (EINVAL); } + +void +cpu_sync_core(void) +{ + isync(); +} diff --git a/sys/riscv/riscv/pmap.c b/sys/riscv/riscv/pmap.c --- a/sys/riscv/riscv/pmap.c +++ b/sys/riscv/riscv/pmap.c @@ -4433,6 +4433,12 @@ PCPU_SET(curpmap, pmap); } +void +pmap_active_cpus(pmap_t pmap, cpuset_t *res) +{ + *res = pmap->pm_active; +} + void pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz) { diff --git a/sys/riscv/riscv/vm_machdep.c b/sys/riscv/riscv/vm_machdep.c --- a/sys/riscv/riscv/vm_machdep.c +++ b/sys/riscv/riscv/vm_machdep.c @@ -51,6 +51,7 @@ #include #include +#include #include #include #include @@ -276,3 +277,9 @@ /* Nothing to do here - busdma bounce buffers are not implemented. */ } + +void +cpu_sync_core(void) +{ + fence_i(); +} diff --git a/sys/sys/elf_common.h b/sys/sys/elf_common.h --- a/sys/sys/elf_common.h +++ b/sys/sys/elf_common.h @@ -1497,5 +1497,6 @@ #define R_X86_64_REX_GOTPCRELX 42 #define ELF_BSDF_SIGFASTBLK 0x0001 /* Kernel supports fast sigblock */ +#define ELF_BSDF_RSEQ1 0x0002 /* Kernel support for rseq v1 */ #endif /* !_SYS_ELF_COMMON_H_ */ diff --git a/sys/sys/membarrier.h b/sys/sys/membarrier.h new file mode 100644 --- /dev/null +++ b/sys/sys/membarrier.h @@ -0,0 +1,64 @@ +/*- + * Copyright (c) 2021 The FreeBSD Foundation + * + * This software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __SYS_MEMBARRIER_H__ +#define __SYS_MEMBARRIER_H__ + +#include + +/* + * The enum membarrier_cmd values are bits. The MEMBARRIER_CMD_QUERY + * command returns a bitset indicating which commands are supported. + * Also the value of MEMBARRIER_CMD_QUERY is zero, so it is + * effectively not returned by the query. + */ +enum membarrier_cmd { + MEMBARRIER_CMD_QUERY = 0x00000000, + MEMBARRIER_CMD_GLOBAL = 0x00000001, + MEMBARRIER_CMD_SHARED = MEMBARRIER_CMD_GLOBAL, + MEMBARRIER_CMD_GLOBAL_EXPEDITED = 0x00000002, + MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED = 0x00000004, + MEMBARRIER_CMD_PRIVATE_EXPEDITED = 0x00000008, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED = 0x00000010, + MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE = 0x00000020, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE = 0x00000040, + MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ = 0x00000080, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = 0x00000100, +}; + +enum membarrier_cmd_flag { + MEMBARRIER_CMD_FLAG_CPU = 0x00000001, +}; + +#ifndef _KERNEL +__BEGIN_DECLS +int membarrier(int, unsigned, int); +__END_DECLS +#endif /* _KERNEL */ + +#endif /* __SYS_MEMBARRIER_H__ */ diff --git a/sys/sys/proc.h b/sys/sys/proc.h --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -262,6 +262,7 @@ /* Cleared during fork1() */ #define td_startzero td_flags int td_flags; /* (t) TDF_* flags. */ + int td_flags2; /* (t) TDF2_* flags. */ int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ int td_pflags2; /* (k) Private thread (TDP2_*) flags. */ @@ -322,6 +323,11 @@ size_t td_vslock_sz; /* (k) amount of vslock-ed space */ struct kcov_info *td_kcov_info; /* (*) Kernel code coverage data */ u_int td_ucredref; /* (k) references on td_realucred */ + uint32_t td_rseq_sig; /* (k) abort handler signature */ + void *td_rseq_abi; /* (k) usermode rseq */ + register_t td_rseq_start_ip;/* (k) */ + register_t td_rseq_end_ip; /* (k) */ + register_t td_rseq_abort_ip;/* (k) */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ @@ -468,7 +474,7 @@ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_SEINTR 0x00200000 /* EINTR on stop attempts. */ #define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */ -#define TDF_UNUSED23 0x00800000 /* --available-- */ +#define TDF_RSEQ 0x00800000 /* rseq active */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ @@ -477,6 +483,9 @@ #define TDF_PROFPEND 0x20000000 /* Pending SIGPROF needs to be posted. */ #define TDF_MACPEND 0x40000000 /* AST-based MAC event pending. */ +#define TDF2_RSEQ_CLRCS 0x00000001 /* rseq clear rc */ +#define TDF2_RSEQ_MB 0x00000002 /* MEMBARRIER_RSEQ requested */ + /* Userland debug flags */ #define TDB_SUSPEND 0x00000001 /* Thread is suspended by debugger */ #define TDB_XSIG 0x00000002 /* Thread is exchanging signal under trace */ @@ -537,6 +546,7 @@ #define TDP2_SBPAGES 0x00000001 /* Owns sbusy on some pages */ #define TDP2_COMPAT32RB 0x00000002 /* compat32 ABI for robust lists */ #define TDP2_ACCT 0x00000004 /* Doing accounting */ +#define TDP2_RSEQ_SIG 0x00000008 /* * Reasons that the current thread can not be run yet. @@ -847,6 +857,14 @@ #define P2_NO_NEW_PRIVS 0x00008000 /* Ignore setuid */ #define P2_WXORX_DISABLE 0x00010000 /* WX mappings enabled */ #define P2_WXORX_ENABLE_EXEC 0x00020000 /* WXORX enabled after exec */ +#define P2_MEMBAR_PRIVE 0x00040000 /* membar private expedited + registered */ +#define P2_MEMBAR_PRIVE_SYNCORE 0x00080000 /* membar private expedited + sync core registered */ +#define P2_MEMBAR_GLOBE 0x00100000 /* membar global expedited + registered */ +#define P2_MEMBAR_PRIVE_RSEQ 0x00200000 /* membar private expedited + rseq registered */ /* Flags protected by proctree_lock, kept in p_treeflags. */ #define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */ @@ -1170,6 +1188,7 @@ int cpu_idle_wakeup(int); extern void (*cpu_idle_hook)(sbintime_t); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *, struct thread *, struct mtx *); +void cpu_sync_core(void); void cpu_throw(struct thread *, struct thread *) __dead2; bool curproc_sigkilled(void); void userret(struct thread *, struct trapframe *); diff --git a/sys/sys/rseq.h b/sys/sys/rseq.h new file mode 100644 --- /dev/null +++ b/sys/sys/rseq.h @@ -0,0 +1,99 @@ +/*- + * Copyright (c) 2021 The FreeBSD Foundation + * + * This software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __SYS_RSEQ_H__ +#define __SYS_RSEQ_H__ + +#include +#include +#include + +enum rseq_cpu_id_state { + RSEQ_CPU_ID_UNINITIALIZED = -1, + RSEQ_CPU_ID_REGISTRATION_FAILED = -2, +}; + +enum rseq_flags { + RSEQ_FLAG_UNREGISTER = 1, +}; + +enum rseq_cs_flags { + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = 0x00000001, + RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = 0x00000002, + RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = 0x00000004, +}; + +struct rseq_cs { + uint32_t version; + uint32_t flags; + uint64_t start_ip; + uint64_t post_commit_offset; + uint64_t abort_ip; +}; + +struct rseq { + uint32_t cpu_id_start; + uint32_t cpu_id; + union { + uint64_t ptr64; +#ifdef __LP64__ + uint64_t ptr; +#else + struct { +#if _BYTE_ORDER == _BIG_ENDIAN + uint32_t pad; + uint32_t ptr32; +#else /* BYTE_ORDER */ + uint32_t ptr32; + uint32_t pad; +#endif /* BYTE_ORDER */ + } ptr; +#endif /* LP64 */ + } rseq_cs; + uint32_t flags; +}; + +#ifdef _KERNEL + +#define TD_RSEQ_ACTIVE 0x00000001 + +void rseq_ast(struct thread *td); +void rseq_before_sig(struct thread *td); +void rseq_on_sig(struct thread *td); + +#else /* _KERNEL */ + +__BEGIN_DECLS +extern __thread volatile struct rseq __rseq_abi __weak_symbol; + +int rseq(volatile struct rseq *rseq, uint32_t rseqlen, int flags, uint32_t sig); +__END_DECLS + +#endif /* _KERNEL */ + +#endif /* __SYS_RSEQ_H__ */ diff --git a/sys/sys/signal.h b/sys/sys/signal.h --- a/sys/sys/signal.h +++ b/sys/sys/signal.h @@ -329,6 +329,9 @@ #define SEGV_ACCERR 2 /* Invalid permissions for mapped */ /* object. */ #define SEGV_PKUERR 100 /* x86: PKU violation */ +#define SEGV_RSEQ_R 101 /* rseq access read fault */ +#define SEGV_RSEQ_W 102 /* rseq access write fault */ +#define SEGV_RSEQ_SIG 103 /* rseq signature check fault */ /* codes for SIGFPE */ #define FPE_INTOVF 1 /* Integer overflow. */ diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h --- a/sys/sys/syscallsubr.h +++ b/sys/sys/syscallsubr.h @@ -200,6 +200,8 @@ int inherit); int kern_mkdirat(struct thread *td, int fd, const char *path, enum uio_seg segflg, int mode); +int kern_membarrier(struct thread *td, int cmd, unsigned flags, + int cpu_id); int kern_mkfifoat(struct thread *td, int fd, const char *path, enum uio_seg pathseg, int mode); int kern_mknodat(struct thread *td, int fd, const char *path, diff --git a/sys/vm/pmap.h b/sys/vm/pmap.h --- a/sys/vm/pmap.h +++ b/sys/vm/pmap.h @@ -92,6 +92,7 @@ #include #ifdef _KERNEL +#include struct thread; /* @@ -120,6 +121,7 @@ #define PMAP_TS_REFERENCED_MAX 5 void pmap_activate(struct thread *td); +void pmap_active_cpus(pmap_t pmap, cpuset_t *res); void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice); void pmap_align_superpage(vm_object_t, vm_ooffset_t, vm_offset_t *, diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include #include @@ -881,6 +882,31 @@ #endif } +#ifdef PMAP_WANT_ACTIVE_CPUS_NAIVE +void +pmap_active_cpus(pmap_t pmap, cpuset_t *res) +{ + struct thread *td; + struct proc *p; + struct vmspace *vm; + int c; + + CPU_ZERO(res); + CPU_FOREACH(c) { + td = cpuid_to_pcpu[c]->pc_curthread; + p = td->td_proc; + if (p == NULL) + continue; + vm = vmspace_acquire_ref(p); + if (vm == NULL) + continue; + if (pmap == vmspace_pmap(vm)) + CPU_SET(c, res); + vmspace_free(vm); + } +} +#endif + /* * Allow userspace to directly trigger the VM drain routine for testing * purposes.