diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map --- a/lib/libc/sys/Symbol.map +++ b/lib/libc/sys/Symbol.map @@ -421,6 +421,7 @@ _Fork; fspacectl; membarrier; + rseq; }; FBSDprivate_1.0 { diff --git a/sys/compat/freebsd32/capabilities.conf b/sys/compat/freebsd32/capabilities.conf --- a/sys/compat/freebsd32/capabilities.conf +++ b/sys/compat/freebsd32/capabilities.conf @@ -552,6 +552,10 @@ recvfrom recvmsg +## +## +rseq + ## ## Allow real-time scheduling primitives to be used. ## diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master --- a/sys/compat/freebsd32/syscalls.master +++ b/sys/compat/freebsd32/syscalls.master @@ -1183,5 +1183,7 @@ struct spacectl_range32 *rmsr); } 581 AUE_NULL STD|NOPROTO { int membarrier(int cmd, unsigned flags, \ int cpu_id); } +582 AUE_NULL STD|NOPROTO {int rseq(void *rseq, uint32_t rseqlen, \ + int flags, uint32_t sig); } ; vim: syntax=off diff --git a/sys/conf/files b/sys/conf/files --- a/sys/conf/files +++ b/sys/conf/files @@ -3837,6 +3837,7 @@ kern/kern_rctl.c standard kern/kern_resource.c standard kern/kern_rmlock.c standard +kern/kern_rseq.c standard kern/kern_rwlock.c standard kern/kern_sdt.c optional kdtrace_hooks kern/kern_sema.c standard diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -804,6 +804,7 @@ /* STOPs are no longer ignored, arrange for AST */ signotify(td); } + td->td_rseq_abi = NULL; if ((imgp->sysent->sv_setid_allowed != NULL && !(*imgp->sysent->sv_setid_allowed)(td, imgp)) || diff --git a/sys/kern/kern_rseq.c b/sys/kern/kern_rseq.c new file mode 100644 --- /dev/null +++ b/sys/kern/kern_rseq.c @@ -0,0 +1,226 @@ +/*- + * Copyright (c) 2021 The FreeBSD Foundation + * + * This software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static void +rseq_inactivate(struct thread *td) +{ + td->td_rseq_abi = NULL; +} + +static void +rseq_inactivate_sig(struct thread *td, void *addr, int si_code) +{ + ksiginfo_t ksi; + + rseq_inactivate(td); + ksiginfo_init_trap(&ksi); + ksi.ksi_signo = SIGSEGV; + ksi.ksi_code = si_code; + ksi.ksi_trapno = 0; + ksi.ksi_addr = addr; + trapsignal(td, &ksi); +} + +void +rseq_ast(struct thread *td) +{ + struct rseq rs; + struct rseq_cs rc; + register_t pc; + int cpu, error; + + if (td->td_rseq_abi == NULL) + return; + + /* + * We cannot enter critical section there to keep td_oncpu + * valid due to userspace access. We do not even want to + * sched_pin() for the same reason. + * + * It is fine to get a context switch after reading td_oncpu, + * since this would cause new AST pending and we re-enter this + * function to update rseq cpu number. + * + * Microoptimize 64bit architectures by doing single 64bit + * write for cpu ids. For instance, on SMAP-enabled amd64 + * this saves two serialization instructions STAC/CLAC. + */ + cpu = td->td_oncpu; +#ifdef __LP64__ + rs.cpu_id_start = cpu; + rs.cpu_id = cpu; + error = suword64((char *)td->td_rseq_abi + offsetof(struct rseq, + cpu_id_start), *(uint64_t *)(char *)&rs.cpu_id_start); +#else + error = suword((char *)td->td_rseq_abi + offsetof(struct rseq, + cpu_id_start), cpu); + if (error == 0) { + error = suword((char *)td->td_rseq_abi + + offsetof(struct rseq, cpu_id), cpu); + } +#endif + if (error != 0) { + rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_W); + return; + } + + error = copyin(td->td_rseq_abi, &rs, sizeof(rs)); + if (error != 0) { + rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_R); + return; + } + + if (rs.rseq_cs.ptr64 == 0) + return; + + critical_enter(); + if ((rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) != 0 && + ((rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) != 0 || + td->td_oncpu == td->td_lastcpu)) + return; + critical_exit(); + + error = copyin((void *)rs.rseq_cs.ptr64, &rc, sizeof(rc)); + if (error != 0) { + rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R); + return; + } + if (rc.version != 0) { + rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R); + return; + } + + critical_enter(); + if ((rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT) != 0 && + ((rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) != 0 || + td->td_oncpu == td->td_lastcpu)) + return; + critical_exit(); + + pc = TRAPF_PC(td->td_frame); + if (pc >= rc.start_ip && pc < rc.start_ip + rc.post_commit_offset) { + /* XXXKIB check signature */ + TRAPF_PC(td->td_frame) = rc.abort_ip + 4; + } +} + +void +rseq_before_sig(struct thread *td) +{ + struct rseq rs; + struct rseq_cs rc; + int error; + + td->td_pflags &= ~TDP2_RSEQ_SIG; + if (td->td_rseq_abi == NULL) + return; + + error = copyin(td->td_rseq_abi, &rs, sizeof(rs)); + if (error != 0) { + rseq_inactivate_sig(td, td->td_rseq_abi, SEGV_RSEQ_R); + return; + } + + if (rs.rseq_cs.ptr64 == 0 || + (rs.flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) != 0) + return; + + error = copyin((void *)rs.rseq_cs.ptr64, &rc, sizeof(rc)); + if (error != 0) { + rseq_inactivate_sig(td, (void *)rs.rseq_cs.ptr64, SEGV_RSEQ_R); + return; + } + + if ((rc.flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) != 0) + return; + /* XXXKIB check signature */ + + td->td_pflags |= TDP2_RSEQ_SIG; + td->td_rseq_start_ip = rc.start_ip; + td->td_rseq_end_ip = rc.start_ip + rc.post_commit_offset; + td->td_rseq_abort_ip = rc.abort_ip; +} + +void +rseq_on_sig(struct thread *td) +{ + register_t pc; + + if ((td->td_pflags & TDP2_RSEQ_SIG) == 0) + return; + td->td_pflags &= ~TDP2_RSEQ_SIG; + pc = TRAPF_PC(td->td_frame); + if (pc >= td->td_rseq_start_ip && pc < td->td_rseq_end_ip) + TRAPF_PC(td->td_frame) = td->td_rseq_abort_ip; +} + +static int +kern_rseq(struct thread *td, uintptr_t rseq, uint32_t rseqlen, int flags, + uint32_t sig) +{ + if (rseqlen != sizeof(struct rseq)) + return (EINVAL); + + if (flags == RSEQ_FLAG_UNREGISTER) { + if (rseq != 0 || td->td_rseq_abi == NULL) + return (EINVAL); + if (sig != td->td_rseq_sig) + return (EPERM); + rseq_inactivate(td); + return (0); + } + + if (td->td_rseq_abi != NULL) + return (EBUSY); + if (flags != 0 || rseq == 0 || + trunc_page(rseq) != trunc_page(rseq + rseqlen)) + return (EINVAL); + + td->td_rseq_abi = (void *)rseq; + td->td_rseq_sig = sig; + thread_lock(td); + td->td_flags |= TDF_ASTPENDING | TDF_RSEQ; + thread_unlock(td); + return (0); +} + +int +sys_rseq(struct thread *td, struct rseq_args *uap) +{ + return (kern_rseq(td, (uintptr_t)uap->rseq, uap->rseqlen, + uap->flags, uap->sig)); +} diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -2017,6 +2018,7 @@ KASSERT(_SIG_VALID(sig), ("invalid signal")); sigfastblock_fetch(td); + rseq_before_sig(td); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); @@ -2030,6 +2032,7 @@ ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, code); #endif + rseq_on_sig(td); (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); postsig_done(sig, td, ps); @@ -3204,6 +3207,7 @@ if (p->p_sig == sig) { p->p_sig = 0; } + rseq_on_sig(td); (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); postsig_done(sig, td, ps); } diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -491,7 +492,7 @@ mi_switch(int flags) { uint64_t runtime, new_switchtime; - struct thread *td; + struct thread *td, *td1; td = curthread; /* XXX */ THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); @@ -549,9 +550,14 @@ /* * If the last thread was exiting, finish cleaning it up. */ - if ((td = PCPU_GET(deadthread))) { + if ((td1 = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); - thread_stash(td); + thread_stash(td1); + } + if (td->td_rseq_abi != NULL) { + thread_lock(td); + td->td_flags |= TDF_ASTPENDING | TDF_RSEQ; + thread_unlock(td); } spinlock_exit(); } diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c --- a/sys/kern/kern_thread.c +++ b/sys/kern/kern_thread.c @@ -89,9 +89,9 @@ "struct thread KBI td_flags"); _Static_assert(offsetof(struct thread, td_pflags) == 0x110, "struct thread KBI td_pflags"); -_Static_assert(offsetof(struct thread, td_frame) == 0x4a8, +_Static_assert(offsetof(struct thread, td_frame) == 0x4c8, "struct thread KBI td_frame"); -_Static_assert(offsetof(struct thread, td_emuldata) == 0x6b0, +_Static_assert(offsetof(struct thread, td_emuldata) == 0x6d0, "struct thread KBI td_emuldata"); _Static_assert(offsetof(struct proc, p_flag) == 0xb8, "struct proc KBI p_flag"); diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include #include @@ -243,7 +244,7 @@ flags = td->td_flags; td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK | TDF_NEEDRESCHED | TDF_ALRMPEND | TDF_PROFPEND | TDF_MACPEND | - TDF_KQTICKLED); + TDF_KQTICKLED | TDF_RSEQ); thread_unlock(td); VM_CNT_INC(v_trap); @@ -332,6 +333,7 @@ if (flags & TDF_NEEDSIGCHK || p->p_pendingcnt > 0 || !SIGISEMPTY(p->p_siglist)) { sigfastblock_fetch(td); + rseq_before_sig(td); PROC_LOCK(p); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { @@ -354,6 +356,9 @@ */ sigfastblock_setpend(td, resched_sigs); + if ((flags & TDF_RSEQ) != 0) + rseq_ast(td); + #ifdef KTRACE KTRUSERRET(td); #endif diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -3268,6 +3268,14 @@ int cpu_id ); } +582 AUE_NULL STD|CAPENABLED { + int rseq( + void *rseq, + uint32_t rseqlen, + int flags, + uint32_t sig + ); + } ; Please copy any additions and changes to the following compatability tables: ; sys/compat/freebsd32/syscalls.master diff --git a/sys/sys/proc.h b/sys/sys/proc.h --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -316,6 +316,11 @@ size_t td_vslock_sz; /* (k) amount of vslock-ed space */ struct kcov_info *td_kcov_info; /* (*) Kernel code coverage data */ u_int td_ucredref; /* (k) references on td_realucred */ + uint32_t td_rseq_sig; /* (k) abort handler signature */ + void *td_rseq_abi; /* (k) usermode rseq */ + register_t td_rseq_start_ip;/* (k) */ + register_t td_rseq_end_ip; /* (k) */ + register_t td_rseq_abort_ip;/* (k) */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ @@ -462,7 +467,7 @@ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_SEINTR 0x00200000 /* EINTR on stop attempts. */ #define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */ -#define TDF_UNUSED23 0x00800000 /* --available-- */ +#define TDF_RSEQ 0x00800000 /* rseq active */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ @@ -531,6 +536,7 @@ #define TDP2_SBPAGES 0x00000001 /* Owns sbusy on some pages */ #define TDP2_COMPAT32RB 0x00000002 /* compat32 ABI for robust lists */ #define TDP2_ACCT 0x00000004 /* Doing accounting */ +#define TDP2_RSEQ_SIG 0x00000008 /* * Reasons that the current thread can not be run yet. diff --git a/sys/sys/rseq.h b/sys/sys/rseq.h new file mode 100644 --- /dev/null +++ b/sys/sys/rseq.h @@ -0,0 +1,96 @@ +/*- + * Copyright (c) 2021 The FreeBSD Foundation + * + * This software were developed by Konstantin Belousov + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef __SYS_RSEQ_H__ +#define __SYS_RSEQ_H__ + +#include +#include + +enum rseq_cpu_id_state { + RSEQ_CPU_ID_UNINITIALIZED = -1, + RSEQ_CPU_ID_REGISTRATION_FAILED = -2, +}; + +enum rseq_flags { + RSEQ_FLAG_UNREGISTER = 1, +}; + +enum rseq_cs_flags { + RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT = 0x00000001, + RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL = 0x00000002, + RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE = 0x00000004, +}; + +struct rseq_cs { + uint32_t version; + uint32_t flags; + uint64_t start_ip; + uint64_t post_commit_offset; + uint64_t abort_ip; +}; + +struct rseq { + uint32_t cpu_id_start; + uint32_t cpu_id; + union { + uint64_t ptr64; +#ifdef __LP64__ + uint64_t ptr; +#else + struct { +#if _BYTE_ORDER == _BIG_ENDIAN + uint32_t pad; + uint32_t ptr32; +#else /* BYTE_ORDER */ + uint32_t ptr32; + uint32_t pad; +#endif /* BYTE_ORDER */ + } ptr; +#endif /* LP64 */ + } rseq_cs; + uint32_t flags; +}; + +#ifdef _KERNEL + +#define TD_RSEQ_ACTIVE 0x00000001 + +void rseq_ast(struct thread *td); +void rseq_before_sig(struct thread *td); +void rseq_on_sig(struct thread *td); + +#else /* _KERNEL */ + +__BEGIN_DECLS +int rseq(volatile struct rseq *rseq, uint32_t rseqlen, int flags, uint32_t sig); +__END_DECLS + +#endif /* _KERNEL */ + +#endif /* __SYS_RSEQ_H__ */ diff --git a/sys/sys/signal.h b/sys/sys/signal.h --- a/sys/sys/signal.h +++ b/sys/sys/signal.h @@ -329,6 +329,8 @@ #define SEGV_ACCERR 2 /* Invalid permissions for mapped */ /* object. */ #define SEGV_PKUERR 100 /* x86: PKU violation */ +#define SEGV_RSEQ_R 101 /* rseq access read fault */ +#define SEGV_RSEQ_W 102 /* rseq access write fault */ /* codes for SIGFPE */ #define FPE_INTOVF 1 /* Integer overflow. */