diff --git a/sys/compat/linux/linux_fork.c b/sys/compat/linux/linux_fork.c index ff2d5dcfc957..bc7704795127 100644 --- a/sys/compat/linux/linux_fork.c +++ b/sys/compat/linux/linux_fork.c @@ -1,439 +1,446 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004 Tim J. Robbins * Copyright (c) 2002 Doug Rabson * Copyright (c) 2000 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #ifdef LINUX_LEGACY_SYSCALLS int linux_fork(struct thread *td, struct linux_fork_args *args) { struct fork_req fr; int error; struct proc *p2; struct thread *td2; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFSTOPPED; fr.fr_procp = &p2; if ((error = fork1(td, &fr)) != 0) return (error); td2 = FIRST_THREAD_IN_PROC(p2); linux_proc_init(td, td2, 0); td->td_retval[0] = p2->p_pid; /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); return (0); } int linux_vfork(struct thread *td, struct linux_vfork_args *args) { struct fork_req fr; int error; struct proc *p2; struct thread *td2; bzero(&fr, sizeof(fr)); fr.fr_flags = RFFDG | RFPROC | RFMEM | RFPPWAIT | RFSTOPPED; fr.fr_procp = &p2; if ((error = fork1(td, &fr)) != 0) return (error); td2 = FIRST_THREAD_IN_PROC(p2); linux_proc_init(td, td2, 0); td->td_retval[0] = p2->p_pid; /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); return (0); } #endif static int linux_clone_proc(struct thread *td, struct linux_clone_args *args) { struct fork_req fr; int error, ff = RFPROC | RFSTOPPED, f2; struct proc *p2; struct thread *td2; int exit_signal; struct linux_emuldata *em; f2 = 0; exit_signal = args->flags & 0x000000ff; if (LINUX_SIG_VALID(exit_signal)) { exit_signal = linux_to_bsd_signal(exit_signal); } else if (exit_signal != 0) return (EINVAL); if (args->flags & LINUX_CLONE_VM) ff |= RFMEM; if (args->flags & LINUX_CLONE_SIGHAND) ff |= RFSIGSHARE; if (args->flags & LINUX_CLONE_FILES) { if (!(args->flags & LINUX_CLONE_FS)) f2 |= FR2_SHARE_PATHS; } else { ff |= RFFDG; if (args->flags & LINUX_CLONE_FS) f2 |= FR2_SHARE_PATHS; } if (args->flags & LINUX_CLONE_PARENT_SETTID) if (args->parent_tidptr == NULL) return (EINVAL); if (args->flags & LINUX_CLONE_VFORK) ff |= RFPPWAIT; bzero(&fr, sizeof(fr)); fr.fr_flags = ff; fr.fr_flags2 = f2; fr.fr_procp = &p2; error = fork1(td, &fr); if (error) return (error); td2 = FIRST_THREAD_IN_PROC(p2); /* create the emuldata */ linux_proc_init(td, td2, args->flags); em = em_find(td2); KASSERT(em != NULL, ("clone_proc: emuldata not found.\n")); if (args->flags & LINUX_CLONE_CHILD_SETTID) em->child_set_tid = args->child_tidptr; else em->child_set_tid = NULL; if (args->flags & LINUX_CLONE_CHILD_CLEARTID) em->child_clear_tid = args->child_tidptr; else em->child_clear_tid = NULL; if (args->flags & LINUX_CLONE_PARENT_SETTID) { error = copyout(&p2->p_pid, args->parent_tidptr, sizeof(p2->p_pid)); if (error) linux_msg(td, "copyout p_pid failed!"); } PROC_LOCK(p2); p2->p_sigparent = exit_signal; PROC_UNLOCK(p2); /* * In a case of stack = NULL, we are supposed to COW calling process * stack. This is what normal fork() does, so we just keep tf_rsp arg * intact. */ linux_set_upcall(td2, PTROUT(args->stack)); if (args->flags & LINUX_CLONE_SETTLS) linux_set_cloned_tls(td2, args->tls); /* * If CLONE_PARENT is set, then the parent of the new process will be * the same as that of the calling process. */ if (args->flags & LINUX_CLONE_PARENT) { sx_xlock(&proctree_lock); PROC_LOCK(p2); proc_reparent(p2, td->td_proc->p_pptr, true); PROC_UNLOCK(p2); sx_xunlock(&proctree_lock); } /* * Make this runnable after we are finished with it. */ thread_lock(td2); TD_SET_CAN_RUN(td2); sched_add(td2, SRQ_BORING); td->td_retval[0] = p2->p_pid; return (0); } static int linux_clone_thread(struct thread *td, struct linux_clone_args *args) { struct linux_emuldata *em; struct thread *newtd; struct proc *p; int error; LINUX_CTR4(clone_thread, "thread(%d) flags %x ptid %p ctid %p", td->td_tid, (unsigned)args->flags, args->parent_tidptr, args->child_tidptr); if ((args->flags & LINUX_CLONE_PARENT) != 0) return (EINVAL); if (args->flags & LINUX_CLONE_PARENT_SETTID) if (args->parent_tidptr == NULL) return (EINVAL); /* Threads should be created with own stack */ if (args->stack == NULL) return (EINVAL); p = td->td_proc; #ifdef RACCT if (racct_enable) { PROC_LOCK(p); error = racct_add(p, RACCT_NTHR, 1); PROC_UNLOCK(p); if (error != 0) return (EPROCLIM); } #endif /* Initialize our td */ error = kern_thr_alloc(p, 0, &newtd); if (error) goto fail; cpu_copy_thread(newtd, td); bzero(&newtd->td_startzero, __rangeof(struct thread, td_startzero, td_endzero)); bcopy(&td->td_startcopy, &newtd->td_startcopy, __rangeof(struct thread, td_startcopy, td_endcopy)); newtd->td_proc = p; thread_cow_get(newtd, td); /* create the emuldata */ linux_proc_init(td, newtd, args->flags); em = em_find(newtd); KASSERT(em != NULL, ("clone_thread: emuldata not found.\n")); if (args->flags & LINUX_CLONE_SETTLS) linux_set_cloned_tls(newtd, args->tls); if (args->flags & LINUX_CLONE_CHILD_SETTID) em->child_set_tid = args->child_tidptr; else em->child_set_tid = NULL; if (args->flags & LINUX_CLONE_CHILD_CLEARTID) em->child_clear_tid = args->child_tidptr; else em->child_clear_tid = NULL; cpu_thread_clean(newtd); linux_set_upcall(newtd, PTROUT(args->stack)); PROC_LOCK(p); p->p_flag |= P_HADTHREADS; thread_link(newtd, p); bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name)); thread_lock(td); /* let the scheduler know about these things. */ sched_fork_thread(td, newtd); thread_unlock(td); if (P_SHOULDSTOP(p)) newtd->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if (p->p_ptevents & PTRACE_LWP) newtd->td_dbgflags |= TDB_BORN; PROC_UNLOCK(p); tidhash_add(newtd); LINUX_CTR2(clone_thread, "thread(%d) successful clone to %d", td->td_tid, newtd->td_tid); if (args->flags & LINUX_CLONE_PARENT_SETTID) { error = copyout(&newtd->td_tid, args->parent_tidptr, sizeof(newtd->td_tid)); if (error) linux_msg(td, "clone_thread: copyout td_tid failed!"); } /* * Make this runnable after we are finished with it. */ thread_lock(newtd); TD_SET_CAN_RUN(newtd); sched_add(newtd, SRQ_BORING); td->td_retval[0] = newtd->td_tid; return (0); fail: #ifdef RACCT if (racct_enable) { PROC_LOCK(p); racct_sub(p, RACCT_NTHR, 1); PROC_UNLOCK(p); } #endif return (error); } int linux_clone(struct thread *td, struct linux_clone_args *args) { if (args->flags & LINUX_CLONE_THREAD) return (linux_clone_thread(td, args)); else return (linux_clone_proc(td, args)); } int linux_exit(struct thread *td, struct linux_exit_args *args) { struct linux_emuldata *em; em = em_find(td); KASSERT(em != NULL, ("exit: emuldata not found.\n")); LINUX_CTR2(exit, "thread(%d) (%d)", em->em_tid, args->rval); linux_thread_detach(td); /* * XXX. When the last two threads of a process * exit via pthread_exit() try thr_exit() first. */ kern_thr_exit(td); exit1(td, args->rval, 0); /* NOTREACHED */ } int linux_set_tid_address(struct thread *td, struct linux_set_tid_address_args *args) { struct linux_emuldata *em; em = em_find(td); KASSERT(em != NULL, ("set_tid_address: emuldata not found.\n")); em->child_clear_tid = args->tidptr; td->td_retval[0] = em->em_tid; LINUX_CTR3(set_tid_address, "tidptr(%d) %p, returns %d", em->em_tid, args->tidptr, td->td_retval[0]); return (0); } void linux_thread_detach(struct thread *td) { struct linux_emuldata *em; int *child_clear_tid; int error; em = em_find(td); KASSERT(em != NULL, ("thread_detach: emuldata not found.\n")); LINUX_CTR1(thread_detach, "thread(%d)", em->em_tid); release_futexes(td, em); child_clear_tid = em->child_clear_tid; if (child_clear_tid != NULL) { LINUX_CTR2(thread_detach, "thread(%d) %p", em->em_tid, child_clear_tid); error = suword32(child_clear_tid, 0); if (error != 0) return; error = futex_wake(td, child_clear_tid, 1, false); /* * this cannot happen at the moment and if this happens it * probably means there is a user space bug */ if (error != 0) linux_msg(td, "futex stuff in thread_detach failed."); } + + /* + * Do not rely on the robust list which is maintained by userspace, + * cleanup remaining pi (if any) after release_futexes anyway. + */ + umtx_thread_exit(td); } diff --git a/sys/compat/linux/linux_futex.c b/sys/compat/linux/linux_futex.c index 9dab78a75af2..441a26d4d317 100644 --- a/sys/compat/linux/linux_futex.c +++ b/sys/compat/linux/linux_futex.c @@ -1,799 +1,1088 @@ /* $NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2005 Emmanuel Dreyfus * All rights reserved. * Copyright (c) 2009-2016 Dmitry Chagin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Emmanuel Dreyfus * 4. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #if 0 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $"); #endif #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include +#include #include #include /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); /** * DTrace probes in this module. */ LIN_SDT_PROBE_DEFINE4(futex, futex_atomic_op, decoded_op, "int", "int", "int", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_op, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_cmp, "int"); LIN_SDT_PROBE_DEFINE3(futex, linux_futex, debug_wait, "uint32_t *", "uint32_t", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, linux_futex, debug_wake, "uint32_t *", "uint32_t", "uint32_t"); LIN_SDT_PROBE_DEFINE5(futex, linux_futex, debug_cmp_requeue, "uint32_t *", "uint32_t", "uint32_t", "uint32_t *", "struct l_timespec *"); LIN_SDT_PROBE_DEFINE5(futex, linux_futex, debug_wake_op, "uint32_t *", "int", "uint32_t", "uint32_t *", "uint32_t"); -LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_lock_pi); -LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_unlock_pi); -LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_trylock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, deprecated_requeue); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_wait_requeue_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_futex, unimplemented_cmp_requeue_pi); LIN_SDT_PROBE_DEFINE1(futex, linux_futex, unknown_operation, "int"); LIN_SDT_PROBE_DEFINE0(futex, linux_set_robust_list, size_error); LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, copyout_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, release_futexes, copyin_error, "int"); #define FUTEX_SHARED 0x8 /* shared futex */ #define GET_SHARED(a) (a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE static int futex_atomic_op(struct thread *, int, uint32_t *); static int handle_futex_death(struct linux_emuldata *, uint32_t *, unsigned int); static int fetch_robust_entry(struct linux_robust_list **, struct linux_robust_list **, unsigned int *); struct linux_futex_args { uint32_t *uaddr; int32_t op; uint32_t flags; bool clockrt; uint32_t val; struct timespec *ts; uint32_t *uaddr2; uint32_t val3; bool val3_compare; struct timespec kts; }; static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *, struct linux_futex_args *); static int linux_futex(struct thread *, struct linux_futex_args *); static int linux_futex_wait(struct thread *, struct linux_futex_args *); static int linux_futex_wake(struct thread *, struct linux_futex_args *); static int linux_futex_requeue(struct thread *, struct linux_futex_args *); static int linux_futex_wakeop(struct thread *, struct linux_futex_args *); +static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *); +static int linux_futex_unlock_pi(struct thread *, bool, + struct linux_futex_args *); +static int futex_wake_pi(struct thread *, uint32_t *, bool); int futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared) { struct linux_futex_args args; bzero(&args, sizeof(args)); args.op = LINUX_FUTEX_WAKE; args.uaddr = uaddr; args.flags = shared == true ? FUTEX_SHARED : 0; args.val = val; args.val3 = FUTEX_BITSET_MATCH_ANY; return (linux_futex_wake(td, &args)); } +static int +futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared) +{ + struct linux_futex_args args; + + bzero(&args, sizeof(args)); + args.op = LINUX_FUTEX_UNLOCK_PI; + args.uaddr = uaddr; + args.flags = shared == true ? FUTEX_SHARED : 0; + + return (linux_futex_unlock_pi(td, true, &args)); +} + static int futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; int oparg = (encoded_op << 8) >> 20; int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; LIN_SDT_PROBE4(futex, futex_atomic_op, decoded_op, op, cmp, oparg, cmparg); switch (op) { case FUTEX_OP_SET: ret = futex_xchgl(oparg, uaddr, &oldval); break; case FUTEX_OP_ADD: ret = futex_addl(oparg, uaddr, &oldval); break; case FUTEX_OP_OR: ret = futex_orl(oparg, uaddr, &oldval); break; case FUTEX_OP_ANDN: ret = futex_andl(~oparg, uaddr, &oldval); break; case FUTEX_OP_XOR: ret = futex_xorl(oparg, uaddr, &oldval); break; default: LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_op, op); ret = -ENOSYS; break; } if (ret) return (ret); switch (cmp) { case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; default: LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_cmp, cmp); ret = -ENOSYS; } return (ret); } static int linux_futex(struct thread *td, struct linux_futex_args *args) { struct linux_pemuldata *pem; struct proc *p; if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { args->flags = 0; args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; } else args->flags = FUTEX_SHARED; args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; switch (args->op) { case LINUX_FUTEX_WAIT: args->val3 = FUTEX_BITSET_MATCH_ANY; /* FALLTHROUGH */ case LINUX_FUTEX_WAIT_BITSET: LIN_SDT_PROBE3(futex, linux_futex, debug_wait, args->uaddr, args->val, args->val3); LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", args->uaddr, args->val, args->val3); return (linux_futex_wait(td, args)); case LINUX_FUTEX_WAKE: args->val3 = FUTEX_BITSET_MATCH_ANY; /* FALLTHROUGH */ case LINUX_FUTEX_WAKE_BITSET: LIN_SDT_PROBE3(futex, linux_futex, debug_wake, args->uaddr, args->val, args->val3); LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", args->uaddr, args->val, args->val3); return (linux_futex_wake(td, args)); case LINUX_FUTEX_REQUEUE: /* * Glibc does not use this operation since version 2.3.3, * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when * FUTEX_REQUEUE returned EINVAL. */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) { linux_msg(td, "unsupported FUTEX_REQUEUE"); pem->flags |= LINUX_XDEPR_REQUEUEOP; LIN_SDT_PROBE0(futex, linux_futex, deprecated_requeue); } /* * The above is true, however musl libc does make use of the * futex requeue operation, allow operation for brands which * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags. */ p = td->td_proc; Elf_Brandinfo *bi = p->p_elf_brandinfo; if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0) return (EINVAL); args->val3_compare = false; /* FALLTHROUGH */ case LINUX_FUTEX_CMP_REQUEUE: LIN_SDT_PROBE5(futex, linux_futex, debug_cmp_requeue, args->uaddr, args->val, args->val3, args->uaddr2, args->ts); LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", args->uaddr, args->val, args->val3, args->uaddr2, args->ts); return (linux_futex_requeue(td, args)); case LINUX_FUTEX_WAKE_OP: LIN_SDT_PROBE5(futex, linux_futex, debug_wake_op, args->uaddr, args->op, args->val, args->uaddr2, args->val3); LINUX_CTR5(sys_futex, "WAKE_OP " "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", args->uaddr, args->val, args->uaddr2, args->val3, args->ts); return (linux_futex_wakeop(td, args)); case LINUX_FUTEX_LOCK_PI: - /* not yet implemented */ - pem = pem_find(td->td_proc); - if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { - linux_msg(td, "unsupported FUTEX_LOCK_PI"); - pem->flags |= LINUX_XUNSUP_FUTEXPIOP; - LIN_SDT_PROBE0(futex, linux_futex, - unimplemented_lock_pi); - } - return (ENOSYS); + args->clockrt = true; + LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x", + args->uaddr, args->val); + + return (linux_futex_lock_pi(td, false, args)); case LINUX_FUTEX_UNLOCK_PI: - /* not yet implemented */ - pem = pem_find(td->td_proc); - if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { - linux_msg(td, "unsupported FUTEX_UNLOCK_PI"); - pem->flags |= LINUX_XUNSUP_FUTEXPIOP; - LIN_SDT_PROBE0(futex, linux_futex, - unimplemented_unlock_pi); - } - return (ENOSYS); + LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p", + args->uaddr); + + return (linux_futex_unlock_pi(td, false, args)); case LINUX_FUTEX_TRYLOCK_PI: - /* not yet implemented */ - pem = pem_find(td->td_proc); - if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { - linux_msg(td, "unsupported FUTEX_TRYLOCK_PI"); - pem->flags |= LINUX_XUNSUP_FUTEXPIOP; - LIN_SDT_PROBE0(futex, linux_futex, - unimplemented_trylock_pi); - } - return (ENOSYS); + LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p", + args->uaddr); + + return (linux_futex_lock_pi(td, true, args)); case LINUX_FUTEX_WAIT_REQUEUE_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_futex, unimplemented_wait_requeue_pi); } return (ENOSYS); case LINUX_FUTEX_CMP_REQUEUE_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_futex, unimplemented_cmp_requeue_pi); } return (ENOSYS); default: linux_msg(td, "unsupported futex op %d", args->op); LIN_SDT_PROBE1(futex, linux_futex, unknown_operation, args->op); return (ENOSYS); } } +/* + * pi protocol: + * - 0 futex word value means unlocked. + * - TID futex word value means locked. + * Userspace uses atomic ops to lock/unlock these futexes without entering the + * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails), + * then FUTEX_LOCK_PI is called. + * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no + * other waiters exists looks up the thread that owns the futex (it has put its + * own TID into the futex value) and made this thread the owner of the internal + * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock + * object, on which it blocks. Once it returns, it has the mutex acquired, and it + * sets the futex value to its own TID and returns (futex value contains + * FUTEX_WAITERS|TID). + * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and + * FUTEX_UNLOCK_PI will be called. + * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED + * bit of the futex word and wakes up the next futex waiter (if any), WAITERS + * bit is preserved (if any). + * If OWNER_DIED bit is set the kernel sanity checks the futex word value against + * the internal futex state and if correct, acquire futex. + */ +static int +linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args) +{ + struct umtx_abs_timeout timo; + struct linux_emuldata *em; + struct umtx_pi *pi, *new_pi; + struct thread *td1; + struct umtx_q *uq; + int error, rv; + uint32_t owner, old_owner; + + em = em_find(td); + uq = td->td_umtxq; + error = umtx_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), + &uq->uq_key); + if (error != 0) + return (error); + if (args->ts != NULL) + linux_umtx_abs_timeout_init(&timo, args); + + umtxq_lock(&uq->uq_key); + pi = umtx_pi_lookup(&uq->uq_key); + if (pi == NULL) { + new_pi = umtx_pi_alloc(M_NOWAIT); + if (new_pi == NULL) { + umtxq_unlock(&uq->uq_key); + new_pi = umtx_pi_alloc(M_WAITOK); + umtxq_lock(&uq->uq_key); + pi = umtx_pi_lookup(&uq->uq_key); + if (pi != NULL) { + umtx_pi_free(new_pi); + new_pi = NULL; + } + } + if (new_pi != NULL) { + new_pi->pi_key = uq->uq_key; + umtx_pi_insert(new_pi); + pi = new_pi; + } + } + umtx_pi_ref(pi); + umtxq_unlock(&uq->uq_key); + for (;;) { + /* Try uncontested case first. */ + rv = casueword32(args->uaddr, 0, &owner, em->em_tid); + /* The acquire succeeded. */ + if (rv == 0) { + error = 0; + break; + } + if (rv == -1) { + error = EFAULT; + break; + } + + /* + * Avoid overwriting a possible error from sleep due + * to the pending signal with suspension check result. + */ + if (error == 0) { + error = thread_check_susp(td, true); + if (error != 0) + break; + } + + /* The futex word at *uaddr is already locked by the caller. */ + if ((owner & FUTEX_TID_MASK) == em->em_tid) { + error = EDEADLK; + break; + } + + /* + * Futex owner died, handle_futex_death() set the OWNER_DIED bit + * and clear tid. Try to acquire it. + */ + if ((owner & FUTEX_TID_MASK) == 0) { + old_owner = owner; + owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED); + owner |= em->em_tid; + rv = casueword32(args->uaddr, old_owner, &owner, owner); + if (rv == -1) { + error = EFAULT; + break; + } + if (rv == 1) { + if (error == 0) { + error = thread_check_susp(td, true); + if (error != 0) + break; + } + + /* + * If this failed the lock could + * changed, restart. + */ + continue; + } + + umtxq_lock(&uq->uq_key); + umtxq_busy(&uq->uq_key); + error = umtx_pi_claim(pi, td); + umtxq_unbusy(&uq->uq_key); + umtxq_unlock(&uq->uq_key); + if (error != 0) { + /* + * Since we're going to return an + * error, restore the futex to its + * previous, unowned state to avoid + * compounding the problem. + */ + (void)casuword32(args->uaddr, owner, old_owner); + } + break; + } + + /* + * Inconsistent state: OWNER_DIED is set and tid is not 0. + * Linux does some checks of futex state, we return EINVAL, + * as the user space can take care of this. + */ + if ((owner & FUTEX_OWNER_DIED) != 0) { + error = EINVAL; + break; + } + + if (try != 0) { + error = EBUSY; + break; + } + + /* + * If we caught a signal, we have retried and now + * exit immediately. + */ + if (error != 0) + break; + + umtxq_lock(&uq->uq_key); + umtxq_busy(&uq->uq_key); + umtxq_unlock(&uq->uq_key); + + /* + * Set the contested bit so that a release in user space knows + * to use the system call for unlock. If this fails either some + * one else has acquired the lock or it has been released. + */ + rv = casueword32(args->uaddr, owner, &owner, + owner | FUTEX_WAITERS); + if (rv == -1) { + umtxq_unbusy_unlocked(&uq->uq_key); + error = EFAULT; + break; + } + if (rv == 1) { + umtxq_unbusy_unlocked(&uq->uq_key); + error = thread_check_susp(td, true); + if (error != 0) + break; + + /* + * The lock changed and we need to retry or we + * lost a race to the thread unlocking the umtx. + */ + continue; + } + + /* + * Substitute Linux thread id by native thread id to + * avoid refactoring code of umtxq_sleep_pi(). + */ + td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1); + if (td1 != NULL) { + owner = td1->td_tid; + PROC_UNLOCK(td1->td_proc); + } else { + umtxq_unbusy_unlocked(&uq->uq_key); + error = EINVAL; + break; + } + + umtxq_lock(&uq->uq_key); + + /* We set the contested bit, sleep. */ + error = umtxq_sleep_pi(uq, pi, owner, "futexp", + args->ts == NULL ? NULL : &timo, + (args->flags & FUTEX_SHARED) != 0); + if (error != 0) + continue; + + error = thread_check_susp(td, false); + if (error != 0) + break; + } + + umtxq_lock(&uq->uq_key); + umtx_pi_unref(pi); + umtxq_unlock(&uq->uq_key); + umtx_key_release(&uq->uq_key); + return (error); +} + +static int +linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args) +{ + struct linux_emuldata *em; + struct umtx_key key; + uint32_t old, owner, new_owner; + int count, error; + + em = em_find(td); + + /* + * Make sure we own this mtx. + */ + error = fueword32(args->uaddr, &owner); + if (error == -1) + return (EFAULT); + if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid) + return (EPERM); + + error = umtx_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key); + if (error != 0) + return (error); + umtxq_lock(&key); + umtxq_busy(&key); + error = umtx_pi_drop(td, &key, rb, &count); + if (error != 0 || rb) { + umtxq_unbusy(&key); + umtxq_unlock(&key); + umtx_key_release(&key); + return (error); + } + umtxq_unlock(&key); + + /* + * When unlocking the futex, it must be marked as unowned if + * there is zero or one thread only waiting for it. + * Otherwise, it must be marked as contested. + */ + if (count > 1) + new_owner = FUTEX_WAITERS; + else + new_owner = 0; + +again: + error = casueword32(args->uaddr, owner, &old, new_owner); + if (error == 1) { + error = thread_check_susp(td, false); + if (error == 0) + goto again; + } + umtxq_unbusy_unlocked(&key); + umtx_key_release(&key); + if (error == -1) + return (EFAULT); + if (error == 0 && old != owner) + return (EINVAL); + return (error); +} + static int linux_futex_wakeop(struct thread *td, struct linux_futex_args *args) { struct umtx_key key, key2; int nrwake, op_ret, ret; int error, count; if (args->uaddr == args->uaddr2) return (EINVAL); error = umtx_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); if (error != 0) return (error); error = umtx_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); if (error != 0) { umtx_key_release(&key); return (error); } umtxq_lock(&key); umtxq_busy(&key); umtxq_unlock(&key); op_ret = futex_atomic_op(td, args->val3, args->uaddr2); if (op_ret < 0) { if (op_ret == -ENOSYS) error = ENOSYS; else error = EFAULT; } umtxq_lock(&key); umtxq_unbusy(&key); if (error != 0) goto out; ret = umtxq_signal_mask(&key, args->val, args->val3); if (op_ret > 0) { nrwake = (int)(unsigned long)args->ts; umtxq_lock(&key2); count = umtxq_count(&key2); if (count > 0) ret += umtxq_signal_mask(&key2, nrwake, args->val3); else ret += umtxq_signal_mask(&key, nrwake, args->val3); umtxq_unlock(&key2); } td->td_retval[0] = ret; out: umtxq_unlock(&key); umtx_key_release(&key2); umtx_key_release(&key); return (error); } static int linux_futex_requeue(struct thread *td, struct linux_futex_args *args) { int nrwake, nrrequeue; struct umtx_key key, key2; int error; uint32_t uval; /* * Linux allows this, we would not, it is an incorrect * usage of declared ABI, so return EINVAL. */ if (args->uaddr == args->uaddr2) return (EINVAL); nrrequeue = (int)(unsigned long)args->ts; nrwake = args->val; /* * Sanity check to prevent signed integer overflow, * see Linux CVE-2018-6927 */ if (nrwake < 0 || nrrequeue < 0) return (EINVAL); error = umtx_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); if (error != 0) return (error); error = umtx_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2); if (error != 0) { umtx_key_release(&key); return (error); } umtxq_lock(&key); umtxq_busy(&key); umtxq_unlock(&key); error = fueword32(args->uaddr, &uval); if (error != 0) error = EFAULT; else if (args->val3_compare == true && uval != args->val3) error = EWOULDBLOCK; umtxq_lock(&key); umtxq_unbusy(&key); if (error == 0) { umtxq_lock(&key2); td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue); umtxq_unlock(&key2); } umtxq_unlock(&key); umtx_key_release(&key2); umtx_key_release(&key); return (error); } static int linux_futex_wake(struct thread *td, struct linux_futex_args *args) { struct umtx_key key; int error; if (args->val3 == 0) return (EINVAL); error = umtx_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key); if (error != 0) return (error); umtxq_lock(&key); td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3); umtxq_unlock(&key); umtx_key_release(&key); return (0); } static int linux_futex_wait(struct thread *td, struct linux_futex_args *args) { struct umtx_abs_timeout timo; struct umtx_q *uq; uint32_t uval; int error; if (args->val3 == 0) error = EINVAL; uq = td->td_umtxq; error = umtx_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &uq->uq_key); if (error != 0) return (error); if (args->ts != NULL) linux_umtx_abs_timeout_init(&timo, args); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); uq->uq_bitset = args->val3; umtxq_insert(uq); umtxq_unlock(&uq->uq_key); error = fueword32(args->uaddr, &uval); if (error != 0) error = EFAULT; else if (uval != args->val) error = EWOULDBLOCK; umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); if (error == 0) { error = umtxq_sleep(uq, "futex", args->ts == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else umtxq_remove(uq); } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { umtxq_remove(uq); } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo, struct linux_futex_args *args) { int clockid, absolute; /* * The FUTEX_CLOCK_REALTIME option bit can be employed only with the * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI. * For FUTEX_WAIT, timeout is interpreted as a relative value, for other * futex operations timeout is interpreted as an absolute value. * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures * the timeout against the CLOCK_REALTIME clock, otherwise the kernel * measures the timeout against the CLOCK_MONOTONIC clock. */ clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC; absolute = args->op == LINUX_FUTEX_WAIT ? false : true; umtx_abs_timeout_init(timo, clockid, absolute, args->ts); } int linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) { struct linux_futex_args fargs = { .uaddr = args->uaddr, .op = args->op, .val = args->val, .ts = NULL, .uaddr2 = args->uaddr2, .val3 = args->val3, .val3_compare = true, }; struct l_timespec lts; int error; switch (args->op & LINUX_FUTEX_CMD_MASK) { case LINUX_FUTEX_WAIT: case LINUX_FUTEX_WAIT_BITSET: + case LINUX_FUTEX_LOCK_PI: if (args->timeout != NULL) { error = copyin(args->timeout, <s, sizeof(lts)); if (error != 0) return (error); error = linux_to_native_timespec(&fargs.kts, <s); if (error != 0) return (error); fargs.ts = &fargs.kts; } break; default: fargs.ts = PTRIN(args->timeout); } return (linux_futex(td, &fargs)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_sys_futex_time64(struct thread *td, struct linux_sys_futex_time64_args *args) { struct linux_futex_args fargs = { .uaddr = args->uaddr, .op = args->op, .val = args->val, .ts = NULL, .uaddr2 = args->uaddr2, .val3 = args->val3, }; struct l_timespec64 lts; int error; switch (args->op & LINUX_FUTEX_CMD_MASK) { case LINUX_FUTEX_WAIT: case LINUX_FUTEX_WAIT_BITSET: + case LINUX_FUTEX_LOCK_PI: if (args->timeout != NULL) { error = copyin(args->timeout, <s, sizeof(lts)); if (error != 0) return (error); error = linux_to_native_timespec64(&fargs.kts, <s); if (error != 0) return (error); fargs.ts = &fargs.kts; } break; default: fargs.ts = PTRIN(args->timeout); } return (linux_futex(td, &fargs)); } #endif int linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) { struct linux_emuldata *em; if (args->len != sizeof(struct linux_robust_list_head)) { LIN_SDT_PROBE0(futex, linux_set_robust_list, size_error); return (EINVAL); } em = em_find(td); em->robust_futexes = args->head; return (0); } int linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) { struct linux_emuldata *em; struct linux_robust_list_head *head; l_size_t len = sizeof(struct linux_robust_list_head); struct thread *td2; int error = 0; if (!args->pid) { em = em_find(td); KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); head = em->robust_futexes; } else { td2 = tdfind(args->pid, -1); if (td2 == NULL) return (ESRCH); if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) { PROC_UNLOCK(td2->td_proc); return (EPERM); } em = em_find(td2); KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); /* XXX: ptrace? */ if (priv_check(td, PRIV_CRED_SETUID) || priv_check(td, PRIV_CRED_SETEUID) || p_candebug(td, td2->td_proc)) { PROC_UNLOCK(td2->td_proc); return (EPERM); } head = em->robust_futexes; PROC_UNLOCK(td2->td_proc); } error = copyout(&len, args->len, sizeof(l_size_t)); if (error) { LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error, error); return (EFAULT); } error = copyout(&head, args->head, sizeof(head)); if (error) { LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error, error); } return (error); } static int handle_futex_death(struct linux_emuldata *em, uint32_t *uaddr, unsigned int pi) { uint32_t uval, nval, mval; int error; retry: error = fueword32(uaddr, &uval); if (error != 0) return (EFAULT); if ((uval & FUTEX_TID_MASK) == em->em_tid) { mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; nval = casuword32(uaddr, uval, mval); if (nval == -1) return (EFAULT); if (nval != uval) goto retry; if (!pi && (uval & FUTEX_WAITERS)) { error = futex_wake(curthread, uaddr, 1, true); if (error != 0) return (error); + } else if (pi && (uval & FUTEX_WAITERS)) { + error = futex_wake_pi(curthread, uaddr, true); + if (error != 0) + return (error); } } return (0); } static int fetch_robust_entry(struct linux_robust_list **entry, struct linux_robust_list **head, unsigned int *pi) { l_ulong uentry; int error; error = copyin((const void *)head, &uentry, sizeof(l_ulong)); if (error) { LIN_SDT_PROBE1(futex, fetch_robust_entry, copyin_error, error); return (EFAULT); } *entry = (void *)(uentry & ~1UL); *pi = uentry & 1; return (0); } /* This walks the list of robust futexes releasing them. */ void release_futexes(struct thread *td, struct linux_emuldata *em) { struct linux_robust_list_head *head = NULL; struct linux_robust_list *entry, *next_entry, *pending; unsigned int limit = 2048, pi, next_pi, pip; l_long futex_offset; int rc, error; head = em->robust_futexes; if (head == NULL) return; if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) return; error = copyin(&head->futex_offset, &futex_offset, sizeof(futex_offset)); if (error) { LIN_SDT_PROBE1(futex, release_futexes, copyin_error, error); return; } if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) return; while (entry != &head->list) { rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi); if (entry != pending) if (handle_futex_death(em, (uint32_t *)((caddr_t)entry + futex_offset), pi)) { return; } if (rc) return; entry = next_entry; pi = next_pi; if (!--limit) break; sched_relinquish(curthread); } if (pending) handle_futex_death(em, (uint32_t *)((caddr_t)pending + futex_offset), pip); }