Index: head/sys/kern/kern_sx.c =================================================================== --- head/sys/kern/kern_sx.c (revision 334023) +++ head/sys/kern/kern_sx.c (revision 334024) @@ -1,1464 +1,1566 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007 Attilio Rao * Copyright (c) 2001 Jason Evans * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ /* * Shared/exclusive locks. This implementation attempts to ensure * deterministic lock granting behavior, so that slocks and xlocks are * interleaved. * * Priority propagation will not generally raise the priority of lock holders, * so should not be relied upon in combination with sx locks. */ #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_no_adaptive_sx.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) && !defined(NO_ADAPTIVE_SX) #include #endif #ifdef DDB #include #endif #if defined(SMP) && !defined(NO_ADAPTIVE_SX) #define ADAPTIVE_SX #endif CTASSERT((SX_NOADAPTIVE & LO_CLASSFLAGS) == SX_NOADAPTIVE); #ifdef HWPMC_HOOKS #include PMC_SOFT_DECLARE( , , lock, failed); #endif /* Handy macros for sleep queues. */ #define SQ_EXCLUSIVE_QUEUE 0 #define SQ_SHARED_QUEUE 1 /* * Variations on DROP_GIANT()/PICKUP_GIANT() for use in this file. We * drop Giant anytime we have to sleep or if we adaptively spin. */ #define GIANT_DECLARE \ int _giantcnt = 0; \ WITNESS_SAVE_DECL(Giant) \ #define GIANT_SAVE(work) do { \ if (__predict_false(mtx_owned(&Giant))) { \ work++; \ WITNESS_SAVE(&Giant.lock_object, Giant); \ while (mtx_owned(&Giant)) { \ _giantcnt++; \ mtx_unlock(&Giant); \ } \ } \ } while (0) #define GIANT_RESTORE() do { \ if (_giantcnt > 0) { \ mtx_assert(&Giant, MA_NOTOWNED); \ while (_giantcnt--) \ mtx_lock(&Giant); \ WITNESS_RESTORE(&Giant.lock_object, Giant); \ } \ } while (0) /* * Returns true if an exclusive lock is recursed. It assumes * curthread currently has an exclusive lock. */ #define sx_recursed(sx) ((sx)->sx_recurse != 0) static void assert_sx(const struct lock_object *lock, int what); #ifdef DDB static void db_show_sx(const struct lock_object *lock); #endif static void lock_sx(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_sx(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_sx(struct lock_object *lock); struct lock_class lock_class_sx = { .lc_name = "sx", .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE, .lc_assert = assert_sx, #ifdef DDB .lc_ddb_show = db_show_sx, #endif .lc_lock = lock_sx, .lc_unlock = unlock_sx, #ifdef KDTRACE_HOOKS .lc_owner = owner_sx, #endif }; #ifndef INVARIANTS #define _sx_assert(sx, what, file, line) #endif #ifdef ADAPTIVE_SX static __read_frequently u_int asx_retries; static __read_frequently u_int asx_loops; static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging"); SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, ""); SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, ""); static struct lock_delay_config __read_frequently sx_delay; SYSCTL_INT(_debug_sx, OID_AUTO, delay_base, CTLFLAG_RW, &sx_delay.base, 0, ""); SYSCTL_INT(_debug_sx, OID_AUTO, delay_max, CTLFLAG_RW, &sx_delay.max, 0, ""); static void sx_lock_delay_init(void *arg __unused) { lock_delay_default_init(&sx_delay); asx_retries = 10; asx_loops = max(10000, sx_delay.max); } LOCK_DELAY_SYSINIT(sx_lock_delay_init); #endif void assert_sx(const struct lock_object *lock, int what) { sx_assert((const struct sx *)lock, what); } void lock_sx(struct lock_object *lock, uintptr_t how) { struct sx *sx; sx = (struct sx *)lock; if (how) sx_slock(sx); else sx_xlock(sx); } uintptr_t unlock_sx(struct lock_object *lock) { struct sx *sx; sx = (struct sx *)lock; sx_assert(sx, SA_LOCKED | SA_NOTRECURSED); if (sx_xlocked(sx)) { sx_xunlock(sx); return (0); } else { sx_sunlock(sx); return (1); } } #ifdef KDTRACE_HOOKS int owner_sx(const struct lock_object *lock, struct thread **owner) { const struct sx *sx; uintptr_t x; sx = (const struct sx *)lock; x = sx->sx_lock; *owner = NULL; return ((x & SX_LOCK_SHARED) != 0 ? (SX_SHARERS(x) != 0) : ((*owner = (struct thread *)SX_OWNER(x)) != NULL)); } #endif void sx_sysinit(void *arg) { struct sx_args *sargs = arg; sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags); } void sx_init_flags(struct sx *sx, const char *description, int opts) { int flags; MPASS((opts & ~(SX_QUIET | SX_RECURSE | SX_NOWITNESS | SX_DUPOK | SX_NOPROFILE | SX_NOADAPTIVE | SX_NEW)) == 0); ASSERT_ATOMIC_LOAD_PTR(sx->sx_lock, ("%s: sx_lock not aligned for %s: %p", __func__, description, &sx->sx_lock)); flags = LO_SLEEPABLE | LO_UPGRADABLE; if (opts & SX_DUPOK) flags |= LO_DUPOK; if (opts & SX_NOPROFILE) flags |= LO_NOPROFILE; if (!(opts & SX_NOWITNESS)) flags |= LO_WITNESS; if (opts & SX_RECURSE) flags |= LO_RECURSABLE; if (opts & SX_QUIET) flags |= LO_QUIET; if (opts & SX_NEW) flags |= LO_NEW; flags |= opts & SX_NOADAPTIVE; lock_init(&sx->lock_object, &lock_class_sx, description, NULL, flags); sx->sx_lock = SX_LOCK_UNLOCKED; sx->sx_recurse = 0; } void sx_destroy(struct sx *sx) { KASSERT(sx->sx_lock == SX_LOCK_UNLOCKED, ("sx lock still held")); KASSERT(sx->sx_recurse == 0, ("sx lock still recursed")); sx->sx_lock = SX_LOCK_DESTROYED; lock_destroy(&sx->lock_object); } int sx_try_slock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { uintptr_t x; if (SCHEDULER_STOPPED()) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_try_slock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); x = sx->sx_lock; for (;;) { KASSERT(x != SX_LOCK_DESTROYED, ("sx_try_slock() of destroyed sx @ %s:%d", file, line)); if (!(x & SX_LOCK_SHARED)) break; if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, x + SX_ONE_SHARER)) { LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 1, file, line); WITNESS_LOCK(&sx->lock_object, LOP_TRYLOCK, file, line); LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, 0, 0, file, line, LOCKSTAT_READER); TD_LOCKS_INC(curthread); + curthread->td_sx_slocks++; return (1); } } LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 0, file, line); return (0); } int sx_try_slock_(struct sx *sx, const char *file, int line) { return (sx_try_slock_int(sx LOCK_FILE_LINE_ARG)); } int _sx_xlock(struct sx *sx, int opts, const char *file, int line) { uintptr_t tid, x; int error = 0; KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() || !TD_IS_IDLETHREAD(curthread), ("sx_xlock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_xlock() of destroyed sx @ %s:%d", file, line)); WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); tid = (uintptr_t)curthread; x = SX_LOCK_UNLOCKED; if (!atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid)) error = _sx_xlock_hard(sx, x, opts LOCK_FILE_LINE_ARG); else LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, 0, 0, file, line, LOCKSTAT_WRITER); if (!error) { LOCK_LOG_LOCK("XLOCK", &sx->lock_object, 0, sx->sx_recurse, file, line); WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } return (error); } int sx_try_xlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { struct thread *td; uintptr_t tid, x; int rval; bool recursed; td = curthread; tid = (uintptr_t)td; if (SCHEDULER_STOPPED_TD(td)) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(td), ("sx_try_xlock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_try_xlock() of destroyed sx @ %s:%d", file, line)); rval = 1; recursed = false; x = SX_LOCK_UNLOCKED; for (;;) { if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid)) break; if (x == SX_LOCK_UNLOCKED) continue; if (x == tid && (sx->lock_object.lo_flags & LO_RECURSABLE)) { sx->sx_recurse++; atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED); break; } rval = 0; break; } LOCK_LOG_TRY("XLOCK", &sx->lock_object, 0, rval, file, line); if (rval) { WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); if (!recursed) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, 0, 0, file, line, LOCKSTAT_WRITER); TD_LOCKS_INC(curthread); } return (rval); } int sx_try_xlock_(struct sx *sx, const char *file, int line) { return (sx_try_xlock_int(sx LOCK_FILE_LINE_ARG)); } void _sx_xunlock(struct sx *sx, const char *file, int line) { KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_xunlock() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_XLOCKED, file, line); WITNESS_UNLOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("XUNLOCK", &sx->lock_object, 0, sx->sx_recurse, file, line); #if LOCK_DEBUG > 0 _sx_xunlock_hard(sx, (uintptr_t)curthread, file, line); #else __sx_xunlock(sx, curthread, file, line); #endif TD_LOCKS_DEC(curthread); } /* * Try to do a non-blocking upgrade from a shared lock to an exclusive lock. * This will only succeed if this thread holds a single shared lock. * Return 1 if if the upgrade succeed, 0 otherwise. */ int sx_try_upgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { uintptr_t x; uintptr_t waiters; int success; if (SCHEDULER_STOPPED()) return (1); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_try_upgrade() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_SLOCKED, file, line); /* * Try to switch from one shared lock to an exclusive lock. We need * to maintain the SX_LOCK_EXCLUSIVE_WAITERS flag if set so that * we will wake up the exclusive waiters when we drop the lock. */ success = 0; x = SX_READ_VALUE(sx); for (;;) { if (SX_SHARERS(x) > 1) break; - waiters = (x & SX_LOCK_EXCLUSIVE_WAITERS); + waiters = (x & SX_LOCK_WAITERS); if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, (uintptr_t)curthread | waiters)) { success = 1; break; } } LOCK_LOG_TRY("XUPGRADE", &sx->lock_object, 0, success, file, line); if (success) { + curthread->td_sx_slocks--; WITNESS_UPGRADE(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); LOCKSTAT_RECORD0(sx__upgrade, sx); } return (success); } int sx_try_upgrade_(struct sx *sx, const char *file, int line) { return (sx_try_upgrade_int(sx LOCK_FILE_LINE_ARG)); } /* * Downgrade an unrecursed exclusive lock into a single shared lock. */ void sx_downgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { uintptr_t x; int wakeup_swapper; if (SCHEDULER_STOPPED()) return; KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_downgrade() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_XLOCKED | SA_NOTRECURSED, file, line); #ifndef INVARIANTS if (sx_recursed(sx)) panic("downgrade of a recursed lock"); #endif WITNESS_DOWNGRADE(&sx->lock_object, 0, file, line); /* * Try to switch from an exclusive lock with no shared waiters * to one sharer with no shared waiters. If there are * exclusive waiters, we don't need to lock the sleep queue so * long as we preserve the flag. We do one quick try and if * that fails we grab the sleepq lock to keep the flags from * changing and do it the slow way. * * We have to lock the sleep queue if there are shared waiters * so we can wake them up. */ x = sx->sx_lock; if (!(x & SX_LOCK_SHARED_WAITERS) && atomic_cmpset_rel_ptr(&sx->sx_lock, x, SX_SHARERS_LOCK(1) | (x & SX_LOCK_EXCLUSIVE_WAITERS))) goto out; /* * Lock the sleep queue so we can read the waiters bits * without any races and wakeup any shared waiters. */ sleepq_lock(&sx->lock_object); /* * Preserve SX_LOCK_EXCLUSIVE_WAITERS while downgraded to a single * shared lock. If there are any shared waiters, wake them up. */ wakeup_swapper = 0; x = sx->sx_lock; atomic_store_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | (x & SX_LOCK_EXCLUSIVE_WAITERS)); if (x & SX_LOCK_SHARED_WAITERS) wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, SQ_SHARED_QUEUE); sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); out: + curthread->td_sx_slocks++; LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line); LOCKSTAT_RECORD0(sx__downgrade, sx); } void sx_downgrade_(struct sx *sx, const char *file, int line) { sx_downgrade_int(sx LOCK_FILE_LINE_ARG); } +#ifdef ADAPTIVE_SX +static inline void +sx_drop_critical(uintptr_t x, bool *in_critical, int *extra_work) +{ + + if (x & SX_LOCK_WRITE_SPINNER) + return; + if (*in_critical) { + critical_exit(); + *in_critical = false; + (*extra_work)--; + } +} +#else +#define sx_drop_critical(x, in_critical, extra_work) do { } while(0) +#endif + /* * This function represents the so-called 'hard case' for sx_xlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ int _sx_xlock_hard(struct sx *sx, uintptr_t x, int opts LOCK_FILE_LINE_ARG_DEF) { GIANT_DECLARE; - uintptr_t tid; + uintptr_t tid, setx; #ifdef ADAPTIVE_SX volatile struct thread *owner; u_int i, n, spintries = 0; enum { READERS, WRITER } sleep_reason = READERS; bool adaptive; + bool in_critical = false; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif int error = 0; #if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) uintptr_t state = 0; + int doing_lockprof = 0; #endif int extra_work = 0; tid = (uintptr_t)curthread; #ifdef KDTRACE_HOOKS if (LOCKSTAT_PROFILE_ENABLED(sx__acquire)) { while (x == SX_LOCK_UNLOCKED) { if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid)) goto out_lockstat; } extra_work = 1; + doing_lockprof = 1; all_time -= lockstat_nsecs(&sx->lock_object); state = x; } #endif #ifdef LOCK_PROFILING extra_work = 1; + doing_lockprof = 1; state = x; #endif if (SCHEDULER_STOPPED()) return (0); #if defined(ADAPTIVE_SX) lock_delay_arg_init(&lda, &sx_delay); #elif defined(KDTRACE_HOOKS) lock_delay_arg_init(&lda, NULL); #endif if (__predict_false(x == SX_LOCK_UNLOCKED)) x = SX_READ_VALUE(sx); /* If we already hold an exclusive lock, then recurse. */ if (__predict_false(lv_sx_owner(x) == (struct thread *)tid)) { KASSERT((sx->lock_object.lo_flags & LO_RECURSABLE) != 0, ("_sx_xlock_hard: recursed on non-recursive sx %s @ %s:%d\n", sx->lock_object.lo_name, file, line)); sx->sx_recurse++; atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p recursing", __func__, sx); return (0); } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__, sx->lock_object.lo_name, (void *)sx->sx_lock, file, line); #ifdef ADAPTIVE_SX adaptive = ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0); #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&sx->lock_object, &contested, &waittime); #ifndef INVARIANTS GIANT_SAVE(extra_work); #endif for (;;) { if (x == SX_LOCK_UNLOCKED) { if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid)) break; continue; } #ifdef INVARIANTS GIANT_SAVE(extra_work); #endif #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef ADAPTIVE_SX if (__predict_false(!adaptive)) goto sleepq; /* * If the lock is write locked and the owner is * running on another CPU, spin until the owner stops * running or the state of the lock changes. */ if ((x & SX_LOCK_SHARED) == 0) { + sx_drop_critical(x, &in_critical, &extra_work); sleep_reason = WRITER; owner = lv_sx_owner(x); if (!TD_IS_RUNNING(owner)) goto sleepq; if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, sx, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); do { lock_delay(&lda); x = SX_READ_VALUE(sx); owner = lv_sx_owner(x); } while (owner != NULL && TD_IS_RUNNING(owner)); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } else if (SX_SHARERS(x) > 0) { sleep_reason = READERS; if (spintries == asx_retries) goto sleepq; + if (!(x & SX_LOCK_WRITE_SPINNER)) { + if (!in_critical) { + critical_enter(); + in_critical = true; + extra_work++; + } + if (!atomic_fcmpset_ptr(&sx->sx_lock, &x, + x | SX_LOCK_WRITE_SPINNER)) { + critical_exit(); + in_critical = false; + extra_work--; + continue; + } + } spintries++; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); + n = SX_SHARERS(x); for (i = 0; i < asx_loops; i += n) { - n = SX_SHARERS(x); lock_delay_spin(n); x = SX_READ_VALUE(sx); - if ((x & SX_LOCK_SHARED) == 0 || - SX_SHARERS(x) == 0) + if (!(x & SX_LOCK_WRITE_SPINNER)) break; + if (!(x & SX_LOCK_SHARED)) + break; + n = SX_SHARERS(x); + if (n == 0) + break; } #ifdef KDTRACE_HOOKS lda.spin_cnt += i; #endif KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); if (i < asx_loops) continue; } sleepq: #endif sleepq_lock(&sx->lock_object); x = SX_READ_VALUE(sx); retry_sleepq: /* * If the lock was released while spinning on the * sleep queue chain lock, try again. */ if (x == SX_LOCK_UNLOCKED) { sleepq_release(&sx->lock_object); + sx_drop_critical(x, &in_critical, &extra_work); continue; } #ifdef ADAPTIVE_SX /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the sleep queue * chain lock. If so, drop the sleep queue lock and try * again. */ if (adaptive) { if (!(x & SX_LOCK_SHARED)) { owner = (struct thread *)SX_OWNER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&sx->lock_object); + sx_drop_critical(x, &in_critical, + &extra_work); continue; } } else if (SX_SHARERS(x) > 0 && sleep_reason == WRITER) { sleepq_release(&sx->lock_object); + sx_drop_critical(x, &in_critical, &extra_work); continue; } } #endif /* * If an exclusive lock was released with both shared * and exclusive waiters and a shared waiter hasn't * woken up and acquired the lock yet, sx_lock will be * set to SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS. * If we see that value, try to acquire it once. Note * that we have to preserve SX_LOCK_EXCLUSIVE_WAITERS * as there are other exclusive waiters still. If we * fail, restart the loop. */ - if (x == (SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS)) { - if (!atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, - tid | SX_LOCK_EXCLUSIVE_WAITERS)) + setx = x & (SX_LOCK_WAITERS | SX_LOCK_WRITE_SPINNER); + if ((x & ~setx) == SX_LOCK_SHARED) { + setx &= ~SX_LOCK_WRITE_SPINNER; + if (!atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid | setx)) goto retry_sleepq; sleepq_release(&sx->lock_object); CTR2(KTR_LOCK, "%s: %p claimed by new writer", __func__, sx); break; } +#ifdef ADAPTIVE_SX /* - * Try to set the SX_LOCK_EXCLUSIVE_WAITERS. If we fail, - * than loop back and retry. + * It is possible we set the SX_LOCK_WRITE_SPINNER bit. + * It is an invariant that when the bit is set, there is + * a writer ready to grab the lock. Thus clear the bit since + * we are going to sleep. */ - if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) { - if (!atomic_fcmpset_ptr(&sx->sx_lock, &x, - x | SX_LOCK_EXCLUSIVE_WAITERS)) { - goto retry_sleepq; + if (in_critical) { + if ((x & SX_LOCK_WRITE_SPINNER) || + !((x & SX_LOCK_EXCLUSIVE_WAITERS))) { + setx = x & ~SX_LOCK_WRITE_SPINNER; + setx |= SX_LOCK_EXCLUSIVE_WAITERS; + if (!atomic_fcmpset_ptr(&sx->sx_lock, &x, + setx)) { + goto retry_sleepq; + } } - if (LOCK_LOG_TEST(&sx->lock_object, 0)) - CTR2(KTR_LOCK, "%s: %p set excl waiters flag", - __func__, sx); + critical_exit(); + in_critical = false; + } else { +#endif + /* + * Try to set the SX_LOCK_EXCLUSIVE_WAITERS. If we fail, + * than loop back and retry. + */ + if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) { + if (!atomic_fcmpset_ptr(&sx->sx_lock, &x, + x | SX_LOCK_EXCLUSIVE_WAITERS)) { + goto retry_sleepq; + } + if (LOCK_LOG_TEST(&sx->lock_object, 0)) + CTR2(KTR_LOCK, "%s: %p set excl waiters flag", + __func__, sx); + } +#ifdef ADAPTIVE_SX } +#endif /* * Since we have been unable to acquire the exclusive * lock and the exclusive waiters flag is set, we have * to sleep. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on sleep queue", __func__, sx); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&sx->lock_object); #endif sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name, SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ? SLEEPQ_INTERRUPTIBLE : 0), SQ_EXCLUSIVE_QUEUE); if (!(opts & SX_INTERRUPTIBLE)) sleepq_wait(&sx->lock_object, 0); else error = sleepq_wait_sig(&sx->lock_object, 0); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&sx->lock_object); sleep_cnt++; #endif if (error) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: interruptible sleep by %p suspended by signal", __func__, sx); break; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from sleep queue", __func__, sx); x = SX_READ_VALUE(sx); } -#if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!extra_work)) return (error); +#ifdef ADAPTIVE_SX + if (in_critical) + critical_exit(); #endif +#if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) + if (__predict_true(!doing_lockprof)) + return (error); +#endif #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&sx->lock_object); if (sleep_time) LOCKSTAT_RECORD4(sx__block, sx, sleep_time, LOCKSTAT_WRITER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(sx__spin, sx, all_time - sleep_time, LOCKSTAT_WRITER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); out_lockstat: #endif if (!error) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, contested, waittime, file, line, LOCKSTAT_WRITER); GIANT_RESTORE(); return (error); } /* * This function represents the so-called 'hard case' for sx_xunlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ void _sx_xunlock_hard(struct sx *sx, uintptr_t x LOCK_FILE_LINE_ARG_DEF) { uintptr_t tid, setx; int queue, wakeup_swapper; if (SCHEDULER_STOPPED()) return; tid = (uintptr_t)curthread; if (__predict_false(x == tid)) x = SX_READ_VALUE(sx); MPASS(!(x & SX_LOCK_SHARED)); if (__predict_false(x & SX_LOCK_RECURSED)) { /* The lock is recursed, unrecurse one level. */ if ((--sx->sx_recurse) == 0) atomic_clear_ptr(&sx->sx_lock, SX_LOCK_RECURSED); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, sx); return; } LOCKSTAT_PROFILE_RELEASE_RWLOCK(sx__release, sx, LOCKSTAT_WRITER); if (x == tid && atomic_cmpset_rel_ptr(&sx->sx_lock, tid, SX_LOCK_UNLOCKED)) return; if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p contested", __func__, sx); sleepq_lock(&sx->lock_object); x = SX_READ_VALUE(sx); MPASS(x & (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)); /* * The wake up algorithm here is quite simple and probably not * ideal. It gives precedence to shared waiters if they are * present. For this condition, we have to preserve the * state of the exclusive waiters flag. * If interruptible sleeps left the shared queue empty avoid a * starvation for the threads sleeping on the exclusive queue by giving * them precedence and cleaning up the shared waiters bit anyway. */ setx = SX_LOCK_UNLOCKED; - queue = SQ_EXCLUSIVE_QUEUE; - if ((x & SX_LOCK_SHARED_WAITERS) != 0 && - sleepq_sleepcnt(&sx->lock_object, SQ_SHARED_QUEUE) != 0) { - queue = SQ_SHARED_QUEUE; - setx |= (x & SX_LOCK_EXCLUSIVE_WAITERS); + queue = SQ_SHARED_QUEUE; + if ((x & SX_LOCK_EXCLUSIVE_WAITERS) != 0 && + sleepq_sleepcnt(&sx->lock_object, SQ_EXCLUSIVE_QUEUE) != 0) { + queue = SQ_EXCLUSIVE_QUEUE; + setx |= (x & SX_LOCK_SHARED_WAITERS); } atomic_store_rel_ptr(&sx->sx_lock, setx); /* Wake up all the waiters for the specific queue. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: %p waking up all threads on %s queue", __func__, sx, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, queue); sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); } static bool __always_inline -__sx_slock_try(struct sx *sx, uintptr_t *xp LOCK_FILE_LINE_ARG_DEF) +__sx_can_read(struct thread *td, uintptr_t x, bool fp) { + if ((x & (SX_LOCK_SHARED | SX_LOCK_EXCLUSIVE_WAITERS | SX_LOCK_WRITE_SPINNER)) + == SX_LOCK_SHARED) + return (true); + if (!fp && td->td_sx_slocks && (x & SX_LOCK_SHARED)) + return (true); + return (false); +} + +static bool __always_inline +__sx_slock_try(struct sx *sx, struct thread *td, uintptr_t *xp, bool fp + LOCK_FILE_LINE_ARG_DEF) +{ + /* * If no other thread has an exclusive lock then try to bump up * the count of sharers. Since we have to preserve the state * of SX_LOCK_EXCLUSIVE_WAITERS, if we fail to acquire the * shared lock loop back and retry. */ - while (*xp & SX_LOCK_SHARED) { - MPASS(!(*xp & SX_LOCK_SHARED_WAITERS)); + while (__sx_can_read(td, *xp, fp)) { if (atomic_fcmpset_acq_ptr(&sx->sx_lock, xp, *xp + SX_ONE_SHARER)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeed %p -> %p", __func__, sx, (void *)*xp, (void *)(*xp + SX_ONE_SHARER)); + td->td_sx_slocks++; return (true); } } return (false); } static int __noinline _sx_slock_hard(struct sx *sx, int opts, uintptr_t x LOCK_FILE_LINE_ARG_DEF) { GIANT_DECLARE; + struct thread *td; #ifdef ADAPTIVE_SX volatile struct thread *owner; + u_int i, n, spintries = 0; bool adaptive; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif int error = 0; #if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) uintptr_t state = 0; #endif int extra_work = 0; + td = curthread; + #ifdef KDTRACE_HOOKS if (LOCKSTAT_PROFILE_ENABLED(sx__acquire)) { - if (__sx_slock_try(sx, &x LOCK_FILE_LINE_ARG)) + if (__sx_slock_try(sx, td, &x, false LOCK_FILE_LINE_ARG)) goto out_lockstat; extra_work = 1; all_time -= lockstat_nsecs(&sx->lock_object); state = x; } #endif #ifdef LOCK_PROFILING extra_work = 1; state = x; #endif if (SCHEDULER_STOPPED()) return (0); #if defined(ADAPTIVE_SX) lock_delay_arg_init(&lda, &sx_delay); #elif defined(KDTRACE_HOOKS) lock_delay_arg_init(&lda, NULL); #endif #ifdef ADAPTIVE_SX adaptive = ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0); #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&sx->lock_object, &contested, &waittime); #ifndef INVARIANTS GIANT_SAVE(extra_work); #endif /* * As with rwlocks, we don't make any attempt to try to block * shared locks once there is an exclusive waiter. */ for (;;) { - if (__sx_slock_try(sx, &x LOCK_FILE_LINE_ARG)) + if (__sx_slock_try(sx, td, &x, false LOCK_FILE_LINE_ARG)) break; #ifdef INVARIANTS GIANT_SAVE(extra_work); #endif #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef ADAPTIVE_SX if (__predict_false(!adaptive)) goto sleepq; + /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ - owner = lv_sx_owner(x); - if (TD_IS_RUNNING(owner)) { - if (LOCK_LOG_TEST(&sx->lock_object, 0)) - CTR3(KTR_LOCK, - "%s: spinning on %p held by %p", - __func__, sx, owner); - KTR_STATE1(KTR_SCHED, "thread", - sched_tdname(curthread), "spinning", - "lockname:\"%s\"", sx->lock_object.lo_name); - do { - lock_delay(&lda); + if ((x & SX_LOCK_SHARED) == 0) { + owner = lv_sx_owner(x); + if (TD_IS_RUNNING(owner)) { + if (LOCK_LOG_TEST(&sx->lock_object, 0)) + CTR3(KTR_LOCK, + "%s: spinning on %p held by %p", + __func__, sx, owner); + KTR_STATE1(KTR_SCHED, "thread", + sched_tdname(curthread), "spinning", + "lockname:\"%s\"", sx->lock_object.lo_name); + do { + lock_delay(&lda); + x = SX_READ_VALUE(sx); + owner = lv_sx_owner(x); + } while (owner != NULL && TD_IS_RUNNING(owner)); + KTR_STATE0(KTR_SCHED, "thread", + sched_tdname(curthread), "running"); + continue; + } + } else { + if ((x & SX_LOCK_WRITE_SPINNER) && SX_SHARERS(x) == 0) { + MPASS(!__sx_can_read(td, x, false)); + lock_delay_spin(2); x = SX_READ_VALUE(sx); - owner = lv_sx_owner(x); - } while (owner != NULL && TD_IS_RUNNING(owner)); - KTR_STATE0(KTR_SCHED, "thread", - sched_tdname(curthread), "running"); - continue; + continue; + } + if (spintries < asx_retries) { + KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), + "spinning", "lockname:\"%s\"", + sx->lock_object.lo_name); + n = SX_SHARERS(x); + for (i = 0; i < asx_loops; i += n) { + lock_delay_spin(n); + x = SX_READ_VALUE(sx); + if (!(x & SX_LOCK_SHARED)) + break; + n = SX_SHARERS(x); + if (n == 0) + break; + if (__sx_can_read(td, x, false)) + break; + } +#ifdef KDTRACE_HOOKS + lda.spin_cnt += i; +#endif + KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), + "running"); + if (i < asx_loops) + continue; + } } sleepq: #endif /* * Some other thread already has an exclusive lock, so * start the process of blocking. */ sleepq_lock(&sx->lock_object); x = SX_READ_VALUE(sx); retry_sleepq: - /* - * The lock could have been released while we spun. - * In this case loop back and retry. - */ - if (x & SX_LOCK_SHARED) { + if (((x & SX_LOCK_WRITE_SPINNER) && SX_SHARERS(x) == 0) || + __sx_can_read(td, x, false)) { sleepq_release(&sx->lock_object); continue; } #ifdef ADAPTIVE_SX /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if (!(x & SX_LOCK_SHARED) && adaptive) { owner = (struct thread *)SX_OWNER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&sx->lock_object); x = SX_READ_VALUE(sx); continue; } } #endif /* * Try to set the SX_LOCK_SHARED_WAITERS flag. If we * fail to set it drop the sleep queue lock and loop * back. */ if (!(x & SX_LOCK_SHARED_WAITERS)) { if (!atomic_fcmpset_ptr(&sx->sx_lock, &x, x | SX_LOCK_SHARED_WAITERS)) goto retry_sleepq; if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set shared waiters flag", __func__, sx); } /* * Since we have been unable to acquire the shared lock, * we have to sleep. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on sleep queue", __func__, sx); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&sx->lock_object); #endif sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name, SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ? SLEEPQ_INTERRUPTIBLE : 0), SQ_SHARED_QUEUE); if (!(opts & SX_INTERRUPTIBLE)) sleepq_wait(&sx->lock_object, 0); else error = sleepq_wait_sig(&sx->lock_object, 0); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&sx->lock_object); sleep_cnt++; #endif if (error) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: interruptible sleep by %p suspended by signal", __func__, sx); break; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from sleep queue", __func__, sx); x = SX_READ_VALUE(sx); } #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!extra_work)) return (error); #endif #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&sx->lock_object); if (sleep_time) LOCKSTAT_RECORD4(sx__block, sx, sleep_time, LOCKSTAT_READER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(sx__spin, sx, all_time - sleep_time, LOCKSTAT_READER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); out_lockstat: #endif if (error == 0) { LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, contested, waittime, file, line, LOCKSTAT_READER); } GIANT_RESTORE(); return (error); } int _sx_slock_int(struct sx *sx, int opts LOCK_FILE_LINE_ARG_DEF) { + struct thread *td; uintptr_t x; int error; KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() || !TD_IS_IDLETHREAD(curthread), ("sx_slock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_slock() of destroyed sx @ %s:%d", file, line)); WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER, file, line, NULL); error = 0; + td = curthread; x = SX_READ_VALUE(sx); if (__predict_false(LOCKSTAT_PROFILE_ENABLED(sx__acquire) || - !__sx_slock_try(sx, &x LOCK_FILE_LINE_ARG))) + !__sx_slock_try(sx, td, &x, true LOCK_FILE_LINE_ARG))) error = _sx_slock_hard(sx, opts, x LOCK_FILE_LINE_ARG); else lock_profile_obtain_lock_success(&sx->lock_object, 0, 0, file, line); if (error == 0) { LOCK_LOG_LOCK("SLOCK", &sx->lock_object, 0, 0, file, line); WITNESS_LOCK(&sx->lock_object, 0, file, line); TD_LOCKS_INC(curthread); } return (error); } int _sx_slock(struct sx *sx, int opts, const char *file, int line) { return (_sx_slock_int(sx, opts LOCK_FILE_LINE_ARG)); } static bool __always_inline -_sx_sunlock_try(struct sx *sx, uintptr_t *xp) +_sx_sunlock_try(struct sx *sx, struct thread *td, uintptr_t *xp) { for (;;) { - /* - * We should never have sharers while at least one thread - * holds a shared lock. - */ - KASSERT(!(*xp & SX_LOCK_SHARED_WAITERS), - ("%s: waiting sharers", __func__)); - - /* - * See if there is more than one shared lock held. If - * so, just drop one and return. - */ - if (SX_SHARERS(*xp) > 1) { + if (SX_SHARERS(*xp) > 1 || !(*xp & SX_LOCK_WAITERS)) { if (atomic_fcmpset_rel_ptr(&sx->sx_lock, xp, *xp - SX_ONE_SHARER)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeeded %p -> %p", __func__, sx, (void *)*xp, (void *)(*xp - SX_ONE_SHARER)); + td->td_sx_slocks--; return (true); } continue; } - - /* - * If there aren't any waiters for an exclusive lock, - * then try to drop it quickly. - */ - if (!(*xp & SX_LOCK_EXCLUSIVE_WAITERS)) { - MPASS(*xp == SX_SHARERS_LOCK(1)); - *xp = SX_SHARERS_LOCK(1); - if (atomic_fcmpset_rel_ptr(&sx->sx_lock, - xp, SX_LOCK_UNLOCKED)) { - if (LOCK_LOG_TEST(&sx->lock_object, 0)) - CTR2(KTR_LOCK, "%s: %p last succeeded", - __func__, sx); - return (true); - } - continue; - } break; } return (false); } static void __noinline -_sx_sunlock_hard(struct sx *sx, uintptr_t x LOCK_FILE_LINE_ARG_DEF) +_sx_sunlock_hard(struct sx *sx, struct thread *td, uintptr_t x + LOCK_FILE_LINE_ARG_DEF) { int wakeup_swapper = 0; - uintptr_t setx; + uintptr_t setx, queue; if (SCHEDULER_STOPPED()) return; - if (_sx_sunlock_try(sx, &x)) + if (_sx_sunlock_try(sx, td, &x)) goto out_lockstat; - /* - * At this point, there should just be one sharer with - * exclusive waiters. - */ - MPASS(x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS)); - sleepq_lock(&sx->lock_object); x = SX_READ_VALUE(sx); for (;;) { - MPASS(x & SX_LOCK_EXCLUSIVE_WAITERS); - MPASS(!(x & SX_LOCK_SHARED_WAITERS)); - if (_sx_sunlock_try(sx, &x)) + if (_sx_sunlock_try(sx, td, &x)) break; /* * Wake up semantic here is quite simple: * Just wake up all the exclusive waiters. * Note that the state of the lock could have changed, * so if it fails loop back and retry. */ - setx = x - SX_ONE_SHARER; - setx &= ~SX_LOCK_EXCLUSIVE_WAITERS; + setx = SX_LOCK_UNLOCKED; + queue = SQ_SHARED_QUEUE; + if (x & SX_LOCK_EXCLUSIVE_WAITERS) { + setx |= (x & SX_LOCK_SHARED_WAITERS); + queue = SQ_EXCLUSIVE_QUEUE; + } + setx |= (x & SX_LOCK_WRITE_SPINNER); if (!atomic_fcmpset_rel_ptr(&sx->sx_lock, &x, setx)) continue; if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p waking up all thread on" "exclusive queue", __func__, sx); wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, - 0, SQ_EXCLUSIVE_QUEUE); + 0, queue); + td->td_sx_slocks--; break; } sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); out_lockstat: LOCKSTAT_PROFILE_RELEASE_RWLOCK(sx__release, sx, LOCKSTAT_READER); } void _sx_sunlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { + struct thread *td; uintptr_t x; KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_sunlock() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_SLOCKED, file, line); WITNESS_UNLOCK(&sx->lock_object, 0, file, line); LOCK_LOG_LOCK("SUNLOCK", &sx->lock_object, 0, 0, file, line); + td = curthread; x = SX_READ_VALUE(sx); if (__predict_false(LOCKSTAT_PROFILE_ENABLED(sx__release) || - !_sx_sunlock_try(sx, &x))) - _sx_sunlock_hard(sx, x LOCK_FILE_LINE_ARG); + !_sx_sunlock_try(sx, td, &x))) + _sx_sunlock_hard(sx, td, x LOCK_FILE_LINE_ARG); else lock_profile_release_lock(&sx->lock_object); TD_LOCKS_DEC(curthread); } void _sx_sunlock(struct sx *sx, const char *file, int line) { _sx_sunlock_int(sx LOCK_FILE_LINE_ARG); } #ifdef INVARIANT_SUPPORT #ifndef INVARIANTS #undef _sx_assert #endif /* * In the non-WITNESS case, sx_assert() can only detect that at least * *some* thread owns an slock, but it cannot guarantee that *this* * thread owns an slock. */ void _sx_assert(const struct sx *sx, int what, const char *file, int line) { #ifndef WITNESS int slocked = 0; #endif if (panicstr != NULL) return; switch (what) { case SA_SLOCKED: case SA_SLOCKED | SA_NOTRECURSED: case SA_SLOCKED | SA_RECURSED: #ifndef WITNESS slocked = 1; /* FALLTHROUGH */ #endif case SA_LOCKED: case SA_LOCKED | SA_NOTRECURSED: case SA_LOCKED | SA_RECURSED: #ifdef WITNESS witness_assert(&sx->lock_object, what, file, line); #else /* * If some other thread has an exclusive lock or we * have one and are asserting a shared lock, fail. * Also, if no one has a lock at all, fail. */ if (sx->sx_lock == SX_LOCK_UNLOCKED || (!(sx->sx_lock & SX_LOCK_SHARED) && (slocked || sx_xholder(sx) != curthread))) panic("Lock %s not %slocked @ %s:%d\n", sx->lock_object.lo_name, slocked ? "share " : "", file, line); if (!(sx->sx_lock & SX_LOCK_SHARED)) { if (sx_recursed(sx)) { if (what & SA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } else if (what & SA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } #endif break; case SA_XLOCKED: case SA_XLOCKED | SA_NOTRECURSED: case SA_XLOCKED | SA_RECURSED: if (sx_xholder(sx) != curthread) panic("Lock %s not exclusively locked @ %s:%d\n", sx->lock_object.lo_name, file, line); if (sx_recursed(sx)) { if (what & SA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } else if (what & SA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); break; case SA_UNLOCKED: #ifdef WITNESS witness_assert(&sx->lock_object, what, file, line); #else /* * If we hold an exclusve lock fail. We can't * reliably check to see if we hold a shared lock or * not. */ if (sx_xholder(sx) == curthread) panic("Lock %s exclusively locked @ %s:%d\n", sx->lock_object.lo_name, file, line); #endif break; default: panic("Unknown sx lock assertion: %d @ %s:%d", what, file, line); } } #endif /* INVARIANT_SUPPORT */ #ifdef DDB static void db_show_sx(const struct lock_object *lock) { struct thread *td; const struct sx *sx; sx = (const struct sx *)lock; db_printf(" state: "); if (sx->sx_lock == SX_LOCK_UNLOCKED) db_printf("UNLOCKED\n"); else if (sx->sx_lock == SX_LOCK_DESTROYED) { db_printf("DESTROYED\n"); return; } else if (sx->sx_lock & SX_LOCK_SHARED) db_printf("SLOCK: %ju\n", (uintmax_t)SX_SHARERS(sx->sx_lock)); else { td = sx_xholder(sx); db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (sx_recursed(sx)) db_printf(" recursed: %d\n", sx->sx_recurse); } db_printf(" waiters: "); switch(sx->sx_lock & (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)) { case SX_LOCK_SHARED_WAITERS: db_printf("shared\n"); break; case SX_LOCK_EXCLUSIVE_WAITERS: db_printf("exclusive\n"); break; case SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS: db_printf("exclusive and shared\n"); break; default: db_printf("none\n"); } } /* * Check to see if a thread that is blocked on a sleep queue is actually * blocked on an sx lock. If so, output some details and return true. * If the lock has an exclusive owner, return that in *ownerp. */ int sx_chain(struct thread *td, struct thread **ownerp) { struct sx *sx; /* * Check to see if this thread is blocked on an sx lock. * First, we check the lock class. If that is ok, then we * compare the lock name against the wait message. */ sx = td->td_wchan; if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx || sx->lock_object.lo_name != td->td_wmesg) return (0); /* We think we have an sx lock, so output some details. */ db_printf("blocked on sx \"%s\" ", td->td_wmesg); *ownerp = sx_xholder(sx); if (sx->sx_lock & SX_LOCK_SHARED) db_printf("SLOCK (count %ju)\n", (uintmax_t)SX_SHARERS(sx->sx_lock)); else db_printf("XLOCK\n"); return (1); } #endif Index: head/sys/kern/subr_trap.c =================================================================== --- head/sys/kern/subr_trap.c (revision 334023) +++ head/sys/kern/subr_trap.c (revision 334024) @@ -1,353 +1,356 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, David Greenman * Copyright (c) 1990, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007 The FreeBSD Foundation * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. * * Portions of this software were developed by A. Joseph Koshy under * sponsorship from the FreeBSD Foundation and Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_ktrace.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #include #include #ifdef VIMAGE #include #endif #ifdef HWPMC_HOOKS #include #endif #include void (*softdep_ast_cleanup)(struct thread *); /* * Define the code needed before returning to user mode, for trap and * syscall. */ void userret(struct thread *td, struct trapframe *frame) { struct proc *p = td->td_proc; CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid, td->td_name); KASSERT((p->p_flag & P_WEXIT) == 0, ("Exiting process returns to usermode")); #ifdef DIAGNOSTIC /* * Check that we called signotify() enough. For * multi-threaded processes, where signal distribution might * change due to other threads changing sigmask, the check is * racy and cannot be performed reliably. * If current process is vfork child, indicated by P_PPWAIT, then * issignal() ignores stops, so we block the check to avoid * classifying pending signals. */ if (p->p_numthreads == 1) { PROC_LOCK(p); thread_lock(td); if ((p->p_flag & P_PPWAIT) == 0) { KASSERT(!SIGPENDING(td) || (td->td_flags & (TDF_NEEDSIGCHK | TDF_ASTPENDING)) == (TDF_NEEDSIGCHK | TDF_ASTPENDING), ("failed to set signal flags for ast p %p " "td %p fl %x", p, td, td->td_flags)); } thread_unlock(td); PROC_UNLOCK(p); } #endif #ifdef KTRACE KTRUSERRET(td); #endif td_softdep_cleanup(td); MPASS(td->td_su == NULL); /* * If this thread tickled GEOM, we need to wait for the giggling to * stop before we return to userland */ if (td->td_pflags & TDP_GEOM) g_waitidle(); /* * Charge system time if profiling. */ if (p->p_flag & P_PROFIL) addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio); /* * Let the scheduler adjust our priority etc. */ sched_userret(td); /* * Check for misbehavior. * * In case there is a callchain tracing ongoing because of * hwpmc(4), skip the scheduler pinning check. * hwpmc(4) subsystem, infact, will collect callchain informations * at ast() checkpoint, which is past userret(). */ WITNESS_WARN(WARN_PANIC, NULL, "userret: returning"); KASSERT(td->td_critnest == 0, ("userret: Returning in a critical section")); KASSERT(td->td_epochnest == 0, ("userret: Returning in an epoch section")); KASSERT(td->td_locks == 0, ("userret: Returning with %d locks held", td->td_locks)); KASSERT(td->td_rw_rlocks == 0, ("userret: Returning with %d rwlocks held in read mode", td->td_rw_rlocks)); + KASSERT(td->td_sx_slocks == 0, + ("userret: Returning with %d sx locks held in shared mode", + td->td_sx_slocks)); KASSERT((td->td_pflags & TDP_NOFAULTING) == 0, ("userret: Returning with pagefaults disabled")); KASSERT(td->td_no_sleeping == 0, ("userret: Returning with sleep disabled")); KASSERT(td->td_pinned == 0 || (td->td_pflags & TDP_CALLCHAIN) != 0, ("userret: Returning with with pinned thread")); KASSERT(td->td_vp_reserv == 0, ("userret: Returning while holding vnode reservation")); KASSERT((td->td_flags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0, ("userret: Returning with stop signals deferred")); KASSERT(td->td_su == NULL, ("userret: Returning with SU cleanup request not handled")); KASSERT(td->td_vslock_sz == 0, ("userret: Returning with vslock-wired space")); #ifdef VIMAGE /* Unfortunately td_vnet_lpush needs VNET_DEBUG. */ VNET_ASSERT(curvnet == NULL, ("%s: Returning on td %p (pid %d, %s) with vnet %p set in %s", __func__, td, p->p_pid, td->td_name, curvnet, (td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A")); #endif #ifdef RACCT if (racct_enable && p->p_throttled != 0) { PROC_LOCK(p); while (p->p_throttled != 0) { msleep(p->p_racct, &p->p_mtx, 0, "racct", p->p_throttled < 0 ? 0 : p->p_throttled); if (p->p_throttled > 0) p->p_throttled = 0; } PROC_UNLOCK(p); } #endif } /* * Process an asynchronous software trap. * This is relatively easy. * This function will return with preemption disabled. */ void ast(struct trapframe *framep) { struct thread *td; struct proc *p; int flags; int sig; td = curthread; p = td->td_proc; CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid, p->p_comm); KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode")); WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode"); mtx_assert(&Giant, MA_NOTOWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); td->td_frame = framep; td->td_pticks = 0; /* * This updates the td_flag's for the checks below in one * "atomic" operation with turning off the astpending flag. * If another AST is triggered while we are handling the * AST's saved in flags, the astpending flag will be set and * ast() will be called again. */ thread_lock(td); flags = td->td_flags; td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK | TDF_NEEDRESCHED | TDF_ALRMPEND | TDF_PROFPEND | TDF_MACPEND); thread_unlock(td); VM_CNT_INC(v_trap); if (td->td_cowgen != p->p_cowgen) thread_cow_update(td); if (td->td_pflags & TDP_OWEUPC && p->p_flag & P_PROFIL) { addupc_task(td, td->td_profil_addr, td->td_profil_ticks); td->td_profil_ticks = 0; td->td_pflags &= ~TDP_OWEUPC; } #ifdef HWPMC_HOOKS /* Handle Software PMC callchain capture. */ if (PMC_IS_PENDING_CALLCHAIN(td)) PMC_CALL_HOOK_UNLOCKED(td, PMC_FN_USER_CALLCHAIN_SOFT, (void *) framep); #endif if (flags & TDF_ALRMPEND) { PROC_LOCK(p); kern_psignal(p, SIGVTALRM); PROC_UNLOCK(p); } if (flags & TDF_PROFPEND) { PROC_LOCK(p); kern_psignal(p, SIGPROF); PROC_UNLOCK(p); } #ifdef MAC if (flags & TDF_MACPEND) mac_thread_userret(td); #endif if (flags & TDF_NEEDRESCHED) { #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 1, __func__); #endif thread_lock(td); sched_prio(td, td->td_user_pri); mi_switch(SW_INVOL | SWT_NEEDRESCHED, NULL); thread_unlock(td); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 1, __func__); #endif } #ifdef DIAGNOSTIC if (p->p_numthreads == 1 && (flags & TDF_NEEDSIGCHK) == 0) { PROC_LOCK(p); thread_lock(td); /* * Note that TDF_NEEDSIGCHK should be re-read from * td_flags, since signal might have been delivered * after we cleared td_flags above. This is one of * the reason for looping check for AST condition. * See comment in userret() about P_PPWAIT. */ if ((p->p_flag & P_PPWAIT) == 0) { KASSERT(!SIGPENDING(td) || (td->td_flags & (TDF_NEEDSIGCHK | TDF_ASTPENDING)) == (TDF_NEEDSIGCHK | TDF_ASTPENDING), ("failed2 to set signal flags for ast p %p td %p " "fl %x %x", p, td, flags, td->td_flags)); } thread_unlock(td); PROC_UNLOCK(p); } #endif /* * Check for signals. Unlocked reads of p_pendingcnt or * p_siglist might cause process-directed signal to be handled * later. */ if (flags & TDF_NEEDSIGCHK || p->p_pendingcnt > 0 || !SIGISEMPTY(p->p_siglist)) { PROC_LOCK(p); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { KASSERT(sig >= 0, ("sig %d", sig)); postsig(sig); } mtx_unlock(&p->p_sigacts->ps_mtx); PROC_UNLOCK(p); } /* * We need to check to see if we have to exit or wait due to a * single threading requirement or some other STOP condition. */ if (flags & TDF_NEEDSUSPCHK) { PROC_LOCK(p); thread_suspend_check(0); PROC_UNLOCK(p); } if (td->td_pflags & TDP_OLDMASK) { td->td_pflags &= ~TDP_OLDMASK; kern_sigprocmask(td, SIG_SETMASK, &td->td_oldsigmask, NULL, 0); } userret(td, framep); } const char * syscallname(struct proc *p, u_int code) { static const char unknown[] = "unknown"; struct sysentvec *sv; sv = p->p_sysent; if (sv->sv_syscallnames == NULL || code >= sv->sv_size) return (unknown); return (sv->sv_syscallnames[code]); } Index: head/sys/sys/proc.h =================================================================== --- head/sys/sys/proc.h (revision 334023) +++ head/sys/sys/proc.h (revision 334024) @@ -1,1176 +1,1177 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)proc.h 8.15 (Berkeley) 5/19/95 * $FreeBSD$ */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #include #ifndef _KERNEL #include #endif #include #include #include #include #include #include #include /* XXX. */ #include #include #include #include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #else #include #include #endif #include #include #include #include #include /* Machine-dependent proc substruct. */ #ifdef _KERNEL #include #endif /* * A section object may be passed to every begin-end pair to allow for * forward progress guarantees with-in prolonged active sections. * * We can't include ck_epoch.h so we define our own variant here and * then CTASSERT that it's the same size in subr_epoch.c */ struct epoch_section { unsigned int bucket; }; typedef struct epoch_section epoch_section_t; /* * One structure allocated per session. * * List of locks * (m) locked by s_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct session { u_int s_count; /* Ref cnt; pgrps in session - atomic. */ struct proc *s_leader; /* (m + e) Session leader. */ struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ struct cdev_priv *s_ttydp; /* (m) Device of controlling tty. */ struct tty *s_ttyp; /* (e) Controlling tty. */ pid_t s_sid; /* (c) Session ID. */ /* (m) Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; struct mtx s_mtx; /* Mutex to protect members. */ }; /* * One structure allocated per process group. * * List of locks * (m) locked by pg_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ pid_t pg_id; /* (c) Process group id. */ int pg_jobc; /* (m) Job control process count. */ struct mtx pg_mtx; /* Mutex to protect members */ }; /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[1]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never changes * (exception aiods switch vmspaces, but they are also * marked 'P_SYSTEM' so hopefully it will be left alone) * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by proc slock * k - only accessed by curthread * k*- only accessed by curthread and from an interrupt * kx- only accessed by curthread and by debugger * l - the attaching proc or attaching proc parent * m - Giant * n - not locked, lazy * o - ktrace lock * q - td_contested lock * r - p_peers lock * s - see sleepq_switch(), sleeping_on_old_rtc(), and sleep(9) * t - thread lock * u - process stat lock * w - process timer lock * x - created at fork, only changes during single threading in exec * y - created at first aio, doesn't change until exit or exec at which * point we are single-threaded and only curthread changes it * z - zombie threads lock * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held * for write access. */ struct cpuset; struct filecaps; struct filemon; struct kaioinfo; struct kaudit_record; struct kdtrace_proc; struct kdtrace_thread; struct mqueue_notifier; struct nlminfo; struct p_sched; struct proc; struct procdesc; struct racct; struct sbuf; struct sleepqueue; struct socket; struct syscall_args; struct td_sched; struct thread; struct trapframe; struct turnstile; struct vm_map; struct vm_map_entry; /* * XXX: Does this belong in resource.h or resourcevar.h instead? * Resource usage extension. The times in rusage structs in the kernel are * never up to date. The actual times are kept as runtimes and tick counts * (with control info in the "previous" times), and are converted when * userland asks for rusage info. Backwards compatibility prevents putting * this directly in the user-visible rusage struct. * * Locking for p_rux: (cu) means (u) for p_rux and (c) for p_crux. * Locking for td_rux: (t) for all fields. */ struct rusage_ext { uint64_t rux_runtime; /* (cu) Real time. */ uint64_t rux_uticks; /* (cu) Statclock hits in user mode. */ uint64_t rux_sticks; /* (cu) Statclock hits in sys mode. */ uint64_t rux_iticks; /* (cu) Statclock hits in intr mode. */ uint64_t rux_uu; /* (c) Previous user time in usec. */ uint64_t rux_su; /* (c) Previous sys time in usec. */ uint64_t rux_tu; /* (c) Previous total time in usec. */ }; /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * Thread context. Processes may have multiple threads. */ struct thread { struct mtx *volatile td_lock; /* replaces sched lock */ struct proc *td_proc; /* (*) Associated process. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ TAILQ_ENTRY(thread) td_runq; /* (t) Run queue. */ TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ LIST_ENTRY(thread) td_hash; /* (d) Hash chain. */ struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ struct domainset_ref td_domain; /* (a) NUMA policy */ struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals u_char td_lend_user_pri; /* (t) Lend user pri. */ /* Cleared during fork1() */ #define td_startzero td_flags u_char td_epochnest; /* (k) Private thread epoch nest counter */ int td_flags; /* (t) TDF_* flags. */ int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ int td_sqqueue; /* (t) Sleepqueue queue blocked on. */ void *td_wchan; /* (t) Sleep address. */ const char *td_wmesg; /* (t) Reason for sleep. */ volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */ u_char td_tsqueue; /* (t) Turnstile queue blocked on. */ short td_locks; /* (k) Debug: count of non-spin locks */ short td_rw_rlocks; /* (k) Count of rwlock read locks. */ + short td_sx_slocks; /* (k) Count of sx shared locks. */ short td_lk_slocks; /* (k) Count of lockmgr shared locks. */ short td_stopsched; /* (k) Scheduler stopped. */ struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */ const char *td_lockname; /* (t) Name of lock blocked on. */ LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ struct ucred *td_ucred; /* (k) Reference to credentials. */ struct plimit *td_limit; /* (k) Resource limits. */ int td_slptick; /* (t) Time at sleep. */ int td_blktick; /* (t) Time spent blocked. */ int td_swvoltick; /* (t) Time at last SW_VOL switch. */ int td_swinvoltick; /* (t) Time at last SW_INVOL switch. */ u_int td_cow; /* (*) Number of copy-on-write faults */ struct rusage td_ru; /* (t) rusage information. */ struct rusage_ext td_rux; /* (t) Internal rusage information. */ uint64_t td_incruntime; /* (t) Cpu ticks to transfer to proc. */ uint64_t td_runtime; /* (t) How many cpu ticks we've run. */ u_int td_pticks; /* (t) Statclock hits for profiling */ u_int td_sticks; /* (t) Statclock hits in system mode. */ u_int td_iticks; /* (t) Statclock hits in intr mode. */ u_int td_uticks; /* (t) Statclock hits in user mode. */ int td_intrval; /* (t) Return value for sleepq. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ volatile u_int td_generation; /* (k) For detection of preemption */ stack_t td_sigstk; /* (k) Stack ptr and on-stack flag. */ int td_xsig; /* (c) Signal for ptrace */ u_long td_profil_addr; /* (k) Temporary addr until AST. */ u_int td_profil_ticks; /* (k) Temporary ticks until AST. */ char td_name[MAXCOMLEN + 1]; /* (*) Thread name. */ struct file *td_fpop; /* (k) file referencing cdev under op */ int td_dbgflags; /* (c) Userland debugger flags */ siginfo_t td_si; /* (c) For debugger or core file */ int td_ng_outbound; /* (k) Thread entered ng from above. */ struct osd td_osd; /* (k) Object specific data. */ struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */ pid_t td_dbg_forked; /* (c) Child pid for debugger. */ u_int td_vp_reserv; /* (k) Count of reserved vnodes. */ int td_no_sleeping; /* (k) Sleeping disabled count. */ void *td_su; /* (k) FFS SU private */ sbintime_t td_sleeptimo; /* (t) Sleep timeout. */ int td_rtcgen; /* (s) rtc_generation of abs. sleep */ size_t td_vslock_sz; /* (k) amount of vslock-ed space */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ #define td_startcopy td_endzero sigset_t td_sigmask; /* (c) Current signal mask. */ u_char td_rqindex; /* (t) Run queue index. */ u_char td_base_pri; /* (t) Thread base kernel priority. */ u_char td_priority; /* (t) Thread active priority. */ u_char td_pri_class; /* (t) Scheduling class. */ u_char td_user_pri; /* (t) User pri from estcpu and nice. */ u_char td_base_user_pri; /* (t) Base user pri */ u_char td_pre_epoch_prio; /* (k) User pri on entry to epoch */ uintptr_t td_rb_list; /* (k) Robust list head. */ uintptr_t td_rbp_list; /* (k) Robust priv list head. */ uintptr_t td_rb_inact; /* (k) Current in-action mutex loc. */ struct syscall_args td_sa; /* (kx) Syscall parameters. Copied on fork for child tracing. */ #define td_endcopy td_pcb /* * Fields that must be manually set in fork1() or create_thread() * or already have been set in the allocator, constructor, etc. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ enum { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING } td_state; /* (t) thread state */ union { register_t tdu_retval[2]; off_t tdu_off; } td_uretoff; /* (k) Syscall aux returns. */ #define td_retval td_uretoff.tdu_retval u_int td_cowgen; /* (k) Generation of COW pointers. */ /* LP64 hole */ struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ struct vm_object *td_kstack_obj;/* (a) Kstack object. */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ int td_kstack_pages; /* (a) Size of the kstack. */ volatile u_int td_critnest; /* (k*) Critical section nest level. */ struct mdthread td_md; /* (k) Any machine-dependent fields. */ struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ struct lpohead td_lprof[2]; /* (a) lock profiling objects. */ struct kdtrace_thread *td_dtrace; /* (*) DTrace-specific data. */ int td_errno; /* Error returned by last syscall. */ /* LP64 hole */ struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ struct proc *td_rfppwait_p; /* (k) The vforked child */ struct vm_page **td_ma; /* (k) uio pages held */ int td_ma_cnt; /* (k) size of *td_ma */ /* LP64 hole */ void *td_emuldata; /* Emulator state data */ int td_lastcpu; /* (t) Last cpu we were on. */ int td_oncpu; /* (t) Which cpu we are on. */ void *td_lkpi_task; /* LinuxKPI task struct pointer */ TAILQ_ENTRY(thread) td_epochq; /* (t) Epoch queue. */ epoch_section_t td_epoch_section; /* (t) epoch section object */ }; struct thread0_storage { struct thread t0st_thread; uint64_t t0st_sched[10]; }; struct mtx *thread_lock_block(struct thread *); void thread_lock_unblock(struct thread *, struct mtx *); void thread_lock_set(struct thread *, struct mtx *); #define THREAD_LOCK_ASSERT(td, type) \ do { \ struct mtx *__m = (td)->td_lock; \ if (__m != &blocked_lock) \ mtx_assert(__m, (type)); \ } while (0) #ifdef INVARIANTS #define THREAD_LOCKPTR_ASSERT(td, lock) \ do { \ struct mtx *__m = (td)->td_lock; \ KASSERT((__m == &blocked_lock || __m == (lock)), \ ("Thread %p lock %p does not match %p", td, __m, (lock))); \ } while (0) #define TD_LOCKS_INC(td) ((td)->td_locks++) #define TD_LOCKS_DEC(td) do { \ KASSERT(SCHEDULER_STOPPED_TD(td) || (td)->td_locks > 0, \ ("thread %p owns no locks", (td))); \ (td)->td_locks--; \ } while (0) #else #define THREAD_LOCKPTR_ASSERT(td, lock) #define TD_LOCKS_INC(td) #define TD_LOCKS_DEC(td) #endif /* * Flags kept in td_flags: * To change these you MUST have the scheduler lock. */ #define TDF_BORROWING 0x00000001 /* Thread is borrowing pri from another. */ #define TDF_INPANIC 0x00000002 /* Caused a panic, let it drive crashdump. */ #define TDF_INMEM 0x00000004 /* Thread's stack is in memory. */ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_CANSWAP 0x00000040 /* Thread can be swapped. */ #define TDF_SLEEPABORT 0x00000080 /* sleepq_abort was called. */ #define TDF_KTH_SUSP 0x00000100 /* kthread is suspended */ #define TDF_ALLPROCSUSP 0x00000200 /* suspended by SINGLE_ALLPROC */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_ASTPENDING 0x00000800 /* Thread has some asynchronous events. */ #define TDF_UNUSED12 0x00001000 /* --available-- */ #define TDF_SBDRY 0x00002000 /* Stop only on usermode boundary. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_NEEDSUSPCHK 0x00008000 /* Thread may need to suspend. */ #define TDF_NEEDRESCHED 0x00010000 /* Thread needs to yield. */ #define TDF_NEEDSIGCHK 0x00020000 /* Thread may need signal delivery. */ #define TDF_NOLOAD 0x00040000 /* Ignore during load avg calculations. */ #define TDF_SERESTART 0x00080000 /* ERESTART on stop attempts. */ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_SEINTR 0x00200000 /* EINTR on stop attempts. */ #define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */ #define TDF_UNUSED23 0x00800000 /* --available-- */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ #define TDF_SCHED3 0x08000000 /* Reserved for scheduler private use */ #define TDF_ALRMPEND 0x10000000 /* Pending SIGVTALRM needs to be posted. */ #define TDF_PROFPEND 0x20000000 /* Pending SIGPROF needs to be posted. */ #define TDF_MACPEND 0x40000000 /* AST-based MAC event pending. */ /* Userland debug flags */ #define TDB_SUSPEND 0x00000001 /* Thread is suspended by debugger */ #define TDB_XSIG 0x00000002 /* Thread is exchanging signal under trace */ #define TDB_USERWR 0x00000004 /* Debugger modified memory or registers */ #define TDB_SCE 0x00000008 /* Thread performs syscall enter */ #define TDB_SCX 0x00000010 /* Thread performs syscall exit */ #define TDB_EXEC 0x00000020 /* TDB_SCX from exec(2) family */ #define TDB_FORK 0x00000040 /* TDB_SCX from fork(2) that created new process */ #define TDB_STOPATFORK 0x00000080 /* Stop at the return from fork (child only) */ #define TDB_CHILD 0x00000100 /* New child indicator for ptrace() */ #define TDB_BORN 0x00000200 /* New LWP indicator for ptrace() */ #define TDB_EXIT 0x00000400 /* Exiting LWP indicator for ptrace() */ #define TDB_VFORK 0x00000800 /* vfork indicator for ptrace() */ #define TDB_FSTP 0x00001000 /* The thread is PT_ATTACH leader */ /* * "Private" flags kept in td_pflags: * These are only written by curthread and thus need no locking. */ #define TDP_OLDMASK 0x00000001 /* Need to restore mask after suspend. */ #define TDP_INKTR 0x00000002 /* Thread is currently in KTR code. */ #define TDP_INKTRACE 0x00000004 /* Thread is currently in KTRACE code. */ #define TDP_BUFNEED 0x00000008 /* Do not recurse into the buf flush */ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock acquisition - deadlock treatment. */ #define TDP_NOFAULTING 0x00000080 /* Do not handle page faults. */ #define TDP_UNUSED9 0x00000100 /* --available-- */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ #define TDP_SYNCIO 0x00000800 /* Local override, disable async i/o. */ #define TDP_SCHED1 0x00001000 /* Reserved for scheduler private use */ #define TDP_SCHED2 0x00002000 /* Reserved for scheduler private use */ #define TDP_SCHED3 0x00004000 /* Reserved for scheduler private use */ #define TDP_SCHED4 0x00008000 /* Reserved for scheduler private use */ #define TDP_GEOM 0x00010000 /* Settle GEOM before finishing syscall */ #define TDP_SOFTDEP 0x00020000 /* Stuck processing softdep worklist */ #define TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */ #define TDP_WAKEUP 0x00080000 /* Don't sleep in umtx cond_wait */ #define TDP_INBDFLUSH 0x00100000 /* Already in BO_BDFLUSH, do not recurse */ #define TDP_KTHREAD 0x00200000 /* This is an official kernel thread */ #define TDP_CALLCHAIN 0x00400000 /* Capture thread's callchain */ #define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */ #define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */ #define TDP_RFPPWAIT 0x02000000 /* Handle RFPPWAIT on syscall exit */ #define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */ #define TDP_NERRNO 0x08000000 /* Last errno is already in td_errno */ #define TDP_UIOHELD 0x10000000 /* Current uio has pages held in td_ma */ #define TDP_FORKING 0x20000000 /* Thread is being created through fork() */ #define TDP_EXECVMSPC 0x40000000 /* Execve destroyed old vmspace */ /* * Reasons that the current thread can not be run yet. * More than one may apply. */ #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ #define TDI_SWAPPED 0x0004 /* Stack not in mem. Bad juju if run. */ #define TDI_LOCK 0x0008 /* Stopped on a lock. */ #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) #define TD_IS_SWAPPED(td) ((td)->td_inhibitors & TDI_SWAPPED) #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) #define TD_IS_RUNNING(td) ((td)->td_state == TDS_RUNNING) #define TD_ON_RUNQ(td) ((td)->td_state == TDS_RUNQ) #define TD_CAN_RUN(td) ((td)->td_state == TDS_CAN_RUN) #define TD_IS_INHIBITED(td) ((td)->td_state == TDS_INHIBITED) #define TD_ON_UPILOCK(td) ((td)->td_flags & TDF_UPIBLOCKED) #define TD_IS_IDLETHREAD(td) ((td)->td_flags & TDF_IDLETD) #define KTDSTATE(td) \ (((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \ ((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \ ((td)->td_inhibitors & TDI_SWAPPED) != 0 ? "swapped" : \ ((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" : \ ((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding") #define TD_SET_INHIB(td, inhib) do { \ (td)->td_state = TDS_INHIBITED; \ (td)->td_inhibitors |= (inhib); \ } while (0) #define TD_CLR_INHIB(td, inhib) do { \ if (((td)->td_inhibitors & (inhib)) && \ (((td)->td_inhibitors &= ~(inhib)) == 0)) \ (td)->td_state = TDS_CAN_RUN; \ } while (0) #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) #define TD_SET_SWAPPED(td) TD_SET_INHIB((td), TDI_SWAPPED) #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) #define TD_CLR_SWAPPED(td) TD_CLR_INHIB((td), TDI_SWAPPED) #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) #define TD_SET_RUNNING(td) (td)->td_state = TDS_RUNNING #define TD_SET_RUNQ(td) (td)->td_state = TDS_RUNQ #define TD_SET_CAN_RUN(td) (td)->td_state = TDS_CAN_RUN #define TD_SBDRY_INTR(td) \ (((td)->td_flags & (TDF_SEINTR | TDF_SERESTART)) != 0) #define TD_SBDRY_ERRNO(td) \ (((td)->td_flags & TDF_SEINTR) != 0 ? EINTR : ERESTART) /* * Process structure. */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ TAILQ_HEAD(, thread) p_threads; /* (c) all threads. */ struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Open files. */ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Resource limits. */ struct callout p_limco; /* (c) Limit callout handle */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ int p_flag; /* (c) P_* flags. */ int p_flag2; /* (c) P2_* flags. */ enum { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE } p_state; /* (j/c) Process status. */ pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ struct proc *p_pptr; /* (c + e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ struct proc *p_reaper; /* (e) My reaper. */ LIST_HEAD(, proc) p_reaplist; /* (e) List of my descendants (if I am reaper). */ LIST_ENTRY(proc) p_reapsibling; /* (e) List of siblings - descendants of the same reaper. */ struct mtx p_mtx; /* (n) Lock for this struct. */ struct mtx p_statmtx; /* Lock for the stats */ struct mtx p_itimmtx; /* Lock for the virt/prof timers */ struct mtx p_profmtx; /* Lock for the profiling */ struct ksiginfo *p_ksi; /* Locked by parent proc lock */ sigqueue_t p_sigqueue; /* (c) Sigs not delivered to a td. */ #define p_siglist p_sigqueue.sq_signals /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_oppid pid_t p_oppid; /* (c + e) Save ppid in ptrace. XXX */ struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtick; /* (c) Tick when swapped in or out. */ u_int p_cowgen; /* (c) Generation of COW pointers. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ struct rusage p_ru; /* (a) Exit information. */ struct rusage_ext p_rux; /* (cu) Internal resource usage. */ struct rusage_ext p_crux; /* (c) Internal child resource usage. */ int p_profthreads; /* (c) Num threads in addupc_task. */ volatile int p_exitthreads; /* (j) Number of threads exiting */ int p_traceflag; /* (o) Kernel trace points. */ struct vnode *p_tracevp; /* (c + o) Trace to vnode. */ struct ucred *p_tracecred; /* (o) Credentials to trace with. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ u_int p_lock; /* (c) Proclock (prevent swap) count. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_long p_code; /* (n) For core dump/debugger XXX. */ u_int p_stops; /* (c) Stop event bitmask. */ u_int p_stype; /* (c) Stop event type. */ char p_step; /* (c) Process is stopped. */ u_char p_pfsflags; /* (c) Procfs flags. */ u_int p_ptevents; /* (c) ptrace() event mask. */ struct nlminfo *p_nlminfo; /* (?) Only used by/for lockd. */ struct kaioinfo *p_aioinfo; /* (y) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ int p_suspcount; /* (j) Num threads in suspended mode. */ struct thread *p_xthread; /* (c) Trap thread */ int p_boundary_count;/* (j) Num threads at user boundary */ int p_pendingcnt; /* how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ struct procdesc *p_procdesc; /* (e) Process descriptor, if any. */ u_int p_treeflag; /* (e) P_TREE flags */ int p_pendingexits; /* (c) Count of pending thread exits. */ struct filemon *p_filemon; /* (c) filemon-specific data. */ int p_pdeathsig; /* (c) Signal from parent on exit. */ /* End area that is zeroed on creation. */ #define p_endzero p_magic /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_endzero u_int p_magic; /* (b) Magic number. */ int p_osrel; /* (x) osreldate for the binary (from ELF note, if any) */ char p_comm[MAXCOMLEN + 1]; /* (x) Process name. */ struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (c) Current CPU limit in seconds. */ signed char p_nice; /* (c) Process "nice" value. */ int p_fibnum; /* in this routing domain XXX MRT */ pid_t p_reapsubtree; /* (e) Pid of the direct child of the reaper which spawned our subtree. */ uint16_t p_elf_machine; /* (x) ELF machine type */ uint64_t p_elf_flags; /* (x) ELF flags */ /* End area that is copied on creation. */ #define p_endcopy p_xexit u_int p_xexit; /* (c) Exit code. */ u_int p_xsig; /* (c) Stop/kill sig. */ struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ struct knlist *p_klist; /* (c) Knotes attached to this proc. */ int p_numthreads; /* (c) Number of threads. */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */ u_short p_acflag; /* (c) Accounting flags. */ struct proc *p_peers; /* (r) */ struct proc *p_leader; /* (b) */ void *p_emuldata; /* (c) Emulator state data. */ struct label *p_label; /* (*) Proc (not subject) MAC label. */ STAILQ_HEAD(, ktr_request) p_ktr; /* (o) KTR event queue. */ LIST_HEAD(, mqueue_notifier) p_mqnotifier; /* (c) mqueue notifiers.*/ struct kdtrace_proc *p_dtrace; /* (*) DTrace-specific data. */ struct cv p_pwait; /* (*) wait cv for exit/exec. */ struct cv p_dbgwait; /* (*) wait cv for debugger attach after fork. */ uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ int p_throttled; /* (c) Flag for racct pcpu throttling */ /* * An orphan is the child that has been re-parented to the * debugger as a result of attaching to it. Need to keep * track of them for parent to be able to collect the exit * status of what used to be children. */ LIST_ENTRY(proc) p_orphan; /* (e) List of orphan processes. */ LIST_HEAD(, proc) p_orphans; /* (e) Pointer to list of orphans. */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id #define NOCPU (-1) /* For when we aren't on a CPU. */ #define NOCPU_OLD (255) #define MAXCPU_OLD (254) #define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock) #define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock) #define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type)) #define PROC_STATLOCK(p) mtx_lock_spin(&(p)->p_statmtx) #define PROC_STATUNLOCK(p) mtx_unlock_spin(&(p)->p_statmtx) #define PROC_STATLOCK_ASSERT(p, type) mtx_assert(&(p)->p_statmtx, (type)) #define PROC_ITIMLOCK(p) mtx_lock_spin(&(p)->p_itimmtx) #define PROC_ITIMUNLOCK(p) mtx_unlock_spin(&(p)->p_itimmtx) #define PROC_ITIMLOCK_ASSERT(p, type) mtx_assert(&(p)->p_itimmtx, (type)) #define PROC_PROFLOCK(p) mtx_lock_spin(&(p)->p_profmtx) #define PROC_PROFUNLOCK(p) mtx_unlock_spin(&(p)->p_profmtx) #define PROC_PROFLOCK_ASSERT(p, type) mtx_assert(&(p)->p_profmtx, (type)) /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00002 /* Has a controlling terminal. */ #define P_KPROC 0x00004 /* Kernel process. */ #define P_UNUSED3 0x00008 /* --available-- */ #define P_PPWAIT 0x00010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00020 /* Has started profiling. */ #define P_STOPPROF 0x00040 /* Has thread requesting to stop profiling. */ #define P_HADTHREADS 0x00080 /* Has had threads (no cleanup shortcuts) */ #define P_SUGID 0x00100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00200 /* System proc: no sigs, stats or swapping. */ #define P_SINGLE_EXIT 0x00400 /* Threads suspending should exit, not wait. */ #define P_TRACED 0x00800 /* Debugged process being traced. */ #define P_WAITED 0x01000 /* Someone is waiting for us. */ #define P_WEXIT 0x02000 /* Working on exiting. */ #define P_EXEC 0x04000 /* Process called exec. */ #define P_WKILLED 0x08000 /* Killed, go to kernel/user boundary ASAP. */ #define P_CONTINUED 0x10000 /* Proc has continued from a stopped state. */ #define P_STOPPED_SIG 0x20000 /* Stopped due to SIGSTOP/SIGTSTP. */ #define P_STOPPED_TRACE 0x40000 /* Stopped because of tracing. */ #define P_STOPPED_SINGLE 0x80000 /* Only 1 thread can continue (not to user). */ #define P_PROTECTED 0x100000 /* Do not kill on memory overcommit. */ #define P_SIGEVENT 0x200000 /* Process pending signals changed. */ #define P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */ #define P_HWPMC 0x800000 /* Process is using HWPMCs */ #define P_JAILED 0x1000000 /* Process is in jail. */ #define P_TOTAL_STOP 0x2000000 /* Stopped in stop_all_proc. */ #define P_INEXEC 0x4000000 /* Process is in execve(). */ #define P_STATCHILD 0x8000000 /* Child process stopped or exited. */ #define P_INMEM 0x10000000 /* Loaded into memory. */ #define P_SWAPPINGOUT 0x20000000 /* Process is being swapped out. */ #define P_SWAPPINGIN 0x40000000 /* Process is being swapped in. */ #define P_PPTRACE 0x80000000 /* PT_TRACEME by vforked child. */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) #define P_KILLED(p) ((p)->p_flag & P_WKILLED) /* These flags are kept in p_flag2. */ #define P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */ #define P2_NOTRACE 0x00000002 /* No ptrace(2) attach or coredumps. */ #define P2_NOTRACE_EXEC 0x00000004 /* Keep P2_NOPTRACE on exec(2). */ #define P2_AST_SU 0x00000008 /* Handles SU ast for kthreads. */ #define P2_PTRACE_FSTP 0x00000010 /* SIGSTOP from PT_ATTACH not yet handled. */ #define P2_TRAPCAP 0x00000020 /* SIGTRAP on ENOTCAPABLE */ /* Flags protected by proctree_lock, kept in p_treeflags. */ #define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */ #define P_TREE_FIRST_ORPHAN 0x00000002 /* First element of orphan list */ #define P_TREE_REAPER 0x00000004 /* Reaper of subtree */ /* * These were process status values (p_stat), now they are only used in * legacy conversion code. */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SLOCK 7 /* Blocked on a lock. */ #define P_MAGIC 0xbeefface #ifdef _KERNEL /* Types and flags for mi_switch(). */ #define SW_TYPE_MASK 0xff /* First 8 bits are switch type */ #define SWT_NONE 0 /* Unspecified switch. */ #define SWT_PREEMPT 1 /* Switching due to preemption. */ #define SWT_OWEPREEMPT 2 /* Switching due to owepreempt. */ #define SWT_TURNSTILE 3 /* Turnstile contention. */ #define SWT_SLEEPQ 4 /* Sleepq wait. */ #define SWT_SLEEPQTIMO 5 /* Sleepq timeout wait. */ #define SWT_RELINQUISH 6 /* yield call. */ #define SWT_NEEDRESCHED 7 /* NEEDRESCHED was set. */ #define SWT_IDLE 8 /* Switching from the idle thread. */ #define SWT_IWAIT 9 /* Waiting for interrupts. */ #define SWT_SUSPEND 10 /* Thread suspended. */ #define SWT_REMOTEPREEMPT 11 /* Remote processor preempted. */ #define SWT_REMOTEWAKEIDLE 12 /* Remote processor preempted idle. */ #define SWT_COUNT 13 /* Number of switch types. */ /* Flags */ #define SW_VOL 0x0100 /* Voluntary switch. */ #define SW_INVOL 0x0200 /* Involuntary switch. */ #define SW_PREEMPT 0x0400 /* The invol switch is a preemption */ /* How values for thread_single(). */ #define SINGLE_NO_EXIT 0 #define SINGLE_EXIT 1 #define SINGLE_BOUNDARY 2 #define SINGLE_ALLPROC 3 #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PARGS); MALLOC_DECLARE(M_PGRP); MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); #endif #define FOREACH_PROC_IN_SYSTEM(p) \ LIST_FOREACH((p), &allproc, p_list) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&(p)->p_threads) /* * We use process IDs <= pid_max <= PID_MAX; PID_MAX + 1 must also fit * in a pid_t, as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID 100000 extern pid_t pid_max; #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) #define STOPEVENT(p, e, v) do { \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) { \ PROC_LOCK(p); \ stopevent((p), (e), (v)); \ PROC_UNLOCK(p); \ } \ } while (0) #define _STOPEVENT(p, e, v) do { \ PROC_LOCK_ASSERT(p, MA_OWNED); \ WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, \ "checking stopevent %d", (e)); \ if ((p)->p_stops & (e)) \ stopevent((p), (e), (v)); \ } while (0) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) /* Lock and unlock a process group. */ #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) #define PGRP_LOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_LOCK(pg); \ } while (0) #define PGRP_UNLOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_UNLOCK(pg); \ } while (0) /* Lock and unlock a session. */ #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) /* * Non-zero p_lock ensures that: * - exit1() is not performed until p_lock reaches zero; * - the process' threads stack are not swapped out if they are currently * not (P_INMEM). * * PHOLD() asserts that the process (except the current process) is * not exiting, increments p_lock and swaps threads stacks into memory, * if needed. * _PHOLD() is same as PHOLD(), it takes the process locked. * _PHOLD_LITE() also takes the process locked, but comparing with * _PHOLD(), it only guarantees that exit1() is not executed, * faultin() is not called. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ _PHOLD(p); \ PROC_UNLOCK(p); \ } while (0) #define _PHOLD(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ if (((p)->p_flag & P_INMEM) == 0) \ faultin((p)); \ } while (0) #define _PHOLD_LITE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ } while (0) #define PROC_ASSERT_HELD(p) do { \ KASSERT((p)->p_lock > 0, ("process %p not held", p)); \ } while (0) #define PRELE(p) do { \ PROC_LOCK((p)); \ _PRELE((p)); \ PROC_UNLOCK((p)); \ } while (0) #define _PRELE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ PROC_ASSERT_HELD(p); \ (--(p)->p_lock); \ if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0) \ wakeup(&(p)->p_lock); \ } while (0) #define PROC_ASSERT_NOT_HELD(p) do { \ KASSERT((p)->p_lock == 0, ("process %p held", p)); \ } while (0) #define PROC_UPDATE_COW(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ (p)->p_cowgen++; \ } while (0) /* Check whether a thread is safe to be swapped out. */ #define thread_safetoswapout(td) ((td)->td_flags & TDF_CANSWAP) /* Control whether or not it is safe for curthread to sleep. */ #define THREAD_NO_SLEEPING() ((curthread)->td_no_sleeping++) #define THREAD_SLEEPING_OK() ((curthread)->td_no_sleeping--) #define THREAD_CAN_SLEEP() ((curthread)->td_no_sleeping == 0) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern u_long pidhash; #define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) extern LIST_HEAD(tidhashhead, thread) *tidhashtbl; extern u_long tidhash; extern struct rwlock tidhash_lock; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct sx allproc_lock; extern int allproc_gen; extern struct sx proctree_lock; extern struct mtx ppeers_lock; extern struct proc proc0; /* Process slot for swapper. */ extern struct thread0_storage thread0_st; /* Primary thread in proc0. */ #define thread0 (thread0_st.t0st_thread) extern struct vmspace vmspace0; /* VM space for proc0. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int lastpid; extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ extern struct proclist zombproc; /* List of zombie processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct uma_zone *proc_zone; struct proc *pfind(pid_t); /* Find process by id. */ struct proc *pfind_any(pid_t); /* Find (zombie) process by id. */ struct proc *pfind_locked(pid_t pid); struct pgrp *pgfind(pid_t); /* Find process group by id. */ struct proc *zpfind(pid_t); /* Find zombie process by id. */ struct fork_req { int fr_flags; int fr_pages; int *fr_pidp; struct proc **fr_procp; int *fr_pd_fd; int fr_pd_flags; struct filecaps *fr_pd_fcaps; }; /* * pget() flags. */ #define PGET_HOLD 0x00001 /* Hold the process. */ #define PGET_CANSEE 0x00002 /* Check against p_cansee(). */ #define PGET_CANDEBUG 0x00004 /* Check against p_candebug(). */ #define PGET_ISCURRENT 0x00008 /* Check that the found process is current. */ #define PGET_NOTWEXIT 0x00010 /* Check that the process is not in P_WEXIT. */ #define PGET_NOTINEXEC 0x00020 /* Check that the process is not in P_INEXEC. */ #define PGET_NOTID 0x00040 /* Do not assume tid if pid > PID_MAX. */ #define PGET_WANTREAD (PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT) int pget(pid_t pid, int flags, struct proc **pp); void ast(struct trapframe *framep); struct thread *choosethread(void); int cr_cansee(struct ucred *u1, struct ucred *u2); int cr_canseesocket(struct ucred *cred, struct socket *so); int cr_canseeothergids(struct ucred *u1, struct ucred *u2); int cr_canseeotheruids(struct ucred *u1, struct ucred *u2); int cr_canseejailproc(struct ucred *u1, struct ucred *u2); int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess); int enterthispgrp(struct proc *p, struct pgrp *pgrp); void faultin(struct proc *p); void fixjobc(struct proc *p, struct pgrp *pgrp, int entering); int fork1(struct thread *, struct fork_req *); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); void kern_proc_vmmap_resident(struct vm_map *map, struct vm_map_entry *entry, int *resident_count, bool *super); void kern_yield(int); void kick_proc0(void); void killjobc(void); int leavepgrp(struct proc *p); int maybe_preempt(struct thread *td); void maybe_yield(void); void mi_switch(int flags, struct thread *newtd); int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); int p_cansignal(struct thread *td, struct proc *p, int signum); int p_canwait(struct thread *td, struct proc *p); struct pargs *pargs_alloc(int len); void pargs_drop(struct pargs *pa); void pargs_hold(struct pargs *pa); int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb); void procinit(void); void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); struct proc *proc_realparent(struct proc *child); void proc_reap(struct thread *td, struct proc *p, int *status, int options); void proc_reparent(struct proc *child, struct proc *newparent); void proc_set_traced(struct proc *p, bool stop); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); void pstats_free(struct pstats *ps); void reaper_abandon_children(struct proc *p, bool exiting); int securelevel_ge(struct ucred *cr, int level); int securelevel_gt(struct ucred *cr, int level); void sess_hold(struct session *); void sess_release(struct session *); int setrunnable(struct thread *); void setsugid(struct proc *p); int should_yield(void); int sigonstack(size_t sp); void stopevent(struct proc *, u_int, u_int); struct thread *tdfind(lwpid_t, pid_t); void threadinit(void); void tidhash_add(struct thread *); void tidhash_remove(struct thread *); void cpu_idle(int); int cpu_idle_wakeup(int); extern void (*cpu_idle_hook)(sbintime_t); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *, struct thread *, struct mtx *); void cpu_throw(struct thread *, struct thread *) __dead2; void unsleep(struct thread *); void userret(struct thread *, struct trapframe *); void cpu_exit(struct thread *); void exit1(struct thread *, int, int) __dead2; void cpu_copy_thread(struct thread *td, struct thread *td0); int cpu_fetch_syscall_args(struct thread *td); void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_fork_kthread_handler(struct thread *, void (*)(void *), void *); void cpu_set_syscall_retval(struct thread *, int); void cpu_set_upcall(struct thread *, void (*)(void *), void *, stack_t *); int cpu_set_user_tls(struct thread *, void *tls_base); void cpu_thread_alloc(struct thread *); void cpu_thread_clean(struct thread *); void cpu_thread_exit(struct thread *); void cpu_thread_free(struct thread *); void cpu_thread_swapin(struct thread *); void cpu_thread_swapout(struct thread *); struct thread *thread_alloc(int pages); int thread_alloc_stack(struct thread *, int pages); void thread_cow_get_proc(struct thread *newtd, struct proc *p); void thread_cow_get(struct thread *newtd, struct thread *td); void thread_cow_free(struct thread *td); void thread_cow_update(struct thread *td); int thread_create(struct thread *td, struct rtprio *rtp, int (*initialize_thread)(struct thread *, void *), void *thunk); void thread_exit(void) __dead2; void thread_free(struct thread *td); void thread_link(struct thread *td, struct proc *p); void thread_reap(void); int thread_single(struct proc *p, int how); void thread_single_end(struct proc *p, int how); void thread_stash(struct thread *td); void thread_stopped(struct proc *p); void childproc_stopped(struct proc *child, int reason); void childproc_continued(struct proc *child); void childproc_exited(struct proc *child); int thread_suspend_check(int how); bool thread_suspend_check_needed(void); void thread_suspend_switch(struct thread *, struct proc *p); void thread_suspend_one(struct thread *td); void thread_unlink(struct thread *td); void thread_unsuspend(struct proc *p); void thread_wait(struct proc *p); struct thread *thread_find(struct proc *p, lwpid_t tid); void stop_all_proc(void); void resume_all_proc(void); static __inline int curthread_pflags_set(int flags) { struct thread *td; int save; td = curthread; save = ~flags | (td->td_pflags & flags); td->td_pflags |= flags; return (save); } static __inline void curthread_pflags_restore(int save) { curthread->td_pflags &= save; } static __inline __pure2 struct td_sched * td_get_sched(struct thread *td) { return ((struct td_sched *)&td[1]); } extern void (*softdep_ast_cleanup)(struct thread *); static __inline void td_softdep_cleanup(struct thread *td) { if (td->td_su != NULL && softdep_ast_cleanup != NULL) softdep_ast_cleanup(td); } #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */ Index: head/sys/sys/sx.h =================================================================== --- head/sys/sys/sx.h (revision 334023) +++ head/sys/sys/sx.h (revision 334024) @@ -1,302 +1,304 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2007 Attilio Rao * Copyright (c) 2001 Jason Evans * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_SX_H_ #define _SYS_SX_H_ #include #include #ifdef _KERNEL #include #include #include #include #endif /* * In general, the sx locks and rwlocks use very similar algorithms. * The main difference in the implementations is how threads are * blocked when a lock is unavailable. For this, sx locks use sleep * queues which do not support priority propagation, and rwlocks use * turnstiles which do. * * The sx_lock field consists of several fields. The low bit * indicates if the lock is locked with a shared or exclusive lock. A * value of 0 indicates an exclusive lock, and a value of 1 indicates * a shared lock. Bit 1 is a boolean indicating if there are any * threads waiting for a shared lock. Bit 2 is a boolean indicating * if there are any threads waiting for an exclusive lock. Bit 3 is a * boolean indicating if an exclusive lock is recursively held. The * rest of the variable's definition is dependent on the value of the * first bit. For an exclusive lock, it is a pointer to the thread * holding the lock, similar to the mtx_lock field of mutexes. For * shared locks, it is a count of read locks that are held. * * When the lock is not locked by any thread, it is encoded as a * shared lock with zero waiters. */ #define SX_LOCK_SHARED 0x01 #define SX_LOCK_SHARED_WAITERS 0x02 #define SX_LOCK_EXCLUSIVE_WAITERS 0x04 #define SX_LOCK_RECURSED 0x08 +#define SX_LOCK_WRITE_SPINNER 0x10 #define SX_LOCK_FLAGMASK \ (SX_LOCK_SHARED | SX_LOCK_SHARED_WAITERS | \ - SX_LOCK_EXCLUSIVE_WAITERS | SX_LOCK_RECURSED) + SX_LOCK_EXCLUSIVE_WAITERS | SX_LOCK_RECURSED | SX_LOCK_WRITE_SPINNER) +#define SX_LOCK_WAITERS (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS) #define SX_OWNER(x) ((x) & ~SX_LOCK_FLAGMASK) -#define SX_SHARERS_SHIFT 4 +#define SX_SHARERS_SHIFT 5 #define SX_SHARERS(x) (SX_OWNER(x) >> SX_SHARERS_SHIFT) #define SX_SHARERS_LOCK(x) \ ((x) << SX_SHARERS_SHIFT | SX_LOCK_SHARED) #define SX_ONE_SHARER (1 << SX_SHARERS_SHIFT) #define SX_LOCK_UNLOCKED SX_SHARERS_LOCK(0) #define SX_LOCK_DESTROYED \ (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS) #ifdef _KERNEL #define sx_recurse lock_object.lo_data #define SX_READ_VALUE(sx) ((sx)->sx_lock) #define lv_sx_owner(v) \ ((v & SX_LOCK_SHARED) ? NULL : (struct thread *)SX_OWNER(v)) /* * Function prototipes. Routines that start with an underscore are not part * of the public interface and are wrappered with a macro. */ void sx_sysinit(void *arg); #define sx_init(sx, desc) sx_init_flags((sx), (desc), 0) void sx_init_flags(struct sx *sx, const char *description, int opts); void sx_destroy(struct sx *sx); int sx_try_slock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF); int sx_try_slock_(struct sx *sx, const char *file, int line); int sx_try_xlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF); int sx_try_xlock_(struct sx *sx, const char *file, int line); int sx_try_upgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF); int sx_try_upgrade_(struct sx *sx, const char *file, int line); void sx_downgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF); void sx_downgrade_(struct sx *sx, const char *file, int line); int _sx_slock_int(struct sx *sx, int opts LOCK_FILE_LINE_ARG_DEF); int _sx_slock(struct sx *sx, int opts, const char *file, int line); int _sx_xlock(struct sx *sx, int opts, const char *file, int line); void _sx_sunlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF); void _sx_sunlock(struct sx *sx, const char *file, int line); void _sx_xunlock(struct sx *sx, const char *file, int line); int _sx_xlock_hard(struct sx *sx, uintptr_t x, int opts LOCK_FILE_LINE_ARG_DEF); void _sx_xunlock_hard(struct sx *sx, uintptr_t x LOCK_FILE_LINE_ARG_DEF); #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void _sx_assert(const struct sx *sx, int what, const char *file, int line); #endif #ifdef DDB int sx_chain(struct thread *td, struct thread **ownerp); #endif struct sx_args { struct sx *sa_sx; const char *sa_desc; int sa_flags; }; #define SX_SYSINIT_FLAGS(name, sxa, desc, flags) \ static struct sx_args name##_args = { \ (sxa), \ (desc), \ (flags) \ }; \ SYSINIT(name##_sx_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ sx_sysinit, &name##_args); \ SYSUNINIT(name##_sx_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ sx_destroy, (sxa)) #define SX_SYSINIT(name, sxa, desc) SX_SYSINIT_FLAGS(name, sxa, desc, 0) /* * Full lock operations that are suitable to be inlined in non-debug kernels. * If the lock can't be acquired or released trivially then the work is * deferred to 'tougher' functions. */ #if (LOCK_DEBUG == 0) /* Acquire an exclusive lock. */ static __inline int __sx_xlock(struct sx *sx, struct thread *td, int opts, const char *file, int line) { uintptr_t tid = (uintptr_t)td; uintptr_t v = SX_LOCK_UNLOCKED; int error = 0; if (__predict_false(LOCKSTAT_PROFILE_ENABLED(sx__acquire) || !atomic_fcmpset_acq_ptr(&sx->sx_lock, &v, tid))) error = _sx_xlock_hard(sx, v, opts); return (error); } /* Release an exclusive lock. */ static __inline void __sx_xunlock(struct sx *sx, struct thread *td, const char *file, int line) { uintptr_t x = (uintptr_t)td; if (__predict_false(LOCKSTAT_PROFILE_ENABLED(sx__release) || !atomic_fcmpset_rel_ptr(&sx->sx_lock, &x, SX_LOCK_UNLOCKED))) _sx_xunlock_hard(sx, x); } #endif /* * Public interface for lock operations. */ #ifndef LOCK_DEBUG #error "LOCK_DEBUG not defined, include before " #endif #if (LOCK_DEBUG > 0) || defined(SX_NOINLINE) #define sx_xlock_(sx, file, line) \ (void)_sx_xlock((sx), 0, (file), (line)) #define sx_xlock_sig_(sx, file, line) \ _sx_xlock((sx), SX_INTERRUPTIBLE, (file), (line)) #define sx_xunlock_(sx, file, line) \ _sx_xunlock((sx), (file), (line)) #else #define sx_xlock_(sx, file, line) \ (void)__sx_xlock((sx), curthread, 0, (file), (line)) #define sx_xlock_sig_(sx, file, line) \ __sx_xlock((sx), curthread, SX_INTERRUPTIBLE, (file), (line)) #define sx_xunlock_(sx, file, line) \ __sx_xunlock((sx), curthread, (file), (line)) #endif /* LOCK_DEBUG > 0 || SX_NOINLINE */ #if (LOCK_DEBUG > 0) #define sx_slock_(sx, file, line) \ (void)_sx_slock((sx), 0, (file), (line)) #define sx_slock_sig_(sx, file, line) \ _sx_slock((sx), SX_INTERRUPTIBLE, (file) , (line)) #define sx_sunlock_(sx, file, line) \ _sx_sunlock((sx), (file), (line)) #define sx_try_slock(sx) sx_try_slock_((sx), LOCK_FILE, LOCK_LINE) #define sx_try_xlock(sx) sx_try_xlock_((sx), LOCK_FILE, LOCK_LINE) #define sx_try_upgrade(sx) sx_try_upgrade_((sx), LOCK_FILE, LOCK_LINE) #define sx_downgrade(sx) sx_downgrade_((sx), LOCK_FILE, LOCK_LINE) #else #define sx_slock_(sx, file, line) \ (void)_sx_slock_int((sx), 0) #define sx_slock_sig_(sx, file, line) \ _sx_slock_int((sx), SX_INTERRUPTIBLE) #define sx_sunlock_(sx, file, line) \ _sx_sunlock_int((sx)) #define sx_try_slock(sx) sx_try_slock_int((sx)) #define sx_try_xlock(sx) sx_try_xlock_int((sx)) #define sx_try_upgrade(sx) sx_try_upgrade_int((sx)) #define sx_downgrade(sx) sx_downgrade_int((sx)) #endif #ifdef INVARIANTS #define sx_assert_(sx, what, file, line) \ _sx_assert((sx), (what), (file), (line)) #else #define sx_assert_(sx, what, file, line) (void)0 #endif #define sx_xlock(sx) sx_xlock_((sx), LOCK_FILE, LOCK_LINE) #define sx_xlock_sig(sx) sx_xlock_sig_((sx), LOCK_FILE, LOCK_LINE) #define sx_xunlock(sx) sx_xunlock_((sx), LOCK_FILE, LOCK_LINE) #define sx_slock(sx) sx_slock_((sx), LOCK_FILE, LOCK_LINE) #define sx_slock_sig(sx) sx_slock_sig_((sx), LOCK_FILE, LOCK_LINE) #define sx_sunlock(sx) sx_sunlock_((sx), LOCK_FILE, LOCK_LINE) #define sx_assert(sx, what) sx_assert_((sx), (what), __FILE__, __LINE__) /* * Return a pointer to the owning thread if the lock is exclusively * locked. */ #define sx_xholder(sx) \ ((sx)->sx_lock & SX_LOCK_SHARED ? NULL : \ (struct thread *)SX_OWNER((sx)->sx_lock)) #define sx_xlocked(sx) \ (((sx)->sx_lock & ~(SX_LOCK_FLAGMASK & ~SX_LOCK_SHARED)) == \ (uintptr_t)curthread) #define sx_unlock_(sx, file, line) do { \ if (sx_xlocked(sx)) \ sx_xunlock_(sx, file, line); \ else \ sx_sunlock_(sx, file, line); \ } while (0) #define sx_unlock(sx) sx_unlock_((sx), LOCK_FILE, LOCK_LINE) #define sx_sleep(chan, sx, pri, wmesg, timo) \ _sleep((chan), &(sx)->lock_object, (pri), (wmesg), \ tick_sbt * (timo), 0, C_HARDCLOCK) /* * Options passed to sx_init_flags(). */ #define SX_DUPOK 0x01 #define SX_NOPROFILE 0x02 #define SX_NOWITNESS 0x04 #define SX_QUIET 0x08 #define SX_NOADAPTIVE 0x10 #define SX_RECURSE 0x20 #define SX_NEW 0x40 /* * Options passed to sx_*lock_hard(). */ #define SX_INTERRUPTIBLE 0x40 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) #define SA_LOCKED LA_LOCKED #define SA_SLOCKED LA_SLOCKED #define SA_XLOCKED LA_XLOCKED #define SA_UNLOCKED LA_UNLOCKED #define SA_RECURSED LA_RECURSED #define SA_NOTRECURSED LA_NOTRECURSED /* Backwards compatibility. */ #define SX_LOCKED LA_LOCKED #define SX_SLOCKED LA_SLOCKED #define SX_XLOCKED LA_XLOCKED #define SX_UNLOCKED LA_UNLOCKED #define SX_RECURSED LA_RECURSED #define SX_NOTRECURSED LA_NOTRECURSED #endif #endif /* _KERNEL */ #endif /* !_SYS_SX_H_ */