diff --git a/sys/kern/kern_condvar.c b/sys/kern/kern_condvar.c index 2731f581a29f..0d470aeafcd5 100644 --- a/sys/kern/kern_condvar.c +++ b/sys/kern/kern_condvar.c @@ -1,484 +1,484 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2000 Jake Burkholder . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #include #endif /* * A bound below which cv_waiters is valid. Once cv_waiters reaches this bound, * cv_signal must manually check the wait queue for threads. */ #define CV_WAITERS_BOUND INT_MAX #define CV_WAITERS_INC(cvp) do { \ if ((cvp)->cv_waiters < CV_WAITERS_BOUND) \ (cvp)->cv_waiters++; \ } while (0) /* * Common sanity checks for cv_wait* functions. */ #define CV_ASSERT(cvp, lock, td) do { \ KASSERT((td) != NULL, ("%s: td NULL", __func__)); \ KASSERT(TD_IS_RUNNING(td), ("%s: not TDS_RUNNING", __func__)); \ KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__)); \ KASSERT((lock) != NULL, ("%s: lock NULL", __func__)); \ } while (0) /* * Initialize a condition variable. Must be called before use. */ void cv_init(struct cv *cvp, const char *desc) { cvp->cv_description = desc; cvp->cv_waiters = 0; } /* * Destroy a condition variable. The condition variable must be re-initialized * in order to be re-used. */ void cv_destroy(struct cv *cvp) { #ifdef INVARIANTS struct sleepqueue *sq; sleepq_lock(cvp); sq = sleepq_lookup(cvp); sleepq_release(cvp); KASSERT(sq == NULL, ("%s: associated sleep queue non-empty", __func__)); #endif } /* * Wait on a condition variable. The current thread is placed on the condition * variable's wait queue and suspended. A cv_signal or cv_broadcast on the same * condition variable will resume the thread. The mutex is released before * sleeping and will be held on return. It is recommended that the mutex be * held when cv_signal or cv_broadcast are called. */ void _cv_wait(struct cv *cvp, struct lock_object *lock) { WITNESS_SAVE_DECL(lock_witness); #ifdef KTRACE char wmesg[WMESGLEN + 1]; #endif struct lock_class *class; struct thread *td; uintptr_t lock_state; td = curthread; CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); - if (SCHEDULER_STOPPED_TD(td)) + if (SCHEDULER_STOPPED()) return; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) { strlcpy(wmesg, cv_wmesg(cvp), sizeof(wmesg)); ktrcsw(1, 0, wmesg); } else { wmesg[0] = '\0'; } #endif class = LOCK_CLASS(lock); lock_state = 0; sleepq_lock(cvp); CV_WAITERS_INC(cvp); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); } sleepq_wait(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } } /* * Wait on a condition variable. This function differs from cv_wait by * not acquiring the mutex after condition variable was signaled. */ void _cv_wait_unlock(struct cv *cvp, struct lock_object *lock) { #ifdef KTRACE char wmesg[WMESGLEN + 1]; #endif struct lock_class *class; struct thread *td; td = curthread; CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); KASSERT(lock != &Giant.lock_object, ("cv_wait_unlock cannot be used with Giant")); class = LOCK_CLASS(lock); - if (SCHEDULER_STOPPED_TD(td)) { + if (SCHEDULER_STOPPED()) { class->lc_unlock(lock); return; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) { strlcpy(wmesg, cv_wmesg(cvp), sizeof(wmesg)); ktrcsw(1, 0, wmesg); } else { wmesg[0] = '\0'; } #endif sleepq_lock(cvp); CV_WAITERS_INC(cvp); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); sleepq_wait(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); } /* * Wait on a condition variable, allowing interruption by signals. Return 0 if * the thread was resumed with cv_signal or cv_broadcast, EINTR or ERESTART if * a signal was caught. If ERESTART is returned the system call should be * restarted if possible. */ int _cv_wait_sig(struct cv *cvp, struct lock_object *lock) { WITNESS_SAVE_DECL(lock_witness); #ifdef KTRACE char wmesg[WMESGLEN + 1]; #endif struct lock_class *class; struct thread *td; uintptr_t lock_state; int rval; td = curthread; CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); - if (SCHEDULER_STOPPED_TD(td)) + if (SCHEDULER_STOPPED()) return (0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) { strlcpy(wmesg, cv_wmesg(cvp), sizeof(wmesg)); ktrcsw(1, 0, wmesg); } else { wmesg[0] = '\0'; } #endif class = LOCK_CLASS(lock); lock_state = 0; sleepq_lock(cvp); CV_WAITERS_INC(cvp); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR | SLEEPQ_INTERRUPTIBLE, 0); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); } rval = sleepq_wait_sig(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } /* * Wait on a condition variable for (at most) the value specified in sbt * argument. Returns 0 if the process was resumed by cv_signal or cv_broadcast, * EWOULDBLOCK if the timeout expires. */ int _cv_timedwait_sbt(struct cv *cvp, struct lock_object *lock, sbintime_t sbt, sbintime_t pr, int flags) { WITNESS_SAVE_DECL(lock_witness); #ifdef KTRACE char wmesg[WMESGLEN + 1]; #endif struct lock_class *class; struct thread *td; int lock_state, rval; td = curthread; CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); - if (SCHEDULER_STOPPED_TD(td)) + if (SCHEDULER_STOPPED()) return (0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) { strlcpy(wmesg, cv_wmesg(cvp), sizeof(wmesg)); ktrcsw(1, 0, wmesg); } else { wmesg[0] = '\0'; } #endif class = LOCK_CLASS(lock); lock_state = 0; sleepq_lock(cvp); CV_WAITERS_INC(cvp); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0); sleepq_set_timeout_sbt(cvp, sbt, pr, flags); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); } rval = sleepq_timedwait(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } /* * Wait on a condition variable for (at most) the value specified in sbt * argument, allowing interruption by signals. * Returns 0 if the thread was resumed by cv_signal or cv_broadcast, * EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if a signal * was caught. */ int _cv_timedwait_sig_sbt(struct cv *cvp, struct lock_object *lock, sbintime_t sbt, sbintime_t pr, int flags) { WITNESS_SAVE_DECL(lock_witness); #ifdef KTRACE char wmesg[WMESGLEN + 1]; #endif struct lock_class *class; struct thread *td; int lock_state, rval; td = curthread; CV_ASSERT(cvp, lock, td); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Waiting on \"%s\"", cvp->cv_description); - if (SCHEDULER_STOPPED_TD(td)) + if (SCHEDULER_STOPPED()) return (0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) { strlcpy(wmesg, cv_wmesg(cvp), sizeof(wmesg)); ktrcsw(1, 0, wmesg); } else { wmesg[0] = '\0'; } #endif class = LOCK_CLASS(lock); lock_state = 0; sleepq_lock(cvp); CV_WAITERS_INC(cvp); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR | SLEEPQ_INTERRUPTIBLE, 0); sleepq_set_timeout_sbt(cvp, sbt, pr, flags); if (lock != &Giant.lock_object) { if (class->lc_flags & LC_SLEEPABLE) sleepq_release(cvp); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); if (class->lc_flags & LC_SLEEPABLE) sleepq_lock(cvp); } rval = sleepq_timedwait_sig(cvp, 0); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); if (lock != &Giant.lock_object) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } return (rval); } /* * Signal a condition variable, wakes up one waiting thread. Will also wakeup * the swapper if the process is not in memory, so that it can bring the * sleeping process in. Note that this may also result in additional threads * being made runnable. Should be called with the same mutex as was passed to * cv_wait held. */ void cv_signal(struct cv *cvp) { if (cvp->cv_waiters == 0) return; sleepq_lock(cvp); if (cvp->cv_waiters == 0) { sleepq_release(cvp); return; } if (cvp->cv_waiters == CV_WAITERS_BOUND && sleepq_lookup(cvp) == NULL) { cvp->cv_waiters = 0; sleepq_release(cvp); } else { if (cvp->cv_waiters < CV_WAITERS_BOUND) cvp->cv_waiters--; if (sleepq_signal(cvp, SLEEPQ_CONDVAR | SLEEPQ_DROP, 0, 0)) kick_proc0(); } } /* * Broadcast a signal to a condition variable. Wakes up all waiting threads. * Should be called with the same mutex as was passed to cv_wait held. */ void cv_broadcastpri(struct cv *cvp, int pri) { int wakeup_swapper; if (cvp->cv_waiters == 0) return; /* * XXX sleepq_broadcast pri argument changed from -1 meaning * no pri to 0 meaning no pri. */ wakeup_swapper = 0; if (pri == -1) pri = 0; sleepq_lock(cvp); if (cvp->cv_waiters > 0) { cvp->cv_waiters = 0; wakeup_swapper = sleepq_broadcast(cvp, SLEEPQ_CONDVAR, pri, 0); } sleepq_release(cvp); if (wakeup_swapper) kick_proc0(); } diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c index 6071ac7fd6f1..92be72546b46 100644 --- a/sys/kern/kern_mutex.c +++ b/sys/kern/kern_mutex.c @@ -1,1356 +1,1356 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Berkeley Software Design Inc's name may not be used to endorse or * promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $ * and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $ */ /* * Machine independent bits of mutex implementation. */ #include #include "opt_adaptive_mutexes.h" #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES) #define ADAPTIVE_MUTEXES #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DEFINE( , , lock, failed); #endif /* * Return the mutex address when the lock cookie address is provided. * This functionality assumes that struct mtx* have a member named mtx_lock. */ #define mtxlock2mtx(c) (__containerof(c, struct mtx, mtx_lock)) /* * Internal utility macros. */ #define mtx_unowned(m) ((m)->mtx_lock == MTX_UNOWNED) #define mtx_destroyed(m) ((m)->mtx_lock == MTX_DESTROYED) static void assert_mtx(const struct lock_object *lock, int what); #ifdef DDB static void db_show_mtx(const struct lock_object *lock); #endif static void lock_mtx(struct lock_object *lock, uintptr_t how); static void lock_spin(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_mtx(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_mtx(struct lock_object *lock); static uintptr_t unlock_spin(struct lock_object *lock); /* * Lock classes for sleep and spin mutexes. */ struct lock_class lock_class_mtx_sleep = { .lc_name = "sleep mutex", .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE, .lc_assert = assert_mtx, #ifdef DDB .lc_ddb_show = db_show_mtx, #endif .lc_lock = lock_mtx, .lc_unlock = unlock_mtx, #ifdef KDTRACE_HOOKS .lc_owner = owner_mtx, #endif }; struct lock_class lock_class_mtx_spin = { .lc_name = "spin mutex", .lc_flags = LC_SPINLOCK | LC_RECURSABLE, .lc_assert = assert_mtx, #ifdef DDB .lc_ddb_show = db_show_mtx, #endif .lc_lock = lock_spin, .lc_unlock = unlock_spin, #ifdef KDTRACE_HOOKS .lc_owner = owner_mtx, #endif }; #ifdef ADAPTIVE_MUTEXES #ifdef MUTEX_CUSTOM_BACKOFF static SYSCTL_NODE(_debug, OID_AUTO, mtx, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "mtx debugging"); static struct lock_delay_config __read_frequently mtx_delay; SYSCTL_U16(_debug_mtx, OID_AUTO, delay_base, CTLFLAG_RW, &mtx_delay.base, 0, ""); SYSCTL_U16(_debug_mtx, OID_AUTO, delay_max, CTLFLAG_RW, &mtx_delay.max, 0, ""); LOCK_DELAY_SYSINIT_DEFAULT(mtx_delay); #else #define mtx_delay locks_delay #endif #endif #ifdef MUTEX_SPIN_CUSTOM_BACKOFF static SYSCTL_NODE(_debug, OID_AUTO, mtx_spin, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "mtx spin debugging"); static struct lock_delay_config __read_frequently mtx_spin_delay; SYSCTL_INT(_debug_mtx_spin, OID_AUTO, delay_base, CTLFLAG_RW, &mtx_spin_delay.base, 0, ""); SYSCTL_INT(_debug_mtx_spin, OID_AUTO, delay_max, CTLFLAG_RW, &mtx_spin_delay.max, 0, ""); LOCK_DELAY_SYSINIT_DEFAULT(mtx_spin_delay); #else #define mtx_spin_delay locks_delay #endif /* * System-wide mutexes */ struct mtx blocked_lock; struct mtx __exclusive_cache_line Giant; static void _mtx_lock_indefinite_check(struct mtx *, struct lock_delay_arg *); void assert_mtx(const struct lock_object *lock, int what) { /* * Treat LA_LOCKED as if LA_XLOCKED was asserted. * * Some callers of lc_assert uses LA_LOCKED to indicate that either * a shared lock or write lock was held, while other callers uses * the more strict LA_XLOCKED (used as MA_OWNED). * * Mutex is the only lock class that can not be shared, as a result, * we can reasonably consider the caller really intends to assert * LA_XLOCKED when they are asserting LA_LOCKED on a mutex object. */ if (what & LA_LOCKED) { what &= ~LA_LOCKED; what |= LA_XLOCKED; } mtx_assert((const struct mtx *)lock, what); } void lock_mtx(struct lock_object *lock, uintptr_t how) { mtx_lock((struct mtx *)lock); } void lock_spin(struct lock_object *lock, uintptr_t how) { mtx_lock_spin((struct mtx *)lock); } uintptr_t unlock_mtx(struct lock_object *lock) { struct mtx *m; m = (struct mtx *)lock; mtx_assert(m, MA_OWNED | MA_NOTRECURSED); mtx_unlock(m); return (0); } uintptr_t unlock_spin(struct lock_object *lock) { struct mtx *m; m = (struct mtx *)lock; mtx_assert(m, MA_OWNED | MA_NOTRECURSED); mtx_unlock_spin(m); return (0); } #ifdef KDTRACE_HOOKS int owner_mtx(const struct lock_object *lock, struct thread **owner) { const struct mtx *m; uintptr_t x; m = (const struct mtx *)lock; x = m->mtx_lock; *owner = (struct thread *)(x & ~MTX_FLAGMASK); return (*owner != NULL); } #endif /* * Function versions of the inlined __mtx_* macros. These are used by * modules and can also be called from assembly language if needed. */ void __mtx_lock_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; uintptr_t tid, v; m = mtxlock2mtx(c); KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() || !TD_IS_IDLETHREAD(curthread), ("mtx_lock() by idle thread %p on sleep mutex %s @ %s:%d", curthread, m->lock_object.lo_name, file, line)); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_lock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep, ("mtx_lock() of spin mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&m->lock_object, (opts & ~MTX_RECURSE) | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); tid = (uintptr_t)curthread; v = MTX_UNOWNED; if (!_mtx_obtain_lock_fetch(m, &v, tid)) _mtx_lock_sleep(m, v, opts, file, line); else LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire, m, 0, 0, file, line); LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file, line); WITNESS_LOCK(&m->lock_object, (opts & ~MTX_RECURSE) | LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } void __mtx_unlock_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_unlock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep, ("mtx_unlock() of spin mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file, line); mtx_assert(m, MA_OWNED); #ifdef LOCK_PROFILING __mtx_unlock_sleep(c, (uintptr_t)curthread, opts, file, line); #else __mtx_unlock(m, curthread, opts, file, line); #endif TD_LOCKS_DEC(curthread); } void __mtx_lock_spin_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; #ifdef SMP uintptr_t tid, v; #endif m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_lock_spin() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("mtx_lock_spin() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); if (mtx_owned(m)) KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 || (opts & MTX_RECURSE) != 0, ("mtx_lock_spin: recursed on non-recursive mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); opts &= ~MTX_RECURSE; WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); #ifdef SMP spinlock_enter(); tid = (uintptr_t)curthread; v = MTX_UNOWNED; if (!_mtx_obtain_lock_fetch(m, &v, tid)) _mtx_lock_spin(m, v, opts, file, line); else LOCKSTAT_PROFILE_OBTAIN_SPIN_LOCK_SUCCESS(spin__acquire, m, 0, 0, file, line); #else __mtx_lock_spin(m, curthread, opts, file, line); #endif LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file, line); WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); } int __mtx_trylock_spin_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; if (SCHEDULER_STOPPED()) return (1); m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_trylock_spin() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("mtx_trylock_spin() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); KASSERT((opts & MTX_RECURSE) == 0, ("mtx_trylock_spin: unsupp. opt MTX_RECURSE on mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); if (__mtx_trylock_spin(m, curthread, opts, file, line)) { LOCK_LOG_TRY("LOCK", &m->lock_object, opts, 1, file, line); WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); return (1); } LOCK_LOG_TRY("LOCK", &m->lock_object, opts, 0, file, line); return (0); } void __mtx_unlock_spin_flags(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; m = mtxlock2mtx(c); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_unlock_spin() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("mtx_unlock_spin() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file, line); mtx_assert(m, MA_OWNED); __mtx_unlock_spin(m); } /* * The important part of mtx_trylock{,_flags}() * Tries to acquire lock `m.' If this function is called on a mutex that * is already owned, it will recursively acquire the lock. */ int _mtx_trylock_flags_int(struct mtx *m, int opts LOCK_FILE_LINE_ARG_DEF) { struct thread *td; uintptr_t tid, v; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif int rval; bool recursed; td = curthread; tid = (uintptr_t)td; - if (SCHEDULER_STOPPED_TD(td)) + if (SCHEDULER_STOPPED()) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(td), ("mtx_trylock() by idle thread %p on sleep mutex %s @ %s:%d", curthread, m->lock_object.lo_name, file, line)); KASSERT(m->mtx_lock != MTX_DESTROYED, ("mtx_trylock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep, ("mtx_trylock() of spin mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); rval = 1; recursed = false; v = MTX_UNOWNED; for (;;) { if (_mtx_obtain_lock_fetch(m, &v, tid)) break; if (v == MTX_UNOWNED) continue; if (v == tid && ((m->lock_object.lo_flags & LO_RECURSABLE) != 0 || (opts & MTX_RECURSE) != 0)) { m->mtx_recurse++; atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); recursed = true; break; } rval = 0; break; } opts &= ~MTX_RECURSE; LOCK_LOG_TRY("LOCK", &m->lock_object, opts, rval, file, line); if (rval) { WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); TD_LOCKS_INC(curthread); if (!recursed) LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire, m, contested, waittime, file, line); } return (rval); } int _mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line) { struct mtx *m; m = mtxlock2mtx(c); return (_mtx_trylock_flags_int(m, opts LOCK_FILE_LINE_ARG)); } /* * __mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock. * * We call this if the lock is either contested (i.e. we need to go to * sleep waiting for it), or if we need to recurse on it. */ #if LOCK_DEBUG > 0 void __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t v, int opts, const char *file, int line) #else void __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t v) #endif { struct thread *td; struct mtx *m; struct turnstile *ts; uintptr_t tid; struct thread *owner; #ifdef LOCK_PROFILING int contested = 0; uint64_t waittime = 0; #endif #if defined(ADAPTIVE_MUTEXES) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) int doing_lockprof = 0; #endif td = curthread; tid = (uintptr_t)td; m = mtxlock2mtx(c); #ifdef KDTRACE_HOOKS if (LOCKSTAT_PROFILE_ENABLED(adaptive__acquire)) { while (v == MTX_UNOWNED) { if (_mtx_obtain_lock_fetch(m, &v, tid)) goto out_lockstat; } doing_lockprof = 1; all_time -= lockstat_nsecs(&m->lock_object); } #endif #ifdef LOCK_PROFILING doing_lockprof = 1; #endif - if (SCHEDULER_STOPPED_TD(td)) + if (SCHEDULER_STOPPED()) return; if (__predict_false(v == MTX_UNOWNED)) v = MTX_READ_VALUE(m); if (__predict_false(lv_mtx_owner(v) == td)) { KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 || (opts & MTX_RECURSE) != 0, ("_mtx_lock_sleep: recursed on non-recursive mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); #if LOCK_DEBUG > 0 opts &= ~MTX_RECURSE; #endif m->mtx_recurse++; atomic_set_ptr(&m->mtx_lock, MTX_RECURSED); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m); return; } #if LOCK_DEBUG > 0 opts &= ~MTX_RECURSE; #endif #if defined(ADAPTIVE_MUTEXES) lock_delay_arg_init(&lda, &mtx_delay); #elif defined(KDTRACE_HOOKS) lock_delay_arg_init_noadapt(&lda); #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&m->lock_object, false, &contested, &waittime); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR4(KTR_LOCK, "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d", m->lock_object.lo_name, (void *)m->mtx_lock, file, line); THREAD_CONTENDS_ON_LOCK(&m->lock_object); for (;;) { if (v == MTX_UNOWNED) { if (_mtx_obtain_lock_fetch(m, &v, tid)) break; continue; } #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef ADAPTIVE_MUTEXES /* * If the owner is running on another CPU, spin until the * owner stops running or the state of the lock changes. */ owner = lv_mtx_owner(v); if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&m->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, m, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "spinning", "lockname:\"%s\"", m->lock_object.lo_name); do { lock_delay(&lda); v = MTX_READ_VALUE(m); owner = lv_mtx_owner(v); } while (v != MTX_UNOWNED && TD_IS_RUNNING(owner)); KTR_STATE0(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "running"); continue; } #endif ts = turnstile_trywait(&m->lock_object); v = MTX_READ_VALUE(m); retry_turnstile: /* * Check if the lock has been released while spinning for * the turnstile chain lock. */ if (v == MTX_UNOWNED) { turnstile_cancel(ts); continue; } #ifdef ADAPTIVE_MUTEXES /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ owner = lv_mtx_owner(v); if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); continue; } #endif /* * If the mutex isn't already contested and a failure occurs * setting the contested bit, the mutex was either released * or the state of the MTX_RECURSED bit changed. */ if ((v & MTX_CONTESTED) == 0 && !atomic_fcmpset_ptr(&m->mtx_lock, &v, v | MTX_CONTESTED)) { goto retry_turnstile; } /* * We definitely must sleep for this lock. */ mtx_assert(m, MA_NOTOWNED); /* * Block on the turnstile. */ #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&m->lock_object); #endif #ifndef ADAPTIVE_MUTEXES owner = mtx_owner(m); #endif MPASS(owner == mtx_owner(m)); turnstile_wait(ts, owner, TS_EXCLUSIVE_QUEUE); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&m->lock_object); sleep_cnt++; #endif v = MTX_READ_VALUE(m); } THREAD_CONTENTION_DONE(&m->lock_object); #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!doing_lockprof)) return; #endif #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&m->lock_object); if (sleep_time) LOCKSTAT_RECORD1(adaptive__block, m, sleep_time); /* * Only record the loops spinning and not sleeping. */ if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD1(adaptive__spin, m, all_time - sleep_time); out_lockstat: #endif LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(adaptive__acquire, m, contested, waittime, file, line); } #ifdef SMP /* * _mtx_lock_spin_cookie: the tougher part of acquiring an MTX_SPIN lock. * * This is only called if we need to actually spin for the lock. Recursion * is handled inline. */ #if LOCK_DEBUG > 0 void _mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t v, int opts, const char *file, int line) #else void _mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t v) #endif { struct mtx *m; struct lock_delay_arg lda; uintptr_t tid; #ifdef LOCK_PROFILING int contested = 0; uint64_t waittime = 0; #endif #ifdef KDTRACE_HOOKS int64_t spin_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) int doing_lockprof = 0; #endif tid = (uintptr_t)curthread; m = mtxlock2mtx(c); #ifdef KDTRACE_HOOKS if (LOCKSTAT_PROFILE_ENABLED(adaptive__acquire)) { while (v == MTX_UNOWNED) { if (_mtx_obtain_lock_fetch(m, &v, tid)) goto out_lockstat; } doing_lockprof = 1; spin_time -= lockstat_nsecs(&m->lock_object); } #endif #ifdef LOCK_PROFILING doing_lockprof = 1; #endif if (__predict_false(v == MTX_UNOWNED)) v = MTX_READ_VALUE(m); if (__predict_false(v == tid)) { m->mtx_recurse++; return; } if (SCHEDULER_STOPPED()) return; if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m); KTR_STATE1(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "spinning", "lockname:\"%s\"", m->lock_object.lo_name); lock_delay_arg_init(&lda, &mtx_spin_delay); #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&m->lock_object, true, &contested, &waittime); for (;;) { if (v == MTX_UNOWNED) { if (_mtx_obtain_lock_fetch(m, &v, tid)) break; continue; } /* Give interrupts a chance while we spin. */ spinlock_exit(); do { if (__predict_true(lda.spin_cnt < 10000000)) { lock_delay(&lda); } else { _mtx_lock_indefinite_check(m, &lda); } v = MTX_READ_VALUE(m); } while (v != MTX_UNOWNED); spinlock_enter(); } if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m); KTR_STATE0(KTR_SCHED, "thread", sched_tdname((struct thread *)tid), "running"); #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!doing_lockprof)) return; #endif #ifdef KDTRACE_HOOKS spin_time += lockstat_nsecs(&m->lock_object); if (lda.spin_cnt != 0) LOCKSTAT_RECORD1(spin__spin, m, spin_time); out_lockstat: #endif LOCKSTAT_PROFILE_OBTAIN_SPIN_LOCK_SUCCESS(spin__acquire, m, contested, waittime, file, line); } #endif /* SMP */ #ifdef INVARIANTS static void thread_lock_validate(struct mtx *m, int opts, const char *file, int line) { KASSERT(m->mtx_lock != MTX_DESTROYED, ("thread_lock() of destroyed mutex @ %s:%d", file, line)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("thread_lock() of sleep mutex %s @ %s:%d", m->lock_object.lo_name, file, line)); KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) == 0, ("thread_lock: got a recursive mutex %s @ %s:%d\n", m->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); } #else #define thread_lock_validate(m, opts, file, line) do { } while (0) #endif #ifndef LOCK_PROFILING #if LOCK_DEBUG > 0 void _thread_lock(struct thread *td, int opts, const char *file, int line) #else void _thread_lock(struct thread *td) #endif { struct mtx *m; uintptr_t tid; tid = (uintptr_t)curthread; if (__predict_false(LOCKSTAT_PROFILE_ENABLED(spin__acquire))) goto slowpath_noirq; spinlock_enter(); m = td->td_lock; thread_lock_validate(m, 0, file, line); if (__predict_false(m == &blocked_lock)) goto slowpath_unlocked; if (__predict_false(!_mtx_obtain_lock(m, tid))) goto slowpath_unlocked; if (__predict_true(m == td->td_lock)) { WITNESS_LOCK(&m->lock_object, LOP_EXCLUSIVE, file, line); return; } _mtx_release_lock_quick(m); slowpath_unlocked: spinlock_exit(); slowpath_noirq: #if LOCK_DEBUG > 0 thread_lock_flags_(td, opts, file, line); #else thread_lock_flags_(td, 0, 0, 0); #endif } #endif void thread_lock_flags_(struct thread *td, int opts, const char *file, int line) { struct mtx *m; uintptr_t tid, v; struct lock_delay_arg lda; #ifdef LOCK_PROFILING int contested = 0; uint64_t waittime = 0; #endif #ifdef KDTRACE_HOOKS int64_t spin_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) int doing_lockprof = 1; #endif tid = (uintptr_t)curthread; if (SCHEDULER_STOPPED()) { /* * Ensure that spinlock sections are balanced even when the * scheduler is stopped, since we may otherwise inadvertently * re-enable interrupts while dumping core. */ spinlock_enter(); return; } lock_delay_arg_init(&lda, &mtx_spin_delay); #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif #ifdef LOCK_PROFILING doing_lockprof = 1; #elif defined(KDTRACE_HOOKS) doing_lockprof = lockstat_enabled; #endif #ifdef KDTRACE_HOOKS if (__predict_false(doing_lockprof)) spin_time -= lockstat_nsecs(&td->td_lock->lock_object); #endif spinlock_enter(); for (;;) { retry: m = td->td_lock; thread_lock_validate(m, opts, file, line); v = MTX_READ_VALUE(m); for (;;) { if (v == MTX_UNOWNED) { if (_mtx_obtain_lock_fetch(m, &v, tid)) break; continue; } MPASS(v != tid); lock_profile_obtain_lock_failed(&m->lock_object, true, &contested, &waittime); /* Give interrupts a chance while we spin. */ spinlock_exit(); do { if (__predict_true(lda.spin_cnt < 10000000)) { lock_delay(&lda); } else { _mtx_lock_indefinite_check(m, &lda); } if (m != td->td_lock) { spinlock_enter(); goto retry; } v = MTX_READ_VALUE(m); } while (v != MTX_UNOWNED); spinlock_enter(); } if (m == td->td_lock) break; _mtx_release_lock_quick(m); } LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file, line); WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line); #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!doing_lockprof)) return; #endif #ifdef KDTRACE_HOOKS spin_time += lockstat_nsecs(&m->lock_object); #endif LOCKSTAT_PROFILE_OBTAIN_SPIN_LOCK_SUCCESS(spin__acquire, m, contested, waittime, file, line); #ifdef KDTRACE_HOOKS if (lda.spin_cnt != 0) LOCKSTAT_RECORD1(thread__spin, m, spin_time); #endif } struct mtx * thread_lock_block(struct thread *td) { struct mtx *lock; lock = td->td_lock; mtx_assert(lock, MA_OWNED); td->td_lock = &blocked_lock; return (lock); } void thread_lock_unblock(struct thread *td, struct mtx *new) { mtx_assert(new, MA_OWNED); KASSERT(td->td_lock == &blocked_lock, ("thread %p lock %p not blocked_lock %p", td, td->td_lock, &blocked_lock)); atomic_store_rel_ptr((volatile void *)&td->td_lock, (uintptr_t)new); } void thread_lock_block_wait(struct thread *td) { while (td->td_lock == &blocked_lock) cpu_spinwait(); /* Acquire fence to be certain that all thread state is visible. */ atomic_thread_fence_acq(); } void thread_lock_set(struct thread *td, struct mtx *new) { struct mtx *lock; mtx_assert(new, MA_OWNED); lock = td->td_lock; mtx_assert(lock, MA_OWNED); td->td_lock = new; mtx_unlock_spin(lock); } /* * __mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock. * * We are only called here if the lock is recursed, contested (i.e. we * need to wake up a blocked thread) or lockstat probe is active. */ #if LOCK_DEBUG > 0 void __mtx_unlock_sleep(volatile uintptr_t *c, uintptr_t v, int opts, const char *file, int line) #else void __mtx_unlock_sleep(volatile uintptr_t *c, uintptr_t v) #endif { struct mtx *m; struct turnstile *ts; uintptr_t tid; if (SCHEDULER_STOPPED()) return; tid = (uintptr_t)curthread; m = mtxlock2mtx(c); if (__predict_false(v == tid)) v = MTX_READ_VALUE(m); if (__predict_false(v & MTX_RECURSED)) { if (--(m->mtx_recurse) == 0) atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m); return; } LOCKSTAT_PROFILE_RELEASE_LOCK(adaptive__release, m); if (v == tid && _mtx_release_lock(m, tid)) return; /* * We have to lock the chain before the turnstile so this turnstile * can be removed from the hash list if it is empty. */ turnstile_chain_lock(&m->lock_object); _mtx_release_lock_quick(m); ts = turnstile_lookup(&m->lock_object); MPASS(ts != NULL); if (LOCK_LOG_TEST(&m->lock_object, opts)) CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m); turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE); /* * This turnstile is now no longer associated with the mutex. We can * unlock the chain lock so a new turnstile may take it's place. */ turnstile_unpend(ts); turnstile_chain_unlock(&m->lock_object); } /* * All the unlocking of MTX_SPIN locks is done inline. * See the __mtx_unlock_spin() macro for the details. */ /* * The backing function for the INVARIANTS-enabled mtx_assert() */ #ifdef INVARIANT_SUPPORT void __mtx_assert(const volatile uintptr_t *c, int what, const char *file, int line) { const struct mtx *m; if (KERNEL_PANICKED() || dumping || SCHEDULER_STOPPED()) return; m = mtxlock2mtx(c); switch (what) { case MA_OWNED: case MA_OWNED | MA_RECURSED: case MA_OWNED | MA_NOTRECURSED: if (!mtx_owned(m)) panic("mutex %s not owned at %s:%d", m->lock_object.lo_name, file, line); if (mtx_recursed(m)) { if ((what & MA_NOTRECURSED) != 0) panic("mutex %s recursed at %s:%d", m->lock_object.lo_name, file, line); } else if ((what & MA_RECURSED) != 0) { panic("mutex %s unrecursed at %s:%d", m->lock_object.lo_name, file, line); } break; case MA_NOTOWNED: if (mtx_owned(m)) panic("mutex %s owned at %s:%d", m->lock_object.lo_name, file, line); break; default: panic("unknown mtx_assert at %s:%d", file, line); } } #endif /* * General init routine used by the MTX_SYSINIT() macro. */ void mtx_sysinit(void *arg) { struct mtx_args *margs = arg; mtx_init((struct mtx *)margs->ma_mtx, margs->ma_desc, NULL, margs->ma_opts); } /* * Mutex initialization routine; initialize lock `m' of type contained in * `opts' with options contained in `opts' and name `name.' The optional * lock type `type' is used as a general lock category name for use with * witness. */ void _mtx_init(volatile uintptr_t *c, const char *name, const char *type, int opts) { struct mtx *m; struct lock_class *class; int flags; m = mtxlock2mtx(c); MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE | MTX_NOWITNESS | MTX_DUPOK | MTX_NOPROFILE | MTX_NEW)) == 0); ASSERT_ATOMIC_LOAD_PTR(m->mtx_lock, ("%s: mtx_lock not aligned for %s: %p", __func__, name, &m->mtx_lock)); /* Determine lock class and lock flags. */ if (opts & MTX_SPIN) class = &lock_class_mtx_spin; else class = &lock_class_mtx_sleep; flags = 0; if (opts & MTX_QUIET) flags |= LO_QUIET; if (opts & MTX_RECURSE) flags |= LO_RECURSABLE; if ((opts & MTX_NOWITNESS) == 0) flags |= LO_WITNESS; if (opts & MTX_DUPOK) flags |= LO_DUPOK; if (opts & MTX_NOPROFILE) flags |= LO_NOPROFILE; if (opts & MTX_NEW) flags |= LO_NEW; /* Initialize mutex. */ lock_init(&m->lock_object, class, name, type, flags); m->mtx_lock = MTX_UNOWNED; m->mtx_recurse = 0; } /* * Remove lock `m' from all_mtx queue. We don't allow MTX_QUIET to be * passed in as a flag here because if the corresponding mtx_init() was * called with MTX_QUIET set, then it will already be set in the mutex's * flags. */ void _mtx_destroy(volatile uintptr_t *c) { struct mtx *m; m = mtxlock2mtx(c); if (!mtx_owned(m)) MPASS(mtx_unowned(m)); else { MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0); /* Perform the non-mtx related part of mtx_unlock_spin(). */ if (LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin) { lock_profile_release_lock(&m->lock_object, true); spinlock_exit(); } else { TD_LOCKS_DEC(curthread); lock_profile_release_lock(&m->lock_object, false); } /* Tell witness this isn't locked to make it happy. */ WITNESS_UNLOCK(&m->lock_object, LOP_EXCLUSIVE, __FILE__, __LINE__); } m->mtx_lock = MTX_DESTROYED; lock_destroy(&m->lock_object); } /* * Intialize the mutex code and system mutexes. This is called from the MD * startup code prior to mi_startup(). The per-CPU data space needs to be * setup before this is called. */ void mutex_init(void) { /* Setup turnstiles so that sleep mutexes work. */ init_turnstiles(); /* * Initialize mutexes. */ mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE); mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN); blocked_lock.mtx_lock = 0xdeadc0de; /* Always blocked. */ mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK); mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN); mtx_init(&proc0.p_statmtx, "pstatl", NULL, MTX_SPIN); mtx_init(&proc0.p_itimmtx, "pitiml", NULL, MTX_SPIN); mtx_init(&proc0.p_profmtx, "pprofl", NULL, MTX_SPIN); mtx_init(&devmtx, "cdev", NULL, MTX_DEF); mtx_lock(&Giant); } static void __noinline _mtx_lock_indefinite_check(struct mtx *m, struct lock_delay_arg *ldap) { struct thread *td; ldap->spin_cnt++; if (ldap->spin_cnt < 60000000 || kdb_active || KERNEL_PANICKED()) cpu_lock_delay(); else { td = mtx_owner(m); /* If the mutex is unlocked, try again. */ if (td == NULL) return; printf( "spin lock %p (%s) held by %p (tid %d) too long\n", m, m->lock_object.lo_name, td, td->td_tid); #ifdef WITNESS witness_display_spinlock(&m->lock_object, td, printf); #endif panic("spin lock held too long"); } cpu_spinwait(); } void mtx_spin_wait_unlocked(struct mtx *m) { struct lock_delay_arg lda; KASSERT(m->mtx_lock != MTX_DESTROYED, ("%s() of destroyed mutex %p", __func__, m)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin, ("%s() of sleep mutex %p (%s)", __func__, m, m->lock_object.lo_name)); KASSERT(!mtx_owned(m), ("%s() waiting on myself on lock %p (%s)", __func__, m, m->lock_object.lo_name)); lda.spin_cnt = 0; while (atomic_load_acq_ptr(&m->mtx_lock) != MTX_UNOWNED) { if (__predict_true(lda.spin_cnt < 10000000)) { cpu_spinwait(); lda.spin_cnt++; } else { _mtx_lock_indefinite_check(m, &lda); } } } void mtx_wait_unlocked(struct mtx *m) { struct thread *owner; uintptr_t v; KASSERT(m->mtx_lock != MTX_DESTROYED, ("%s() of destroyed mutex %p", __func__, m)); KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep, ("%s() not a sleep mutex %p (%s)", __func__, m, m->lock_object.lo_name)); KASSERT(!mtx_owned(m), ("%s() waiting on myself on lock %p (%s)", __func__, m, m->lock_object.lo_name)); for (;;) { v = atomic_load_acq_ptr(&m->mtx_lock); if (v == MTX_UNOWNED) { break; } owner = lv_mtx_owner(v); if (!TD_IS_RUNNING(owner)) { mtx_lock(m); mtx_unlock(m); break; } cpu_spinwait(); } } #ifdef DDB void db_show_mtx(const struct lock_object *lock) { struct thread *td; const struct mtx *m; m = (const struct mtx *)lock; db_printf(" flags: {"); if (LOCK_CLASS(lock) == &lock_class_mtx_spin) db_printf("SPIN"); else db_printf("DEF"); if (m->lock_object.lo_flags & LO_RECURSABLE) db_printf(", RECURSE"); if (m->lock_object.lo_flags & LO_DUPOK) db_printf(", DUPOK"); db_printf("}\n"); db_printf(" state: {"); if (mtx_unowned(m)) db_printf("UNOWNED"); else if (mtx_destroyed(m)) db_printf("DESTROYED"); else { db_printf("OWNED"); if (m->mtx_lock & MTX_CONTESTED) db_printf(", CONTESTED"); if (m->mtx_lock & MTX_RECURSED) db_printf(", RECURSED"); } db_printf("}\n"); if (!mtx_unowned(m) && !mtx_destroyed(m)) { td = mtx_owner(m); db_printf(" owner: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (mtx_recursed(m)) db_printf(" recursed: %d\n", m->mtx_recurse); } } #endif diff --git a/sys/kern/kern_rwlock.c b/sys/kern/kern_rwlock.c index 83d5862a6667..28dddb950966 100644 --- a/sys/kern/kern_rwlock.c +++ b/sys/kern/kern_rwlock.c @@ -1,1568 +1,1568 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2006 John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Machine independent bits of reader/writer lock implementation. */ #include #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_no_adaptive_rwlocks.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) && !defined(NO_ADAPTIVE_RWLOCKS) #define ADAPTIVE_RWLOCKS #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DECLARE( , , lock, failed); #endif /* * Return the rwlock address when the lock cookie address is provided. * This functionality assumes that struct rwlock* have a member named rw_lock. */ #define rwlock2rw(c) (__containerof(c, struct rwlock, rw_lock)) #ifdef DDB #include static void db_show_rwlock(const struct lock_object *lock); #endif static void assert_rw(const struct lock_object *lock, int what); static void lock_rw(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_rw(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_rw(struct lock_object *lock); struct lock_class lock_class_rw = { .lc_name = "rw", .lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE, .lc_assert = assert_rw, #ifdef DDB .lc_ddb_show = db_show_rwlock, #endif .lc_lock = lock_rw, .lc_unlock = unlock_rw, #ifdef KDTRACE_HOOKS .lc_owner = owner_rw, #endif }; #ifdef ADAPTIVE_RWLOCKS #ifdef RWLOCK_CUSTOM_BACKOFF static u_short __read_frequently rowner_retries; static u_short __read_frequently rowner_loops; static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "rwlock debugging"); SYSCTL_U16(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, ""); SYSCTL_U16(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, ""); static struct lock_delay_config __read_frequently rw_delay; SYSCTL_U16(_debug_rwlock, OID_AUTO, delay_base, CTLFLAG_RW, &rw_delay.base, 0, ""); SYSCTL_U16(_debug_rwlock, OID_AUTO, delay_max, CTLFLAG_RW, &rw_delay.max, 0, ""); static void rw_lock_delay_init(void *arg __unused) { lock_delay_default_init(&rw_delay); rowner_retries = 10; rowner_loops = max(10000, rw_delay.max); } LOCK_DELAY_SYSINIT(rw_lock_delay_init); #else #define rw_delay locks_delay #define rowner_retries locks_delay_retries #define rowner_loops locks_delay_loops #endif #endif /* * Return a pointer to the owning thread if the lock is write-locked or * NULL if the lock is unlocked or read-locked. */ #define lv_rw_wowner(v) \ ((v) & RW_LOCK_READ ? NULL : \ (struct thread *)RW_OWNER((v))) #define rw_wowner(rw) lv_rw_wowner(RW_READ_VALUE(rw)) /* * Returns if a write owner is recursed. Write ownership is not assured * here and should be previously checked. */ #define rw_recursed(rw) ((rw)->rw_recurse != 0) /* * Return true if curthread helds the lock. */ #define rw_wlocked(rw) (rw_wowner((rw)) == curthread) /* * Return a pointer to the owning thread for this lock who should receive * any priority lent by threads that block on this lock. Currently this * is identical to rw_wowner(). */ #define rw_owner(rw) rw_wowner(rw) #ifndef INVARIANTS #define __rw_assert(c, what, file, line) #endif void assert_rw(const struct lock_object *lock, int what) { rw_assert((const struct rwlock *)lock, what); } void lock_rw(struct lock_object *lock, uintptr_t how) { struct rwlock *rw; rw = (struct rwlock *)lock; if (how) rw_rlock(rw); else rw_wlock(rw); } uintptr_t unlock_rw(struct lock_object *lock) { struct rwlock *rw; rw = (struct rwlock *)lock; rw_assert(rw, RA_LOCKED | LA_NOTRECURSED); if (rw->rw_lock & RW_LOCK_READ) { rw_runlock(rw); return (1); } else { rw_wunlock(rw); return (0); } } #ifdef KDTRACE_HOOKS int owner_rw(const struct lock_object *lock, struct thread **owner) { const struct rwlock *rw = (const struct rwlock *)lock; uintptr_t x = rw->rw_lock; *owner = rw_wowner(rw); return ((x & RW_LOCK_READ) != 0 ? (RW_READERS(x) != 0) : (*owner != NULL)); } #endif void _rw_init_flags(volatile uintptr_t *c, const char *name, int opts) { struct rwlock *rw; int flags; rw = rwlock2rw(c); MPASS((opts & ~(RW_DUPOK | RW_NOPROFILE | RW_NOWITNESS | RW_QUIET | RW_RECURSE | RW_NEW)) == 0); ASSERT_ATOMIC_LOAD_PTR(rw->rw_lock, ("%s: rw_lock not aligned for %s: %p", __func__, name, &rw->rw_lock)); flags = LO_UPGRADABLE; if (opts & RW_DUPOK) flags |= LO_DUPOK; if (opts & RW_NOPROFILE) flags |= LO_NOPROFILE; if (!(opts & RW_NOWITNESS)) flags |= LO_WITNESS; if (opts & RW_RECURSE) flags |= LO_RECURSABLE; if (opts & RW_QUIET) flags |= LO_QUIET; if (opts & RW_NEW) flags |= LO_NEW; lock_init(&rw->lock_object, &lock_class_rw, name, NULL, flags); rw->rw_lock = RW_UNLOCKED; rw->rw_recurse = 0; } void _rw_destroy(volatile uintptr_t *c) { struct rwlock *rw; rw = rwlock2rw(c); KASSERT(rw->rw_lock == RW_UNLOCKED, ("rw lock %p not unlocked", rw)); KASSERT(rw->rw_recurse == 0, ("rw lock %p still recursed", rw)); rw->rw_lock = RW_DESTROYED; lock_destroy(&rw->lock_object); } void rw_sysinit(void *arg) { struct rw_args *args; args = arg; rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc, args->ra_flags); } int _rw_wowned(const volatile uintptr_t *c) { return (rw_wowner(rwlock2rw(c)) == curthread); } void _rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; uintptr_t tid, v; rw = rwlock2rw(c); KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() || !TD_IS_IDLETHREAD(curthread), ("rw_wlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_wlock() of destroyed rwlock @ %s:%d", file, line)); WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); tid = (uintptr_t)curthread; v = RW_UNLOCKED; if (!_rw_write_lock_fetch(rw, &v, tid)) _rw_wlock_hard(rw, v, file, line); else LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, 0, 0, file, line, LOCKSTAT_WRITER); LOCK_LOG_LOCK("WLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line); WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } int __rw_try_wlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF) { struct thread *td; uintptr_t tid, v; int rval; bool recursed; td = curthread; tid = (uintptr_t)td; - if (SCHEDULER_STOPPED_TD(td)) + if (SCHEDULER_STOPPED()) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(td), ("rw_try_wlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_try_wlock() of destroyed rwlock @ %s:%d", file, line)); rval = 1; recursed = false; v = RW_UNLOCKED; for (;;) { if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &v, tid)) break; if (v == RW_UNLOCKED) continue; if (v == tid && (rw->lock_object.lo_flags & LO_RECURSABLE)) { rw->rw_recurse++; atomic_set_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED); break; } rval = 0; break; } LOCK_LOG_TRY("WLOCK", &rw->lock_object, 0, rval, file, line); if (rval) { WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); if (!recursed) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, 0, 0, file, line, LOCKSTAT_WRITER); TD_LOCKS_INC(curthread); } return (rval); } int __rw_try_wlock(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; rw = rwlock2rw(c); return (__rw_try_wlock_int(rw LOCK_FILE_LINE_ARG)); } void _rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; rw = rwlock2rw(c); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_wunlock() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(c, RA_WLOCKED, file, line); WITNESS_UNLOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("WUNLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line); #ifdef LOCK_PROFILING _rw_wunlock_hard(rw, (uintptr_t)curthread, file, line); #else __rw_wunlock(rw, curthread, file, line); #endif TD_LOCKS_DEC(curthread); } /* * Determines whether a new reader can acquire a lock. Succeeds if the * reader already owns a read lock and the lock is locked for read to * prevent deadlock from reader recursion. Also succeeds if the lock * is unlocked and has no writer waiters or spinners. Failing otherwise * prioritizes writers before readers. */ static bool __always_inline __rw_can_read(struct thread *td, uintptr_t v, bool fp) { if ((v & (RW_LOCK_READ | RW_LOCK_WRITE_WAITERS | RW_LOCK_WRITE_SPINNER)) == RW_LOCK_READ) return (true); if (!fp && td->td_rw_rlocks && (v & RW_LOCK_READ)) return (true); return (false); } static bool __always_inline __rw_rlock_try(struct rwlock *rw, struct thread *td, uintptr_t *vp, bool fp LOCK_FILE_LINE_ARG_DEF) { /* * Handle the easy case. If no other thread has a write * lock, then try to bump up the count of read locks. Note * that we have to preserve the current state of the * RW_LOCK_WRITE_WAITERS flag. If we fail to acquire a * read lock, then rw_lock must have changed, so restart * the loop. Note that this handles the case of a * completely unlocked rwlock since such a lock is encoded * as a read lock with no waiters. */ while (__rw_can_read(td, *vp, fp)) { if (atomic_fcmpset_acq_ptr(&rw->rw_lock, vp, *vp + RW_ONE_READER)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeed %p -> %p", __func__, rw, (void *)*vp, (void *)(*vp + RW_ONE_READER)); td->td_rw_rlocks++; return (true); } } return (false); } static void __noinline __rw_rlock_hard(struct rwlock *rw, struct thread *td, uintptr_t v LOCK_FILE_LINE_ARG_DEF) { struct turnstile *ts; struct thread *owner; #ifdef ADAPTIVE_RWLOCKS int spintries = 0; int i, n; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif #if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) uintptr_t state = 0; int doing_lockprof = 0; #endif #ifdef KDTRACE_HOOKS if (LOCKSTAT_PROFILE_ENABLED(rw__acquire)) { if (__rw_rlock_try(rw, td, &v, false LOCK_FILE_LINE_ARG)) goto out_lockstat; doing_lockprof = 1; all_time -= lockstat_nsecs(&rw->lock_object); state = v; } #endif #ifdef LOCK_PROFILING doing_lockprof = 1; state = v; #endif if (SCHEDULER_STOPPED()) return; #if defined(ADAPTIVE_RWLOCKS) lock_delay_arg_init(&lda, &rw_delay); #elif defined(KDTRACE_HOOKS) lock_delay_arg_init_noadapt(&lda); #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&rw->lock_object, false, &contested, &waittime); THREAD_CONTENDS_ON_LOCK(&rw->lock_object); for (;;) { if (__rw_rlock_try(rw, td, &v, false LOCK_FILE_LINE_ARG)) break; #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef ADAPTIVE_RWLOCKS /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if ((v & RW_LOCK_READ) == 0) { owner = (struct thread *)RW_OWNER(v); if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, rw, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); do { lock_delay(&lda); v = RW_READ_VALUE(rw); owner = lv_rw_wowner(v); } while (owner != NULL && TD_IS_RUNNING(owner)); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } } else { if ((v & RW_LOCK_WRITE_SPINNER) && RW_READERS(v) == 0) { MPASS(!__rw_can_read(td, v, false)); lock_delay_spin(2); v = RW_READ_VALUE(rw); continue; } if (spintries < rowner_retries) { spintries++; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); n = RW_READERS(v); for (i = 0; i < rowner_loops; i += n) { lock_delay_spin(n); v = RW_READ_VALUE(rw); if (!(v & RW_LOCK_READ)) break; n = RW_READERS(v); if (n == 0) break; if (__rw_can_read(td, v, false)) break; } #ifdef KDTRACE_HOOKS lda.spin_cnt += rowner_loops - i; #endif KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); if (i < rowner_loops) continue; } } #endif /* * Okay, now it's the hard case. Some other thread already * has a write lock or there are write waiters present, * acquire the turnstile lock so we can begin the process * of blocking. */ ts = turnstile_trywait(&rw->lock_object); /* * The lock might have been released while we spun, so * recheck its state and restart the loop if needed. */ v = RW_READ_VALUE(rw); retry_ts: if (((v & RW_LOCK_WRITE_SPINNER) && RW_READERS(v) == 0) || __rw_can_read(td, v, false)) { turnstile_cancel(ts); continue; } owner = lv_rw_wowner(v); #ifdef ADAPTIVE_RWLOCKS /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ if (owner != NULL) { if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); continue; } } #endif /* * The lock is held in write mode or it already has waiters. */ MPASS(!__rw_can_read(td, v, false)); /* * If the RW_LOCK_READ_WAITERS flag is already set, then * we can go ahead and block. If it is not set then try * to set it. If we fail to set it drop the turnstile * lock and restart the loop. */ if (!(v & RW_LOCK_READ_WAITERS)) { if (!atomic_fcmpset_ptr(&rw->rw_lock, &v, v | RW_LOCK_READ_WAITERS)) goto retry_ts; if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set read waiters flag", __func__, rw); } /* * We were unable to acquire the lock and the read waiters * flag is set, so we must block on the turnstile. */ if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__, rw); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&rw->lock_object); #endif MPASS(owner == rw_owner(rw)); turnstile_wait(ts, owner, TS_SHARED_QUEUE); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&rw->lock_object); sleep_cnt++; #endif if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from turnstile", __func__, rw); v = RW_READ_VALUE(rw); } THREAD_CONTENTION_DONE(&rw->lock_object); #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!doing_lockprof)) return; #endif #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&rw->lock_object); if (sleep_time) LOCKSTAT_RECORD4(rw__block, rw, sleep_time, LOCKSTAT_READER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); /* Record only the loops spinning and not sleeping. */ if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(rw__spin, rw, all_time - sleep_time, LOCKSTAT_READER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); out_lockstat: #endif /* * TODO: acquire "owner of record" here. Here be turnstile dragons * however. turnstiles don't like owners changing between calls to * turnstile_wait() currently. */ LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, contested, waittime, file, line, LOCKSTAT_READER); } void __rw_rlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF) { struct thread *td; uintptr_t v; td = curthread; - KASSERT(kdb_active != 0 || SCHEDULER_STOPPED_TD(td) || + KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() || !TD_IS_IDLETHREAD(td), ("rw_rlock() by idle thread %p on rwlock %s @ %s:%d", td, rw->lock_object.lo_name, file, line)); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_rlock() of destroyed rwlock @ %s:%d", file, line)); KASSERT(rw_wowner(rw) != td, ("rw_rlock: wlock already held for %s @ %s:%d", rw->lock_object.lo_name, file, line)); WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER, file, line, NULL); v = RW_READ_VALUE(rw); if (__predict_false(LOCKSTAT_PROFILE_ENABLED(rw__acquire) || !__rw_rlock_try(rw, td, &v, true LOCK_FILE_LINE_ARG))) __rw_rlock_hard(rw, td, v LOCK_FILE_LINE_ARG); else lock_profile_obtain_lock_success(&rw->lock_object, false, 0, 0, file, line); LOCK_LOG_LOCK("RLOCK", &rw->lock_object, 0, 0, file, line); WITNESS_LOCK(&rw->lock_object, 0, file, line); TD_LOCKS_INC(curthread); } void __rw_rlock(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; rw = rwlock2rw(c); __rw_rlock_int(rw LOCK_FILE_LINE_ARG); } int __rw_try_rlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF) { uintptr_t x; if (SCHEDULER_STOPPED()) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("rw_try_rlock() by idle thread %p on rwlock %s @ %s:%d", curthread, rw->lock_object.lo_name, file, line)); x = rw->rw_lock; for (;;) { KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_try_rlock() of destroyed rwlock @ %s:%d", file, line)); if (!(x & RW_LOCK_READ)) break; if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &x, x + RW_ONE_READER)) { LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 1, file, line); WITNESS_LOCK(&rw->lock_object, LOP_TRYLOCK, file, line); LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, 0, 0, file, line, LOCKSTAT_READER); TD_LOCKS_INC(curthread); curthread->td_rw_rlocks++; return (1); } } LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 0, file, line); return (0); } int __rw_try_rlock(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; rw = rwlock2rw(c); return (__rw_try_rlock_int(rw LOCK_FILE_LINE_ARG)); } static bool __always_inline __rw_runlock_try(struct rwlock *rw, struct thread *td, uintptr_t *vp) { for (;;) { if (RW_READERS(*vp) > 1 || !(*vp & RW_LOCK_WAITERS)) { if (atomic_fcmpset_rel_ptr(&rw->rw_lock, vp, *vp - RW_ONE_READER)) { if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeeded %p -> %p", __func__, rw, (void *)*vp, (void *)(*vp - RW_ONE_READER)); td->td_rw_rlocks--; return (true); } continue; } break; } return (false); } static void __noinline __rw_runlock_hard(struct rwlock *rw, struct thread *td, uintptr_t v LOCK_FILE_LINE_ARG_DEF) { struct turnstile *ts; uintptr_t setv, queue; if (SCHEDULER_STOPPED()) return; if (__rw_runlock_try(rw, td, &v)) goto out_lockstat; /* * Ok, we know we have waiters and we think we are the * last reader, so grab the turnstile lock. */ turnstile_chain_lock(&rw->lock_object); v = RW_READ_VALUE(rw); for (;;) { if (__rw_runlock_try(rw, td, &v)) break; MPASS(v & RW_LOCK_WAITERS); /* * Try to drop our lock leaving the lock in a unlocked * state. * * If you wanted to do explicit lock handoff you'd have to * do it here. You'd also want to use turnstile_signal() * and you'd have to handle the race where a higher * priority thread blocks on the write lock before the * thread you wakeup actually runs and have the new thread * "steal" the lock. For now it's a lot simpler to just * wakeup all of the waiters. * * As above, if we fail, then another thread might have * acquired a read lock, so drop the turnstile lock and * restart. */ setv = RW_UNLOCKED; queue = TS_SHARED_QUEUE; if (v & RW_LOCK_WRITE_WAITERS) { queue = TS_EXCLUSIVE_QUEUE; setv |= (v & RW_LOCK_READ_WAITERS); } setv |= (v & RW_LOCK_WRITE_SPINNER); if (!atomic_fcmpset_rel_ptr(&rw->rw_lock, &v, setv)) continue; if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p last succeeded with waiters", __func__, rw); /* * Ok. The lock is released and all that's left is to * wake up the waiters. Note that the lock might not be * free anymore, but in that case the writers will just * block again if they run before the new lock holder(s) * release the lock. */ ts = turnstile_lookup(&rw->lock_object); MPASS(ts != NULL); turnstile_broadcast(ts, queue); turnstile_unpend(ts); td->td_rw_rlocks--; break; } turnstile_chain_unlock(&rw->lock_object); out_lockstat: LOCKSTAT_PROFILE_RELEASE_RWLOCK(rw__release, rw, LOCKSTAT_READER); } void _rw_runlock_cookie_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF) { struct thread *td; uintptr_t v; KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_runlock() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(&rw->rw_lock, RA_RLOCKED, file, line); WITNESS_UNLOCK(&rw->lock_object, 0, file, line); LOCK_LOG_LOCK("RUNLOCK", &rw->lock_object, 0, 0, file, line); td = curthread; v = RW_READ_VALUE(rw); if (__predict_false(LOCKSTAT_PROFILE_ENABLED(rw__release) || !__rw_runlock_try(rw, td, &v))) __rw_runlock_hard(rw, td, v LOCK_FILE_LINE_ARG); else lock_profile_release_lock(&rw->lock_object, false); TD_LOCKS_DEC(curthread); } void _rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; rw = rwlock2rw(c); _rw_runlock_cookie_int(rw LOCK_FILE_LINE_ARG); } #ifdef ADAPTIVE_RWLOCKS static inline void rw_drop_critical(uintptr_t v, bool *in_critical, int *extra_work) { if (v & RW_LOCK_WRITE_SPINNER) return; if (*in_critical) { critical_exit(); *in_critical = false; (*extra_work)--; } } #else #define rw_drop_critical(v, in_critical, extra_work) do { } while (0) #endif /* * This function is called when we are unable to obtain a write lock on the * first try. This means that at least one other thread holds either a * read or write lock. */ void __rw_wlock_hard(volatile uintptr_t *c, uintptr_t v LOCK_FILE_LINE_ARG_DEF) { uintptr_t tid; struct rwlock *rw; struct turnstile *ts; struct thread *owner; #ifdef ADAPTIVE_RWLOCKS int spintries = 0; int i, n; enum { READERS, WRITER } sleep_reason = READERS; bool in_critical = false; #endif uintptr_t setv; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif #if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) uintptr_t state = 0; int doing_lockprof = 0; #endif int extra_work = 0; tid = (uintptr_t)curthread; rw = rwlock2rw(c); #ifdef KDTRACE_HOOKS if (LOCKSTAT_PROFILE_ENABLED(rw__acquire)) { while (v == RW_UNLOCKED) { if (_rw_write_lock_fetch(rw, &v, tid)) goto out_lockstat; } extra_work = 1; doing_lockprof = 1; all_time -= lockstat_nsecs(&rw->lock_object); state = v; } #endif #ifdef LOCK_PROFILING extra_work = 1; doing_lockprof = 1; state = v; #endif if (SCHEDULER_STOPPED()) return; if (__predict_false(v == RW_UNLOCKED)) v = RW_READ_VALUE(rw); if (__predict_false(lv_rw_wowner(v) == (struct thread *)tid)) { KASSERT(rw->lock_object.lo_flags & LO_RECURSABLE, ("%s: recursing but non-recursive rw %s @ %s:%d\n", __func__, rw->lock_object.lo_name, file, line)); rw->rw_recurse++; atomic_set_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED); if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p recursing", __func__, rw); return; } if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__, rw->lock_object.lo_name, (void *)rw->rw_lock, file, line); #if defined(ADAPTIVE_RWLOCKS) lock_delay_arg_init(&lda, &rw_delay); #elif defined(KDTRACE_HOOKS) lock_delay_arg_init_noadapt(&lda); #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&rw->lock_object, false, &contested, &waittime); THREAD_CONTENDS_ON_LOCK(&rw->lock_object); for (;;) { if (v == RW_UNLOCKED) { if (_rw_write_lock_fetch(rw, &v, tid)) break; continue; } #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef ADAPTIVE_RWLOCKS if (v == (RW_LOCK_READ | RW_LOCK_WRITE_SPINNER)) { if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &v, tid)) break; continue; } /* * If the lock is write locked and the owner is * running on another CPU, spin until the owner stops * running or the state of the lock changes. */ if (!(v & RW_LOCK_READ)) { rw_drop_critical(v, &in_critical, &extra_work); sleep_reason = WRITER; owner = lv_rw_wowner(v); if (!TD_IS_RUNNING(owner)) goto ts; if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, rw, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); do { lock_delay(&lda); v = RW_READ_VALUE(rw); owner = lv_rw_wowner(v); } while (owner != NULL && TD_IS_RUNNING(owner)); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } else if (RW_READERS(v) > 0) { sleep_reason = READERS; if (spintries == rowner_retries) goto ts; if (!(v & RW_LOCK_WRITE_SPINNER)) { if (!in_critical) { critical_enter(); in_critical = true; extra_work++; } if (!atomic_fcmpset_ptr(&rw->rw_lock, &v, v | RW_LOCK_WRITE_SPINNER)) { critical_exit(); in_critical = false; extra_work--; continue; } } spintries++; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", rw->lock_object.lo_name); n = RW_READERS(v); for (i = 0; i < rowner_loops; i += n) { lock_delay_spin(n); v = RW_READ_VALUE(rw); if (!(v & RW_LOCK_WRITE_SPINNER)) break; if (!(v & RW_LOCK_READ)) break; n = RW_READERS(v); if (n == 0) break; } #ifdef KDTRACE_HOOKS lda.spin_cnt += i; #endif KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); if (i < rowner_loops) continue; } ts: #endif ts = turnstile_trywait(&rw->lock_object); v = RW_READ_VALUE(rw); retry_ts: owner = lv_rw_wowner(v); #ifdef ADAPTIVE_RWLOCKS /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ if (owner != NULL) { if (TD_IS_RUNNING(owner)) { turnstile_cancel(ts); rw_drop_critical(v, &in_critical, &extra_work); continue; } } else if (RW_READERS(v) > 0 && sleep_reason == WRITER) { turnstile_cancel(ts); rw_drop_critical(v, &in_critical, &extra_work); continue; } #endif /* * Check for the waiters flags about this rwlock. * If the lock was released, without maintain any pending * waiters queue, simply try to acquire it. * If a pending waiters queue is present, claim the lock * ownership and maintain the pending queue. */ setv = v & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER); if ((v & ~setv) == RW_UNLOCKED) { setv &= ~RW_LOCK_WRITE_SPINNER; if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &v, tid | setv)) { if (setv) turnstile_claim(ts); else turnstile_cancel(ts); break; } goto retry_ts; } #ifdef ADAPTIVE_RWLOCKS if (in_critical) { if ((v & RW_LOCK_WRITE_SPINNER) || !((v & RW_LOCK_WRITE_WAITERS))) { setv = v & ~RW_LOCK_WRITE_SPINNER; setv |= RW_LOCK_WRITE_WAITERS; if (!atomic_fcmpset_ptr(&rw->rw_lock, &v, setv)) goto retry_ts; } critical_exit(); in_critical = false; extra_work--; } else { #endif /* * If the RW_LOCK_WRITE_WAITERS flag isn't set, then try to * set it. If we fail to set it, then loop back and try * again. */ if (!(v & RW_LOCK_WRITE_WAITERS)) { if (!atomic_fcmpset_ptr(&rw->rw_lock, &v, v | RW_LOCK_WRITE_WAITERS)) goto retry_ts; if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set write waiters flag", __func__, rw); } #ifdef ADAPTIVE_RWLOCKS } #endif /* * We were unable to acquire the lock and the write waiters * flag is set, so we must block on the turnstile. */ if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__, rw); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&rw->lock_object); #endif MPASS(owner == rw_owner(rw)); turnstile_wait(ts, owner, TS_EXCLUSIVE_QUEUE); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&rw->lock_object); sleep_cnt++; #endif if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from turnstile", __func__, rw); #ifdef ADAPTIVE_RWLOCKS spintries = 0; #endif v = RW_READ_VALUE(rw); } THREAD_CONTENTION_DONE(&rw->lock_object); if (__predict_true(!extra_work)) return; #ifdef ADAPTIVE_RWLOCKS if (in_critical) critical_exit(); #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!doing_lockprof)) return; #endif #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&rw->lock_object); if (sleep_time) LOCKSTAT_RECORD4(rw__block, rw, sleep_time, LOCKSTAT_WRITER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); /* Record only the loops spinning and not sleeping. */ if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(rw__spin, rw, all_time - sleep_time, LOCKSTAT_WRITER, (state & RW_LOCK_READ) == 0, (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state)); out_lockstat: #endif LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, contested, waittime, file, line, LOCKSTAT_WRITER); } /* * This function is called if lockstat is active or the first try at releasing * a write lock failed. The latter means that the lock is recursed or one of * the 2 waiter bits must be set indicating that at least one thread is waiting * on this lock. */ void __rw_wunlock_hard(volatile uintptr_t *c, uintptr_t v LOCK_FILE_LINE_ARG_DEF) { struct rwlock *rw; struct turnstile *ts; uintptr_t tid, setv; int queue; tid = (uintptr_t)curthread; if (SCHEDULER_STOPPED()) return; rw = rwlock2rw(c); if (__predict_false(v == tid)) v = RW_READ_VALUE(rw); if (v & RW_LOCK_WRITER_RECURSED) { if (--(rw->rw_recurse) == 0) atomic_clear_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED); if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, rw); return; } LOCKSTAT_PROFILE_RELEASE_RWLOCK(rw__release, rw, LOCKSTAT_WRITER); if (v == tid && _rw_write_unlock(rw, tid)) return; KASSERT(rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS), ("%s: neither of the waiter flags are set", __func__)); if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p contested", __func__, rw); turnstile_chain_lock(&rw->lock_object); /* * Use the same algo as sx locks for now. Prefer waking up shared * waiters if we have any over writers. This is probably not ideal. * * 'v' is the value we are going to write back to rw_lock. If we * have waiters on both queues, we need to preserve the state of * the waiter flag for the queue we don't wake up. For now this is * hardcoded for the algorithm mentioned above. * * In the case of both readers and writers waiting we wakeup the * readers but leave the RW_LOCK_WRITE_WAITERS flag set. If a * new writer comes in before a reader it will claim the lock up * above. There is probably a potential priority inversion in * there that could be worked around either by waking both queues * of waiters or doing some complicated lock handoff gymnastics. */ setv = RW_UNLOCKED; v = RW_READ_VALUE(rw); queue = TS_SHARED_QUEUE; if (v & RW_LOCK_WRITE_WAITERS) { queue = TS_EXCLUSIVE_QUEUE; setv |= (v & RW_LOCK_READ_WAITERS); } atomic_store_rel_ptr(&rw->rw_lock, setv); /* Wake up all waiters for the specific queue. */ if (LOCK_LOG_TEST(&rw->lock_object, 0)) CTR3(KTR_LOCK, "%s: %p waking up %s waiters", __func__, rw, queue == TS_SHARED_QUEUE ? "read" : "write"); ts = turnstile_lookup(&rw->lock_object); MPASS(ts != NULL); turnstile_broadcast(ts, queue); turnstile_unpend(ts); turnstile_chain_unlock(&rw->lock_object); } /* * Attempt to do a non-blocking upgrade from a read lock to a write * lock. This will only succeed if this thread holds a single read * lock. Returns true if the upgrade succeeded and false otherwise. */ int __rw_try_upgrade_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF) { uintptr_t v, setv, tid; struct turnstile *ts; int success; if (SCHEDULER_STOPPED()) return (1); KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_try_upgrade() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(&rw->rw_lock, RA_RLOCKED, file, line); /* * Attempt to switch from one reader to a writer. If there * are any write waiters, then we will have to lock the * turnstile first to prevent races with another writer * calling turnstile_wait() before we have claimed this * turnstile. So, do the simple case of no waiters first. */ tid = (uintptr_t)curthread; success = 0; v = RW_READ_VALUE(rw); for (;;) { if (RW_READERS(v) > 1) break; if (!(v & RW_LOCK_WAITERS)) { success = atomic_fcmpset_acq_ptr(&rw->rw_lock, &v, tid); if (!success) continue; break; } /* * Ok, we think we have waiters, so lock the turnstile. */ ts = turnstile_trywait(&rw->lock_object); v = RW_READ_VALUE(rw); retry_ts: if (RW_READERS(v) > 1) { turnstile_cancel(ts); break; } /* * Try to switch from one reader to a writer again. This time * we honor the current state of the waiters flags. * If we obtain the lock with the flags set, then claim * ownership of the turnstile. */ setv = tid | (v & RW_LOCK_WAITERS); success = atomic_fcmpset_ptr(&rw->rw_lock, &v, setv); if (success) { if (v & RW_LOCK_WAITERS) turnstile_claim(ts); else turnstile_cancel(ts); break; } goto retry_ts; } LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line); if (success) { curthread->td_rw_rlocks--; WITNESS_UPGRADE(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); LOCKSTAT_RECORD0(rw__upgrade, rw); } return (success); } int __rw_try_upgrade(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; rw = rwlock2rw(c); return (__rw_try_upgrade_int(rw LOCK_FILE_LINE_ARG)); } /* * Downgrade a write lock into a single read lock. */ void __rw_downgrade_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF) { struct turnstile *ts; uintptr_t tid, v; int rwait, wwait; if (SCHEDULER_STOPPED()) return; KASSERT(rw->rw_lock != RW_DESTROYED, ("rw_downgrade() of destroyed rwlock @ %s:%d", file, line)); __rw_assert(&rw->rw_lock, RA_WLOCKED | RA_NOTRECURSED, file, line); #ifndef INVARIANTS if (rw_recursed(rw)) panic("downgrade of a recursed lock"); #endif WITNESS_DOWNGRADE(&rw->lock_object, 0, file, line); /* * Convert from a writer to a single reader. First we handle * the easy case with no waiters. If there are any waiters, we * lock the turnstile and "disown" the lock. */ tid = (uintptr_t)curthread; if (atomic_cmpset_rel_ptr(&rw->rw_lock, tid, RW_READERS_LOCK(1))) goto out; /* * Ok, we think we have waiters, so lock the turnstile so we can * read the waiter flags without any races. */ turnstile_chain_lock(&rw->lock_object); v = rw->rw_lock & RW_LOCK_WAITERS; rwait = v & RW_LOCK_READ_WAITERS; wwait = v & RW_LOCK_WRITE_WAITERS; MPASS(rwait | wwait); /* * Downgrade from a write lock while preserving waiters flag * and give up ownership of the turnstile. */ ts = turnstile_lookup(&rw->lock_object); MPASS(ts != NULL); if (!wwait) v &= ~RW_LOCK_READ_WAITERS; atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v); /* * Wake other readers if there are no writers pending. Otherwise they * won't be able to acquire the lock anyway. */ if (rwait && !wwait) { turnstile_broadcast(ts, TS_SHARED_QUEUE); turnstile_unpend(ts); } else turnstile_disown(ts); turnstile_chain_unlock(&rw->lock_object); out: curthread->td_rw_rlocks++; LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line); LOCKSTAT_RECORD0(rw__downgrade, rw); } void __rw_downgrade(volatile uintptr_t *c, const char *file, int line) { struct rwlock *rw; rw = rwlock2rw(c); __rw_downgrade_int(rw LOCK_FILE_LINE_ARG); } #ifdef INVARIANT_SUPPORT #ifndef INVARIANTS #undef __rw_assert #endif /* * In the non-WITNESS case, rw_assert() can only detect that at least * *some* thread owns an rlock, but it cannot guarantee that *this* * thread owns an rlock. */ void __rw_assert(const volatile uintptr_t *c, int what, const char *file, int line) { const struct rwlock *rw; if (SCHEDULER_STOPPED()) return; rw = rwlock2rw(c); switch (what) { case RA_LOCKED: case RA_LOCKED | RA_RECURSED: case RA_LOCKED | RA_NOTRECURSED: case RA_RLOCKED: case RA_RLOCKED | RA_RECURSED: case RA_RLOCKED | RA_NOTRECURSED: #ifdef WITNESS witness_assert(&rw->lock_object, what, file, line); #else /* * If some other thread has a write lock or we have one * and are asserting a read lock, fail. Also, if no one * has a lock at all, fail. */ if (rw->rw_lock == RW_UNLOCKED || (!(rw->rw_lock & RW_LOCK_READ) && (what & RA_RLOCKED || rw_wowner(rw) != curthread))) panic("Lock %s not %slocked @ %s:%d\n", rw->lock_object.lo_name, (what & RA_RLOCKED) ? "read " : "", file, line); if (!(rw->rw_lock & RW_LOCK_READ) && !(what & RA_RLOCKED)) { if (rw_recursed(rw)) { if (what & RA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); } else if (what & RA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); } #endif break; case RA_WLOCKED: case RA_WLOCKED | RA_RECURSED: case RA_WLOCKED | RA_NOTRECURSED: if (rw_wowner(rw) != curthread) panic("Lock %s not exclusively locked @ %s:%d\n", rw->lock_object.lo_name, file, line); if (rw_recursed(rw)) { if (what & RA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); } else if (what & RA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", rw->lock_object.lo_name, file, line); break; case RA_UNLOCKED: #ifdef WITNESS witness_assert(&rw->lock_object, what, file, line); #else /* * If we hold a write lock fail. We can't reliably check * to see if we hold a read lock or not. */ if (rw_wowner(rw) == curthread) panic("Lock %s exclusively locked @ %s:%d\n", rw->lock_object.lo_name, file, line); #endif break; default: panic("Unknown rw lock assertion: %d @ %s:%d", what, file, line); } } #endif /* INVARIANT_SUPPORT */ #ifdef DDB void db_show_rwlock(const struct lock_object *lock) { const struct rwlock *rw; struct thread *td; rw = (const struct rwlock *)lock; db_printf(" state: "); if (rw->rw_lock == RW_UNLOCKED) db_printf("UNLOCKED\n"); else if (rw->rw_lock == RW_DESTROYED) { db_printf("DESTROYED\n"); return; } else if (rw->rw_lock & RW_LOCK_READ) db_printf("RLOCK: %ju locks\n", (uintmax_t)(RW_READERS(rw->rw_lock))); else { td = rw_wowner(rw); db_printf("WLOCK: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (rw_recursed(rw)) db_printf(" recursed: %u\n", rw->rw_recurse); } db_printf(" waiters: "); switch (rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)) { case RW_LOCK_READ_WAITERS: db_printf("readers\n"); break; case RW_LOCK_WRITE_WAITERS: db_printf("writers\n"); break; case RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS: db_printf("readers and writers\n"); break; default: db_printf("none\n"); break; } } #endif diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c index 17d40ff0429c..ee666281418f 100644 --- a/sys/kern/kern_shutdown.c +++ b/sys/kern/kern_shutdown.c @@ -1,1835 +1,1836 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1986, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_ddb.h" #include "opt_ekcd.h" #include "opt_kdb.h" #include "opt_panic.h" #include "opt_printf.h" #include "opt_sched.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_DUMPER, "dumper", "dumper block buffer"); #ifndef PANIC_REBOOT_WAIT_TIME #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ #endif static int panic_reboot_wait_time = PANIC_REBOOT_WAIT_TIME; SYSCTL_INT(_kern, OID_AUTO, panic_reboot_wait_time, CTLFLAG_RWTUN, &panic_reboot_wait_time, 0, "Seconds to wait before rebooting after a panic"); static int reboot_wait_time = 0; SYSCTL_INT(_kern, OID_AUTO, reboot_wait_time, CTLFLAG_RWTUN, &reboot_wait_time, 0, "Seconds to wait before rebooting"); /* * Note that stdarg.h and the ANSI style va_start macro is used for both * ANSI and traditional C compilers. */ #include #ifdef KDB #ifdef KDB_UNATTENDED int debugger_on_panic = 0; #else int debugger_on_panic = 1; #endif SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RWTUN, &debugger_on_panic, 0, "Run debugger on kernel panic"); static bool debugger_on_recursive_panic = false; SYSCTL_BOOL(_debug, OID_AUTO, debugger_on_recursive_panic, CTLFLAG_RWTUN, &debugger_on_recursive_panic, 0, "Run debugger on recursive kernel panic"); int debugger_on_trap = 0; SYSCTL_INT(_debug, OID_AUTO, debugger_on_trap, CTLFLAG_RWTUN, &debugger_on_trap, 0, "Run debugger on kernel trap before panic"); #ifdef KDB_TRACE static int trace_on_panic = 1; static bool trace_all_panics = true; #else static int trace_on_panic = 0; static bool trace_all_panics = false; #endif SYSCTL_INT(_debug, OID_AUTO, trace_on_panic, CTLFLAG_RWTUN | CTLFLAG_SECURE, &trace_on_panic, 0, "Print stack trace on kernel panic"); SYSCTL_BOOL(_debug, OID_AUTO, trace_all_panics, CTLFLAG_RWTUN, &trace_all_panics, 0, "Print stack traces on secondary kernel panics"); #endif /* KDB */ static int sync_on_panic = 0; SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RWTUN, &sync_on_panic, 0, "Do a sync before rebooting from a panic"); static bool poweroff_on_panic = 0; SYSCTL_BOOL(_kern, OID_AUTO, poweroff_on_panic, CTLFLAG_RWTUN, &poweroff_on_panic, 0, "Do a power off instead of a reboot on a panic"); static bool powercycle_on_panic = 0; SYSCTL_BOOL(_kern, OID_AUTO, powercycle_on_panic, CTLFLAG_RWTUN, &powercycle_on_panic, 0, "Do a power cycle instead of a reboot on a panic"); static SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Shutdown environment"); #ifndef DIAGNOSTIC static int show_busybufs; #else static int show_busybufs = 1; #endif SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW, &show_busybufs, 0, "Show busy buffers during shutdown"); int suspend_blocked = 0; SYSCTL_INT(_kern, OID_AUTO, suspend_blocked, CTLFLAG_RW, &suspend_blocked, 0, "Block suspend due to a pending shutdown"); #ifdef EKCD FEATURE(ekcd, "Encrypted kernel crash dumps support"); MALLOC_DEFINE(M_EKCD, "ekcd", "Encrypted kernel crash dumps data"); struct kerneldumpcrypto { uint8_t kdc_encryption; uint8_t kdc_iv[KERNELDUMP_IV_MAX_SIZE]; union { struct { keyInstance aes_ki; cipherInstance aes_ci; } u_aes; struct chacha_ctx u_chacha; } u; #define kdc_ki u.u_aes.aes_ki #define kdc_ci u.u_aes.aes_ci #define kdc_chacha u.u_chacha uint32_t kdc_dumpkeysize; struct kerneldumpkey kdc_dumpkey[]; }; #endif struct kerneldumpcomp { uint8_t kdc_format; struct compressor *kdc_stream; uint8_t *kdc_buf; size_t kdc_resid; }; static struct kerneldumpcomp *kerneldumpcomp_create(struct dumperinfo *di, uint8_t compression); static void kerneldumpcomp_destroy(struct dumperinfo *di); static int kerneldumpcomp_write_cb(void *base, size_t len, off_t off, void *arg); static int kerneldump_gzlevel = 6; SYSCTL_INT(_kern, OID_AUTO, kerneldump_gzlevel, CTLFLAG_RWTUN, &kerneldump_gzlevel, 0, "Kernel crash dump compression level"); /* * Variable panicstr contains argument to first call to panic; used as flag * to indicate that the kernel has already called panic. */ const char *panicstr __read_mostly; +bool scheduler_stopped __read_frequently; int dumping __read_mostly; /* system is dumping */ int rebooting __read_mostly; /* system is rebooting */ /* * Used to serialize between sysctl kern.shutdown.dumpdevname and list * modifications via ioctl. */ static struct mtx dumpconf_list_lk; MTX_SYSINIT(dumper_configs, &dumpconf_list_lk, "dumper config list", MTX_DEF); /* Our selected dumper(s). */ static TAILQ_HEAD(dumpconflist, dumperinfo) dumper_configs = TAILQ_HEAD_INITIALIZER(dumper_configs); /* Context information for dump-debuggers, saved by the dump_savectx() macro. */ struct pcb dumppcb; /* Registers. */ lwpid_t dumptid; /* Thread ID. */ static struct cdevsw reroot_cdevsw = { .d_version = D_VERSION, .d_name = "reroot", }; static void poweroff_wait(void *, int); static void shutdown_halt(void *junk, int howto); static void shutdown_panic(void *junk, int howto); static void shutdown_reset(void *junk, int howto); static int kern_reroot(void); /* register various local shutdown events */ static void shutdown_conf(void *unused) { EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL, SHUTDOWN_PRI_FIRST); EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL, SHUTDOWN_PRI_LAST + 100); EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL, SHUTDOWN_PRI_LAST + 100); } SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL); /* * The only reason this exists is to create the /dev/reroot/ directory, * used by reroot code in init(8) as a mountpoint for tmpfs. */ static void reroot_conf(void *unused) { int error; struct cdev *cdev; error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &cdev, &reroot_cdevsw, NULL, UID_ROOT, GID_WHEEL, 0600, "reroot/reroot"); if (error != 0) { printf("%s: failed to create device node, error %d", __func__, error); } } SYSINIT(reroot_conf, SI_SUB_DEVFS, SI_ORDER_ANY, reroot_conf, NULL); /* * The system call that results in a reboot. */ /* ARGSUSED */ int sys_reboot(struct thread *td, struct reboot_args *uap) { int error; error = 0; #ifdef MAC error = mac_system_check_reboot(td->td_ucred, uap->opt); #endif if (error == 0) error = priv_check(td, PRIV_REBOOT); if (error == 0) { if (uap->opt & RB_REROOT) error = kern_reroot(); else kern_reboot(uap->opt); } return (error); } static void shutdown_nice_task_fn(void *arg, int pending __unused) { int howto; howto = (uintptr_t)arg; /* Send a signal to init(8) and have it shutdown the world. */ PROC_LOCK(initproc); if ((howto & RB_POWEROFF) != 0) { BOOTTRACE("SIGUSR2 to init(8)"); kern_psignal(initproc, SIGUSR2); } else if ((howto & RB_POWERCYCLE) != 0) { BOOTTRACE("SIGWINCH to init(8)"); kern_psignal(initproc, SIGWINCH); } else if ((howto & RB_HALT) != 0) { BOOTTRACE("SIGUSR1 to init(8)"); kern_psignal(initproc, SIGUSR1); } else { BOOTTRACE("SIGINT to init(8)"); kern_psignal(initproc, SIGINT); } PROC_UNLOCK(initproc); } static struct task shutdown_nice_task = TASK_INITIALIZER(0, &shutdown_nice_task_fn, NULL); /* * Called by events that want to shut down.. e.g on a PC */ void shutdown_nice(int howto) { if (initproc != NULL && !SCHEDULER_STOPPED()) { BOOTTRACE("shutdown initiated"); shutdown_nice_task.ta_context = (void *)(uintptr_t)howto; taskqueue_enqueue(taskqueue_fast, &shutdown_nice_task); } else { /* * No init(8) running, or scheduler would not allow it * to run, so simply reboot. */ kern_reboot(howto | RB_NOSYNC); } } static void print_uptime(void) { int f; struct timespec ts; getnanouptime(&ts); printf("Uptime: "); f = 0; if (ts.tv_sec >= 86400) { printf("%ldd", (long)ts.tv_sec / 86400); ts.tv_sec %= 86400; f = 1; } if (f || ts.tv_sec >= 3600) { printf("%ldh", (long)ts.tv_sec / 3600); ts.tv_sec %= 3600; f = 1; } if (f || ts.tv_sec >= 60) { printf("%ldm", (long)ts.tv_sec / 60); ts.tv_sec %= 60; f = 1; } printf("%lds\n", (long)ts.tv_sec); } int doadump(boolean_t textdump) { boolean_t coredump; int error; error = 0; if (dumping) return (EBUSY); if (TAILQ_EMPTY(&dumper_configs)) return (ENXIO); dump_savectx(); dumping++; coredump = TRUE; #ifdef DDB if (textdump && textdump_pending) { coredump = FALSE; textdump_dumpsys(TAILQ_FIRST(&dumper_configs)); } #endif if (coredump) { struct dumperinfo *di; TAILQ_FOREACH(di, &dumper_configs, di_next) { error = dumpsys(di); if (error == 0) break; } } dumping--; return (error); } /* * Trace the shutdown reason. */ static void reboottrace(int howto) { if ((howto & RB_DUMP) != 0) { if ((howto & RB_HALT) != 0) BOOTTRACE("system panic: halting..."); if ((howto & RB_POWEROFF) != 0) BOOTTRACE("system panic: powering off..."); if ((howto & (RB_HALT|RB_POWEROFF)) == 0) BOOTTRACE("system panic: rebooting..."); } else { if ((howto & RB_HALT) != 0) BOOTTRACE("system halting..."); if ((howto & RB_POWEROFF) != 0) BOOTTRACE("system powering off..."); if ((howto & (RB_HALT|RB_POWEROFF)) == 0) BOOTTRACE("system rebooting..."); } } /* * kern_reboot(9): Shut down the system cleanly to prepare for reboot, halt, or * power off. */ void kern_reboot(int howto) { static int once = 0; if (initproc != NULL && curproc != initproc) BOOTTRACE("kernel shutdown (dirty) started"); else BOOTTRACE("kernel shutdown (clean) started"); /* * Normal paths here don't hold Giant, but we can wind up here * unexpectedly with it held. Drop it now so we don't have to * drop and pick it up elsewhere. The paths it is locking will * never be returned to, and it is preferable to preclude * deadlock than to lock against code that won't ever * continue. */ while (!SCHEDULER_STOPPED() && mtx_owned(&Giant)) mtx_unlock(&Giant); #if defined(SMP) /* * Bind us to the first CPU so that all shutdown code runs there. Some * systems don't shutdown properly (i.e., ACPI power off) if we * run on another processor. */ if (!SCHEDULER_STOPPED()) { thread_lock(curthread); sched_bind(curthread, CPU_FIRST()); thread_unlock(curthread); KASSERT(PCPU_GET(cpuid) == CPU_FIRST(), ("%s: not running on cpu 0", __func__)); } #endif /* We're in the process of rebooting. */ rebooting = 1; reboottrace(howto); /* * Do any callouts that should be done BEFORE syncing the filesystems. */ EVENTHANDLER_INVOKE(shutdown_pre_sync, howto); BOOTTRACE("shutdown pre sync complete"); /* * Now sync filesystems */ if (!cold && (howto & RB_NOSYNC) == 0 && once == 0) { once = 1; BOOTTRACE("bufshutdown begin"); bufshutdown(show_busybufs); BOOTTRACE("bufshutdown end"); } print_uptime(); cngrab(); /* * Ok, now do things that assume all filesystem activity has * been completed. */ EVENTHANDLER_INVOKE(shutdown_post_sync, howto); BOOTTRACE("shutdown post sync complete"); if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold && !dumping) doadump(TRUE); /* Now that we're going to really halt the system... */ BOOTTRACE("shutdown final begin"); if (shutdown_trace) boottrace_dump_console(); EVENTHANDLER_INVOKE(shutdown_final, howto); /* * Call this directly so that reset is attempted even if shutdown * handlers are not yet registered. */ shutdown_reset(NULL, howto); for(;;) ; /* safety against shutdown_reset not working */ /* NOTREACHED */ } /* * The system call that results in changing the rootfs. */ static int kern_reroot(void) { struct vnode *oldrootvnode, *vp; struct mount *mp, *devmp; int error; if (curproc != initproc) return (EPERM); /* * Mark the filesystem containing currently-running executable * (the temporary copy of init(8)) busy. */ vp = curproc->p_textvp; error = vn_lock(vp, LK_SHARED); if (error != 0) return (error); mp = vp->v_mount; error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp); error = vfs_busy(mp, 0); vn_lock(vp, LK_SHARED | LK_RETRY); vfs_rel(mp); if (error != 0) { VOP_UNLOCK(vp); return (ENOENT); } if (VN_IS_DOOMED(vp)) { VOP_UNLOCK(vp); vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp); /* * Remove the filesystem containing currently-running executable * from the mount list, to prevent it from being unmounted * by vfs_unmountall(), and to avoid confusing vfs_mountroot(). * * Also preserve /dev - forcibly unmounting it could cause driver * reinitialization. */ vfs_ref(rootdevmp); devmp = rootdevmp; rootdevmp = NULL; mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); TAILQ_REMOVE(&mountlist, devmp, mnt_list); mtx_unlock(&mountlist_mtx); oldrootvnode = rootvnode; /* * Unmount everything except for the two filesystems preserved above. */ vfs_unmountall(); /* * Add /dev back; vfs_mountroot() will move it into its new place. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_HEAD(&mountlist, devmp, mnt_list); mtx_unlock(&mountlist_mtx); rootdevmp = devmp; vfs_rel(rootdevmp); /* * Mount the new rootfs. */ vfs_mountroot(); /* * Update all references to the old rootvnode. */ mountcheckdirs(oldrootvnode, rootvnode); /* * Add the temporary filesystem back and unbusy it. */ mtx_lock(&mountlist_mtx); TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); vfs_unbusy(mp); return (0); } /* * If the shutdown was a clean halt, behave accordingly. */ static void shutdown_halt(void *junk, int howto) { if (howto & RB_HALT) { printf("\n"); printf("The operating system has halted.\n"); printf("Please press any key to reboot.\n\n"); wdog_kern_pat(WD_TO_NEVER); switch (cngetc()) { case -1: /* No console, just die */ cpu_halt(); /* NOTREACHED */ default: break; } } } /* * Check to see if the system panicked, pause and then reboot * according to the specified delay. */ static void shutdown_panic(void *junk, int howto) { int loop; if (howto & RB_DUMP) { if (panic_reboot_wait_time != 0) { if (panic_reboot_wait_time != -1) { printf("Automatic reboot in %d seconds - " "press a key on the console to abort\n", panic_reboot_wait_time); for (loop = panic_reboot_wait_time * 10; loop > 0; --loop) { DELAY(1000 * 100); /* 1/10th second */ /* Did user type a key? */ if (cncheckc() != -1) break; } if (!loop) return; } } else { /* zero time specified - reboot NOW */ return; } printf("--> Press a key on the console to reboot,\n"); printf("--> or switch off the system now.\n"); cngetc(); } } /* * Everything done, now reset */ static void shutdown_reset(void *junk, int howto) { printf("Rebooting...\n"); DELAY(reboot_wait_time * 1000000); /* * Acquiring smp_ipi_mtx here has a double effect: * - it disables interrupts avoiding CPU0 preemption * by fast handlers (thus deadlocking against other CPUs) * - it avoids deadlocks against smp_rendezvous() or, more * generally, threads busy-waiting, with this spinlock held, * and waiting for responses by threads on other CPUs * (ie. smp_tlb_shootdown()). * * For the !SMP case it just needs to handle the former problem. */ #ifdef SMP mtx_lock_spin(&smp_ipi_mtx); #else spinlock_enter(); #endif cpu_reset(); /* NOTREACHED */ /* assuming reset worked */ } #if defined(WITNESS) || defined(INVARIANT_SUPPORT) static int kassert_warn_only = 0; #ifdef KDB static int kassert_do_kdb = 0; #endif #ifdef KTR static int kassert_do_ktr = 0; #endif static int kassert_do_log = 1; static int kassert_log_pps_limit = 4; static int kassert_log_mute_at = 0; static int kassert_log_panic_at = 0; static int kassert_suppress_in_panic = 0; static int kassert_warnings = 0; SYSCTL_NODE(_debug, OID_AUTO, kassert, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "kassert options"); #ifdef KASSERT_PANIC_OPTIONAL #define KASSERT_RWTUN CTLFLAG_RWTUN #else #define KASSERT_RWTUN CTLFLAG_RDTUN #endif SYSCTL_INT(_debug_kassert, OID_AUTO, warn_only, KASSERT_RWTUN, &kassert_warn_only, 0, "KASSERT triggers a panic (0) or just a warning (1)"); #ifdef KDB SYSCTL_INT(_debug_kassert, OID_AUTO, do_kdb, KASSERT_RWTUN, &kassert_do_kdb, 0, "KASSERT will enter the debugger"); #endif #ifdef KTR SYSCTL_UINT(_debug_kassert, OID_AUTO, do_ktr, KASSERT_RWTUN, &kassert_do_ktr, 0, "KASSERT does a KTR, set this to the KTRMASK you want"); #endif SYSCTL_INT(_debug_kassert, OID_AUTO, do_log, KASSERT_RWTUN, &kassert_do_log, 0, "If warn_only is enabled, log (1) or do not log (0) assertion violations"); SYSCTL_INT(_debug_kassert, OID_AUTO, warnings, CTLFLAG_RD | CTLFLAG_STATS, &kassert_warnings, 0, "number of KASSERTs that have been triggered"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_panic_at, KASSERT_RWTUN, &kassert_log_panic_at, 0, "max number of KASSERTS before we will panic"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_pps_limit, KASSERT_RWTUN, &kassert_log_pps_limit, 0, "limit number of log messages per second"); SYSCTL_INT(_debug_kassert, OID_AUTO, log_mute_at, KASSERT_RWTUN, &kassert_log_mute_at, 0, "max number of KASSERTS to log"); SYSCTL_INT(_debug_kassert, OID_AUTO, suppress_in_panic, KASSERT_RWTUN, &kassert_suppress_in_panic, 0, "KASSERTs will be suppressed while handling a panic"); #undef KASSERT_RWTUN static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_debug_kassert, OID_AUTO, kassert, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE, NULL, 0, kassert_sysctl_kassert, "I", "set to trigger a test kassert"); static int kassert_sysctl_kassert(SYSCTL_HANDLER_ARGS) { int error, i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); KASSERT(0, ("kassert_sysctl_kassert triggered kassert %d", i)); return (0); } #ifdef KASSERT_PANIC_OPTIONAL /* * Called by KASSERT, this decides if we will panic * or if we will log via printf and/or ktr. */ void kassert_panic(const char *fmt, ...) { static char buf[256]; va_list ap; va_start(ap, fmt); (void)vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); /* * If we are suppressing secondary panics, log the warning but do not * re-enter panic/kdb. */ if (KERNEL_PANICKED() && kassert_suppress_in_panic) { if (kassert_do_log) { printf("KASSERT failed: %s\n", buf); #ifdef KDB if (trace_all_panics && trace_on_panic) kdb_backtrace(); #endif } return; } /* * panic if we're not just warning, or if we've exceeded * kassert_log_panic_at warnings. */ if (!kassert_warn_only || (kassert_log_panic_at > 0 && kassert_warnings >= kassert_log_panic_at)) { va_start(ap, fmt); vpanic(fmt, ap); /* NORETURN */ } #ifdef KTR if (kassert_do_ktr) CTR0(ktr_mask, buf); #endif /* KTR */ /* * log if we've not yet met the mute limit. */ if (kassert_do_log && (kassert_log_mute_at == 0 || kassert_warnings < kassert_log_mute_at)) { static struct timeval lasterr; static int curerr; if (ppsratecheck(&lasterr, &curerr, kassert_log_pps_limit)) { printf("KASSERT failed: %s\n", buf); kdb_backtrace(); } } #ifdef KDB if (kassert_do_kdb) { kdb_enter(KDB_WHY_KASSERT, buf); } #endif atomic_add_int(&kassert_warnings, 1); } #endif /* KASSERT_PANIC_OPTIONAL */ #endif /* * Panic is called on unresolvable fatal errors. It prints "panic: mesg", * and then reboots. If we are called twice, then we avoid trying to sync * the disks as this often leads to recursive panics. */ void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); vpanic(fmt, ap); } void vpanic(const char *fmt, va_list ap) { #ifdef SMP cpuset_t other_cpus; #endif struct thread *td = curthread; int bootopt, newpanic; static char buf[256]; /* * 'fmt' must not be NULL as it is put into 'panicstr' which is then * used as a flag to detect if the kernel has panicked. Also, although * vsnprintf() supports a NULL 'fmt' argument, use a more informative * message. */ if (fmt == NULL) fmt = ""; spinlock_enter(); #ifdef SMP /* * stop_cpus_hard(other_cpus) should prevent multiple CPUs from * concurrently entering panic. Only the winner will proceed * further. */ if (!KERNEL_PANICKED() && !kdb_active) { other_cpus = all_cpus; CPU_CLR(PCPU_GET(cpuid), &other_cpus); stop_cpus_hard(other_cpus); } #endif /* * Ensure that the scheduler is stopped while panicking, even if panic * has been entered from kdb. */ - td->td_stopsched = 1; + scheduler_stopped = true; bootopt = RB_AUTOBOOT; newpanic = 0; if (KERNEL_PANICKED()) bootopt |= RB_NOSYNC; else { bootopt |= RB_DUMP; panicstr = fmt; newpanic = 1; } if (newpanic) { (void)vsnprintf(buf, sizeof(buf), fmt, ap); panicstr = buf; cngrab(); printf("panic: %s\n", buf); } else { printf("panic: "); vprintf(fmt, ap); printf("\n"); } #ifdef SMP printf("cpuid = %d\n", PCPU_GET(cpuid)); #endif printf("time = %jd\n", (intmax_t )time_second); #ifdef KDB if ((newpanic || trace_all_panics) && trace_on_panic) kdb_backtrace(); if (debugger_on_panic) kdb_enter(KDB_WHY_PANIC, "panic"); else if (!newpanic && debugger_on_recursive_panic) kdb_enter(KDB_WHY_PANIC, "re-panic"); #endif /*thread_lock(td); */ td->td_flags |= TDF_INPANIC; /* thread_unlock(td); */ if (!sync_on_panic) bootopt |= RB_NOSYNC; if (poweroff_on_panic) bootopt |= RB_POWEROFF; if (powercycle_on_panic) bootopt |= RB_POWERCYCLE; kern_reboot(bootopt); } /* * Support for poweroff delay. * * Please note that setting this delay too short might power off your machine * before the write cache on your hard disk has been flushed, leading to * soft-updates inconsistencies. */ #ifndef POWEROFF_DELAY # define POWEROFF_DELAY 5000 #endif static int poweroff_delay = POWEROFF_DELAY; SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW, &poweroff_delay, 0, "Delay before poweroff to write disk caches (msec)"); static void poweroff_wait(void *junk, int howto) { if ((howto & (RB_POWEROFF | RB_POWERCYCLE)) == 0 || poweroff_delay <= 0) return; DELAY(poweroff_delay * 1000); } /* * Some system processes (e.g. syncer) need to be stopped at appropriate * points in their main loops prior to a system shutdown, so that they * won't interfere with the shutdown process (e.g. by holding a disk buf * to cause sync to fail). For each of these system processes, register * shutdown_kproc() as a handler for one of shutdown events. */ static int kproc_shutdown_wait = 60; SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW, &kproc_shutdown_wait, 0, "Max wait time (sec) to stop for each process"); void kproc_shutdown(void *arg, int howto) { struct proc *p; int error; if (SCHEDULER_STOPPED()) return; p = (struct proc *)arg; printf("Waiting (max %d seconds) for system process `%s' to stop... ", kproc_shutdown_wait, p->p_comm); error = kproc_suspend(p, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } void kthread_shutdown(void *arg, int howto) { struct thread *td; int error; if (SCHEDULER_STOPPED()) return; td = (struct thread *)arg; printf("Waiting (max %d seconds) for system thread `%s' to stop... ", kproc_shutdown_wait, td->td_name); error = kthread_suspend(td, kproc_shutdown_wait * hz); if (error == EWOULDBLOCK) printf("timed out\n"); else printf("done\n"); } static int dumpdevname_sysctl_handler(SYSCTL_HANDLER_ARGS) { char buf[256]; struct dumperinfo *di; struct sbuf sb; int error; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sbuf_new_for_sysctl(&sb, buf, sizeof(buf), req); mtx_lock(&dumpconf_list_lk); TAILQ_FOREACH(di, &dumper_configs, di_next) { if (di != TAILQ_FIRST(&dumper_configs)) sbuf_putc(&sb, ','); sbuf_cat(&sb, di->di_devname); } mtx_unlock(&dumpconf_list_lk); error = sbuf_finish(&sb); sbuf_delete(&sb); return (error); } SYSCTL_PROC(_kern_shutdown, OID_AUTO, dumpdevname, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, &dumper_configs, 0, dumpdevname_sysctl_handler, "A", "Device(s) for kernel dumps"); static int _dump_append(struct dumperinfo *di, void *virtual, size_t length); #ifdef EKCD static struct kerneldumpcrypto * kerneldumpcrypto_create(size_t blocksize, uint8_t encryption, const uint8_t *key, uint32_t encryptedkeysize, const uint8_t *encryptedkey) { struct kerneldumpcrypto *kdc; struct kerneldumpkey *kdk; uint32_t dumpkeysize; dumpkeysize = roundup2(sizeof(*kdk) + encryptedkeysize, blocksize); kdc = malloc(sizeof(*kdc) + dumpkeysize, M_EKCD, M_WAITOK | M_ZERO); arc4rand(kdc->kdc_iv, sizeof(kdc->kdc_iv), 0); kdc->kdc_encryption = encryption; switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_makeKey(&kdc->kdc_ki, DIR_ENCRYPT, 256, key) <= 0) goto failed; break; case KERNELDUMP_ENC_CHACHA20: chacha_keysetup(&kdc->kdc_chacha, key, 256); break; default: goto failed; } kdc->kdc_dumpkeysize = dumpkeysize; kdk = kdc->kdc_dumpkey; kdk->kdk_encryption = kdc->kdc_encryption; memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv)); kdk->kdk_encryptedkeysize = htod32(encryptedkeysize); memcpy(kdk->kdk_encryptedkey, encryptedkey, encryptedkeysize); return (kdc); failed: zfree(kdc, M_EKCD); return (NULL); } static int kerneldumpcrypto_init(struct kerneldumpcrypto *kdc) { uint8_t hash[SHA256_DIGEST_LENGTH]; SHA256_CTX ctx; struct kerneldumpkey *kdk; int error; error = 0; if (kdc == NULL) return (0); /* * When a user enters ddb it can write a crash dump multiple times. * Each time it should be encrypted using a different IV. */ SHA256_Init(&ctx); SHA256_Update(&ctx, kdc->kdc_iv, sizeof(kdc->kdc_iv)); SHA256_Final(hash, &ctx); bcopy(hash, kdc->kdc_iv, sizeof(kdc->kdc_iv)); switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC, kdc->kdc_iv) <= 0) { error = EINVAL; goto out; } break; case KERNELDUMP_ENC_CHACHA20: chacha_ivsetup(&kdc->kdc_chacha, kdc->kdc_iv, NULL); break; default: error = EINVAL; goto out; } kdk = kdc->kdc_dumpkey; memcpy(kdk->kdk_iv, kdc->kdc_iv, sizeof(kdk->kdk_iv)); out: explicit_bzero(hash, sizeof(hash)); return (error); } static uint32_t kerneldumpcrypto_dumpkeysize(const struct kerneldumpcrypto *kdc) { if (kdc == NULL) return (0); return (kdc->kdc_dumpkeysize); } #endif /* EKCD */ static struct kerneldumpcomp * kerneldumpcomp_create(struct dumperinfo *di, uint8_t compression) { struct kerneldumpcomp *kdcomp; int format; switch (compression) { case KERNELDUMP_COMP_GZIP: format = COMPRESS_GZIP; break; case KERNELDUMP_COMP_ZSTD: format = COMPRESS_ZSTD; break; default: return (NULL); } kdcomp = malloc(sizeof(*kdcomp), M_DUMPER, M_WAITOK | M_ZERO); kdcomp->kdc_format = compression; kdcomp->kdc_stream = compressor_init(kerneldumpcomp_write_cb, format, di->maxiosize, kerneldump_gzlevel, di); if (kdcomp->kdc_stream == NULL) { free(kdcomp, M_DUMPER); return (NULL); } kdcomp->kdc_buf = malloc(di->maxiosize, M_DUMPER, M_WAITOK | M_NODUMP); return (kdcomp); } static void kerneldumpcomp_destroy(struct dumperinfo *di) { struct kerneldumpcomp *kdcomp; kdcomp = di->kdcomp; if (kdcomp == NULL) return; compressor_fini(kdcomp->kdc_stream); zfree(kdcomp->kdc_buf, M_DUMPER); free(kdcomp, M_DUMPER); } /* * Free a dumper. Must not be present on global list. */ void dumper_destroy(struct dumperinfo *di) { if (di == NULL) return; zfree(di->blockbuf, M_DUMPER); kerneldumpcomp_destroy(di); #ifdef EKCD zfree(di->kdcrypto, M_EKCD); #endif zfree(di, M_DUMPER); } /* * Allocate and set up a new dumper from the provided template. */ int dumper_create(const struct dumperinfo *di_template, const char *devname, const struct diocskerneldump_arg *kda, struct dumperinfo **dip) { struct dumperinfo *newdi; int error = 0; if (dip == NULL) return (EINVAL); /* Allocate a new dumper */ newdi = malloc(sizeof(*newdi) + strlen(devname) + 1, M_DUMPER, M_WAITOK | M_ZERO); memcpy(newdi, di_template, sizeof(*newdi)); newdi->blockbuf = NULL; newdi->kdcrypto = NULL; newdi->kdcomp = NULL; strcpy(newdi->di_devname, devname); if (kda->kda_encryption != KERNELDUMP_ENC_NONE) { #ifdef EKCD newdi->kdcrypto = kerneldumpcrypto_create(newdi->blocksize, kda->kda_encryption, kda->kda_key, kda->kda_encryptedkeysize, kda->kda_encryptedkey); if (newdi->kdcrypto == NULL) { error = EINVAL; goto cleanup; } #else error = EOPNOTSUPP; goto cleanup; #endif } if (kda->kda_compression != KERNELDUMP_COMP_NONE) { #ifdef EKCD /* * We can't support simultaneous unpadded block cipher * encryption and compression because there is no guarantee the * length of the compressed result is exactly a multiple of the * cipher block size. */ if (kda->kda_encryption == KERNELDUMP_ENC_AES_256_CBC) { error = EOPNOTSUPP; goto cleanup; } #endif newdi->kdcomp = kerneldumpcomp_create(newdi, kda->kda_compression); if (newdi->kdcomp == NULL) { error = EINVAL; goto cleanup; } } newdi->blockbuf = malloc(newdi->blocksize, M_DUMPER, M_WAITOK | M_ZERO); *dip = newdi; return (0); cleanup: dumper_destroy(newdi); return (error); } /* * Create a new dumper and register it in the global list. */ int dumper_insert(const struct dumperinfo *di_template, const char *devname, const struct diocskerneldump_arg *kda) { struct dumperinfo *newdi, *listdi; bool inserted; uint8_t index; int error; index = kda->kda_index; MPASS(index != KDA_REMOVE && index != KDA_REMOVE_DEV && index != KDA_REMOVE_ALL); error = priv_check(curthread, PRIV_SETDUMPER); if (error != 0) return (error); error = dumper_create(di_template, devname, kda, &newdi); if (error != 0) return (error); /* Add the new configuration to the queue */ mtx_lock(&dumpconf_list_lk); inserted = false; TAILQ_FOREACH(listdi, &dumper_configs, di_next) { if (index == 0) { TAILQ_INSERT_BEFORE(listdi, newdi, di_next); inserted = true; break; } index--; } if (!inserted) TAILQ_INSERT_TAIL(&dumper_configs, newdi, di_next); mtx_unlock(&dumpconf_list_lk); return (0); } #ifdef DDB void dumper_ddb_insert(struct dumperinfo *newdi) { TAILQ_INSERT_HEAD(&dumper_configs, newdi, di_next); } void dumper_ddb_remove(struct dumperinfo *di) { TAILQ_REMOVE(&dumper_configs, di, di_next); } #endif static bool dumper_config_match(const struct dumperinfo *di, const char *devname, const struct diocskerneldump_arg *kda) { if (kda->kda_index == KDA_REMOVE_ALL) return (true); if (strcmp(di->di_devname, devname) != 0) return (false); /* * Allow wildcard removal of configs matching a device on g_dev_orphan. */ if (kda->kda_index == KDA_REMOVE_DEV) return (true); if (di->kdcomp != NULL) { if (di->kdcomp->kdc_format != kda->kda_compression) return (false); } else if (kda->kda_compression != KERNELDUMP_COMP_NONE) return (false); #ifdef EKCD if (di->kdcrypto != NULL) { if (di->kdcrypto->kdc_encryption != kda->kda_encryption) return (false); /* * Do we care to verify keys match to delete? It seems weird * to expect multiple fallback dump configurations on the same * device that only differ in crypto key. */ } else #endif if (kda->kda_encryption != KERNELDUMP_ENC_NONE) return (false); return (true); } /* * Remove and free the requested dumper(s) from the global list. */ int dumper_remove(const char *devname, const struct diocskerneldump_arg *kda) { struct dumperinfo *di, *sdi; bool found; int error; error = priv_check(curthread, PRIV_SETDUMPER); if (error != 0) return (error); /* * Try to find a matching configuration, and kill it. * * NULL 'kda' indicates remove any configuration matching 'devname', * which may remove multiple configurations in atypical configurations. */ found = false; mtx_lock(&dumpconf_list_lk); TAILQ_FOREACH_SAFE(di, &dumper_configs, di_next, sdi) { if (dumper_config_match(di, devname, kda)) { found = true; TAILQ_REMOVE(&dumper_configs, di, di_next); dumper_destroy(di); } } mtx_unlock(&dumpconf_list_lk); /* Only produce ENOENT if a more targeted match didn't match. */ if (!found && kda->kda_index == KDA_REMOVE) return (ENOENT); return (0); } static int dump_check_bounds(struct dumperinfo *di, off_t offset, size_t length) { if (di->mediasize > 0 && length != 0 && (offset < di->mediaoffset || offset - di->mediaoffset + length > di->mediasize)) { if (di->kdcomp != NULL && offset >= di->mediaoffset) { printf( "Compressed dump failed to fit in device boundaries.\n"); return (E2BIG); } printf("Attempt to write outside dump device boundaries.\n" "offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n", (intmax_t)offset, (intmax_t)di->mediaoffset, (uintmax_t)length, (intmax_t)di->mediasize); return (ENOSPC); } if (length % di->blocksize != 0) { printf("Attempt to write partial block of length %ju.\n", (uintmax_t)length); return (EINVAL); } if (offset % di->blocksize != 0) { printf("Attempt to write at unaligned offset %jd.\n", (intmax_t)offset); return (EINVAL); } return (0); } #ifdef EKCD static int dump_encrypt(struct kerneldumpcrypto *kdc, uint8_t *buf, size_t size) { switch (kdc->kdc_encryption) { case KERNELDUMP_ENC_AES_256_CBC: if (rijndael_blockEncrypt(&kdc->kdc_ci, &kdc->kdc_ki, buf, 8 * size, buf) <= 0) { return (EIO); } if (rijndael_cipherInit(&kdc->kdc_ci, MODE_CBC, buf + size - 16 /* IV size for AES-256-CBC */) <= 0) { return (EIO); } break; case KERNELDUMP_ENC_CHACHA20: chacha_encrypt_bytes(&kdc->kdc_chacha, buf, buf, size); break; default: return (EINVAL); } return (0); } /* Encrypt data and call dumper. */ static int dump_encrypted_write(struct dumperinfo *di, void *virtual, off_t offset, size_t length) { static uint8_t buf[KERNELDUMP_BUFFER_SIZE]; struct kerneldumpcrypto *kdc; int error; size_t nbytes; kdc = di->kdcrypto; while (length > 0) { nbytes = MIN(length, sizeof(buf)); bcopy(virtual, buf, nbytes); if (dump_encrypt(kdc, buf, nbytes) != 0) return (EIO); error = dump_write(di, buf, offset, nbytes); if (error != 0) return (error); offset += nbytes; virtual = (void *)((uint8_t *)virtual + nbytes); length -= nbytes; } return (0); } #endif /* EKCD */ static int kerneldumpcomp_write_cb(void *base, size_t length, off_t offset, void *arg) { struct dumperinfo *di; size_t resid, rlength; int error; di = arg; if (length % di->blocksize != 0) { /* * This must be the final write after flushing the compression * stream. Write as many full blocks as possible and stash the * residual data in the dumper's block buffer. It will be * padded and written in dump_finish(). */ rlength = rounddown(length, di->blocksize); if (rlength != 0) { error = _dump_append(di, base, rlength); if (error != 0) return (error); } resid = length - rlength; memmove(di->blockbuf, (uint8_t *)base + rlength, resid); bzero((uint8_t *)di->blockbuf + resid, di->blocksize - resid); di->kdcomp->kdc_resid = resid; return (EAGAIN); } return (_dump_append(di, base, length)); } /* * Write kernel dump headers at the beginning and end of the dump extent. * Write the kernel dump encryption key after the leading header if we were * configured to do so. */ static int dump_write_headers(struct dumperinfo *di, struct kerneldumpheader *kdh) { #ifdef EKCD struct kerneldumpcrypto *kdc; #endif void *buf; size_t hdrsz; uint64_t extent; uint32_t keysize; int error; hdrsz = sizeof(*kdh); if (hdrsz > di->blocksize) return (ENOMEM); #ifdef EKCD kdc = di->kdcrypto; keysize = kerneldumpcrypto_dumpkeysize(kdc); #else keysize = 0; #endif /* * If the dump device has special handling for headers, let it take care * of writing them out. */ if (di->dumper_hdr != NULL) return (di->dumper_hdr(di, kdh)); if (hdrsz == di->blocksize) buf = kdh; else { buf = di->blockbuf; memset(buf, 0, di->blocksize); memcpy(buf, kdh, hdrsz); } extent = dtoh64(kdh->dumpextent); #ifdef EKCD if (kdc != NULL) { error = dump_write(di, kdc->kdc_dumpkey, di->mediaoffset + di->mediasize - di->blocksize - extent - keysize, keysize); if (error != 0) return (error); } #endif error = dump_write(di, buf, di->mediaoffset + di->mediasize - 2 * di->blocksize - extent - keysize, di->blocksize); if (error == 0) error = dump_write(di, buf, di->mediaoffset + di->mediasize - di->blocksize, di->blocksize); return (error); } /* * Don't touch the first SIZEOF_METADATA bytes on the dump device. This is to * protect us from metadata and metadata from us. */ #define SIZEOF_METADATA (64 * 1024) /* * Do some preliminary setup for a kernel dump: initialize state for encryption, * if requested, and make sure that we have enough space on the dump device. * * We set things up so that the dump ends before the last sector of the dump * device, at which the trailing header is written. * * +-----------+------+-----+----------------------------+------+ * | | lhdr | key | ... kernel dump ... | thdr | * +-----------+------+-----+----------------------------+------+ * 1 blk opt <------- dump extent --------> 1 blk * * Dumps written using dump_append() start at the beginning of the extent. * Uncompressed dumps will use the entire extent, but compressed dumps typically * will not. The true length of the dump is recorded in the leading and trailing * headers once the dump has been completed. * * The dump device may provide a callback, in which case it will initialize * dumpoff and take care of laying out the headers. */ int dump_start(struct dumperinfo *di, struct kerneldumpheader *kdh) { #ifdef EKCD struct kerneldumpcrypto *kdc; #endif void *key; uint64_t dumpextent, span; uint32_t keysize; int error; #ifdef EKCD /* Send the key before the dump so a partial dump is still usable. */ kdc = di->kdcrypto; error = kerneldumpcrypto_init(kdc); if (error != 0) return (error); keysize = kerneldumpcrypto_dumpkeysize(kdc); key = keysize > 0 ? kdc->kdc_dumpkey : NULL; #else error = 0; keysize = 0; key = NULL; #endif if (di->dumper_start != NULL) { error = di->dumper_start(di, key, keysize); } else { dumpextent = dtoh64(kdh->dumpextent); span = SIZEOF_METADATA + dumpextent + 2 * di->blocksize + keysize; if (di->mediasize < span) { if (di->kdcomp == NULL) return (E2BIG); /* * We don't yet know how much space the compressed dump * will occupy, so try to use the whole swap partition * (minus the first 64KB) in the hope that the * compressed dump will fit. If that doesn't turn out to * be enough, the bounds checking in dump_write() * will catch us and cause the dump to fail. */ dumpextent = di->mediasize - span + dumpextent; kdh->dumpextent = htod64(dumpextent); } /* * The offset at which to begin writing the dump. */ di->dumpoff = di->mediaoffset + di->mediasize - di->blocksize - dumpextent; } di->origdumpoff = di->dumpoff; return (error); } static int _dump_append(struct dumperinfo *di, void *virtual, size_t length) { int error; #ifdef EKCD if (di->kdcrypto != NULL) error = dump_encrypted_write(di, virtual, di->dumpoff, length); else #endif error = dump_write(di, virtual, di->dumpoff, length); if (error == 0) di->dumpoff += length; return (error); } /* * Write to the dump device starting at dumpoff. When compression is enabled, * writes to the device will be performed using a callback that gets invoked * when the compression stream's output buffer is full. */ int dump_append(struct dumperinfo *di, void *virtual, size_t length) { void *buf; if (di->kdcomp != NULL) { /* Bounce through a buffer to avoid CRC errors. */ if (length > di->maxiosize) return (EINVAL); buf = di->kdcomp->kdc_buf; memmove(buf, virtual, length); return (compressor_write(di->kdcomp->kdc_stream, buf, length)); } return (_dump_append(di, virtual, length)); } /* * Write to the dump device at the specified offset. */ int dump_write(struct dumperinfo *di, void *virtual, off_t offset, size_t length) { int error; error = dump_check_bounds(di, offset, length); if (error != 0) return (error); return (di->dumper(di->priv, virtual, offset, length)); } /* * Perform kernel dump finalization: flush the compression stream, if necessary, * write the leading and trailing kernel dump headers now that we know the true * length of the dump, and optionally write the encryption key following the * leading header. */ int dump_finish(struct dumperinfo *di, struct kerneldumpheader *kdh) { int error; if (di->kdcomp != NULL) { error = compressor_flush(di->kdcomp->kdc_stream); if (error == EAGAIN) { /* We have residual data in di->blockbuf. */ error = _dump_append(di, di->blockbuf, di->blocksize); if (error == 0) /* Compensate for _dump_append()'s adjustment. */ di->dumpoff -= di->blocksize - di->kdcomp->kdc_resid; di->kdcomp->kdc_resid = 0; } if (error != 0) return (error); /* * We now know the size of the compressed dump, so update the * header accordingly and recompute parity. */ kdh->dumplength = htod64(di->dumpoff - di->origdumpoff); kdh->parity = 0; kdh->parity = kerneldump_parity(kdh); compressor_reset(di->kdcomp->kdc_stream); } error = dump_write_headers(di, kdh); if (error != 0) return (error); (void)dump_write(di, NULL, 0, 0); return (0); } void dump_init_header(const struct dumperinfo *di, struct kerneldumpheader *kdh, const char *magic, uint32_t archver, uint64_t dumplen) { size_t dstsize; bzero(kdh, sizeof(*kdh)); strlcpy(kdh->magic, magic, sizeof(kdh->magic)); strlcpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture)); kdh->version = htod32(KERNELDUMPVERSION); kdh->architectureversion = htod32(archver); kdh->dumplength = htod64(dumplen); kdh->dumpextent = kdh->dumplength; kdh->dumptime = htod64(time_second); #ifdef EKCD kdh->dumpkeysize = htod32(kerneldumpcrypto_dumpkeysize(di->kdcrypto)); #else kdh->dumpkeysize = 0; #endif kdh->blocksize = htod32(di->blocksize); strlcpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname)); dstsize = sizeof(kdh->versionstring); if (strlcpy(kdh->versionstring, version, dstsize) >= dstsize) kdh->versionstring[dstsize - 2] = '\n'; if (panicstr != NULL) strlcpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring)); if (di->kdcomp != NULL) kdh->compression = di->kdcomp->kdc_format; kdh->parity = kerneldump_parity(kdh); } #ifdef DDB DB_SHOW_COMMAND_FLAGS(panic, db_show_panic, DB_CMD_MEMSAFE) { if (panicstr == NULL) db_printf("panicstr not set\n"); else db_printf("panic: %s\n", panicstr); } #endif diff --git a/sys/kern/kern_sx.c b/sys/kern/kern_sx.c index 706ae90ef9af..d302fa45161e 100644 --- a/sys/kern/kern_sx.c +++ b/sys/kern/kern_sx.c @@ -1,1575 +1,1575 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2007 Attilio Rao * Copyright (c) 2001 Jason Evans * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ /* * Shared/exclusive locks. This implementation attempts to ensure * deterministic lock granting behavior, so that slocks and xlocks are * interleaved. * * Priority propagation will not generally raise the priority of lock holders, * so should not be relied upon in combination with sx locks. */ #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_no_adaptive_sx.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(SMP) && !defined(NO_ADAPTIVE_SX) #include #endif #ifdef DDB #include #endif #if defined(SMP) && !defined(NO_ADAPTIVE_SX) #define ADAPTIVE_SX #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DECLARE( , , lock, failed); #endif /* Handy macros for sleep queues. */ #define SQ_EXCLUSIVE_QUEUE 0 #define SQ_SHARED_QUEUE 1 /* * Variations on DROP_GIANT()/PICKUP_GIANT() for use in this file. We * drop Giant anytime we have to sleep or if we adaptively spin. */ #define GIANT_DECLARE \ int _giantcnt = 0; \ WITNESS_SAVE_DECL(Giant) \ #define GIANT_SAVE(work) do { \ if (__predict_false(mtx_owned(&Giant))) { \ work++; \ WITNESS_SAVE(&Giant.lock_object, Giant); \ while (mtx_owned(&Giant)) { \ _giantcnt++; \ mtx_unlock(&Giant); \ } \ } \ } while (0) #define GIANT_RESTORE() do { \ if (_giantcnt > 0) { \ mtx_assert(&Giant, MA_NOTOWNED); \ while (_giantcnt--) \ mtx_lock(&Giant); \ WITNESS_RESTORE(&Giant.lock_object, Giant); \ } \ } while (0) /* * Returns true if an exclusive lock is recursed. It assumes * curthread currently has an exclusive lock. */ #define sx_recursed(sx) ((sx)->sx_recurse != 0) static void assert_sx(const struct lock_object *lock, int what); #ifdef DDB static void db_show_sx(const struct lock_object *lock); #endif static void lock_sx(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_sx(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_sx(struct lock_object *lock); struct lock_class lock_class_sx = { .lc_name = "sx", .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE, .lc_assert = assert_sx, #ifdef DDB .lc_ddb_show = db_show_sx, #endif .lc_lock = lock_sx, .lc_unlock = unlock_sx, #ifdef KDTRACE_HOOKS .lc_owner = owner_sx, #endif }; #ifndef INVARIANTS #define _sx_assert(sx, what, file, line) #endif #ifdef ADAPTIVE_SX #ifdef SX_CUSTOM_BACKOFF static u_short __read_frequently asx_retries; static u_short __read_frequently asx_loops; static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "sxlock debugging"); SYSCTL_U16(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, ""); SYSCTL_U16(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, ""); static struct lock_delay_config __read_frequently sx_delay; SYSCTL_U16(_debug_sx, OID_AUTO, delay_base, CTLFLAG_RW, &sx_delay.base, 0, ""); SYSCTL_U16(_debug_sx, OID_AUTO, delay_max, CTLFLAG_RW, &sx_delay.max, 0, ""); static void sx_lock_delay_init(void *arg __unused) { lock_delay_default_init(&sx_delay); asx_retries = 10; asx_loops = max(10000, sx_delay.max); } LOCK_DELAY_SYSINIT(sx_lock_delay_init); #else #define sx_delay locks_delay #define asx_retries locks_delay_retries #define asx_loops locks_delay_loops #endif #endif void assert_sx(const struct lock_object *lock, int what) { sx_assert((const struct sx *)lock, what); } void lock_sx(struct lock_object *lock, uintptr_t how) { struct sx *sx; sx = (struct sx *)lock; if (how) sx_slock(sx); else sx_xlock(sx); } uintptr_t unlock_sx(struct lock_object *lock) { struct sx *sx; sx = (struct sx *)lock; sx_assert(sx, SA_LOCKED | SA_NOTRECURSED); if (sx_xlocked(sx)) { sx_xunlock(sx); return (0); } else { sx_sunlock(sx); return (1); } } #ifdef KDTRACE_HOOKS int owner_sx(const struct lock_object *lock, struct thread **owner) { const struct sx *sx; uintptr_t x; sx = (const struct sx *)lock; x = sx->sx_lock; *owner = NULL; return ((x & SX_LOCK_SHARED) != 0 ? (SX_SHARERS(x) != 0) : ((*owner = (struct thread *)SX_OWNER(x)) != NULL)); } #endif void sx_sysinit(void *arg) { struct sx_args *sargs = arg; sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags); } void sx_init_flags(struct sx *sx, const char *description, int opts) { int flags; MPASS((opts & ~(SX_QUIET | SX_RECURSE | SX_NOWITNESS | SX_DUPOK | SX_NOPROFILE | SX_NEW)) == 0); ASSERT_ATOMIC_LOAD_PTR(sx->sx_lock, ("%s: sx_lock not aligned for %s: %p", __func__, description, &sx->sx_lock)); flags = LO_SLEEPABLE | LO_UPGRADABLE; if (opts & SX_DUPOK) flags |= LO_DUPOK; if (opts & SX_NOPROFILE) flags |= LO_NOPROFILE; if (!(opts & SX_NOWITNESS)) flags |= LO_WITNESS; if (opts & SX_RECURSE) flags |= LO_RECURSABLE; if (opts & SX_QUIET) flags |= LO_QUIET; if (opts & SX_NEW) flags |= LO_NEW; lock_init(&sx->lock_object, &lock_class_sx, description, NULL, flags); sx->sx_lock = SX_LOCK_UNLOCKED; sx->sx_recurse = 0; } void sx_destroy(struct sx *sx) { KASSERT(sx->sx_lock == SX_LOCK_UNLOCKED, ("sx lock still held")); KASSERT(sx->sx_recurse == 0, ("sx lock still recursed")); sx->sx_lock = SX_LOCK_DESTROYED; lock_destroy(&sx->lock_object); } int sx_try_slock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { uintptr_t x; if (SCHEDULER_STOPPED()) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("sx_try_slock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); x = sx->sx_lock; for (;;) { KASSERT(x != SX_LOCK_DESTROYED, ("sx_try_slock() of destroyed sx @ %s:%d", file, line)); if (!(x & SX_LOCK_SHARED)) break; if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, x + SX_ONE_SHARER)) { LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 1, file, line); WITNESS_LOCK(&sx->lock_object, LOP_TRYLOCK, file, line); LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, 0, 0, file, line, LOCKSTAT_READER); TD_LOCKS_INC(curthread); curthread->td_sx_slocks++; return (1); } } LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 0, file, line); return (0); } int sx_try_slock_(struct sx *sx, const char *file, int line) { return (sx_try_slock_int(sx LOCK_FILE_LINE_ARG)); } int _sx_xlock(struct sx *sx, int opts, const char *file, int line) { uintptr_t tid, x; int error = 0; KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() || !TD_IS_IDLETHREAD(curthread), ("sx_xlock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_xlock() of destroyed sx @ %s:%d", file, line)); WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); tid = (uintptr_t)curthread; x = SX_LOCK_UNLOCKED; if (!atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid)) error = _sx_xlock_hard(sx, x, opts LOCK_FILE_LINE_ARG); else LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, 0, 0, file, line, LOCKSTAT_WRITER); if (!error) { LOCK_LOG_LOCK("XLOCK", &sx->lock_object, 0, sx->sx_recurse, file, line); WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_INC(curthread); } return (error); } int sx_try_xlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { struct thread *td; uintptr_t tid, x; int rval; bool recursed; td = curthread; tid = (uintptr_t)td; - if (SCHEDULER_STOPPED_TD(td)) + if (SCHEDULER_STOPPED()) return (1); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(td), ("sx_try_xlock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_try_xlock() of destroyed sx @ %s:%d", file, line)); rval = 1; recursed = false; x = SX_LOCK_UNLOCKED; for (;;) { if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid)) break; if (x == SX_LOCK_UNLOCKED) continue; if (x == tid && (sx->lock_object.lo_flags & LO_RECURSABLE)) { sx->sx_recurse++; atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED); break; } rval = 0; break; } LOCK_LOG_TRY("XLOCK", &sx->lock_object, 0, rval, file, line); if (rval) { WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); if (!recursed) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, 0, 0, file, line, LOCKSTAT_WRITER); TD_LOCKS_INC(curthread); } return (rval); } int sx_try_xlock_(struct sx *sx, const char *file, int line) { return (sx_try_xlock_int(sx LOCK_FILE_LINE_ARG)); } void _sx_xunlock(struct sx *sx, const char *file, int line) { KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_xunlock() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_XLOCKED, file, line); WITNESS_UNLOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("XUNLOCK", &sx->lock_object, 0, sx->sx_recurse, file, line); #if LOCK_DEBUG > 0 _sx_xunlock_hard(sx, (uintptr_t)curthread, file, line); #else __sx_xunlock(sx, curthread, file, line); #endif TD_LOCKS_DEC(curthread); } /* * Try to do a non-blocking upgrade from a shared lock to an exclusive lock. * This will only succeed if this thread holds a single shared lock. * Return 1 if if the upgrade succeed, 0 otherwise. */ int sx_try_upgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { uintptr_t x; uintptr_t waiters; int success; if (SCHEDULER_STOPPED()) return (1); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_try_upgrade() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_SLOCKED, file, line); /* * Try to switch from one shared lock to an exclusive lock. We need * to maintain the SX_LOCK_EXCLUSIVE_WAITERS flag if set so that * we will wake up the exclusive waiters when we drop the lock. */ success = 0; x = SX_READ_VALUE(sx); for (;;) { if (SX_SHARERS(x) > 1) break; waiters = (x & SX_LOCK_WAITERS); if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, (uintptr_t)curthread | waiters)) { success = 1; break; } } LOCK_LOG_TRY("XUPGRADE", &sx->lock_object, 0, success, file, line); if (success) { curthread->td_sx_slocks--; WITNESS_UPGRADE(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); LOCKSTAT_RECORD0(sx__upgrade, sx); } return (success); } int sx_try_upgrade_(struct sx *sx, const char *file, int line) { return (sx_try_upgrade_int(sx LOCK_FILE_LINE_ARG)); } /* * Downgrade an unrecursed exclusive lock into a single shared lock. */ void sx_downgrade_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { uintptr_t x; int wakeup_swapper; if (SCHEDULER_STOPPED()) return; KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_downgrade() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_XLOCKED | SA_NOTRECURSED, file, line); #ifndef INVARIANTS if (sx_recursed(sx)) panic("downgrade of a recursed lock"); #endif WITNESS_DOWNGRADE(&sx->lock_object, 0, file, line); /* * Try to switch from an exclusive lock with no shared waiters * to one sharer with no shared waiters. If there are * exclusive waiters, we don't need to lock the sleep queue so * long as we preserve the flag. We do one quick try and if * that fails we grab the sleepq lock to keep the flags from * changing and do it the slow way. * * We have to lock the sleep queue if there are shared waiters * so we can wake them up. */ x = sx->sx_lock; if (!(x & SX_LOCK_SHARED_WAITERS) && atomic_cmpset_rel_ptr(&sx->sx_lock, x, SX_SHARERS_LOCK(1) | (x & SX_LOCK_EXCLUSIVE_WAITERS))) goto out; /* * Lock the sleep queue so we can read the waiters bits * without any races and wakeup any shared waiters. */ sleepq_lock(&sx->lock_object); /* * Preserve SX_LOCK_EXCLUSIVE_WAITERS while downgraded to a single * shared lock. If there are any shared waiters, wake them up. */ wakeup_swapper = 0; x = sx->sx_lock; atomic_store_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | (x & SX_LOCK_EXCLUSIVE_WAITERS)); if (x & SX_LOCK_SHARED_WAITERS) wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, SQ_SHARED_QUEUE); sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); out: curthread->td_sx_slocks++; LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line); LOCKSTAT_RECORD0(sx__downgrade, sx); } void sx_downgrade_(struct sx *sx, const char *file, int line) { sx_downgrade_int(sx LOCK_FILE_LINE_ARG); } #ifdef ADAPTIVE_SX static inline void sx_drop_critical(uintptr_t x, bool *in_critical, int *extra_work) { if (x & SX_LOCK_WRITE_SPINNER) return; if (*in_critical) { critical_exit(); *in_critical = false; (*extra_work)--; } } #else #define sx_drop_critical(x, in_critical, extra_work) do { } while (0) #endif /* * This function represents the so-called 'hard case' for sx_xlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ int _sx_xlock_hard(struct sx *sx, uintptr_t x, int opts LOCK_FILE_LINE_ARG_DEF) { GIANT_DECLARE; uintptr_t tid, setx; #ifdef ADAPTIVE_SX struct thread *owner; u_int i, n, spintries = 0; enum { READERS, WRITER } sleep_reason = READERS; bool in_critical = false; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif int error = 0; #if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) uintptr_t state = 0; int doing_lockprof = 0; #endif int extra_work = 0; tid = (uintptr_t)curthread; #ifdef KDTRACE_HOOKS if (LOCKSTAT_PROFILE_ENABLED(sx__acquire)) { while (x == SX_LOCK_UNLOCKED) { if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid)) goto out_lockstat; } extra_work = 1; doing_lockprof = 1; all_time -= lockstat_nsecs(&sx->lock_object); state = x; } #endif #ifdef LOCK_PROFILING extra_work = 1; doing_lockprof = 1; state = x; #endif if (SCHEDULER_STOPPED()) return (0); if (__predict_false(x == SX_LOCK_UNLOCKED)) x = SX_READ_VALUE(sx); /* If we already hold an exclusive lock, then recurse. */ if (__predict_false(lv_sx_owner(x) == (struct thread *)tid)) { KASSERT((sx->lock_object.lo_flags & LO_RECURSABLE) != 0, ("_sx_xlock_hard: recursed on non-recursive sx %s @ %s:%d\n", sx->lock_object.lo_name, file, line)); sx->sx_recurse++; atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p recursing", __func__, sx); return (0); } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__, sx->lock_object.lo_name, (void *)sx->sx_lock, file, line); #if defined(ADAPTIVE_SX) lock_delay_arg_init(&lda, &sx_delay); #elif defined(KDTRACE_HOOKS) lock_delay_arg_init_noadapt(&lda); #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&sx->lock_object, false, &contested, &waittime); #ifndef INVARIANTS GIANT_SAVE(extra_work); #endif THREAD_CONTENDS_ON_LOCK(&sx->lock_object); for (;;) { if (x == SX_LOCK_UNLOCKED) { if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid)) break; continue; } #ifdef INVARIANTS GIANT_SAVE(extra_work); #endif #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef ADAPTIVE_SX if (x == (SX_LOCK_SHARED | SX_LOCK_WRITE_SPINNER)) { if (atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid)) break; continue; } /* * If the lock is write locked and the owner is * running on another CPU, spin until the owner stops * running or the state of the lock changes. */ if ((x & SX_LOCK_SHARED) == 0) { sx_drop_critical(x, &in_critical, &extra_work); sleep_reason = WRITER; owner = lv_sx_owner(x); if (!TD_IS_RUNNING(owner)) goto sleepq; if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, sx, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); do { lock_delay(&lda); x = SX_READ_VALUE(sx); owner = lv_sx_owner(x); } while (owner != NULL && TD_IS_RUNNING(owner)); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } else if (SX_SHARERS(x) > 0) { sleep_reason = READERS; if (spintries == asx_retries) goto sleepq; if (!(x & SX_LOCK_WRITE_SPINNER)) { if (!in_critical) { critical_enter(); in_critical = true; extra_work++; } if (!atomic_fcmpset_ptr(&sx->sx_lock, &x, x | SX_LOCK_WRITE_SPINNER)) { critical_exit(); in_critical = false; extra_work--; continue; } } spintries++; KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); n = SX_SHARERS(x); for (i = 0; i < asx_loops; i += n) { lock_delay_spin(n); x = SX_READ_VALUE(sx); if (!(x & SX_LOCK_WRITE_SPINNER)) break; if (!(x & SX_LOCK_SHARED)) break; n = SX_SHARERS(x); if (n == 0) break; } #ifdef KDTRACE_HOOKS lda.spin_cnt += i; #endif KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); if (i < asx_loops) continue; } sleepq: #endif sleepq_lock(&sx->lock_object); x = SX_READ_VALUE(sx); retry_sleepq: /* * If the lock was released while spinning on the * sleep queue chain lock, try again. */ if (x == SX_LOCK_UNLOCKED) { sleepq_release(&sx->lock_object); sx_drop_critical(x, &in_critical, &extra_work); continue; } #ifdef ADAPTIVE_SX /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the sleep queue * chain lock. If so, drop the sleep queue lock and try * again. */ if (!(x & SX_LOCK_SHARED)) { owner = (struct thread *)SX_OWNER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&sx->lock_object); sx_drop_critical(x, &in_critical, &extra_work); continue; } } else if (SX_SHARERS(x) > 0 && sleep_reason == WRITER) { sleepq_release(&sx->lock_object); sx_drop_critical(x, &in_critical, &extra_work); continue; } #endif /* * If an exclusive lock was released with both shared * and exclusive waiters and a shared waiter hasn't * woken up and acquired the lock yet, sx_lock will be * set to SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS. * If we see that value, try to acquire it once. Note * that we have to preserve SX_LOCK_EXCLUSIVE_WAITERS * as there are other exclusive waiters still. If we * fail, restart the loop. */ setx = x & (SX_LOCK_WAITERS | SX_LOCK_WRITE_SPINNER); if ((x & ~setx) == SX_LOCK_SHARED) { setx &= ~SX_LOCK_WRITE_SPINNER; if (!atomic_fcmpset_acq_ptr(&sx->sx_lock, &x, tid | setx)) goto retry_sleepq; sleepq_release(&sx->lock_object); CTR2(KTR_LOCK, "%s: %p claimed by new writer", __func__, sx); break; } #ifdef ADAPTIVE_SX /* * It is possible we set the SX_LOCK_WRITE_SPINNER bit. * It is an invariant that when the bit is set, there is * a writer ready to grab the lock. Thus clear the bit since * we are going to sleep. */ if (in_critical) { if ((x & SX_LOCK_WRITE_SPINNER) || !((x & SX_LOCK_EXCLUSIVE_WAITERS))) { setx = x & ~SX_LOCK_WRITE_SPINNER; setx |= SX_LOCK_EXCLUSIVE_WAITERS; if (!atomic_fcmpset_ptr(&sx->sx_lock, &x, setx)) { goto retry_sleepq; } } critical_exit(); in_critical = false; } else { #endif /* * Try to set the SX_LOCK_EXCLUSIVE_WAITERS. If we fail, * than loop back and retry. */ if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) { if (!atomic_fcmpset_ptr(&sx->sx_lock, &x, x | SX_LOCK_EXCLUSIVE_WAITERS)) { goto retry_sleepq; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set excl waiters flag", __func__, sx); } #ifdef ADAPTIVE_SX } #endif /* * Since we have been unable to acquire the exclusive * lock and the exclusive waiters flag is set, we have * to sleep. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on sleep queue", __func__, sx); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&sx->lock_object); #endif sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name, SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ? SLEEPQ_INTERRUPTIBLE : 0), SQ_EXCLUSIVE_QUEUE); /* * Hack: this can land in thread_suspend_check which will * conditionally take a mutex, tripping over an assert if a * lock we are waiting for is set. */ THREAD_CONTENTION_DONE(&sx->lock_object); if (!(opts & SX_INTERRUPTIBLE)) sleepq_wait(&sx->lock_object, 0); else error = sleepq_wait_sig(&sx->lock_object, 0); THREAD_CONTENDS_ON_LOCK(&sx->lock_object); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&sx->lock_object); sleep_cnt++; #endif if (error) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: interruptible sleep by %p suspended by signal", __func__, sx); break; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from sleep queue", __func__, sx); x = SX_READ_VALUE(sx); } THREAD_CONTENTION_DONE(&sx->lock_object); if (__predict_true(!extra_work)) return (error); #ifdef ADAPTIVE_SX if (in_critical) critical_exit(); #endif GIANT_RESTORE(); #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!doing_lockprof)) return (error); #endif #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&sx->lock_object); if (sleep_time) LOCKSTAT_RECORD4(sx__block, sx, sleep_time, LOCKSTAT_WRITER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(sx__spin, sx, all_time - sleep_time, LOCKSTAT_WRITER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); out_lockstat: #endif if (!error) LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, contested, waittime, file, line, LOCKSTAT_WRITER); return (error); } /* * This function represents the so-called 'hard case' for sx_xunlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ void _sx_xunlock_hard(struct sx *sx, uintptr_t x LOCK_FILE_LINE_ARG_DEF) { uintptr_t tid, setx; int queue, wakeup_swapper; if (SCHEDULER_STOPPED()) return; tid = (uintptr_t)curthread; if (__predict_false(x == tid)) x = SX_READ_VALUE(sx); MPASS(!(x & SX_LOCK_SHARED)); if (__predict_false(x & SX_LOCK_RECURSED)) { /* The lock is recursed, unrecurse one level. */ if ((--sx->sx_recurse) == 0) atomic_clear_ptr(&sx->sx_lock, SX_LOCK_RECURSED); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, sx); return; } LOCKSTAT_PROFILE_RELEASE_RWLOCK(sx__release, sx, LOCKSTAT_WRITER); if (x == tid && atomic_cmpset_rel_ptr(&sx->sx_lock, tid, SX_LOCK_UNLOCKED)) return; if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p contested", __func__, sx); sleepq_lock(&sx->lock_object); x = SX_READ_VALUE(sx); MPASS(x & (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)); /* * The wake up algorithm here is quite simple and probably not * ideal. It gives precedence to shared waiters if they are * present. For this condition, we have to preserve the * state of the exclusive waiters flag. * If interruptible sleeps left the shared queue empty avoid a * starvation for the threads sleeping on the exclusive queue by giving * them precedence and cleaning up the shared waiters bit anyway. */ setx = SX_LOCK_UNLOCKED; queue = SQ_SHARED_QUEUE; if ((x & SX_LOCK_EXCLUSIVE_WAITERS) != 0 && sleepq_sleepcnt(&sx->lock_object, SQ_EXCLUSIVE_QUEUE) != 0) { queue = SQ_EXCLUSIVE_QUEUE; setx |= (x & SX_LOCK_SHARED_WAITERS); } atomic_store_rel_ptr(&sx->sx_lock, setx); /* Wake up all the waiters for the specific queue. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: %p waking up all threads on %s queue", __func__, sx, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, queue); sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); } static bool __always_inline __sx_can_read(struct thread *td, uintptr_t x, bool fp) { if ((x & (SX_LOCK_SHARED | SX_LOCK_EXCLUSIVE_WAITERS | SX_LOCK_WRITE_SPINNER)) == SX_LOCK_SHARED) return (true); if (!fp && td->td_sx_slocks && (x & SX_LOCK_SHARED)) return (true); return (false); } static bool __always_inline __sx_slock_try(struct sx *sx, struct thread *td, uintptr_t *xp, bool fp LOCK_FILE_LINE_ARG_DEF) { /* * If no other thread has an exclusive lock then try to bump up * the count of sharers. Since we have to preserve the state * of SX_LOCK_EXCLUSIVE_WAITERS, if we fail to acquire the * shared lock loop back and retry. */ while (__sx_can_read(td, *xp, fp)) { if (atomic_fcmpset_acq_ptr(&sx->sx_lock, xp, *xp + SX_ONE_SHARER)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeed %p -> %p", __func__, sx, (void *)*xp, (void *)(*xp + SX_ONE_SHARER)); td->td_sx_slocks++; return (true); } } return (false); } static int __noinline _sx_slock_hard(struct sx *sx, int opts, uintptr_t x LOCK_FILE_LINE_ARG_DEF) { GIANT_DECLARE; struct thread *td; #ifdef ADAPTIVE_SX struct thread *owner; u_int i, n, spintries = 0; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif int error = 0; #if defined(ADAPTIVE_SX) || defined(KDTRACE_HOOKS) struct lock_delay_arg lda; #endif #ifdef KDTRACE_HOOKS u_int sleep_cnt = 0; int64_t sleep_time = 0; int64_t all_time = 0; #endif #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) uintptr_t state = 0; #endif int extra_work __sdt_used = 0; td = curthread; #ifdef KDTRACE_HOOKS if (LOCKSTAT_PROFILE_ENABLED(sx__acquire)) { if (__sx_slock_try(sx, td, &x, false LOCK_FILE_LINE_ARG)) goto out_lockstat; extra_work = 1; all_time -= lockstat_nsecs(&sx->lock_object); state = x; } #endif #ifdef LOCK_PROFILING extra_work = 1; state = x; #endif if (SCHEDULER_STOPPED()) return (0); #if defined(ADAPTIVE_SX) lock_delay_arg_init(&lda, &sx_delay); #elif defined(KDTRACE_HOOKS) lock_delay_arg_init_noadapt(&lda); #endif #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&sx->lock_object, false, &contested, &waittime); #ifndef INVARIANTS GIANT_SAVE(extra_work); #endif THREAD_CONTENDS_ON_LOCK(&sx->lock_object); /* * As with rwlocks, we don't make any attempt to try to block * shared locks once there is an exclusive waiter. */ for (;;) { if (__sx_slock_try(sx, td, &x, false LOCK_FILE_LINE_ARG)) break; #ifdef INVARIANTS GIANT_SAVE(extra_work); #endif #ifdef KDTRACE_HOOKS lda.spin_cnt++; #endif #ifdef ADAPTIVE_SX /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if ((x & SX_LOCK_SHARED) == 0) { owner = lv_sx_owner(x); if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, sx, owner); KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); do { lock_delay(&lda); x = SX_READ_VALUE(sx); owner = lv_sx_owner(x); } while (owner != NULL && TD_IS_RUNNING(owner)); KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); continue; } } else { if ((x & SX_LOCK_WRITE_SPINNER) && SX_SHARERS(x) == 0) { MPASS(!__sx_can_read(td, x, false)); lock_delay_spin(2); x = SX_READ_VALUE(sx); continue; } if (spintries < asx_retries) { KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread), "spinning", "lockname:\"%s\"", sx->lock_object.lo_name); n = SX_SHARERS(x); for (i = 0; i < asx_loops; i += n) { lock_delay_spin(n); x = SX_READ_VALUE(sx); if (!(x & SX_LOCK_SHARED)) break; n = SX_SHARERS(x); if (n == 0) break; if (__sx_can_read(td, x, false)) break; } #ifdef KDTRACE_HOOKS lda.spin_cnt += i; #endif KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread), "running"); if (i < asx_loops) continue; } } #endif /* * Some other thread already has an exclusive lock, so * start the process of blocking. */ sleepq_lock(&sx->lock_object); x = SX_READ_VALUE(sx); retry_sleepq: if (((x & SX_LOCK_WRITE_SPINNER) && SX_SHARERS(x) == 0) || __sx_can_read(td, x, false)) { sleepq_release(&sx->lock_object); continue; } #ifdef ADAPTIVE_SX /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if (!(x & SX_LOCK_SHARED)) { owner = (struct thread *)SX_OWNER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&sx->lock_object); x = SX_READ_VALUE(sx); continue; } } #endif /* * Try to set the SX_LOCK_SHARED_WAITERS flag. If we * fail to set it drop the sleep queue lock and loop * back. */ if (!(x & SX_LOCK_SHARED_WAITERS)) { if (!atomic_fcmpset_ptr(&sx->sx_lock, &x, x | SX_LOCK_SHARED_WAITERS)) goto retry_sleepq; if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set shared waiters flag", __func__, sx); } /* * Since we have been unable to acquire the shared lock, * we have to sleep. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on sleep queue", __func__, sx); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&sx->lock_object); #endif sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name, SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ? SLEEPQ_INTERRUPTIBLE : 0), SQ_SHARED_QUEUE); /* * Hack: this can land in thread_suspend_check which will * conditionally take a mutex, tripping over an assert if a * lock we are waiting for is set. */ THREAD_CONTENTION_DONE(&sx->lock_object); if (!(opts & SX_INTERRUPTIBLE)) sleepq_wait(&sx->lock_object, 0); else error = sleepq_wait_sig(&sx->lock_object, 0); THREAD_CONTENDS_ON_LOCK(&sx->lock_object); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&sx->lock_object); sleep_cnt++; #endif if (error) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: interruptible sleep by %p suspended by signal", __func__, sx); break; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from sleep queue", __func__, sx); x = SX_READ_VALUE(sx); } THREAD_CONTENTION_DONE(&sx->lock_object); #if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING) if (__predict_true(!extra_work)) return (error); #endif #ifdef KDTRACE_HOOKS all_time += lockstat_nsecs(&sx->lock_object); if (sleep_time) LOCKSTAT_RECORD4(sx__block, sx, sleep_time, LOCKSTAT_READER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); if (lda.spin_cnt > sleep_cnt) LOCKSTAT_RECORD4(sx__spin, sx, all_time - sleep_time, LOCKSTAT_READER, (state & SX_LOCK_SHARED) == 0, (state & SX_LOCK_SHARED) == 0 ? 0 : SX_SHARERS(state)); out_lockstat: #endif if (error == 0) { LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(sx__acquire, sx, contested, waittime, file, line, LOCKSTAT_READER); } GIANT_RESTORE(); return (error); } int _sx_slock_int(struct sx *sx, int opts LOCK_FILE_LINE_ARG_DEF) { struct thread *td; uintptr_t x; int error; KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() || !TD_IS_IDLETHREAD(curthread), ("sx_slock() by idle thread %p on sx %s @ %s:%d", curthread, sx->lock_object.lo_name, file, line)); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_slock() of destroyed sx @ %s:%d", file, line)); WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER, file, line, NULL); error = 0; td = curthread; x = SX_READ_VALUE(sx); if (__predict_false(LOCKSTAT_PROFILE_ENABLED(sx__acquire) || !__sx_slock_try(sx, td, &x, true LOCK_FILE_LINE_ARG))) error = _sx_slock_hard(sx, opts, x LOCK_FILE_LINE_ARG); else lock_profile_obtain_lock_success(&sx->lock_object, false, 0, 0, file, line); if (error == 0) { LOCK_LOG_LOCK("SLOCK", &sx->lock_object, 0, 0, file, line); WITNESS_LOCK(&sx->lock_object, 0, file, line); TD_LOCKS_INC(curthread); } return (error); } int _sx_slock(struct sx *sx, int opts, const char *file, int line) { return (_sx_slock_int(sx, opts LOCK_FILE_LINE_ARG)); } static bool __always_inline _sx_sunlock_try(struct sx *sx, struct thread *td, uintptr_t *xp) { for (;;) { if (SX_SHARERS(*xp) > 1 || !(*xp & SX_LOCK_WAITERS)) { if (atomic_fcmpset_rel_ptr(&sx->sx_lock, xp, *xp - SX_ONE_SHARER)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeeded %p -> %p", __func__, sx, (void *)*xp, (void *)(*xp - SX_ONE_SHARER)); td->td_sx_slocks--; return (true); } continue; } break; } return (false); } static void __noinline _sx_sunlock_hard(struct sx *sx, struct thread *td, uintptr_t x LOCK_FILE_LINE_ARG_DEF) { int wakeup_swapper = 0; uintptr_t setx, queue; if (SCHEDULER_STOPPED()) return; if (_sx_sunlock_try(sx, td, &x)) goto out_lockstat; sleepq_lock(&sx->lock_object); x = SX_READ_VALUE(sx); for (;;) { if (_sx_sunlock_try(sx, td, &x)) break; /* * Wake up semantic here is quite simple: * Just wake up all the exclusive waiters. * Note that the state of the lock could have changed, * so if it fails loop back and retry. */ setx = SX_LOCK_UNLOCKED; queue = SQ_SHARED_QUEUE; if (x & SX_LOCK_EXCLUSIVE_WAITERS) { setx |= (x & SX_LOCK_SHARED_WAITERS); queue = SQ_EXCLUSIVE_QUEUE; } setx |= (x & SX_LOCK_WRITE_SPINNER); if (!atomic_fcmpset_rel_ptr(&sx->sx_lock, &x, setx)) continue; if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p waking up all thread on" "exclusive queue", __func__, sx); wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, queue); td->td_sx_slocks--; break; } sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); out_lockstat: LOCKSTAT_PROFILE_RELEASE_RWLOCK(sx__release, sx, LOCKSTAT_READER); } void _sx_sunlock_int(struct sx *sx LOCK_FILE_LINE_ARG_DEF) { struct thread *td; uintptr_t x; KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_sunlock() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_SLOCKED, file, line); WITNESS_UNLOCK(&sx->lock_object, 0, file, line); LOCK_LOG_LOCK("SUNLOCK", &sx->lock_object, 0, 0, file, line); td = curthread; x = SX_READ_VALUE(sx); if (__predict_false(LOCKSTAT_PROFILE_ENABLED(sx__release) || !_sx_sunlock_try(sx, td, &x))) _sx_sunlock_hard(sx, td, x LOCK_FILE_LINE_ARG); else lock_profile_release_lock(&sx->lock_object, false); TD_LOCKS_DEC(curthread); } void _sx_sunlock(struct sx *sx, const char *file, int line) { _sx_sunlock_int(sx LOCK_FILE_LINE_ARG); } #ifdef INVARIANT_SUPPORT #ifndef INVARIANTS #undef _sx_assert #endif /* * In the non-WITNESS case, sx_assert() can only detect that at least * *some* thread owns an slock, but it cannot guarantee that *this* * thread owns an slock. */ void _sx_assert(const struct sx *sx, int what, const char *file, int line) { #ifndef WITNESS int slocked = 0; #endif if (SCHEDULER_STOPPED()) return; switch (what) { case SA_SLOCKED: case SA_SLOCKED | SA_NOTRECURSED: case SA_SLOCKED | SA_RECURSED: #ifndef WITNESS slocked = 1; /* FALLTHROUGH */ #endif case SA_LOCKED: case SA_LOCKED | SA_NOTRECURSED: case SA_LOCKED | SA_RECURSED: #ifdef WITNESS witness_assert(&sx->lock_object, what, file, line); #else /* * If some other thread has an exclusive lock or we * have one and are asserting a shared lock, fail. * Also, if no one has a lock at all, fail. */ if (sx->sx_lock == SX_LOCK_UNLOCKED || (!(sx->sx_lock & SX_LOCK_SHARED) && (slocked || sx_xholder(sx) != curthread))) panic("Lock %s not %slocked @ %s:%d\n", sx->lock_object.lo_name, slocked ? "share " : "", file, line); if (!(sx->sx_lock & SX_LOCK_SHARED)) { if (sx_recursed(sx)) { if (what & SA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } else if (what & SA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } #endif break; case SA_XLOCKED: case SA_XLOCKED | SA_NOTRECURSED: case SA_XLOCKED | SA_RECURSED: if (sx_xholder(sx) != curthread) panic("Lock %s not exclusively locked @ %s:%d\n", sx->lock_object.lo_name, file, line); if (sx_recursed(sx)) { if (what & SA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } else if (what & SA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); break; case SA_UNLOCKED: #ifdef WITNESS witness_assert(&sx->lock_object, what, file, line); #else /* * If we hold an exclusve lock fail. We can't * reliably check to see if we hold a shared lock or * not. */ if (sx_xholder(sx) == curthread) panic("Lock %s exclusively locked @ %s:%d\n", sx->lock_object.lo_name, file, line); #endif break; default: panic("Unknown sx lock assertion: %d @ %s:%d", what, file, line); } } #endif /* INVARIANT_SUPPORT */ #ifdef DDB static void db_show_sx(const struct lock_object *lock) { struct thread *td; const struct sx *sx; sx = (const struct sx *)lock; db_printf(" state: "); if (sx->sx_lock == SX_LOCK_UNLOCKED) db_printf("UNLOCKED\n"); else if (sx->sx_lock == SX_LOCK_DESTROYED) { db_printf("DESTROYED\n"); return; } else if (sx->sx_lock & SX_LOCK_SHARED) db_printf("SLOCK: %ju\n", (uintmax_t)SX_SHARERS(sx->sx_lock)); else { td = sx_xholder(sx); db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (sx_recursed(sx)) db_printf(" recursed: %d\n", sx->sx_recurse); } db_printf(" waiters: "); switch(sx->sx_lock & (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)) { case SX_LOCK_SHARED_WAITERS: db_printf("shared\n"); break; case SX_LOCK_EXCLUSIVE_WAITERS: db_printf("exclusive\n"); break; case SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS: db_printf("exclusive and shared\n"); break; default: db_printf("none\n"); } } /* * Check to see if a thread that is blocked on a sleep queue is actually * blocked on an sx lock. If so, output some details and return true. * If the lock has an exclusive owner, return that in *ownerp. */ int sx_chain(struct thread *td, struct thread **ownerp) { const struct sx *sx; /* * Check to see if this thread is blocked on an sx lock. * First, we check the lock class. If that is ok, then we * compare the lock name against the wait message. */ sx = td->td_wchan; if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx || sx->lock_object.lo_name != td->td_wmesg) return (0); /* We think we have an sx lock, so output some details. */ db_printf("blocked on sx \"%s\" ", td->td_wmesg); *ownerp = sx_xholder(sx); if (sx->sx_lock & SX_LOCK_SHARED) db_printf("SLOCK (count %ju)\n", (uintmax_t)SX_SHARERS(sx->sx_lock)); else db_printf("XLOCK\n"); return (1); } #endif diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index 8cb847fe2a2d..f12054a04b23 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -1,713 +1,713 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_ktrace.h" #include "opt_sched.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #include #endif #ifdef EPOCH_TRACE #include #endif #include static void synch_setup(void *dummy); SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup, NULL); int hogticks; static const char pause_wchan[MAXCPU]; static struct callout loadav_callout; struct loadavg averunnable = { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ /* * Constants for averages over 1, 5, and 15 minutes * when sampling at 5 second intervals. */ static uint64_t cexp[3] = { 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 0.9944598480048967 * FSCALE, /* exp(-1/180) */ }; /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FSCALE, "Fixed-point scale factor used for calculating load average values"); static void loadav(void *arg); SDT_PROVIDER_DECLARE(sched); SDT_PROBE_DEFINE(sched, , , preempt); static void sleepinit(void *unused) { hogticks = (hz / 10) * 2; /* Default only. */ init_sleepqueues(); } /* * vmem tries to lock the sleepq mutexes when free'ing kva, so make sure * it is available. */ SYSINIT(sleepinit, SI_SUB_KMEM, SI_ORDER_ANY, sleepinit, NULL); /* * General sleep call. Suspends the current thread until a wakeup is * performed on the specified identifier. The thread will then be made * runnable with the specified priority. Sleeps at most sbt units of time * (0 means no timeout). If pri includes the PCATCH flag, let signals * interrupt the sleep, otherwise ignore them while sleeping. Returns 0 if * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a * signal becomes pending, ERESTART is returned if the current system * call should be restarted if possible, and EINTR is returned if the system * call should be interrupted by the signal (return EINTR). * * The lock argument is unlocked before the caller is suspended, and * re-locked before _sleep() returns. If priority includes the PDROP * flag the lock is not re-locked before returning. */ int _sleep(const void *ident, struct lock_object *lock, int priority, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { struct thread *td; struct lock_class *class; uintptr_t lock_state; int catch, pri, rval, sleepq_flags; WITNESS_SAVE_DECL(lock_witness); TSENTER(); td = curthread; #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 0, wmesg); #endif WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock, "Sleeping on \"%s\"", wmesg); KASSERT(sbt != 0 || mtx_owned(&Giant) || lock != NULL || (priority & PNOLOCK) != 0, ("sleeping without a lock")); KASSERT(ident != NULL, ("_sleep: NULL ident")); KASSERT(TD_IS_RUNNING(td), ("_sleep: curthread not running")); if (priority & PDROP) KASSERT(lock != NULL && lock != &Giant.lock_object, ("PDROP requires a non-Giant lock")); if (lock != NULL) class = LOCK_CLASS(lock); else class = NULL; - if (SCHEDULER_STOPPED_TD(td)) { + if (SCHEDULER_STOPPED()) { if (lock != NULL && priority & PDROP) class->lc_unlock(lock); return (0); } catch = priority & PCATCH; pri = priority & PRIMASK; KASSERT(!TD_ON_SLEEPQ(td), ("recursive sleep")); if ((uintptr_t)ident >= (uintptr_t)&pause_wchan[0] && (uintptr_t)ident <= (uintptr_t)&pause_wchan[MAXCPU - 1]) sleepq_flags = SLEEPQ_PAUSE; else sleepq_flags = SLEEPQ_SLEEP; if (catch) sleepq_flags |= SLEEPQ_INTERRUPTIBLE; sleepq_lock(ident); CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)", td->td_tid, td->td_proc->p_pid, td->td_name, wmesg, ident); if (lock == &Giant.lock_object) mtx_assert(&Giant, MA_OWNED); DROP_GIANT(); if (lock != NULL && lock != &Giant.lock_object && !(class->lc_flags & LC_SLEEPABLE)) { KASSERT(!(class->lc_flags & LC_SPINLOCK), ("spin locks can only use msleep_spin")); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); } else /* GCC needs to follow the Yellow Brick Road */ lock_state = -1; /* * We put ourselves on the sleep queue and start our timeout * before calling thread_suspend_check, as we could stop there, * and a wakeup or a SIGCONT (or both) could occur while we were * stopped without resuming us. Thus, we must be ready for sleep * when cursig() is called. If the wakeup happens while we're * stopped, then td will no longer be on a sleep queue upon * return from cursig(). */ sleepq_add(ident, lock, wmesg, sleepq_flags, 0); if (sbt != 0) sleepq_set_timeout_sbt(ident, sbt, pr, flags); if (lock != NULL && class->lc_flags & LC_SLEEPABLE) { sleepq_release(ident); WITNESS_SAVE(lock, lock_witness); lock_state = class->lc_unlock(lock); sleepq_lock(ident); } if (sbt != 0 && catch) rval = sleepq_timedwait_sig(ident, pri); else if (sbt != 0) rval = sleepq_timedwait(ident, pri); else if (catch) rval = sleepq_wait_sig(ident, pri); else { sleepq_wait(ident, pri); rval = 0; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) { class->lc_lock(lock, lock_state); WITNESS_RESTORE(lock, lock_witness); } TSEXIT(); return (rval); } int msleep_spin_sbt(const void *ident, struct mtx *mtx, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { struct thread *td; int rval; WITNESS_SAVE_DECL(mtx); td = curthread; KASSERT(mtx != NULL, ("sleeping without a mutex")); KASSERT(ident != NULL, ("msleep_spin_sbt: NULL ident")); KASSERT(TD_IS_RUNNING(td), ("msleep_spin_sbt: curthread not running")); - if (SCHEDULER_STOPPED_TD(td)) + if (SCHEDULER_STOPPED()) return (0); sleepq_lock(ident); CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)", td->td_tid, td->td_proc->p_pid, td->td_name, wmesg, ident); DROP_GIANT(); mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED); WITNESS_SAVE(&mtx->lock_object, mtx); mtx_unlock_spin(mtx); /* * We put ourselves on the sleep queue and start our timeout. */ sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0); if (sbt != 0) sleepq_set_timeout_sbt(ident, sbt, pr, flags); /* * Can't call ktrace with any spin locks held so it can lock the * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold * any spin lock. Thus, we have to drop the sleepq spin lock while * we handle those requests. This is safe since we have placed our * thread on the sleep queue already. */ #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) { sleepq_release(ident); ktrcsw(1, 0, wmesg); sleepq_lock(ident); } #endif #ifdef WITNESS sleepq_release(ident); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"", wmesg); sleepq_lock(ident); #endif if (sbt != 0) rval = sleepq_timedwait(ident, 0); else { sleepq_wait(ident, 0); rval = 0; } #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 0, wmesg); #endif PICKUP_GIANT(); mtx_lock_spin(mtx); WITNESS_RESTORE(&mtx->lock_object, mtx); return (rval); } /* * pause_sbt() delays the calling thread by the given signed binary * time. During cold bootup, pause_sbt() uses the DELAY() function * instead of the _sleep() function to do the waiting. The "sbt" * argument must be greater than or equal to zero. A "sbt" value of * zero is equivalent to a "sbt" value of one tick. */ int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) { KASSERT(sbt >= 0, ("pause_sbt: timeout must be >= 0")); /* silently convert invalid timeouts */ if (sbt == 0) sbt = tick_sbt; if ((cold && curthread == &thread0) || kdb_active || SCHEDULER_STOPPED()) { /* * We delay one second at a time to avoid overflowing the * system specific DELAY() function(s): */ while (sbt >= SBT_1S) { DELAY(1000000); sbt -= SBT_1S; } /* Do the delay remainder, if any */ sbt = howmany(sbt, SBT_1US); if (sbt > 0) DELAY(sbt); return (EWOULDBLOCK); } return (_sleep(&pause_wchan[curcpu], NULL, (flags & C_CATCH) ? PCATCH : 0, wmesg, sbt, pr, flags)); } /* * Make all threads sleeping on the specified identifier runnable. */ void wakeup(const void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0); sleepq_release(ident); if (wakeup_swapper) { KASSERT(ident != &proc0, ("wakeup and wakeup_swapper and proc0")); kick_proc0(); } } /* * Make a thread sleeping on the specified identifier runnable. * May wake more than one thread if a target thread is currently * swapped out. */ void wakeup_one(const void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP | SLEEPQ_DROP, 0, 0); if (wakeup_swapper) kick_proc0(); } void wakeup_any(const void *ident) { int wakeup_swapper; sleepq_lock(ident); wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP | SLEEPQ_UNFAIR | SLEEPQ_DROP, 0, 0); if (wakeup_swapper) kick_proc0(); } /* * Signal sleeping waiters after the counter has reached zero. */ void _blockcount_wakeup(blockcount_t *bc, u_int old) { KASSERT(_BLOCKCOUNT_WAITERS(old), ("%s: no waiters on %p", __func__, bc)); if (atomic_cmpset_int(&bc->__count, _BLOCKCOUNT_WAITERS_FLAG, 0)) wakeup(bc); } /* * Wait for a wakeup or a signal. This does not guarantee that the count is * still zero on return. Callers wanting a precise answer should use * blockcount_wait() with an interlock. * * If there is no work to wait for, return 0. If the sleep was interrupted by a * signal, return EINTR or ERESTART, and return EAGAIN otherwise. */ int _blockcount_sleep(blockcount_t *bc, struct lock_object *lock, const char *wmesg, int prio) { void *wchan; uintptr_t lock_state; u_int old; int ret; bool catch, drop; KASSERT(lock != &Giant.lock_object, ("%s: cannot use Giant as the interlock", __func__)); catch = (prio & PCATCH) != 0; drop = (prio & PDROP) != 0; prio &= PRIMASK; /* * Synchronize with the fence in blockcount_release(). If we end up * waiting, the sleepqueue lock acquisition will provide the required * side effects. * * If there is no work to wait for, but waiters are present, try to put * ourselves to sleep to avoid jumping ahead. */ if (atomic_load_acq_int(&bc->__count) == 0) { if (lock != NULL && drop) LOCK_CLASS(lock)->lc_unlock(lock); return (0); } lock_state = 0; wchan = bc; sleepq_lock(wchan); DROP_GIANT(); if (lock != NULL) lock_state = LOCK_CLASS(lock)->lc_unlock(lock); old = blockcount_read(bc); ret = 0; do { if (_BLOCKCOUNT_COUNT(old) == 0) { sleepq_release(wchan); goto out; } if (_BLOCKCOUNT_WAITERS(old)) break; } while (!atomic_fcmpset_int(&bc->__count, &old, old | _BLOCKCOUNT_WAITERS_FLAG)); sleepq_add(wchan, NULL, wmesg, catch ? SLEEPQ_INTERRUPTIBLE : 0, 0); if (catch) ret = sleepq_wait_sig(wchan, prio); else sleepq_wait(wchan, prio); if (ret == 0) ret = EAGAIN; out: PICKUP_GIANT(); if (lock != NULL && !drop) LOCK_CLASS(lock)->lc_lock(lock, lock_state); return (ret); } static void kdb_switch(void) { thread_unlock(curthread); kdb_backtrace(); kdb_reenter(); panic("%s: did not reenter debugger", __func__); } /* * mi_switch(9): The machine-independent parts of context switching. * * The thread lock is required on entry and is no longer held on return. */ void mi_switch(int flags) { uint64_t runtime, new_switchtime; struct thread *td; td = curthread; /* XXX */ THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code")); #ifdef INVARIANTS if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td)) mtx_assert(&Giant, MA_NOTOWNED); #endif /* thread_lock() performs spinlock_enter(). */ KASSERT(td->td_critnest == 1 || KERNEL_PANICKED(), ("mi_switch: switch in a critical section")); KASSERT((flags & (SW_INVOL | SW_VOL)) != 0, ("mi_switch: switch must be voluntary or involuntary")); KASSERT((flags & SW_TYPE_MASK) != 0, ("mi_switch: a switch reason (type) must be specified")); KASSERT((flags & SW_TYPE_MASK) < SWT_COUNT, ("mi_switch: invalid switch reason %d", (flags & SW_TYPE_MASK))); /* * Don't perform context switches from the debugger. */ if (kdb_active) kdb_switch(); - if (SCHEDULER_STOPPED_TD(td)) + if (SCHEDULER_STOPPED()) return; if (flags & SW_VOL) { td->td_ru.ru_nvcsw++; td->td_swvoltick = ticks; } else { td->td_ru.ru_nivcsw++; td->td_swinvoltick = ticks; } #ifdef SCHED_STATS SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]); #endif /* * Compute the amount of time during which the current * thread was running, and add that to its total so far. */ new_switchtime = cpu_ticks(); runtime = new_switchtime - PCPU_GET(switchtime); td->td_runtime += runtime; td->td_incruntime += runtime; PCPU_SET(switchtime, new_switchtime); td->td_generation++; /* bump preempt-detect counter */ VM_CNT_INC(v_swtch); PCPU_SET(switchticks, ticks); CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)", td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name); #ifdef KDTRACE_HOOKS if (SDT_PROBES_ENABLED() && ((flags & SW_PREEMPT) != 0 || ((flags & SW_INVOL) != 0 && (flags & SW_TYPE_MASK) == SWT_NEEDRESCHED))) SDT_PROBE0(sched, , , preempt); #endif sched_switch(td, flags); CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)", td->td_tid, td_get_sched(td), td->td_proc->p_pid, td->td_name); /* * If the last thread was exiting, finish cleaning it up. */ if ((td = PCPU_GET(deadthread))) { PCPU_SET(deadthread, NULL); thread_stash(td); } spinlock_exit(); } /* * Change thread state to be runnable, placing it on the run queue if * it is in memory. If it is swapped out, return true so our caller * will know to awaken the swapper. * * Requires the thread lock on entry, drops on exit. */ int setrunnable(struct thread *td, int srqflags) { int swapin; THREAD_LOCK_ASSERT(td, MA_OWNED); KASSERT(td->td_proc->p_state != PRS_ZOMBIE, ("setrunnable: pid %d is a zombie", td->td_proc->p_pid)); swapin = 0; switch (TD_GET_STATE(td)) { case TDS_RUNNING: case TDS_RUNQ: break; case TDS_CAN_RUN: KASSERT((td->td_flags & TDF_INMEM) != 0, ("setrunnable: td %p not in mem, flags 0x%X inhibit 0x%X", td, td->td_flags, td->td_inhibitors)); /* unlocks thread lock according to flags */ sched_wakeup(td, srqflags); return (0); case TDS_INHIBITED: /* * If we are only inhibited because we are swapped out * arrange to swap in this process. */ if (td->td_inhibitors == TDI_SWAPPED && (td->td_flags & TDF_SWAPINREQ) == 0) { td->td_flags |= TDF_SWAPINREQ; swapin = 1; } break; default: panic("setrunnable: state 0x%x", TD_GET_STATE(td)); } if ((srqflags & (SRQ_HOLD | SRQ_HOLDTD)) == 0) thread_unlock(td); return (swapin); } /* * Compute a tenex style load average of a quantity on * 1, 5 and 15 minute intervals. */ static void loadav(void *arg) { int i; uint64_t nrun; struct loadavg *avg; nrun = (uint64_t)sched_load(); avg = &averunnable; for (i = 0; i < 3; i++) avg->ldavg[i] = (cexp[i] * (uint64_t)avg->ldavg[i] + nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; /* * Schedule the next update to occur after 5 seconds, but add a * random variation to avoid synchronisation with processes that * run at regular intervals. */ callout_reset_sbt(&loadav_callout, SBT_1US * (4000000 + (int)(random() % 2000001)), SBT_1US, loadav, NULL, C_DIRECT_EXEC | C_PREL(32)); } static void ast_scheduler(struct thread *td, int tda __unused) { #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(1, 1, __func__); #endif thread_lock(td); sched_prio(td, td->td_user_pri); mi_switch(SW_INVOL | SWT_NEEDRESCHED); #ifdef KTRACE if (KTRPOINT(td, KTR_CSW)) ktrcsw(0, 1, __func__); #endif } static void synch_setup(void *dummy __unused) { callout_init(&loadav_callout, 1); ast_register(TDA_SCHED, ASTR_ASTF_REQUIRED, 0, ast_scheduler); /* Kick off timeout driven events by calling first time. */ loadav(NULL); } bool should_yield(void) { return ((u_int)ticks - (u_int)curthread->td_swvoltick >= hogticks); } void maybe_yield(void) { if (should_yield()) kern_yield(PRI_USER); } void kern_yield(int prio) { struct thread *td; td = curthread; DROP_GIANT(); thread_lock(td); if (prio == PRI_USER) prio = td->td_user_pri; if (prio >= 0) sched_prio(td, prio); mi_switch(SW_VOL | SWT_RELINQUISH); PICKUP_GIANT(); } /* * General purpose yield system call. */ int sys_yield(struct thread *td, struct yield_args *uap) { thread_lock(td); if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) sched_prio(td, PRI_MAX_TIMESHARE); mi_switch(SW_VOL | SWT_RELINQUISH); td->td_retval[0] = 0; return (0); } int sys_sched_getcpu(struct thread *td, struct sched_getcpu_args *uap) { td->td_retval[0] = td->td_oncpu; return (0); } diff --git a/sys/kern/subr_kdb.c b/sys/kern/subr_kdb.c index 86f392485a4b..a7fc2284cbcf 100644 --- a/sys/kern/subr_kdb.c +++ b/sys/kern/subr_kdb.c @@ -1,814 +1,814 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2004 The FreeBSD Project * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include "opt_kdb.h" #include "opt_stack.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SMP #include #endif #include u_char __read_frequently kdb_active = 0; static void *kdb_jmpbufp = NULL; struct kdb_dbbe *kdb_dbbe = NULL; static struct pcb kdb_pcb; struct pcb *kdb_thrctx = NULL; struct thread *kdb_thread = NULL; struct trapframe *kdb_frame = NULL; #ifdef BREAK_TO_DEBUGGER #define KDB_BREAK_TO_DEBUGGER 1 #else #define KDB_BREAK_TO_DEBUGGER 0 #endif #ifdef ALT_BREAK_TO_DEBUGGER #define KDB_ALT_BREAK_TO_DEBUGGER 1 #else #define KDB_ALT_BREAK_TO_DEBUGGER 0 #endif static int kdb_break_to_debugger = KDB_BREAK_TO_DEBUGGER; static int kdb_alt_break_to_debugger = KDB_ALT_BREAK_TO_DEBUGGER; static int kdb_enter_securelevel = 0; KDB_BACKEND(null, NULL, NULL, NULL, NULL); static int kdb_sysctl_available(SYSCTL_HANDLER_ARGS); static int kdb_sysctl_current(SYSCTL_HANDLER_ARGS); static int kdb_sysctl_enter(SYSCTL_HANDLER_ARGS); static int kdb_sysctl_panic(SYSCTL_HANDLER_ARGS); static int kdb_sysctl_panic_str(SYSCTL_HANDLER_ARGS); static int kdb_sysctl_trap(SYSCTL_HANDLER_ARGS); static int kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS); static int kdb_sysctl_stack_overflow(SYSCTL_HANDLER_ARGS); static SYSCTL_NODE(_debug, OID_AUTO, kdb, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "KDB nodes"); SYSCTL_PROC(_debug_kdb, OID_AUTO, available, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, kdb_sysctl_available, "A", "list of available KDB backends"); SYSCTL_PROC(_debug_kdb, OID_AUTO, current, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, kdb_sysctl_current, "A", "currently selected KDB backend"); SYSCTL_PROC(_debug_kdb, OID_AUTO, enter, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, kdb_sysctl_enter, "I", "set to enter the debugger"); SYSCTL_PROC(_debug_kdb, OID_AUTO, panic, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE, NULL, 0, kdb_sysctl_panic, "I", "set to panic the kernel"); SYSCTL_PROC(_debug_kdb, OID_AUTO, panic_str, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE, NULL, 0, kdb_sysctl_panic_str, "A", "trigger a kernel panic, using the provided string as the panic message"); SYSCTL_PROC(_debug_kdb, OID_AUTO, trap, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE, NULL, 0, kdb_sysctl_trap, "I", "set to cause a page fault via data access"); SYSCTL_PROC(_debug_kdb, OID_AUTO, trap_code, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE, NULL, 0, kdb_sysctl_trap_code, "I", "set to cause a page fault via code access"); SYSCTL_PROC(_debug_kdb, OID_AUTO, stack_overflow, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE | CTLFLAG_MPSAFE, NULL, 0, kdb_sysctl_stack_overflow, "I", "set to cause a stack overflow"); SYSCTL_INT(_debug_kdb, OID_AUTO, break_to_debugger, CTLFLAG_RWTUN, &kdb_break_to_debugger, 0, "Enable break to debugger"); SYSCTL_INT(_debug_kdb, OID_AUTO, alt_break_to_debugger, CTLFLAG_RWTUN, &kdb_alt_break_to_debugger, 0, "Enable alternative break to debugger"); SYSCTL_INT(_debug_kdb, OID_AUTO, enter_securelevel, CTLFLAG_RWTUN | CTLFLAG_SECURE, &kdb_enter_securelevel, 0, "Maximum securelevel to enter a KDB backend"); /* * Flag to indicate to debuggers why the debugger was entered. */ const char * volatile kdb_why = KDB_WHY_UNSET; static int kdb_sysctl_available(SYSCTL_HANDLER_ARGS) { struct kdb_dbbe **iter; struct sbuf sbuf; int error; sbuf_new_for_sysctl(&sbuf, NULL, 64, req); SET_FOREACH(iter, kdb_dbbe_set) { if ((*iter)->dbbe_active == 0) sbuf_printf(&sbuf, "%s ", (*iter)->dbbe_name); } error = sbuf_finish(&sbuf); sbuf_delete(&sbuf); return (error); } static int kdb_sysctl_current(SYSCTL_HANDLER_ARGS) { char buf[16]; int error; if (kdb_dbbe != NULL) strlcpy(buf, kdb_dbbe->dbbe_name, sizeof(buf)); else *buf = '\0'; error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); if (kdb_active) return (EBUSY); return (kdb_dbbe_select(buf)); } static int kdb_sysctl_enter(SYSCTL_HANDLER_ARGS) { int error, i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); if (kdb_active) return (EBUSY); kdb_enter(KDB_WHY_SYSCTL, "sysctl debug.kdb.enter"); return (0); } static int kdb_sysctl_panic(SYSCTL_HANDLER_ARGS) { int error, i; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); panic("kdb_sysctl_panic"); return (0); } static int kdb_sysctl_panic_str(SYSCTL_HANDLER_ARGS) { int error; static char buf[256]; /* static buffer to limit mallocs when panicing */ *buf = '\0'; error = sysctl_handle_string(oidp, buf, sizeof(buf), req); if (error != 0 || req->newptr == NULL) return (error); panic("kdb_sysctl_panic: %s", buf); return (0); } static int kdb_sysctl_trap(SYSCTL_HANDLER_ARGS) { int error, i; int *addr = (int *)0x10; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); return (*addr); } static int kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS) { int error, i; void (*fp)(u_int, u_int, u_int) = (void *)0xdeadc0de; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); (*fp)(0x11111111, 0x22222222, 0x33333333); return (0); } static void kdb_stack_overflow(volatile int *x) __noinline; static void kdb_stack_overflow(volatile int *x) { if (*x > 10000000) return; kdb_stack_overflow(x); *x += PCPU_GET(cpuid) / 1000000; } static int kdb_sysctl_stack_overflow(SYSCTL_HANDLER_ARGS) { int error, i; volatile int x; error = sysctl_wire_old_buffer(req, sizeof(int)); if (error == 0) { i = 0; error = sysctl_handle_int(oidp, &i, 0, req); } if (error != 0 || req->newptr == NULL) return (error); x = 0; kdb_stack_overflow(&x); return (0); } void kdb_panic(const char *msg) { kdb_why = KDB_WHY_PANIC; printf("KDB: panic\n"); panic("%s", msg); } void kdb_reboot(void) { kdb_why = KDB_WHY_REBOOT; printf("KDB: reboot requested\n"); shutdown_nice(0); } /* * Solaris implements a new BREAK which is initiated by a character sequence * CR ~ ^b which is similar to a familiar pattern used on Sun servers by the * Remote Console. * * Note that this function may be called from almost anywhere, with interrupts * disabled and with unknown locks held, so it must not access data other than * its arguments. Its up to the caller to ensure that the state variable is * consistent. */ #define KEY_CR 13 /* CR '\r' */ #define KEY_TILDE 126 /* ~ */ #define KEY_CRTLB 2 /* ^B */ #define KEY_CRTLP 16 /* ^P */ #define KEY_CRTLR 18 /* ^R */ /* States of th KDB "alternate break sequence" detecting state machine. */ enum { KDB_ALT_BREAK_SEEN_NONE, KDB_ALT_BREAK_SEEN_CR, KDB_ALT_BREAK_SEEN_CR_TILDE, }; int kdb_break(void) { if (!kdb_break_to_debugger) return (0); kdb_enter(KDB_WHY_BREAK, "Break to debugger"); return (KDB_REQ_DEBUGGER); } static int kdb_alt_break_state(int key, int *state) { int brk; /* All states transition to KDB_ALT_BREAK_SEEN_CR on a CR. */ if (key == KEY_CR) { *state = KDB_ALT_BREAK_SEEN_CR; return (0); } brk = 0; switch (*state) { case KDB_ALT_BREAK_SEEN_CR: *state = KDB_ALT_BREAK_SEEN_NONE; if (key == KEY_TILDE) *state = KDB_ALT_BREAK_SEEN_CR_TILDE; break; case KDB_ALT_BREAK_SEEN_CR_TILDE: *state = KDB_ALT_BREAK_SEEN_NONE; if (key == KEY_CRTLB) brk = KDB_REQ_DEBUGGER; else if (key == KEY_CRTLP) brk = KDB_REQ_PANIC; else if (key == KEY_CRTLR) brk = KDB_REQ_REBOOT; break; case KDB_ALT_BREAK_SEEN_NONE: default: *state = KDB_ALT_BREAK_SEEN_NONE; break; } return (brk); } static int kdb_alt_break_internal(int key, int *state, int force_gdb) { int brk; if (!kdb_alt_break_to_debugger) return (0); brk = kdb_alt_break_state(key, state); switch (brk) { case KDB_REQ_DEBUGGER: if (force_gdb) kdb_dbbe_select("gdb"); kdb_enter(KDB_WHY_BREAK, "Break to debugger"); break; case KDB_REQ_PANIC: if (force_gdb) kdb_dbbe_select("gdb"); kdb_panic("Panic sequence on console"); break; case KDB_REQ_REBOOT: kdb_reboot(); break; } return (0); } int kdb_alt_break(int key, int *state) { return (kdb_alt_break_internal(key, state, 0)); } /* * This variation on kdb_alt_break() is used only by dcons, which has its own * configuration flag to force GDB use regardless of the global KDB * configuration. */ int kdb_alt_break_gdb(int key, int *state) { return (kdb_alt_break_internal(key, state, 1)); } /* * Print a backtrace of the calling thread. The backtrace is generated by * the selected debugger, provided it supports backtraces. If no debugger * is selected or the current debugger does not support backtraces, this * function silently returns. */ void kdb_backtrace(void) { if (kdb_dbbe != NULL && kdb_dbbe->dbbe_trace != NULL) { printf("KDB: stack backtrace:\n"); kdb_dbbe->dbbe_trace(); } #ifdef STACK else { struct stack st; printf("KDB: stack backtrace:\n"); stack_save(&st); stack_print_ddb(&st); } #endif } /* * Similar to kdb_backtrace() except that it prints a backtrace of an * arbitrary thread rather than the calling thread. */ void kdb_backtrace_thread(struct thread *td) { if (kdb_dbbe != NULL && kdb_dbbe->dbbe_trace_thread != NULL) { printf("KDB: stack backtrace of thread %d:\n", td->td_tid); kdb_dbbe->dbbe_trace_thread(td); } #ifdef STACK else { struct stack st; printf("KDB: stack backtrace of thread %d:\n", td->td_tid); if (stack_save_td(&st, td) == 0) stack_print_ddb(&st); } #endif } /* * Set/change the current backend. */ int kdb_dbbe_select(const char *name) { struct kdb_dbbe *be, **iter; int error; error = priv_check(curthread, PRIV_KDB_SET_BACKEND); if (error) return (error); SET_FOREACH(iter, kdb_dbbe_set) { be = *iter; if (be->dbbe_active == 0 && strcmp(be->dbbe_name, name) == 0) { kdb_dbbe = be; return (0); } } return (EINVAL); } static bool kdb_backend_permitted(struct kdb_dbbe *be, struct thread *td) { struct ucred *cred; int error; cred = td->td_ucred; if (cred == NULL) { KASSERT(td == &thread0 && cold, ("%s: missing cred for %p", __func__, td)); error = 0; } else { error = securelevel_gt(cred, kdb_enter_securelevel); } #ifdef MAC /* * Give MAC a chance to weigh in on the policy: if the securelevel is * not raised, then MAC may veto the backend, otherwise MAC may * explicitly grant access. */ if (error == 0) { error = mac_kdb_check_backend(be); if (error != 0) { printf("MAC prevented execution of KDB backend: %s\n", be->dbbe_name); return (false); } } else if (mac_kdb_grant_backend(be) == 0) { error = 0; } #endif if (error != 0) printf("refusing to enter KDB with elevated securelevel\n"); return (error == 0); } /* * Enter the currently selected debugger. If a message has been provided, * it is printed first. If the debugger does not support the enter method, * it is entered by using breakpoint(), which enters the debugger through * kdb_trap(). The 'why' argument will contain a more mechanically usable * string than 'msg', and is relied upon by DDB scripting to identify the * reason for entering the debugger so that the right script can be run. */ void kdb_enter(const char *why, const char *msg) { if (kdb_dbbe != NULL && kdb_active == 0) { kdb_why = why; if (msg != NULL) printf("KDB: enter: %s\n", msg); breakpoint(); kdb_why = KDB_WHY_UNSET; } } /* * Initialize the kernel debugger interface. */ void kdb_init(void) { struct kdb_dbbe *be, **iter; int cur_pri, pri; TSENTER(); kdb_active = 0; kdb_dbbe = NULL; cur_pri = -1; SET_FOREACH(iter, kdb_dbbe_set) { be = *iter; pri = (be->dbbe_init != NULL) ? be->dbbe_init() : -1; be->dbbe_active = (pri >= 0) ? 0 : -1; if (pri > cur_pri) { cur_pri = pri; kdb_dbbe = be; } } if (kdb_dbbe != NULL) { printf("KDB: debugger backends:"); SET_FOREACH(iter, kdb_dbbe_set) { be = *iter; if (be->dbbe_active == 0) printf(" %s", be->dbbe_name); } printf("\n"); printf("KDB: current backend: %s\n", kdb_dbbe->dbbe_name); } TSEXIT(); } /* * Handle contexts. */ void * kdb_jmpbuf(jmp_buf new) { void *old; old = kdb_jmpbufp; kdb_jmpbufp = new; return (old); } void kdb_reenter(void) { if (!kdb_active || kdb_jmpbufp == NULL) return; printf("KDB: reentering\n"); kdb_backtrace(); longjmp(kdb_jmpbufp, 1); /* NOTREACHED */ } void kdb_reenter_silent(void) { if (!kdb_active || kdb_jmpbufp == NULL) return; longjmp(kdb_jmpbufp, 1); /* NOTREACHED */ } /* * Thread-related support functions. */ struct pcb * kdb_thr_ctx(struct thread *thr) { #ifdef SMP struct pcpu *pc; #endif if (thr == curthread) return (&kdb_pcb); #ifdef SMP STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) { if (pc->pc_curthread == thr && CPU_ISSET(pc->pc_cpuid, &stopped_cpus)) return (&stoppcbs[pc->pc_cpuid]); } #endif return (thr->td_pcb); } struct thread * kdb_thr_first(void) { struct proc *p; struct thread *thr; u_int i; /* This function may be called early. */ if (pidhashtbl == NULL) return (&thread0); for (i = 0; i <= pidhash; i++) { LIST_FOREACH(p, &pidhashtbl[i], p_hash) { thr = FIRST_THREAD_IN_PROC(p); if (thr != NULL) return (thr); } } return (NULL); } struct thread * kdb_thr_from_pid(pid_t pid) { struct proc *p; LIST_FOREACH(p, PIDHASH(pid), p_hash) { if (p->p_pid == pid) return (FIRST_THREAD_IN_PROC(p)); } return (NULL); } struct thread * kdb_thr_lookup(lwpid_t tid) { struct thread *thr; thr = kdb_thr_first(); while (thr != NULL && thr->td_tid != tid) thr = kdb_thr_next(thr); return (thr); } struct thread * kdb_thr_next(struct thread *thr) { struct proc *p; u_int hash; p = thr->td_proc; thr = TAILQ_NEXT(thr, td_plist); if (thr != NULL) return (thr); if (pidhashtbl == NULL) return (NULL); hash = p->p_pid & pidhash; for (;;) { p = LIST_NEXT(p, p_hash); while (p == NULL) { if (++hash > pidhash) return (NULL); p = LIST_FIRST(&pidhashtbl[hash]); } thr = FIRST_THREAD_IN_PROC(p); if (thr != NULL) return (thr); } } int kdb_thr_select(struct thread *thr) { if (thr == NULL) return (EINVAL); kdb_thread = thr; kdb_thrctx = kdb_thr_ctx(thr); return (0); } /* * Enter the debugger due to a trap. */ int kdb_trap(int type, int code, struct trapframe *tf) { #ifdef SMP cpuset_t other_cpus; #endif struct kdb_dbbe *be; register_t intr; int handled; int did_stop_cpus; be = kdb_dbbe; if (be == NULL || be->dbbe_trap == NULL) return (0); /* We reenter the debugger through kdb_reenter(). */ if (kdb_active) return (0); intr = intr_disable(); if (!SCHEDULER_STOPPED()) { #ifdef SMP other_cpus = all_cpus; CPU_ANDNOT(&other_cpus, &other_cpus, &stopped_cpus); CPU_CLR(PCPU_GET(cpuid), &other_cpus); stop_cpus_hard(other_cpus); #endif - curthread->td_stopsched = 1; + scheduler_stopped = true; did_stop_cpus = 1; } else did_stop_cpus = 0; kdb_active++; kdb_frame = tf; /* Let MD code do its thing first... */ kdb_cpu_trap(type, code); makectx(tf, &kdb_pcb); kdb_thr_select(curthread); cngrab(); for (;;) { if (!kdb_backend_permitted(be, curthread)) { /* Unhandled breakpoint traps are fatal. */ handled = 1; break; } handled = be->dbbe_trap(type, code); if (be == kdb_dbbe) break; be = kdb_dbbe; if (be == NULL || be->dbbe_trap == NULL) break; printf("Switching to %s back-end\n", be->dbbe_name); } cnungrab(); kdb_active--; if (did_stop_cpus) { - curthread->td_stopsched = 0; + scheduler_stopped = false; #ifdef SMP CPU_AND(&other_cpus, &other_cpus, &stopped_cpus); restart_cpus(other_cpus); #endif } intr_restore(intr); return (handled); } diff --git a/sys/sys/proc.h b/sys/sys/proc.h index 33a878dc46aa..b08226c89dfd 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -1,1372 +1,1372 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_PROC_H_ #define _SYS_PROC_H_ #include /* For struct callout. */ #include /* For struct klist. */ #ifdef _KERNEL #include #endif #include #ifndef _KERNEL #include #endif #include #include #include #include #include #include #include /* XXX. */ #include #include #include #include #include #ifndef _KERNEL #include /* For structs itimerval, timeval. */ #else #include #include #endif #include #include #include #include #include /* Machine-dependent proc substruct. */ #ifdef _KERNEL #include #endif /* * One structure allocated per session. * * List of locks * (m) locked by s_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct session { u_int s_count; /* Ref cnt; pgrps in session - atomic. */ struct proc *s_leader; /* (m + e) Session leader. */ struct vnode *s_ttyvp; /* (m) Vnode of controlling tty. */ struct cdev_priv *s_ttydp; /* (m) Device of controlling tty. */ struct tty *s_ttyp; /* (e) Controlling tty. */ pid_t s_sid; /* (c) Session ID. */ /* (m) Setlogin() name: */ char s_login[roundup(MAXLOGNAME, sizeof(long))]; struct mtx s_mtx; /* Mutex to protect members. */ }; /* * One structure allocated per process group. * * List of locks * (m) locked by pg_mtx mtx * (e) locked by proctree_lock sx * (c) const until freeing */ struct pgrp { LIST_ENTRY(pgrp) pg_hash; /* (e) Hash chain. */ LIST_HEAD(, proc) pg_members; /* (m + e) Pointer to pgrp members. */ struct session *pg_session; /* (c) Pointer to session. */ struct sigiolst pg_sigiolst; /* (m) List of sigio sources. */ pid_t pg_id; /* (c) Process group id. */ struct mtx pg_mtx; /* Mutex to protect members */ int pg_flags; /* (m) PGRP_ flags */ struct sx pg_killsx; /* Mutual exclusion between group member * fork() and killpg() */ }; #define PGRP_ORPHANED 0x00000001 /* Group is orphaned */ /* * pargs, used to hold a copy of the command line, if it had a sane length. */ struct pargs { u_int ar_ref; /* Reference count. */ u_int ar_length; /* Length. */ u_char ar_args[1]; /* Arguments. */ }; /*- * Description of a process. * * This structure contains the information needed to manage a thread of * control, known in UN*X as a process; it has references to substructures * containing descriptions of things that the process uses, but may share * with related processes. The process structure and the substructures * are always addressable except for those marked "(CPU)" below, * which might be addressable only on a processor on which the process * is running. * * Below is a key of locks used to protect each member of struct proc. The * lock is indicated by a reference to a specific character in parens in the * associated comment. * * - not yet protected * a - only touched by curproc or parent during fork/wait * b - created at fork, never changes * (exception aiods switch vmspaces, but they are also * marked 'P_SYSTEM' so hopefully it will be left alone) * c - locked by proc mtx * d - locked by allproc_lock lock * e - locked by proctree_lock lock * f - session mtx * g - process group mtx * h - callout_lock mtx * i - by curproc or the master session mtx * j - locked by proc slock * k - only accessed by curthread * k*- only accessed by curthread and from an interrupt * kx- only accessed by curthread and by debugger * l - the attaching proc or attaching proc parent * n - not locked, lazy * o - ktrace lock * q - td_contested lock * r - p_peers lock * s - see sleepq_switch(), sleeping_on_old_rtc(), and sleep(9) * t - thread lock * u - process stat lock * w - process timer lock * x - created at fork, only changes during single threading in exec * y - created at first aio, doesn't change until exit or exec at which * point we are single-threaded and only curthread changes it * * If the locking key specifies two identifiers (for example, p_pptr) then * either lock is sufficient for read access, but both locks must be held * for write access. */ struct cpuset; struct filecaps; struct filemon; struct kaioinfo; struct kaudit_record; struct kcov_info; struct kdtrace_proc; struct kdtrace_thread; struct kmsan_td; struct kq_timer_cb_data; struct mqueue_notifier; struct p_sched; struct proc; struct procdesc; struct racct; struct sbuf; struct sleepqueue; struct socket; struct td_sched; struct thread; struct trapframe; struct turnstile; struct vm_map; struct vm_map_entry; struct epoch_tracker; struct syscall_args { u_int code; u_int original_code; struct sysent *callp; register_t args[8]; }; /* * XXX: Does this belong in resource.h or resourcevar.h instead? * Resource usage extension. The times in rusage structs in the kernel are * never up to date. The actual times are kept as runtimes and tick counts * (with control info in the "previous" times), and are converted when * userland asks for rusage info. Backwards compatibility prevents putting * this directly in the user-visible rusage struct. * * Locking for p_rux: (cu) means (u) for p_rux and (c) for p_crux. * Locking for td_rux: (t) for all fields. */ struct rusage_ext { uint64_t rux_runtime; /* (cu) Real time. */ uint64_t rux_uticks; /* (cu) Statclock hits in user mode. */ uint64_t rux_sticks; /* (cu) Statclock hits in sys mode. */ uint64_t rux_iticks; /* (cu) Statclock hits in intr mode. */ uint64_t rux_uu; /* (c) Previous user time in usec. */ uint64_t rux_su; /* (c) Previous sys time in usec. */ uint64_t rux_tu; /* (c) Previous total time in usec. */ }; /* * Kernel runnable context (thread). * This is what is put to sleep and reactivated. * Thread context. Processes may have multiple threads. */ struct thread { struct mtx *volatile td_lock; /* replaces sched lock */ struct proc *td_proc; /* (*) Associated process. */ TAILQ_ENTRY(thread) td_plist; /* (*) All threads in this proc. */ TAILQ_ENTRY(thread) td_runq; /* (t) Run queue. */ union { TAILQ_ENTRY(thread) td_slpq; /* (t) Sleep queue. */ struct thread *td_zombie; /* Zombie list linkage */ }; TAILQ_ENTRY(thread) td_lockq; /* (t) Lock queue. */ LIST_ENTRY(thread) td_hash; /* (d) Hash chain. */ struct cpuset *td_cpuset; /* (t) CPU affinity mask. */ struct domainset_ref td_domain; /* (a) NUMA policy */ struct seltd *td_sel; /* Select queue/channel. */ struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */ struct turnstile *td_turnstile; /* (k) Associated turnstile. */ struct rl_q_entry *td_rlqe; /* (k) Associated range lock entry. */ struct umtx_q *td_umtxq; /* (c?) Link for when we're blocked. */ lwpid_t td_tid; /* (b) Thread ID. */ sigqueue_t td_sigqueue; /* (c) Sigs arrived, not delivered. */ #define td_siglist td_sigqueue.sq_signals u_char td_lend_user_pri; /* (t) Lend user pri. */ u_char td_allocdomain; /* (b) NUMA domain backing this struct thread. */ u_char td_base_ithread_pri; /* (t) Base ithread pri */ struct kmsan_td *td_kmsan; /* (k) KMSAN state */ /* Cleared during fork1(), thread_create(), or kthread_add(). */ #define td_startzero td_flags int td_flags; /* (t) TDF_* flags. */ int td_ast; /* (t) TDA_* indicators */ int td_inhibitors; /* (t) Why can not run. */ int td_pflags; /* (k) Private thread (TDP_*) flags. */ int td_pflags2; /* (k) Private thread (TDP2_*) flags. */ int td_dupfd; /* (k) Ret value from fdopen. XXX */ int td_sqqueue; /* (t) Sleepqueue queue blocked on. */ const void *td_wchan; /* (t) Sleep address. */ const char *td_wmesg; /* (t) Reason for sleep. */ volatile u_char td_owepreempt; /* (k*) Preempt on last critical_exit */ u_char td_tsqueue; /* (t) Turnstile queue blocked on. */ - u_char td_stopsched; /* (k) Scheduler stopped. */ + u_char _td_pad0[2]; /* Available. */ int td_locks; /* (k) Debug: count of non-spin locks */ int td_rw_rlocks; /* (k) Count of rwlock read locks. */ int td_sx_slocks; /* (k) Count of sx shared locks. */ int td_lk_slocks; /* (k) Count of lockmgr shared locks. */ struct lock_object *td_wantedlock; /* (k) Lock we are contending on */ struct turnstile *td_blocked; /* (t) Lock thread is blocked on. */ const char *td_lockname; /* (t) Name of lock blocked on. */ LIST_HEAD(, turnstile) td_contested; /* (q) Contested locks. */ struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */ int td_intr_nesting_level; /* (k) Interrupt recursion. */ int td_pinned; /* (k) Temporary cpu pin count. */ struct ucred *td_realucred; /* (k) Reference to credentials. */ struct ucred *td_ucred; /* (k) Used credentials, temporarily switchable. */ struct plimit *td_limit; /* (k) Resource limits. */ int td_slptick; /* (t) Time at sleep. */ int td_blktick; /* (t) Time spent blocked. */ int td_swvoltick; /* (t) Time at last SW_VOL switch. */ int td_swinvoltick; /* (t) Time at last SW_INVOL switch. */ u_int td_cow; /* (*) Number of copy-on-write faults */ struct rusage td_ru; /* (t) rusage information. */ struct rusage_ext td_rux; /* (t) Internal rusage information. */ uint64_t td_incruntime; /* (t) Cpu ticks to transfer to proc. */ uint64_t td_runtime; /* (t) How many cpu ticks we've run. */ u_int td_pticks; /* (t) Statclock hits for profiling */ u_int td_sticks; /* (t) Statclock hits in system mode. */ u_int td_iticks; /* (t) Statclock hits in intr mode. */ u_int td_uticks; /* (t) Statclock hits in user mode. */ int td_intrval; /* (t) Return value for sleepq. */ sigset_t td_oldsigmask; /* (k) Saved mask from pre sigpause. */ volatile u_int td_generation; /* (k) For detection of preemption */ stack_t td_sigstk; /* (k) Stack ptr and on-stack flag. */ int td_xsig; /* (c) Signal for ptrace */ u_long td_profil_addr; /* (k) Temporary addr until AST. */ u_int td_profil_ticks; /* (k) Temporary ticks until AST. */ char td_name[MAXCOMLEN + 1]; /* (*) Thread name. */ struct file *td_fpop; /* (k) file referencing cdev under op */ int td_dbgflags; /* (c) Userland debugger flags */ siginfo_t td_si; /* (c) For debugger or core file */ int td_ng_outbound; /* (k) Thread entered ng from above. */ struct osd td_osd; /* (k) Object specific data. */ struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */ pid_t td_dbg_forked; /* (c) Child pid for debugger. */ u_int td_no_sleeping; /* (k) Sleeping disabled count. */ struct vnode *td_vp_reserved;/* (k) Preallocated vnode. */ void *td_su; /* (k) FFS SU private */ sbintime_t td_sleeptimo; /* (t) Sleep timeout. */ int td_rtcgen; /* (s) rtc_generation of abs. sleep */ int td_errno; /* (k) Error from last syscall. */ size_t td_vslock_sz; /* (k) amount of vslock-ed space */ struct kcov_info *td_kcov_info; /* (*) Kernel code coverage data */ long td_ucredref; /* (k) references on td_realucred */ #define td_endzero td_sigmask /* Copied during fork1(), thread_create(), or kthread_add(). */ #define td_startcopy td_endzero sigset_t td_sigmask; /* (c) Current signal mask. */ u_char td_rqindex; /* (t) Run queue index. */ u_char td_base_pri; /* (t) Thread base kernel priority. */ u_char td_priority; /* (t) Thread active priority. */ u_char td_pri_class; /* (t) Scheduling class. */ u_char td_user_pri; /* (t) User pri from estcpu and nice. */ u_char td_base_user_pri; /* (t) Base user pri */ uintptr_t td_rb_list; /* (k) Robust list head. */ uintptr_t td_rbp_list; /* (k) Robust priv list head. */ uintptr_t td_rb_inact; /* (k) Current in-action mutex loc. */ struct syscall_args td_sa; /* (kx) Syscall parameters. Copied on fork for child tracing. */ void *td_sigblock_ptr; /* (k) uptr for fast sigblock. */ uint32_t td_sigblock_val; /* (k) fast sigblock value read at td_sigblock_ptr on kern entry */ #define td_endcopy td_pcb /* * Fields that must be manually set in fork1(), thread_create(), kthread_add(), * or already have been set in the allocator, constructor, etc. */ struct pcb *td_pcb; /* (k) Kernel VA of pcb and kstack. */ enum td_states { TDS_INACTIVE = 0x0, TDS_INHIBITED, TDS_CAN_RUN, TDS_RUNQ, TDS_RUNNING } td_state; /* (t) thread state */ /* Note: td_state must be accessed using TD_{GET,SET}_STATE(). */ union { syscallarg_t tdu_retval[2]; off_t tdu_off; } td_uretoff; /* (k) Syscall aux returns. */ #define td_retval td_uretoff.tdu_retval u_int td_cowgen; /* (k) Generation of COW pointers. */ /* LP64 hole */ struct callout td_slpcallout; /* (h) Callout for sleep. */ struct trapframe *td_frame; /* (k) */ vm_offset_t td_kstack; /* (a) Kernel VA of kstack. */ int td_kstack_pages; /* (a) Size of the kstack. */ volatile u_int td_critnest; /* (k*) Critical section nest level. */ struct mdthread td_md; /* (k) Any machine-dependent fields. */ struct kaudit_record *td_ar; /* (k) Active audit record, if any. */ struct lpohead td_lprof[2]; /* (a) lock profiling objects. */ struct kdtrace_thread *td_dtrace; /* (*) DTrace-specific data. */ struct vnet *td_vnet; /* (k) Effective vnet. */ const char *td_vnet_lpush; /* (k) Debugging vnet push / pop. */ struct trapframe *td_intr_frame;/* (k) Frame of the current irq */ struct proc *td_rfppwait_p; /* (k) The vforked child */ struct vm_page **td_ma; /* (k) uio pages held */ int td_ma_cnt; /* (k) size of *td_ma */ /* LP64 hole */ void *td_emuldata; /* Emulator state data */ int td_lastcpu; /* (t) Last cpu we were on. */ int td_oncpu; /* (t) Which cpu we are on. */ void *td_lkpi_task; /* LinuxKPI task struct pointer */ int td_pmcpend; void *td_remotereq; /* (c) dbg remote request. */ off_t td_ktr_io_lim; /* (k) limit for ktrace file size */ #ifdef EPOCH_TRACE SLIST_HEAD(, epoch_tracker) td_epochs; #endif }; struct thread0_storage { struct thread t0st_thread; uint64_t t0st_sched[10]; }; struct mtx *thread_lock_block(struct thread *); void thread_lock_block_wait(struct thread *); void thread_lock_set(struct thread *, struct mtx *); void thread_lock_unblock(struct thread *, struct mtx *); #define THREAD_LOCK_ASSERT(td, type) \ mtx_assert((td)->td_lock, (type)) #define THREAD_LOCK_BLOCKED_ASSERT(td, type) \ do { \ struct mtx *__m = (td)->td_lock; \ if (__m != &blocked_lock) \ mtx_assert(__m, (type)); \ } while (0) #ifdef INVARIANTS #define THREAD_LOCKPTR_ASSERT(td, lock) \ do { \ struct mtx *__m; \ __m = (td)->td_lock; \ KASSERT(__m == (lock), \ ("Thread %p lock %p does not match %p", td, __m, (lock))); \ } while (0) #define THREAD_LOCKPTR_BLOCKED_ASSERT(td, lock) \ do { \ struct mtx *__m; \ __m = (td)->td_lock; \ KASSERT(__m == (lock) || __m == &blocked_lock, \ ("Thread %p lock %p does not match %p", td, __m, (lock))); \ } while (0) #define TD_LOCKS_INC(td) ((td)->td_locks++) #define TD_LOCKS_DEC(td) do { \ - KASSERT(SCHEDULER_STOPPED_TD(td) || (td)->td_locks > 0, \ + KASSERT(SCHEDULER_STOPPED() || (td)->td_locks > 0, \ ("Thread %p owns no locks", (td))); \ (td)->td_locks--; \ } while (0) #else #define THREAD_LOCKPTR_ASSERT(td, lock) #define THREAD_LOCKPTR_BLOCKED_ASSERT(td, lock) #define TD_LOCKS_INC(td) #define TD_LOCKS_DEC(td) #endif /* * Flags kept in td_flags: * To change these you MUST have the scheduler lock. */ #define TDF_BORROWING 0x00000001 /* Thread is borrowing pri from another. */ #define TDF_INPANIC 0x00000002 /* Caused a panic, let it drive crashdump. */ #define TDF_INMEM 0x00000004 /* Thread's stack is in memory. */ #define TDF_SINTR 0x00000008 /* Sleep is interruptible. */ #define TDF_TIMEOUT 0x00000010 /* Timing out during sleep. */ #define TDF_IDLETD 0x00000020 /* This is a per-CPU idle thread. */ #define TDF_CANSWAP 0x00000040 /* Thread can be swapped. */ #define TDF_SIGWAIT 0x00000080 /* Ignore ignored signals */ #define TDF_KTH_SUSP 0x00000100 /* kthread is suspended */ #define TDF_ALLPROCSUSP 0x00000200 /* suspended by SINGLE_ALLPROC */ #define TDF_BOUNDARY 0x00000400 /* Thread suspended at user boundary */ #define TDF_UNUSED1 0x00000800 /* Available */ #define TDF_UNUSED2 0x00001000 /* Available */ #define TDF_SBDRY 0x00002000 /* Stop only on usermode boundary. */ #define TDF_UPIBLOCKED 0x00004000 /* Thread blocked on user PI mutex. */ #define TDF_UNUSED3 0x00008000 /* Available */ #define TDF_UNUSED4 0x00010000 /* Available */ #define TDF_UNUSED5 0x00020000 /* Available */ #define TDF_NOLOAD 0x00040000 /* Ignore during load avg calculations. */ #define TDF_SERESTART 0x00080000 /* ERESTART on stop attempts. */ #define TDF_THRWAKEUP 0x00100000 /* Libthr thread must not suspend itself. */ #define TDF_SEINTR 0x00200000 /* EINTR on stop attempts. */ #define TDF_SWAPINREQ 0x00400000 /* Swapin request due to wakeup. */ #define TDF_UNUSED6 0x00800000 /* Available */ #define TDF_SCHED0 0x01000000 /* Reserved for scheduler private use */ #define TDF_SCHED1 0x02000000 /* Reserved for scheduler private use */ #define TDF_SCHED2 0x04000000 /* Reserved for scheduler private use */ #define TDF_SCHED3 0x08000000 /* Reserved for scheduler private use */ #define TDF_UNUSED7 0x10000000 /* Available */ #define TDF_UNUSED8 0x20000000 /* Available */ #define TDF_UNUSED9 0x40000000 /* Available */ #define TDF_UNUSED10 0x80000000 /* Available */ enum { TDA_AST = 0, /* Special: call all non-flagged AST handlers */ TDA_OWEUPC, TDA_HWPMC, TDA_VFORK, TDA_ALRM, TDA_PROF, TDA_MAC, TDA_SCHED, TDA_UFS, TDA_GEOM, TDA_KQUEUE, TDA_RACCT, TDA_MOD1, /* For third party use, before signals are */ TAD_MOD2, /* processed .. */ TDA_SIG, TDA_KTRACE, TDA_SUSPEND, TDA_SIGSUSPEND, TDA_MOD3, /* .. and after */ TAD_MOD4, TDA_MAX, }; #define TDAI(tda) (1U << (tda)) #define td_ast_pending(td, tda) ((td->td_ast & TDAI(tda)) != 0) /* Userland debug flags */ #define TDB_SUSPEND 0x00000001 /* Thread is suspended by debugger */ #define TDB_XSIG 0x00000002 /* Thread is exchanging signal under trace */ #define TDB_USERWR 0x00000004 /* Debugger modified memory or registers */ #define TDB_SCE 0x00000008 /* Thread performs syscall enter */ #define TDB_SCX 0x00000010 /* Thread performs syscall exit */ #define TDB_EXEC 0x00000020 /* TDB_SCX from exec(2) family */ #define TDB_FORK 0x00000040 /* TDB_SCX from fork(2) that created new process */ #define TDB_STOPATFORK 0x00000080 /* Stop at the return from fork (child only) */ #define TDB_CHILD 0x00000100 /* New child indicator for ptrace() */ #define TDB_BORN 0x00000200 /* New LWP indicator for ptrace() */ #define TDB_EXIT 0x00000400 /* Exiting LWP indicator for ptrace() */ #define TDB_VFORK 0x00000800 /* vfork indicator for ptrace() */ #define TDB_FSTP 0x00001000 /* The thread is PT_ATTACH leader */ #define TDB_STEP 0x00002000 /* (x86) PSL_T set for PT_STEP */ #define TDB_SSWITCH 0x00004000 /* Suspended in ptracestop */ #define TDB_BOUNDARY 0x00008000 /* ptracestop() at boundary */ #define TDB_COREDUMPREQ 0x00010000 /* Coredump request */ #define TDB_SCREMOTEREQ 0x00020000 /* Remote syscall request */ /* * "Private" flags kept in td_pflags: * These are only written by curthread and thus need no locking. */ #define TDP_OLDMASK 0x00000001 /* Need to restore mask after suspend. */ #define TDP_INKTR 0x00000002 /* Thread is currently in KTR code. */ #define TDP_INKTRACE 0x00000004 /* Thread is currently in KTRACE code. */ #define TDP_BUFNEED 0x00000008 /* Do not recurse into the buf flush */ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock acquisition - deadlock treatment. */ #define TDP_NOFAULTING 0x00000080 /* Do not handle page faults. */ #define TDP_SIGFASTBLOCK 0x00000100 /* Fast sigblock active */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ #define TDP_SYNCIO 0x00000800 /* Local override, disable async i/o. */ #define TDP_SCHED1 0x00001000 /* Reserved for scheduler private use */ #define TDP_SCHED2 0x00002000 /* Reserved for scheduler private use */ #define TDP_SCHED3 0x00004000 /* Reserved for scheduler private use */ #define TDP_SCHED4 0x00008000 /* Reserved for scheduler private use */ #define TDP_GEOM 0x00010000 /* Settle GEOM before finishing syscall */ #define TDP_SOFTDEP 0x00020000 /* Stuck processing softdep worklist */ #define TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */ #define TDP_WAKEUP 0x00080000 /* Don't sleep in umtx cond_wait */ #define TDP_INBDFLUSH 0x00100000 /* Already in BO_BDFLUSH, do not recurse */ #define TDP_KTHREAD 0x00200000 /* This is an official kernel thread */ #define TDP_CALLCHAIN 0x00400000 /* Capture thread's callchain */ #define TDP_IGNSUSP 0x00800000 /* Permission to ignore the MNTK_SUSPEND* */ #define TDP_AUDITREC 0x01000000 /* Audit record pending on thread */ #define TDP_RFPPWAIT 0x02000000 /* Handle RFPPWAIT on syscall exit */ #define TDP_RESETSPUR 0x04000000 /* Reset spurious page fault history. */ #define TDP_NERRNO 0x08000000 /* Last errno is already in td_errno */ #define TDP_UIOHELD 0x10000000 /* Current uio has pages held in td_ma */ #define TDP_INTCPCALLOUT 0x20000000 /* used by netinet/tcp_timer.c */ #define TDP_EXECVMSPC 0x40000000 /* Execve destroyed old vmspace */ #define TDP_SIGFASTPENDING 0x80000000 /* Pending signal due to sigfastblock */ #define TDP2_SBPAGES 0x00000001 /* Owns sbusy on some pages */ #define TDP2_COMPAT32RB 0x00000002 /* compat32 ABI for robust lists */ #define TDP2_ACCT 0x00000004 /* Doing accounting */ /* * Reasons that the current thread can not be run yet. * More than one may apply. */ #define TDI_SUSPENDED 0x0001 /* On suspension queue. */ #define TDI_SLEEPING 0x0002 /* Actually asleep! (tricky). */ #define TDI_SWAPPED 0x0004 /* Stack not in mem. Bad juju if run. */ #define TDI_LOCK 0x0008 /* Stopped on a lock. */ #define TDI_IWAIT 0x0010 /* Awaiting interrupt. */ #define TD_IS_SLEEPING(td) ((td)->td_inhibitors & TDI_SLEEPING) #define TD_ON_SLEEPQ(td) ((td)->td_wchan != NULL) #define TD_IS_SUSPENDED(td) ((td)->td_inhibitors & TDI_SUSPENDED) #define TD_IS_SWAPPED(td) ((td)->td_inhibitors & TDI_SWAPPED) #define TD_ON_LOCK(td) ((td)->td_inhibitors & TDI_LOCK) #define TD_AWAITING_INTR(td) ((td)->td_inhibitors & TDI_IWAIT) #ifdef _KERNEL #define TD_GET_STATE(td) atomic_load_int(&(td)->td_state) #else #define TD_GET_STATE(td) ((td)->td_state) #endif #define TD_IS_RUNNING(td) (TD_GET_STATE(td) == TDS_RUNNING) #define TD_ON_RUNQ(td) (TD_GET_STATE(td) == TDS_RUNQ) #define TD_CAN_RUN(td) (TD_GET_STATE(td) == TDS_CAN_RUN) #define TD_IS_INHIBITED(td) (TD_GET_STATE(td) == TDS_INHIBITED) #define TD_ON_UPILOCK(td) ((td)->td_flags & TDF_UPIBLOCKED) #define TD_IS_IDLETHREAD(td) ((td)->td_flags & TDF_IDLETD) #define TD_CAN_ABORT(td) (TD_ON_SLEEPQ((td)) && \ ((td)->td_flags & TDF_SINTR) != 0) #define KTDSTATE(td) \ (((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \ ((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \ ((td)->td_inhibitors & TDI_SWAPPED) != 0 ? "swapped" : \ ((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" : \ ((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding") #define TD_SET_INHIB(td, inhib) do { \ TD_SET_STATE(td, TDS_INHIBITED); \ (td)->td_inhibitors |= (inhib); \ } while (0) #define TD_CLR_INHIB(td, inhib) do { \ if (((td)->td_inhibitors & (inhib)) && \ (((td)->td_inhibitors &= ~(inhib)) == 0)) \ TD_SET_STATE(td, TDS_CAN_RUN); \ } while (0) #define TD_SET_SLEEPING(td) TD_SET_INHIB((td), TDI_SLEEPING) #define TD_SET_SWAPPED(td) TD_SET_INHIB((td), TDI_SWAPPED) #define TD_SET_LOCK(td) TD_SET_INHIB((td), TDI_LOCK) #define TD_SET_SUSPENDED(td) TD_SET_INHIB((td), TDI_SUSPENDED) #define TD_SET_IWAIT(td) TD_SET_INHIB((td), TDI_IWAIT) #define TD_SET_EXITING(td) TD_SET_INHIB((td), TDI_EXITING) #define TD_CLR_SLEEPING(td) TD_CLR_INHIB((td), TDI_SLEEPING) #define TD_CLR_SWAPPED(td) TD_CLR_INHIB((td), TDI_SWAPPED) #define TD_CLR_LOCK(td) TD_CLR_INHIB((td), TDI_LOCK) #define TD_CLR_SUSPENDED(td) TD_CLR_INHIB((td), TDI_SUSPENDED) #define TD_CLR_IWAIT(td) TD_CLR_INHIB((td), TDI_IWAIT) #ifdef _KERNEL #define TD_SET_STATE(td, state) atomic_store_int(&(td)->td_state, state) #else #define TD_SET_STATE(td, state) (td)->td_state = state #endif #define TD_SET_RUNNING(td) TD_SET_STATE(td, TDS_RUNNING) #define TD_SET_RUNQ(td) TD_SET_STATE(td, TDS_RUNQ) #define TD_SET_CAN_RUN(td) TD_SET_STATE(td, TDS_CAN_RUN) #define TD_SBDRY_INTR(td) \ (((td)->td_flags & (TDF_SEINTR | TDF_SERESTART)) != 0) #define TD_SBDRY_ERRNO(td) \ (((td)->td_flags & TDF_SEINTR) != 0 ? EINTR : ERESTART) /* * Process structure. */ struct proc { LIST_ENTRY(proc) p_list; /* (d) List of all processes. */ TAILQ_HEAD(, thread) p_threads; /* (c) all threads. */ struct mtx p_slock; /* process spin lock */ struct ucred *p_ucred; /* (c) Process owner's identity. */ struct filedesc *p_fd; /* (b) Open files. */ struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */ struct pwddesc *p_pd; /* (b) Cwd, chroot, jail, umask */ struct pstats *p_stats; /* (b) Accounting/statistics (CPU). */ struct plimit *p_limit; /* (c) Resource limits. */ struct callout p_limco; /* (c) Limit callout handle */ struct sigacts *p_sigacts; /* (x) Signal actions, state (CPU). */ int p_flag; /* (c) P_* flags. */ int p_flag2; /* (c) P2_* flags. */ enum p_states { PRS_NEW = 0, /* In creation */ PRS_NORMAL, /* threads can be run. */ PRS_ZOMBIE } p_state; /* (j/c) Process status. */ pid_t p_pid; /* (b) Process identifier. */ LIST_ENTRY(proc) p_hash; /* (d) Hash chain. */ LIST_ENTRY(proc) p_pglist; /* (g + e) List of processes in pgrp. */ struct proc *p_pptr; /* (c + e) Pointer to parent process. */ LIST_ENTRY(proc) p_sibling; /* (e) List of sibling processes. */ LIST_HEAD(, proc) p_children; /* (e) Pointer to list of children. */ struct proc *p_reaper; /* (e) My reaper. */ LIST_HEAD(, proc) p_reaplist; /* (e) List of my descendants (if I am reaper). */ LIST_ENTRY(proc) p_reapsibling; /* (e) List of siblings - descendants of the same reaper. */ struct mtx p_mtx; /* (n) Lock for this struct. */ struct mtx p_statmtx; /* Lock for the stats */ struct mtx p_itimmtx; /* Lock for the virt/prof timers */ struct mtx p_profmtx; /* Lock for the profiling */ struct ksiginfo *p_ksi; /* Locked by parent proc lock */ sigqueue_t p_sigqueue; /* (c) Sigs not delivered to a td. */ #define p_siglist p_sigqueue.sq_signals pid_t p_oppid; /* (c + e) Real parent pid. */ /* The following fields are all zeroed upon creation in fork. */ #define p_startzero p_vmspace struct vmspace *p_vmspace; /* (b) Address space. */ u_int p_swtick; /* (c) Tick when swapped in or out. */ u_int p_cowgen; /* (c) Generation of COW pointers. */ struct itimerval p_realtimer; /* (c) Alarm timer. */ struct rusage p_ru; /* (a) Exit information. */ struct rusage_ext p_rux; /* (cu) Internal resource usage. */ struct rusage_ext p_crux; /* (c) Internal child resource usage. */ int p_profthreads; /* (c) Num threads in addupc_task. */ volatile int p_exitthreads; /* (j) Number of threads exiting */ int p_traceflag; /* (o) Kernel trace points. */ struct ktr_io_params *p_ktrioparms; /* (c + o) Params for ktrace. */ struct vnode *p_textvp; /* (b) Vnode of executable. */ struct vnode *p_textdvp; /* (b) Dir containing textvp. */ char *p_binname; /* (b) Binary hardlink name. */ u_int p_lock; /* (c) Proclock (prevent swap) count. */ struct sigiolst p_sigiolst; /* (c) List of sigio sources. */ int p_sigparent; /* (c) Signal to parent on exit. */ int p_sig; /* (n) For core dump/debugger XXX. */ u_int p_ptevents; /* (c + e) ptrace() event mask. */ struct kaioinfo *p_aioinfo; /* (y) ASYNC I/O info. */ struct thread *p_singlethread;/* (c + j) If single threading this is it */ int p_suspcount; /* (j) Num threads in suspended mode. */ struct thread *p_xthread; /* (c) Trap thread */ int p_boundary_count;/* (j) Num threads at user boundary */ int p_pendingcnt; /* how many signals are pending */ struct itimers *p_itimers; /* (c) POSIX interval timers. */ struct procdesc *p_procdesc; /* (e) Process descriptor, if any. */ u_int p_treeflag; /* (e) P_TREE flags */ int p_pendingexits; /* (c) Count of pending thread exits. */ struct filemon *p_filemon; /* (c) filemon-specific data. */ int p_pdeathsig; /* (c) Signal from parent on exit. */ /* End area that is zeroed on creation. */ #define p_endzero p_magic /* The following fields are all copied upon creation in fork. */ #define p_startcopy p_endzero u_int p_magic; /* (b) Magic number. */ int p_osrel; /* (x) osreldate for the binary (from ELF note, if any) */ uint32_t p_fctl0; /* (x) ABI feature control, ELF note */ char p_comm[MAXCOMLEN + 1]; /* (x) Process name. */ struct sysentvec *p_sysent; /* (b) Syscall dispatch info. */ struct pargs *p_args; /* (c) Process arguments. */ rlim_t p_cpulimit; /* (c) Current CPU limit in seconds. */ signed char p_nice; /* (c) Process "nice" value. */ int p_fibnum; /* in this routing domain XXX MRT */ pid_t p_reapsubtree; /* (e) Pid of the direct child of the reaper which spawned our subtree. */ uint64_t p_elf_flags; /* (x) ELF flags */ void *p_elf_brandinfo; /* (x) Elf_Brandinfo, NULL for non ELF binaries. */ sbintime_t p_umtx_min_timeout; /* End area that is copied on creation. */ #define p_endcopy p_xexit u_int p_xexit; /* (c) Exit code. */ u_int p_xsig; /* (c) Stop/kill sig. */ struct pgrp *p_pgrp; /* (c + e) Pointer to process group. */ struct knlist *p_klist; /* (c) Knotes attached to this proc. */ int p_numthreads; /* (c) Number of threads. */ struct mdproc p_md; /* Any machine-dependent fields. */ struct callout p_itcallout; /* (h + c) Interval timer callout. */ u_short p_acflag; /* (c) Accounting flags. */ struct proc *p_peers; /* (r) */ struct proc *p_leader; /* (b) */ void *p_emuldata; /* (c) Emulator state data. */ struct label *p_label; /* (*) Proc (not subject) MAC label. */ STAILQ_HEAD(, ktr_request) p_ktr; /* (o) KTR event queue. */ LIST_HEAD(, mqueue_notifier) p_mqnotifier; /* (c) mqueue notifiers.*/ struct kdtrace_proc *p_dtrace; /* (*) DTrace-specific data. */ struct cv p_pwait; /* (*) wait cv for exit/exec. */ uint64_t p_prev_runtime; /* (c) Resource usage accounting. */ struct racct *p_racct; /* (b) Resource accounting. */ int p_throttled; /* (c) Flag for racct pcpu throttling */ /* * An orphan is the child that has been re-parented to the * debugger as a result of attaching to it. Need to keep * track of them for parent to be able to collect the exit * status of what used to be children. */ LIST_ENTRY(proc) p_orphan; /* (e) List of orphan processes. */ LIST_HEAD(, proc) p_orphans; /* (e) Pointer to list of orphans. */ TAILQ_HEAD(, kq_timer_cb_data) p_kqtim_stop; /* (c) */ LIST_ENTRY(proc) p_jaillist; /* (d) Jail process linkage. */ }; #define p_session p_pgrp->pg_session #define p_pgid p_pgrp->pg_id #define NOCPU (-1) /* For when we aren't on a CPU. */ #define NOCPU_OLD (255) #define MAXCPU_OLD (254) #define PROC_SLOCK(p) mtx_lock_spin(&(p)->p_slock) #define PROC_SUNLOCK(p) mtx_unlock_spin(&(p)->p_slock) #define PROC_SLOCK_ASSERT(p, type) mtx_assert(&(p)->p_slock, (type)) #define PROC_STATLOCK(p) mtx_lock_spin(&(p)->p_statmtx) #define PROC_STATUNLOCK(p) mtx_unlock_spin(&(p)->p_statmtx) #define PROC_STATLOCK_ASSERT(p, type) mtx_assert(&(p)->p_statmtx, (type)) #define PROC_ITIMLOCK(p) mtx_lock_spin(&(p)->p_itimmtx) #define PROC_ITIMUNLOCK(p) mtx_unlock_spin(&(p)->p_itimmtx) #define PROC_ITIMLOCK_ASSERT(p, type) mtx_assert(&(p)->p_itimmtx, (type)) #define PROC_PROFLOCK(p) mtx_lock_spin(&(p)->p_profmtx) #define PROC_PROFUNLOCK(p) mtx_unlock_spin(&(p)->p_profmtx) #define PROC_PROFLOCK_ASSERT(p, type) mtx_assert(&(p)->p_profmtx, (type)) /* These flags are kept in p_flag. */ #define P_ADVLOCK 0x00000001 /* Process may hold a POSIX advisory lock. */ #define P_CONTROLT 0x00000002 /* Has a controlling terminal. */ #define P_KPROC 0x00000004 /* Kernel process. */ #define P_UNUSED3 0x00000008 /* --available-- */ #define P_PPWAIT 0x00000010 /* Parent is waiting for child to exec/exit. */ #define P_PROFIL 0x00000020 /* Has started profiling. */ #define P_STOPPROF 0x00000040 /* Has thread requesting to stop profiling. */ #define P_HADTHREADS 0x00000080 /* Has had threads (no cleanup shortcuts) */ #define P_SUGID 0x00000100 /* Had set id privileges since last exec. */ #define P_SYSTEM 0x00000200 /* System proc: no sigs, stats or swapping. */ #define P_SINGLE_EXIT 0x00000400 /* Threads suspending should exit, not wait. */ #define P_TRACED 0x00000800 /* Debugged process being traced. */ #define P_WAITED 0x00001000 /* Someone is waiting for us. */ #define P_WEXIT 0x00002000 /* Working on exiting. */ #define P_EXEC 0x00004000 /* Process called exec. */ #define P_WKILLED 0x00008000 /* Killed, go to kernel/user boundary ASAP. */ #define P_CONTINUED 0x00010000 /* Proc has continued from a stopped state. */ #define P_STOPPED_SIG 0x00020000 /* Stopped due to SIGSTOP/SIGTSTP. */ #define P_STOPPED_TRACE 0x00040000 /* Stopped because of tracing. */ #define P_STOPPED_SINGLE 0x00080000 /* Only 1 thread can continue (not to user). */ #define P_PROTECTED 0x00100000 /* Do not kill on memory overcommit. */ #define P_SIGEVENT 0x00200000 /* Process pending signals changed. */ #define P_SINGLE_BOUNDARY 0x00400000 /* Threads should suspend at user boundary. */ #define P_HWPMC 0x00800000 /* Process is using HWPMCs */ #define P_JAILED 0x01000000 /* Process is in jail. */ #define P_TOTAL_STOP 0x02000000 /* Stopped in stop_all_proc. */ #define P_INEXEC 0x04000000 /* Process is in execve(). */ #define P_STATCHILD 0x08000000 /* Child process stopped or exited. */ #define P_INMEM 0x10000000 /* Loaded into memory. */ #define P_SWAPPINGOUT 0x20000000 /* Process is being swapped out. */ #define P_SWAPPINGIN 0x40000000 /* Process is being swapped in. */ #define P_PPTRACE 0x80000000 /* PT_TRACEME by vforked child. */ #define P_STOPPED (P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE) #define P_SHOULDSTOP(p) ((p)->p_flag & P_STOPPED) #define P_KILLED(p) ((p)->p_flag & P_WKILLED) /* These flags are kept in p_flag2. */ #define P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */ #define P2_NOTRACE 0x00000002 /* No ptrace(2) attach or coredumps. */ #define P2_NOTRACE_EXEC 0x00000004 /* Keep P2_NOPTRACE on exec(2). */ #define P2_AST_SU 0x00000008 /* Handles SU ast for kthreads. */ #define P2_PTRACE_FSTP 0x00000010 /* SIGSTOP from PT_ATTACH not yet handled. */ #define P2_TRAPCAP 0x00000020 /* SIGTRAP on ENOTCAPABLE */ #define P2_ASLR_ENABLE 0x00000040 /* Force enable ASLR. */ #define P2_ASLR_DISABLE 0x00000080 /* Force disable ASLR. */ #define P2_ASLR_IGNSTART 0x00000100 /* Enable ASLR to consume sbrk area. */ #define P2_PROTMAX_ENABLE 0x00000200 /* Force enable implied PROT_MAX. */ #define P2_PROTMAX_DISABLE 0x00000400 /* Force disable implied PROT_MAX. */ #define P2_STKGAP_DISABLE 0x00000800 /* Disable stack gap for MAP_STACK */ #define P2_STKGAP_DISABLE_EXEC 0x00001000 /* Stack gap disabled after exec */ #define P2_ITSTOPPED 0x00002000 #define P2_PTRACEREQ 0x00004000 /* Active ptrace req */ #define P2_NO_NEW_PRIVS 0x00008000 /* Ignore setuid */ #define P2_WXORX_DISABLE 0x00010000 /* WX mappings enabled */ #define P2_WXORX_ENABLE_EXEC 0x00020000 /* WXORX enabled after exec */ #define P2_WEXIT 0x00040000 /* exit just started, no external thread_single() is permitted */ #define P2_REAPKILLED 0x00080000 #define P2_MEMBAR_PRIVE 0x00100000 /* membar private expedited registered */ #define P2_MEMBAR_PRIVE_SYNCORE 0x00200000 /* membar private expedited sync core registered */ #define P2_MEMBAR_GLOBE 0x00400000 /* membar global expedited registered */ /* Flags protected by proctree_lock, kept in p_treeflags. */ #define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */ #define P_TREE_FIRST_ORPHAN 0x00000002 /* First element of orphan list */ #define P_TREE_REAPER 0x00000004 /* Reaper of subtree */ #define P_TREE_GRPEXITED 0x00000008 /* exit1() done with job ctl */ /* * These were process status values (p_stat), now they are only used in * legacy conversion code. */ #define SIDL 1 /* Process being created by fork. */ #define SRUN 2 /* Currently runnable. */ #define SSLEEP 3 /* Sleeping on an address. */ #define SSTOP 4 /* Process debugging or suspension. */ #define SZOMB 5 /* Awaiting collection by parent. */ #define SWAIT 6 /* Waiting for interrupt. */ #define SLOCK 7 /* Blocked on a lock. */ #define P_MAGIC 0xbeefface #ifdef _KERNEL /* Types and flags for mi_switch(9). */ #define SW_TYPE_MASK 0xff /* First 8 bits are switch type */ #define SWT_OWEPREEMPT 1 /* Switching due to owepreempt. */ #define SWT_TURNSTILE 2 /* Turnstile contention. */ #define SWT_SLEEPQ 3 /* Sleepq wait. */ #define SWT_RELINQUISH 4 /* yield call. */ #define SWT_NEEDRESCHED 5 /* NEEDRESCHED was set. */ #define SWT_IDLE 6 /* Switching from the idle thread. */ #define SWT_IWAIT 7 /* Waiting for interrupts. */ #define SWT_SUSPEND 8 /* Thread suspended. */ #define SWT_REMOTEPREEMPT 9 /* Remote processor preempted. */ #define SWT_REMOTEWAKEIDLE 10 /* Remote processor preempted idle. */ #define SWT_BIND 11 /* Thread bound to a new CPU. */ #define SWT_COUNT 12 /* Number of switch types. */ /* Flags */ #define SW_VOL 0x0100 /* Voluntary switch. */ #define SW_INVOL 0x0200 /* Involuntary switch. */ #define SW_PREEMPT 0x0400 /* The invol switch is a preemption */ /* How values for thread_single(). */ #define SINGLE_NO_EXIT 0 #define SINGLE_EXIT 1 #define SINGLE_BOUNDARY 2 #define SINGLE_ALLPROC 3 #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_PARGS); MALLOC_DECLARE(M_SESSION); MALLOC_DECLARE(M_SUBPROC); #endif #define FOREACH_PROC_IN_SYSTEM(p) \ LIST_FOREACH((p), &allproc, p_list) #define FOREACH_THREAD_IN_PROC(p, td) \ TAILQ_FOREACH((td), &(p)->p_threads, td_plist) #define FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&(p)->p_threads) /* * We use process IDs <= pid_max <= PID_MAX; PID_MAX + 1 must also fit * in a pid_t, as it is used to represent "no process group". */ #define PID_MAX 99999 #define NO_PID (PID_MAX + 1) #define THREAD0_TID NO_PID extern pid_t pid_max; #define SESS_LEADER(p) ((p)->p_session->s_leader == (p)) /* Lock and unlock a process. */ #define PROC_LOCK(p) mtx_lock(&(p)->p_mtx) #define PROC_TRYLOCK(p) mtx_trylock(&(p)->p_mtx) #define PROC_UNLOCK(p) mtx_unlock(&(p)->p_mtx) #define PROC_LOCKED(p) mtx_owned(&(p)->p_mtx) #define PROC_WAIT_UNLOCKED(p) mtx_wait_unlocked(&(p)->p_mtx) #define PROC_LOCK_ASSERT(p, type) mtx_assert(&(p)->p_mtx, (type)) /* Lock and unlock a process group. */ #define PGRP_LOCK(pg) mtx_lock(&(pg)->pg_mtx) #define PGRP_UNLOCK(pg) mtx_unlock(&(pg)->pg_mtx) #define PGRP_LOCKED(pg) mtx_owned(&(pg)->pg_mtx) #define PGRP_LOCK_ASSERT(pg, type) mtx_assert(&(pg)->pg_mtx, (type)) #define PGRP_LOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_LOCK(pg); \ } while (0) #define PGRP_UNLOCK_PGSIGNAL(pg) do { \ if ((pg) != NULL) \ PGRP_UNLOCK(pg); \ } while (0) /* Lock and unlock a session. */ #define SESS_LOCK(s) mtx_lock(&(s)->s_mtx) #define SESS_UNLOCK(s) mtx_unlock(&(s)->s_mtx) #define SESS_LOCKED(s) mtx_owned(&(s)->s_mtx) #define SESS_LOCK_ASSERT(s, type) mtx_assert(&(s)->s_mtx, (type)) /* * Non-zero p_lock ensures that: * - exit1() is not performed until p_lock reaches zero; * - the process' threads stack are not swapped out if they are currently * not (P_INMEM). * * PHOLD() asserts that the process (except the current process) is * not exiting, increments p_lock and swaps threads stacks into memory, * if needed. * _PHOLD() is same as PHOLD(), it takes the process locked. * _PHOLD_LITE() also takes the process locked, but comparing with * _PHOLD(), it only guarantees that exit1() is not executed, * faultin() is not called. */ #define PHOLD(p) do { \ PROC_LOCK(p); \ _PHOLD(p); \ PROC_UNLOCK(p); \ } while (0) #define _PHOLD(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ if (((p)->p_flag & P_INMEM) == 0) \ faultin((p)); \ } while (0) #define _PHOLD_LITE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc, \ ("PHOLD of exiting process %p", p)); \ (p)->p_lock++; \ } while (0) #define PROC_ASSERT_HELD(p) do { \ KASSERT((p)->p_lock > 0, ("process %p not held", p)); \ } while (0) #define PRELE(p) do { \ PROC_LOCK((p)); \ _PRELE((p)); \ PROC_UNLOCK((p)); \ } while (0) #define _PRELE(p) do { \ PROC_LOCK_ASSERT((p), MA_OWNED); \ PROC_ASSERT_HELD(p); \ (--(p)->p_lock); \ if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0) \ wakeup(&(p)->p_lock); \ } while (0) #define PROC_ASSERT_NOT_HELD(p) do { \ KASSERT((p)->p_lock == 0, ("process %p held", p)); \ } while (0) #define PROC_UPDATE_COW(p) do { \ struct proc *_p = (p); \ PROC_LOCK_ASSERT((_p), MA_OWNED); \ atomic_store_int(&_p->p_cowgen, _p->p_cowgen + 1); \ } while (0) #define PROC_COW_CHANGECOUNT(td, p) ({ \ struct thread *_td = (td); \ struct proc *_p = (p); \ MPASS(_td == curthread); \ PROC_LOCK_ASSERT(_p, MA_OWNED); \ _p->p_cowgen - _td->td_cowgen; \ }) /* Check whether a thread is safe to be swapped out. */ #define thread_safetoswapout(td) ((td)->td_flags & TDF_CANSWAP) /* Control whether or not it is safe for curthread to sleep. */ #define THREAD_NO_SLEEPING() do { \ curthread->td_no_sleeping++; \ MPASS(curthread->td_no_sleeping > 0); \ } while (0) #define THREAD_SLEEPING_OK() do { \ MPASS(curthread->td_no_sleeping > 0); \ curthread->td_no_sleeping--; \ } while (0) #define THREAD_CAN_SLEEP() ((curthread)->td_no_sleeping == 0) #define THREAD_CONTENDS_ON_LOCK(lo) do { \ MPASS(curthread->td_wantedlock == NULL); \ curthread->td_wantedlock = lo; \ } while (0) #define THREAD_CONTENTION_DONE(lo) do { \ MPASS(curthread->td_wantedlock == lo); \ curthread->td_wantedlock = NULL; \ } while (0) #define PIDHASH(pid) (&pidhashtbl[(pid) & pidhash]) #define PIDHASHLOCK(pid) (&pidhashtbl_lock[((pid) & pidhashlock)]) extern LIST_HEAD(pidhashhead, proc) *pidhashtbl; extern struct sx *pidhashtbl_lock; extern u_long pidhash; extern u_long pidhashlock; #define PGRPHASH(pgid) (&pgrphashtbl[(pgid) & pgrphash]) extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl; extern u_long pgrphash; extern struct sx allproc_lock; extern int allproc_gen; extern struct sx proctree_lock; extern struct mtx ppeers_lock; extern struct mtx procid_lock; extern struct proc proc0; /* Process slot for swapper. */ extern struct thread0_storage thread0_st; /* Primary thread in proc0. */ #define thread0 (thread0_st.t0st_thread) extern struct vmspace vmspace0; /* VM space for proc0. */ extern int hogticks; /* Limit on kernel cpu hogs. */ extern int lastpid; extern int nprocs, maxproc; /* Current and max number of procs. */ extern int maxprocperuid; /* Max procs per uid. */ extern u_long ps_arg_cache_limit; LIST_HEAD(proclist, proc); TAILQ_HEAD(procqueue, proc); TAILQ_HEAD(threadqueue, thread); extern struct proclist allproc; /* List of all processes. */ extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */ extern struct uma_zone *proc_zone; extern struct uma_zone *pgrp_zone; struct proc *pfind(pid_t); /* Find process by id. */ struct proc *pfind_any(pid_t); /* Find (zombie) process by id. */ struct proc *pfind_any_locked(pid_t pid); /* Find process by id, locked. */ struct pgrp *pgfind(pid_t); /* Find process group by id. */ void pidhash_slockall(void); /* Shared lock all pid hash lists. */ void pidhash_sunlockall(void); /* Shared unlock all pid hash lists. */ struct fork_req { int fr_flags; int fr_pages; int *fr_pidp; struct proc **fr_procp; int *fr_pd_fd; int fr_pd_flags; struct filecaps *fr_pd_fcaps; int fr_flags2; #define FR2_DROPSIG_CAUGHT 0x00000001 /* Drop caught non-DFL signals */ #define FR2_SHARE_PATHS 0x00000002 /* Invert sense of RFFDG for paths */ #define FR2_KPROC 0x00000004 /* Create a kernel process */ }; /* * pget() flags. */ #define PGET_HOLD 0x00001 /* Hold the process. */ #define PGET_CANSEE 0x00002 /* Check against p_cansee(). */ #define PGET_CANDEBUG 0x00004 /* Check against p_candebug(). */ #define PGET_ISCURRENT 0x00008 /* Check that the found process is current. */ #define PGET_NOTWEXIT 0x00010 /* Check that the process is not in P_WEXIT. */ #define PGET_NOTINEXEC 0x00020 /* Check that the process is not in P_INEXEC. */ #define PGET_NOTID 0x00040 /* Do not assume tid if pid > PID_MAX. */ #define PGET_WANTREAD (PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT) int pget(pid_t pid, int flags, struct proc **pp); /* ast_register() flags */ #define ASTR_ASTF_REQUIRED 0x0001 /* td_ast TDAI(TDA_X) flag set is required for call */ #define ASTR_TDP 0x0002 /* td_pflags flag set is required */ #define ASTR_KCLEAR 0x0004 /* call me on ast_kclear() */ #define ASTR_UNCOND 0x0008 /* call me always */ void ast(struct trapframe *framep); void ast_kclear(struct thread *td); void ast_register(int ast, int ast_flags, int tdp, void (*f)(struct thread *td, int asts)); void ast_deregister(int tda); void ast_sched_locked(struct thread *td, int tda); void ast_sched_mask(struct thread *td, int ast); void ast_sched(struct thread *td, int tda); void ast_unsched_locked(struct thread *td, int tda); struct thread *choosethread(void); int cr_bsd_visible(struct ucred *u1, struct ucred *u2); int cr_cansee(struct ucred *u1, struct ucred *u2); int cr_canseesocket(struct ucred *cred, struct socket *so); int cr_cansignal(struct ucred *cred, struct proc *proc, int signum); int enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess); int enterthispgrp(struct proc *p, struct pgrp *pgrp); void faultin(struct proc *p); int fork1(struct thread *, struct fork_req *); void fork_exit(void (*)(void *, struct trapframe *), void *, struct trapframe *); void fork_return(struct thread *, struct trapframe *); int inferior(struct proc *p); void itimer_proc_continue(struct proc *p); void kqtimer_proc_continue(struct proc *p); void kern_proc_vmmap_resident(struct vm_map *map, struct vm_map_entry *entry, int *resident_count, bool *super); void kern_yield(int); void kick_proc0(void); void killjobc(void); int leavepgrp(struct proc *p); int maybe_preempt(struct thread *td); void maybe_yield(void); void mi_switch(int flags); int p_candebug(struct thread *td, struct proc *p); int p_cansee(struct thread *td, struct proc *p); int p_cansched(struct thread *td, struct proc *p); int p_cansignal(struct thread *td, struct proc *p, int signum); int p_canwait(struct thread *td, struct proc *p); struct pargs *pargs_alloc(int len); void pargs_drop(struct pargs *pa); void pargs_hold(struct pargs *pa); void proc_add_orphan(struct proc *child, struct proc *parent); int proc_get_binpath(struct proc *p, char *binname, char **fullpath, char **freepath); int proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb); int proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb); void procinit(void); int proc_iterate(int (*cb)(struct proc *, void *), void *cbarg); void proc_linkup0(struct proc *p, struct thread *td); void proc_linkup(struct proc *p, struct thread *td); struct proc *proc_realparent(struct proc *child); void proc_reap(struct thread *td, struct proc *p, int *status, int options); void proc_reparent(struct proc *child, struct proc *newparent, bool set_oppid); void proc_set_p2_wexit(struct proc *p); void proc_set_traced(struct proc *p, bool stop); void proc_wkilled(struct proc *p); struct pstats *pstats_alloc(void); void pstats_fork(struct pstats *src, struct pstats *dst); void pstats_free(struct pstats *ps); void proc_clear_orphan(struct proc *p); void reaper_abandon_children(struct proc *p, bool exiting); int securelevel_ge(struct ucred *cr, int level); int securelevel_gt(struct ucred *cr, int level); void sess_hold(struct session *); void sess_release(struct session *); int setrunnable(struct thread *, int); void setsugid(struct proc *p); bool should_yield(void); int sigonstack(size_t sp); void stopevent(struct proc *, u_int, u_int); struct thread *tdfind(lwpid_t, pid_t); void threadinit(void); void tidhash_add(struct thread *); void tidhash_remove(struct thread *); void cpu_idle(int); int cpu_idle_wakeup(int); extern void (*cpu_idle_hook)(sbintime_t); /* Hook to machdep CPU idler. */ void cpu_switch(struct thread *, struct thread *, struct mtx *); void cpu_sync_core(void); void cpu_throw(struct thread *, struct thread *) __dead2; bool curproc_sigkilled(void); void userret(struct thread *, struct trapframe *); void cpu_exit(struct thread *); void exit1(struct thread *, int, int) __dead2; void cpu_copy_thread(struct thread *td, struct thread *td0); bool cpu_exec_vmspace_reuse(struct proc *p, struct vm_map *map); int cpu_fetch_syscall_args(struct thread *td); void cpu_fork(struct thread *, struct proc *, struct thread *, int); void cpu_fork_kthread_handler(struct thread *, void (*)(void *), void *); int cpu_procctl(struct thread *td, int idtype, id_t id, int com, void *data); void cpu_set_syscall_retval(struct thread *, int); int cpu_set_upcall(struct thread *, void (*)(void *), void *, stack_t *); int cpu_set_user_tls(struct thread *, void *tls_base); void cpu_thread_alloc(struct thread *); void cpu_thread_clean(struct thread *); void cpu_thread_exit(struct thread *); void cpu_thread_free(struct thread *); void cpu_thread_swapin(struct thread *); void cpu_thread_swapout(struct thread *); struct thread *thread_alloc(int pages); int thread_alloc_stack(struct thread *, int pages); int thread_check_susp(struct thread *td, bool sleep); void thread_cow_get_proc(struct thread *newtd, struct proc *p); void thread_cow_get(struct thread *newtd, struct thread *td); void thread_cow_free(struct thread *td); void thread_cow_update(struct thread *td); void thread_cow_synced(struct thread *td); int thread_create(struct thread *td, struct rtprio *rtp, int (*initialize_thread)(struct thread *, void *), void *thunk); void thread_exit(void) __dead2; void thread_free(struct thread *td); void thread_link(struct thread *td, struct proc *p); void thread_reap_barrier(void); int thread_single(struct proc *p, int how); void thread_single_end(struct proc *p, int how); void thread_stash(struct thread *td); void thread_stopped(struct proc *p); void childproc_stopped(struct proc *child, int reason); void childproc_continued(struct proc *child); void childproc_exited(struct proc *child); void thread_run_flash(struct thread *td); int thread_suspend_check(int how); bool thread_suspend_check_needed(void); void thread_suspend_switch(struct thread *, struct proc *p); void thread_suspend_one(struct thread *td); void thread_unlink(struct thread *td); void thread_unsuspend(struct proc *p); void thread_wait(struct proc *p); bool stop_all_proc_block(void); void stop_all_proc_unblock(void); void stop_all_proc(void); void resume_all_proc(void); static __inline int curthread_pflags_set(int flags) { struct thread *td; int save; td = curthread; save = ~flags | (td->td_pflags & flags); td->td_pflags |= flags; return (save); } static __inline void curthread_pflags_restore(int save) { curthread->td_pflags &= save; } static __inline int curthread_pflags2_set(int flags) { struct thread *td; int save; td = curthread; save = ~flags | (td->td_pflags2 & flags); td->td_pflags2 |= flags; return (save); } static __inline void curthread_pflags2_restore(int save) { curthread->td_pflags2 &= save; } static __inline __pure2 struct td_sched * td_get_sched(struct thread *td) { return ((struct td_sched *)&td[1]); } #define PROC_ID_PID 0 #define PROC_ID_GROUP 1 #define PROC_ID_SESSION 2 #define PROC_ID_REAP 3 void proc_id_set(int type, pid_t id); void proc_id_set_cond(int type, pid_t id); void proc_id_clear(int type, pid_t id); EVENTHANDLER_LIST_DECLARE(process_ctor); EVENTHANDLER_LIST_DECLARE(process_dtor); EVENTHANDLER_LIST_DECLARE(process_init); EVENTHANDLER_LIST_DECLARE(process_fini); EVENTHANDLER_LIST_DECLARE(process_exit); EVENTHANDLER_LIST_DECLARE(process_fork); EVENTHANDLER_LIST_DECLARE(process_exec); EVENTHANDLER_LIST_DECLARE(thread_ctor); EVENTHANDLER_LIST_DECLARE(thread_dtor); EVENTHANDLER_LIST_DECLARE(thread_init); #endif /* _KERNEL */ #endif /* !_SYS_PROC_H_ */ diff --git a/sys/sys/systm.h b/sys/sys/systm.h index 0d3f9fe98893..29c8bfc3c768 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -1,585 +1,583 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_SYSTM_H_ #define _SYS_SYSTM_H_ #include #include #include #include #include /* for people using printf mainly */ #include #include __NULLABILITY_PRAGMA_PUSH #ifdef _KERNEL extern int cold; /* nonzero if we are doing a cold boot */ extern int suspend_blocked; /* block suspend due to pending shutdown */ extern int rebooting; /* kern_reboot() has been called. */ extern char version[]; /* system version */ extern char compiler_version[]; /* compiler version */ extern char copyright[]; /* system copyright */ extern int kstack_pages; /* number of kernel stack pages */ extern u_long pagesizes[]; /* supported page sizes */ extern long physmem; /* physical memory */ extern long realmem; /* 'real' memory */ extern char *rootdevnames[2]; /* names of possible root devices */ extern int boothowto; /* reboot flags, from console subsystem */ extern int bootverbose; /* nonzero to print verbose messages */ extern int maxusers; /* system tune hint */ extern int ngroups_max; /* max # of supplemental groups */ extern int vm_guest; /* Running as virtual machine guest? */ extern u_long maxphys; /* max raw I/O transfer size */ /* * Detected virtual machine guest types. The intention is to expand * and/or add to the VM_GUEST_VM type if specific VM functionality is * ever implemented (e.g. vendor-specific paravirtualization features). * Keep in sync with vm_guest_sysctl_names[]. */ enum VM_GUEST { VM_GUEST_NO = 0, VM_GUEST_VM, VM_GUEST_XEN, VM_GUEST_HV, VM_GUEST_VMWARE, VM_GUEST_KVM, VM_GUEST_BHYVE, VM_GUEST_VBOX, VM_GUEST_PARALLELS, VM_LAST }; #endif /* KERNEL */ /* * Align variables. */ #define __read_mostly __section(".data.read_mostly") #define __read_frequently __section(".data.read_frequently") #define __exclusive_cache_line __aligned(CACHE_LINE_SIZE) \ __section(".data.exclusive_cache_line") #if defined(_STANDALONE) struct ucred; #endif #ifdef _KERNEL #include /* MAXCPU */ #include /* curthread */ #include +extern bool scheduler_stopped; + /* * If we have already panic'd and this is the thread that called * panic(), then don't block on any mutexes but silently succeed. * Otherwise, the kernel will deadlock since the scheduler isn't * going to run the thread that holds any lock we need. */ -#define SCHEDULER_STOPPED_TD(td) ({ \ - MPASS((td) == curthread); \ - __predict_false((td)->td_stopsched); \ -}) -#define SCHEDULER_STOPPED() SCHEDULER_STOPPED_TD(curthread) +#define SCHEDULER_STOPPED() __predict_false(scheduler_stopped) extern int osreldate; extern const void *zero_region; /* address space maps to a zeroed page */ extern int unmapped_buf_allowed; #ifdef __LP64__ #define IOSIZE_MAX iosize_max() #define DEVFS_IOSIZE_MAX devfs_iosize_max() #else #define IOSIZE_MAX SSIZE_MAX #define DEVFS_IOSIZE_MAX SSIZE_MAX #endif /* * General function declarations. */ struct inpcb; struct lock_object; struct malloc_type; struct mtx; struct proc; struct socket; struct thread; struct tty; struct ucred; struct uio; struct _jmp_buf; struct trapframe; struct eventtimer; int setjmp(struct _jmp_buf *) __returns_twice; void longjmp(struct _jmp_buf *, int) __dead2; int dumpstatus(vm_offset_t addr, off_t count); int nullop(void); int eopnotsupp(void); int ureadc(int, struct uio *); void hashdestroy(void *, struct malloc_type *, u_long); void *hashinit(int count, struct malloc_type *type, u_long *hashmask); void *hashinit_flags(int count, struct malloc_type *type, u_long *hashmask, int flags); #define HASH_NOWAIT 0x00000001 #define HASH_WAITOK 0x00000002 void *phashinit(int count, struct malloc_type *type, u_long *nentries); void *phashinit_flags(int count, struct malloc_type *type, u_long *nentries, int flags); void cpu_flush_dcache(void *, size_t); void cpu_rootconf(void); void critical_enter_KBI(void); void critical_exit_KBI(void); void critical_exit_preempt(void); void init_param1(void); void init_param2(long physpages); void init_static_kenv(char *, size_t); void tablefull(const char *); /* * Allocate per-thread "current" state in the linuxkpi */ extern int (*lkpi_alloc_current)(struct thread *, int); int linux_alloc_current_noop(struct thread *, int); #if (defined(KLD_MODULE) && !defined(KLD_TIED)) || defined(KTR_CRITICAL) || !defined(_KERNEL) || defined(GENOFFSET) #define critical_enter() critical_enter_KBI() #define critical_exit() critical_exit_KBI() #else static __inline void critical_enter(void) { struct thread_lite *td; td = (struct thread_lite *)curthread; td->td_critnest++; atomic_interrupt_fence(); } static __inline void critical_exit(void) { struct thread_lite *td; td = (struct thread_lite *)curthread; KASSERT(td->td_critnest != 0, ("critical_exit: td_critnest == 0")); atomic_interrupt_fence(); td->td_critnest--; atomic_interrupt_fence(); if (__predict_false(td->td_owepreempt)) critical_exit_preempt(); } #endif #ifdef EARLY_PRINTF typedef void early_putc_t(int ch); extern early_putc_t *early_putc; #endif int kvprintf(char const *, void (*)(int, void*), void *, int, __va_list) __printflike(1, 0); void log(int, const char *, ...) __printflike(2, 3); void log_console(struct uio *); void vlog(int, const char *, __va_list) __printflike(2, 0); int asprintf(char **ret, struct malloc_type *mtp, const char *format, ...) __printflike(3, 4); int printf(const char *, ...) __printflike(1, 2); int snprintf(char *, size_t, const char *, ...) __printflike(3, 4); int sprintf(char *buf, const char *, ...) __printflike(2, 3); int uprintf(const char *, ...) __printflike(1, 2); int vprintf(const char *, __va_list) __printflike(1, 0); int vasprintf(char **ret, struct malloc_type *mtp, const char *format, __va_list ap) __printflike(3, 0); int vsnprintf(char *, size_t, const char *, __va_list) __printflike(3, 0); int vsnrprintf(char *, size_t, int, const char *, __va_list) __printflike(4, 0); int vsprintf(char *buf, const char *, __va_list) __printflike(2, 0); int sscanf(const char *, char const * _Nonnull, ...) __scanflike(2, 3); int vsscanf(const char * _Nonnull, char const * _Nonnull, __va_list) __scanflike(2, 0); long strtol(const char *, char **, int); u_long strtoul(const char *, char **, int); quad_t strtoq(const char *, char **, int); u_quad_t strtouq(const char *, char **, int); void tprintf(struct proc *p, int pri, const char *, ...) __printflike(3, 4); void vtprintf(struct proc *, int, const char *, __va_list) __printflike(3, 0); void hexdump(const void *ptr, int length, const char *hdr, int flags); #define HD_COLUMN_MASK 0xff #define HD_DELIM_MASK 0xff00 #define HD_OMIT_COUNT (1 << 16) #define HD_OMIT_HEX (1 << 17) #define HD_OMIT_CHARS (1 << 18) #define ovbcopy(f, t, l) bcopy((f), (t), (l)) void explicit_bzero(void * _Nonnull, size_t); void *memset(void * _Nonnull buf, int c, size_t len); void *memcpy(void * _Nonnull to, const void * _Nonnull from, size_t len); void *memmove(void * _Nonnull dest, const void * _Nonnull src, size_t n); int memcmp(const void *b1, const void *b2, size_t len); #ifdef SAN_NEEDS_INTERCEPTORS #define SAN_INTERCEPTOR(func) \ __CONCAT(SAN_INTERCEPTOR_PREFIX, __CONCAT(_, func)) void *SAN_INTERCEPTOR(memset)(void *, int, size_t); void *SAN_INTERCEPTOR(memcpy)(void *, const void *, size_t); void *SAN_INTERCEPTOR(memmove)(void *, const void *, size_t); int SAN_INTERCEPTOR(memcmp)(const void *, const void *, size_t); #ifndef SAN_RUNTIME #define bcopy(from, to, len) SAN_INTERCEPTOR(memmove)((to), (from), (len)) #define bzero(buf, len) SAN_INTERCEPTOR(memset)((buf), 0, (len)) #define bcmp(b1, b2, len) SAN_INTERCEPTOR(memcmp)((b1), (b2), (len)) #define memset(buf, c, len) SAN_INTERCEPTOR(memset)((buf), (c), (len)) #define memcpy(to, from, len) SAN_INTERCEPTOR(memcpy)((to), (from), (len)) #define memmove(dest, src, n) SAN_INTERCEPTOR(memmove)((dest), (src), (n)) #define memcmp(b1, b2, len) SAN_INTERCEPTOR(memcmp)((b1), (b2), (len)) #endif /* !SAN_RUNTIME */ #else /* !SAN_NEEDS_INTERCEPTORS */ #define bcopy(from, to, len) __builtin_memmove((to), (from), (len)) #define bzero(buf, len) __builtin_memset((buf), 0, (len)) #define bcmp(b1, b2, len) __builtin_memcmp((b1), (b2), (len)) #define memset(buf, c, len) __builtin_memset((buf), (c), (len)) #define memcpy(to, from, len) __builtin_memcpy((to), (from), (len)) #define memmove(dest, src, n) __builtin_memmove((dest), (src), (n)) #define memcmp(b1, b2, len) __builtin_memcmp((b1), (b2), (len)) #endif /* SAN_NEEDS_INTERCEPTORS */ void *memset_early(void * _Nonnull buf, int c, size_t len); #define bzero_early(buf, len) memset_early((buf), 0, (len)) void *memcpy_early(void * _Nonnull to, const void * _Nonnull from, size_t len); void *memmove_early(void * _Nonnull dest, const void * _Nonnull src, size_t n); #define bcopy_early(from, to, len) memmove_early((to), (from), (len)) #define copystr(src, dst, len, outlen) ({ \ size_t __r, __len, *__outlen; \ \ __len = (len); \ __outlen = (outlen); \ __r = strlcpy((dst), (src), __len); \ if (__outlen != NULL) \ *__outlen = ((__r >= __len) ? __len : __r + 1); \ ((__r >= __len) ? ENAMETOOLONG : 0); \ }) int __result_use_check copyinstr(const void * __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len, size_t * __restrict lencopied); int __result_use_check copyin(const void * __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len); int __result_use_check copyin_nofault(const void * __restrict udaddr, void * _Nonnull __restrict kaddr, size_t len); int copyout(const void * _Nonnull __restrict kaddr, void * __restrict udaddr, size_t len); int copyout_nofault(const void * _Nonnull __restrict kaddr, void * __restrict udaddr, size_t len); #ifdef SAN_NEEDS_INTERCEPTORS int SAN_INTERCEPTOR(copyin)(const void *, void *, size_t); int SAN_INTERCEPTOR(copyinstr)(const void *, void *, size_t, size_t *); int SAN_INTERCEPTOR(copyout)(const void *, void *, size_t); #ifndef SAN_RUNTIME #define copyin(u, k, l) SAN_INTERCEPTOR(copyin)((u), (k), (l)) #define copyinstr(u, k, l, lc) SAN_INTERCEPTOR(copyinstr)((u), (k), (l), (lc)) #define copyout(k, u, l) SAN_INTERCEPTOR(copyout)((k), (u), (l)) #endif /* !SAN_RUNTIME */ #endif /* SAN_NEEDS_INTERCEPTORS */ int fubyte(volatile const void *base); long fuword(volatile const void *base); int fuword16(volatile const void *base); int32_t fuword32(volatile const void *base); int64_t fuword64(volatile const void *base); int __result_use_check fueword(volatile const void *base, long *val); int __result_use_check fueword32(volatile const void *base, int32_t *val); int __result_use_check fueword64(volatile const void *base, int64_t *val); int subyte(volatile void *base, int byte); int suword(volatile void *base, long word); int suword16(volatile void *base, int word); int suword32(volatile void *base, int32_t word); int suword64(volatile void *base, int64_t word); uint32_t casuword32(volatile uint32_t *base, uint32_t oldval, uint32_t newval); u_long casuword(volatile u_long *p, u_long oldval, u_long newval); int casueword32(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp, uint32_t newval); int casueword(volatile u_long *p, u_long oldval, u_long *oldvalp, u_long newval); #if defined(SAN_NEEDS_INTERCEPTORS) && !defined(KCSAN) int SAN_INTERCEPTOR(fubyte)(volatile const void *base); int SAN_INTERCEPTOR(fuword16)(volatile const void *base); int SAN_INTERCEPTOR(fueword)(volatile const void *base, long *val); int SAN_INTERCEPTOR(fueword32)(volatile const void *base, int32_t *val); int SAN_INTERCEPTOR(fueword64)(volatile const void *base, int64_t *val); int SAN_INTERCEPTOR(subyte)(volatile void *base, int byte); int SAN_INTERCEPTOR(suword)(volatile void *base, long word); int SAN_INTERCEPTOR(suword16)(volatile void *base, int word); int SAN_INTERCEPTOR(suword32)(volatile void *base, int32_t word); int SAN_INTERCEPTOR(suword64)(volatile void *base, int64_t word); int SAN_INTERCEPTOR(casueword32)(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp, uint32_t newval); int SAN_INTERCEPTOR(casueword)(volatile u_long *p, u_long oldval, u_long *oldvalp, u_long newval); #ifndef SAN_RUNTIME #define fubyte(b) SAN_INTERCEPTOR(fubyte)((b)) #define fuword16(b) SAN_INTERCEPTOR(fuword16)((b)) #define fueword(b, v) SAN_INTERCEPTOR(fueword)((b), (v)) #define fueword32(b, v) SAN_INTERCEPTOR(fueword32)((b), (v)) #define fueword64(b, v) SAN_INTERCEPTOR(fueword64)((b), (v)) #define subyte(b, w) SAN_INTERCEPTOR(subyte)((b), (w)) #define suword(b, w) SAN_INTERCEPTOR(suword)((b), (w)) #define suword16(b, w) SAN_INTERCEPTOR(suword16)((b), (w)) #define suword32(b, w) SAN_INTERCEPTOR(suword32)((b), (w)) #define suword64(b, w) SAN_INTERCEPTOR(suword64)((b), (w)) #define casueword32(b, o, p, n) SAN_INTERCEPTOR(casueword32)((b), (o), (p), (n)) #define casueword(b, o, p, n) SAN_INTERCEPTOR(casueword)((b), (o), (p), (n)) #endif /* !SAN_RUNTIME */ #endif /* SAN_NEEDS_INTERCEPTORS && !KCSAN */ int sysbeep(int hertz, sbintime_t duration); void hardclock(int cnt, int usermode); void hardclock_sync(int cpu); void statclock(int cnt, int usermode); void profclock(int cnt, int usermode, uintfptr_t pc); int hardclockintr(void); void startprofclock(struct proc *); void stopprofclock(struct proc *); void cpu_startprofclock(void); void cpu_stopprofclock(void); void suspendclock(void); void resumeclock(void); sbintime_t cpu_idleclock(void); void cpu_activeclock(void); void cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt); void cpu_et_frequency(struct eventtimer *et, uint64_t newfreq); extern int cpu_disable_c2_sleep; extern int cpu_disable_c3_sleep; extern void (*tcp_hpts_softclock)(void); #define tcp_hpts_softclock() do { \ if (tcp_hpts_softclock != NULL) \ tcp_hpts_softclock(); \ } while (0) char *kern_getenv(const char *name); void freeenv(char *env); int getenv_int(const char *name, int *data); int getenv_uint(const char *name, unsigned int *data); int getenv_long(const char *name, long *data); int getenv_ulong(const char *name, unsigned long *data); int getenv_string(const char *name, char *data, int size); int getenv_int64(const char *name, int64_t *data); int getenv_uint64(const char *name, uint64_t *data); int getenv_quad(const char *name, quad_t *data); int getenv_bool(const char *name, bool *data); bool getenv_is_true(const char *name); bool getenv_is_false(const char *name); int kern_setenv(const char *name, const char *value); int kern_unsetenv(const char *name); int testenv(const char *name); int getenv_array(const char *name, void *data, int size, int *psize, int type_size, bool allow_signed); #define GETENV_UNSIGNED false /* negative numbers not allowed */ #define GETENV_SIGNED true /* negative numbers allowed */ typedef uint64_t (cpu_tick_f)(void); void set_cputicker(cpu_tick_f *func, uint64_t freq, bool isvariable); extern cpu_tick_f *cpu_ticks; uint64_t cpu_tickrate(void); uint64_t cputick2usec(uint64_t tick); #include /* Initialize the world */ void consinit(void); void cpu_initclocks(void); void cpu_initclocks_bsp(void); void cpu_initclocks_ap(void); void usrinfoinit(void); /* Finalize the world */ void kern_reboot(int) __dead2; void shutdown_nice(int); /* Stubs for obsolete functions that used to be for interrupt management */ static __inline intrmask_t splhigh(void) { return 0; } static __inline intrmask_t splimp(void) { return 0; } static __inline intrmask_t splnet(void) { return 0; } static __inline intrmask_t spltty(void) { return 0; } static __inline void splx(intrmask_t ipl __unused) { return; } /* * Common `proc' functions are declared here so that proc.h can be included * less often. */ int _sleep(const void * _Nonnull chan, struct lock_object *lock, int pri, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define msleep(chan, mtx, pri, wmesg, timo) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), \ tick_sbt * (timo), 0, C_HARDCLOCK) #define msleep_sbt(chan, mtx, pri, wmesg, bt, pr, flags) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (bt), (pr), \ (flags)) int msleep_spin_sbt(const void * _Nonnull chan, struct mtx *mtx, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define msleep_spin(chan, mtx, wmesg, timo) \ msleep_spin_sbt((chan), (mtx), (wmesg), tick_sbt * (timo), \ 0, C_HARDCLOCK) int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); static __inline int pause(const char *wmesg, int timo) { return (pause_sbt(wmesg, tick_sbt * timo, 0, C_HARDCLOCK)); } #define pause_sig(wmesg, timo) \ pause_sbt((wmesg), tick_sbt * (timo), 0, C_HARDCLOCK | C_CATCH) #define tsleep(chan, pri, wmesg, timo) \ _sleep((chan), NULL, (pri), (wmesg), tick_sbt * (timo), \ 0, C_HARDCLOCK) #define tsleep_sbt(chan, pri, wmesg, bt, pr, flags) \ _sleep((chan), NULL, (pri), (wmesg), (bt), (pr), (flags)) void wakeup(const void *chan); void wakeup_one(const void *chan); void wakeup_any(const void *chan); /* * Common `struct cdev *' stuff are declared here to avoid #include poisoning */ struct cdev; dev_t dev2udev(struct cdev *x); const char *devtoname(struct cdev *cdev); #ifdef __LP64__ size_t devfs_iosize_max(void); size_t iosize_max(void); #endif int poll_no_poll(int events); /* XXX: Should be void nanodelay(u_int nsec); */ void DELAY(int usec); int kcmp_cmp(uintptr_t a, uintptr_t b); /* Root mount holdback API */ struct root_hold_token { int flags; const char *who; TAILQ_ENTRY(root_hold_token) list; }; struct root_hold_token *root_mount_hold(const char *identifier); void root_mount_hold_token(const char *identifier, struct root_hold_token *h); void root_mount_rel(struct root_hold_token *h); int root_mounted(void); /* * Unit number allocation API. (kern/subr_unit.c) */ struct unrhdr; #define UNR_NO_MTX ((void *)(uintptr_t)-1) struct unrhdr *new_unrhdr(int low, int high, struct mtx *mutex); void init_unrhdr(struct unrhdr *uh, int low, int high, struct mtx *mutex); void delete_unrhdr(struct unrhdr *uh); void clear_unrhdr(struct unrhdr *uh); void clean_unrhdr(struct unrhdr *uh); void clean_unrhdrl(struct unrhdr *uh); int alloc_unr(struct unrhdr *uh); int alloc_unr_specific(struct unrhdr *uh, u_int item); int alloc_unrl(struct unrhdr *uh); void free_unr(struct unrhdr *uh, u_int item); void *create_iter_unr(struct unrhdr *uh); int next_iter_unr(void *handle); void free_iter_unr(void *handle); struct unrhdr64 { uint64_t counter; }; static __inline void new_unrhdr64(struct unrhdr64 *unr64, uint64_t low) { unr64->counter = low; } static __inline uint64_t alloc_unr64(struct unrhdr64 *unr64) { return (atomic_fetchadd_64(&unr64->counter, 1)); } void intr_prof_stack_use(struct thread *td, struct trapframe *frame); void counted_warning(unsigned *counter, const char *msg); /* * APIs to manage deprecation and obsolescence. */ void _gone_in(int major, const char *msg); void _gone_in_dev(device_t dev, int major, const char *msg); #ifdef NO_OBSOLETE_CODE #define __gone_ok(m, msg) \ _Static_assert(m < P_OSREL_MAJOR(__FreeBSD_version)), \ "Obsolete code: " msg); #else #define __gone_ok(m, msg) #endif #define gone_in(major, msg) __gone_ok(major, msg) _gone_in(major, msg) #define gone_in_dev(dev, major, msg) __gone_ok(major, msg) _gone_in_dev(dev, major, msg) #ifdef INVARIANTS #define __diagused #else #define __diagused __unused #endif #ifdef WITNESS #define __witness_used #else #define __witness_used __unused #endif #endif /* _KERNEL */ __NULLABILITY_PRAGMA_POP #endif /* !_SYS_SYSTM_H_ */