diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c index 091abcda2a1e..5cefcf7a597b 100644 --- a/sys/kern/kern_lock.c +++ b/sys/kern/kern_lock.c @@ -1,1853 +1,1861 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Attilio Rao * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include #include #include #ifdef DEBUG_LOCKS #include #endif #include #include #include #ifdef DDB #include #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DECLARE( , , lock, failed); #endif +/* + * Hack. There should be prio_t or similar so that this is not necessary. + */ +_Static_assert((PRILASTFLAG * 2) - 1 <= USHRT_MAX, + "prio flags wont fit in u_short pri in struct lock"); + CTASSERT(LK_UNLOCKED == (LK_UNLOCKED & ~(LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS))); #define SQ_EXCLUSIVE_QUEUE 0 #define SQ_SHARED_QUEUE 1 #ifndef INVARIANTS #define _lockmgr_assert(lk, what, file, line) #endif #define TD_SLOCKS_INC(td) ((td)->td_lk_slocks++) #define TD_SLOCKS_DEC(td) ((td)->td_lk_slocks--) #ifndef DEBUG_LOCKS #define STACK_PRINT(lk) #define STACK_SAVE(lk) #define STACK_ZERO(lk) #else #define STACK_PRINT(lk) stack_print_ddb(&(lk)->lk_stack) #define STACK_SAVE(lk) stack_save(&(lk)->lk_stack) #define STACK_ZERO(lk) stack_zero(&(lk)->lk_stack) #endif #define LOCK_LOG2(lk, string, arg1, arg2) \ if (LOCK_LOG_TEST(&(lk)->lock_object, 0)) \ CTR2(KTR_LOCK, (string), (arg1), (arg2)) #define LOCK_LOG3(lk, string, arg1, arg2, arg3) \ if (LOCK_LOG_TEST(&(lk)->lock_object, 0)) \ CTR3(KTR_LOCK, (string), (arg1), (arg2), (arg3)) #define GIANT_DECLARE \ int _i = 0; \ WITNESS_SAVE_DECL(Giant) #define GIANT_RESTORE() do { \ if (__predict_false(_i > 0)) { \ while (_i--) \ mtx_lock(&Giant); \ WITNESS_RESTORE(&Giant.lock_object, Giant); \ } \ } while (0) #define GIANT_SAVE() do { \ if (__predict_false(mtx_owned(&Giant))) { \ WITNESS_SAVE(&Giant.lock_object, Giant); \ while (mtx_owned(&Giant)) { \ _i++; \ mtx_unlock(&Giant); \ } \ } \ } while (0) static bool __always_inline LK_CAN_SHARE(uintptr_t x, int flags, bool fp) { if ((x & (LK_SHARE | LK_EXCLUSIVE_WAITERS | LK_EXCLUSIVE_SPINNERS)) == LK_SHARE) return (true); if (fp || (!(x & LK_SHARE))) return (false); if ((curthread->td_lk_slocks != 0 && !(flags & LK_NODDLKTREAT)) || (curthread->td_pflags & TDP_DEADLKTREAT)) return (true); return (false); } #define LK_TRYOP(x) \ ((x) & LK_NOWAIT) #define LK_CAN_WITNESS(x) \ (((x) & LK_NOWITNESS) == 0 && !LK_TRYOP(x)) #define LK_TRYWIT(x) \ (LK_TRYOP(x) ? LOP_TRYLOCK : 0) #define lockmgr_disowned(lk) \ (((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == LK_KERNPROC) #define lockmgr_xlocked_v(v) \ (((v) & ~(LK_FLAGMASK & ~LK_SHARE)) == (uintptr_t)curthread) #define lockmgr_xlocked(lk) lockmgr_xlocked_v(lockmgr_read_value(lk)) static void assert_lockmgr(const struct lock_object *lock, int how); #ifdef DDB static void db_show_lockmgr(const struct lock_object *lock); #endif static void lock_lockmgr(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_lockmgr(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_lockmgr(struct lock_object *lock); struct lock_class lock_class_lockmgr = { .lc_name = "lockmgr", .lc_flags = LC_RECURSABLE | LC_SLEEPABLE | LC_SLEEPLOCK | LC_UPGRADABLE, .lc_assert = assert_lockmgr, #ifdef DDB .lc_ddb_show = db_show_lockmgr, #endif .lc_lock = lock_lockmgr, .lc_unlock = unlock_lockmgr, #ifdef KDTRACE_HOOKS .lc_owner = owner_lockmgr, #endif }; static __read_mostly bool lk_adaptive = true; static SYSCTL_NODE(_debug, OID_AUTO, lockmgr, CTLFLAG_RD, NULL, "lockmgr debugging"); SYSCTL_BOOL(_debug_lockmgr, OID_AUTO, adaptive_spinning, CTLFLAG_RW, &lk_adaptive, 0, ""); #define lockmgr_delay locks_delay struct lockmgr_wait { const char *iwmesg; int ipri; int itimo; }; static bool __always_inline lockmgr_slock_try(struct lock *lk, uintptr_t *xp, int flags, bool fp); static bool __always_inline lockmgr_sunlock_try(struct lock *lk, uintptr_t *xp); static void lockmgr_exit(u_int flags, struct lock_object *ilk, int wakeup_swapper) { struct lock_class *class; if (flags & LK_INTERLOCK) { class = LOCK_CLASS(ilk); class->lc_unlock(ilk); } if (__predict_false(wakeup_swapper)) kick_proc0(); } static void lockmgr_note_shared_acquire(struct lock *lk, int contested, uint64_t waittime, const char *file, int line, int flags) { LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(lockmgr__acquire, lk, contested, waittime, file, line, LOCKSTAT_READER); LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, file, line); WITNESS_LOCK(&lk->lock_object, LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); TD_SLOCKS_INC(curthread); STACK_SAVE(lk); } static void lockmgr_note_shared_release(struct lock *lk, const char *file, int line) { WITNESS_UNLOCK(&lk->lock_object, 0, file, line); LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, file, line); TD_LOCKS_DEC(curthread); TD_SLOCKS_DEC(curthread); } static void lockmgr_note_exclusive_acquire(struct lock *lk, int contested, uint64_t waittime, const char *file, int line, int flags) { LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(lockmgr__acquire, lk, contested, waittime, file, line, LOCKSTAT_WRITER); LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); STACK_SAVE(lk); } static void lockmgr_note_exclusive_release(struct lock *lk, const char *file, int line) { if (LK_HOLDER(lockmgr_read_value(lk)) != LK_KERNPROC) { WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_DEC(curthread); } LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); } static __inline struct thread * lockmgr_xholder(const struct lock *lk) { uintptr_t x; x = lockmgr_read_value(lk); return ((x & LK_SHARE) ? NULL : (struct thread *)LK_HOLDER(x)); } /* * It assumes sleepq_lock held and returns with this one unheld. * It also assumes the generic interlock is sane and previously checked. * If LK_INTERLOCK is specified the interlock is not reacquired after the * sleep. */ static __inline int sleeplk(struct lock *lk, u_int flags, struct lock_object *ilk, const char *wmesg, int pri, int timo, int queue) { GIANT_DECLARE; struct lock_class *class; int catch, error; class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL; catch = pri & PCATCH; pri &= PRIMASK; error = 0; LOCK_LOG3(lk, "%s: %p blocking on the %s sleepqueue", __func__, lk, (queue == SQ_EXCLUSIVE_QUEUE) ? "exclusive" : "shared"); if (flags & LK_INTERLOCK) class->lc_unlock(ilk); - if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0) - lk->lk_exslpfail++; + if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0) { + if (lk->lk_exslpfail < USHRT_MAX) + lk->lk_exslpfail++; + } GIANT_SAVE(); sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ? SLEEPQ_INTERRUPTIBLE : 0), queue); if ((flags & LK_TIMELOCK) && timo) sleepq_set_timeout(&lk->lock_object, timo); /* * Decisional switch for real sleeping. */ if ((flags & LK_TIMELOCK) && timo && catch) error = sleepq_timedwait_sig(&lk->lock_object, pri); else if ((flags & LK_TIMELOCK) && timo) error = sleepq_timedwait(&lk->lock_object, pri); else if (catch) error = sleepq_wait_sig(&lk->lock_object, pri); else sleepq_wait(&lk->lock_object, pri); GIANT_RESTORE(); if ((flags & LK_SLEEPFAIL) && error == 0) error = ENOLCK; return (error); } static __inline int wakeupshlk(struct lock *lk, const char *file, int line) { uintptr_t v, x, orig_x; u_int realexslp; int queue, wakeup_swapper; wakeup_swapper = 0; for (;;) { x = lockmgr_read_value(lk); if (lockmgr_sunlock_try(lk, &x)) break; /* * We should have a sharer with waiters, so enter the hard * path in order to handle wakeups correctly. */ sleepq_lock(&lk->lock_object); orig_x = lockmgr_read_value(lk); retry_sleepq: x = orig_x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); v = LK_UNLOCKED; /* * If the lock has exclusive waiters, give them preference in * order to avoid deadlock with shared runners up. * If interruptible sleeps left the exclusive queue empty * avoid a starvation for the threads sleeping on the shared * queue by giving them precedence and cleaning up the * exclusive waiters bit anyway. * Please note that lk_exslpfail count may be lying about * the real number of waiters with the LK_SLEEPFAIL flag on * because they may be used in conjunction with interruptible * sleeps so lk_exslpfail might be considered an 'upper limit' * bound, including the edge cases. */ realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE); if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) { - if (lk->lk_exslpfail < realexslp) { + if (lk->lk_exslpfail != USHRT_MAX && lk->lk_exslpfail < realexslp) { lk->lk_exslpfail = 0; queue = SQ_EXCLUSIVE_QUEUE; v |= (x & LK_SHARED_WAITERS); } else { lk->lk_exslpfail = 0; LOCK_LOG2(lk, "%s: %p has only LK_SLEEPFAIL sleepers", __func__, lk); LOCK_LOG2(lk, "%s: %p waking up threads on the exclusive queue", __func__, lk); wakeup_swapper = sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); queue = SQ_SHARED_QUEUE; } - } else { /* * Exclusive waiters sleeping with LK_SLEEPFAIL on * and using interruptible sleeps/timeout may have * left spourious lk_exslpfail counts on, so clean * it up anyway. */ lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; } if (lockmgr_sunlock_try(lk, &orig_x)) { sleepq_release(&lk->lock_object); break; } x |= LK_SHARERS_LOCK(1); if (!atomic_fcmpset_rel_ptr(&lk->lk_lock, &x, v)) { orig_x = x; goto retry_sleepq; } LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue", __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, queue); sleepq_release(&lk->lock_object); break; } LOCKSTAT_PROFILE_RELEASE_RWLOCK(lockmgr__release, lk, LOCKSTAT_READER); return (wakeup_swapper); } static void assert_lockmgr(const struct lock_object *lock, int what) { panic("lockmgr locks do not support assertions"); } static void lock_lockmgr(struct lock_object *lock, uintptr_t how) { panic("lockmgr locks do not support sleep interlocking"); } static uintptr_t unlock_lockmgr(struct lock_object *lock) { panic("lockmgr locks do not support sleep interlocking"); } #ifdef KDTRACE_HOOKS static int owner_lockmgr(const struct lock_object *lock, struct thread **owner) { panic("lockmgr locks do not support owner inquiring"); } #endif void lockinit(struct lock *lk, int pri, const char *wmesg, int timo, int flags) { int iflags; MPASS((flags & ~LK_INIT_MASK) == 0); ASSERT_ATOMIC_LOAD_PTR(lk->lk_lock, ("%s: lockmgr not aligned for %s: %p", __func__, wmesg, &lk->lk_lock)); iflags = LO_SLEEPABLE | LO_UPGRADABLE; if (flags & LK_CANRECURSE) iflags |= LO_RECURSABLE; if ((flags & LK_NODUP) == 0) iflags |= LO_DUPOK; if (flags & LK_NOPROFILE) iflags |= LO_NOPROFILE; if ((flags & LK_NOWITNESS) == 0) iflags |= LO_WITNESS; if (flags & LK_QUIET) iflags |= LO_QUIET; if (flags & LK_IS_VNODE) iflags |= LO_IS_VNODE; if (flags & LK_NEW) iflags |= LO_NEW; iflags |= flags & LK_NOSHARE; lock_init(&lk->lock_object, &lock_class_lockmgr, wmesg, NULL, iflags); lk->lk_lock = LK_UNLOCKED; lk->lk_recurse = 0; lk->lk_exslpfail = 0; lk->lk_timo = timo; lk->lk_pri = pri; STACK_ZERO(lk); } /* * XXX: Gross hacks to manipulate external lock flags after * initialization. Used for certain vnode and buf locks. */ void lockallowshare(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags &= ~LK_NOSHARE; } void lockdisableshare(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags |= LK_NOSHARE; } void lockallowrecurse(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags |= LO_RECURSABLE; } void lockdisablerecurse(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags &= ~LO_RECURSABLE; } void lockdestroy(struct lock *lk) { KASSERT(lk->lk_lock == LK_UNLOCKED, ("lockmgr still held")); KASSERT(lk->lk_recurse == 0, ("lockmgr still recursed")); KASSERT(lk->lk_exslpfail == 0, ("lockmgr still exclusive waiters")); lock_destroy(&lk->lock_object); } static bool __always_inline lockmgr_slock_try(struct lock *lk, uintptr_t *xp, int flags, bool fp) { /* * If no other thread has an exclusive lock, or * no exclusive waiter is present, bump the count of * sharers. Since we have to preserve the state of * waiters, if we fail to acquire the shared lock * loop back and retry. */ while (LK_CAN_SHARE(*xp, flags, fp)) { if (atomic_fcmpset_acq_ptr(&lk->lk_lock, xp, *xp + LK_ONE_SHARER)) { return (true); } } return (false); } static bool __always_inline lockmgr_sunlock_try(struct lock *lk, uintptr_t *xp) { for (;;) { if (LK_SHARERS(*xp) > 1 || !(*xp & LK_ALL_WAITERS)) { if (atomic_fcmpset_rel_ptr(&lk->lk_lock, xp, *xp - LK_ONE_SHARER)) return (true); continue; } break; } return (false); } static bool lockmgr_slock_adaptive(struct lock_delay_arg *lda, struct lock *lk, uintptr_t *xp, int flags) { struct thread *owner; uintptr_t x; x = *xp; MPASS(x != LK_UNLOCKED); owner = (struct thread *)LK_HOLDER(x); for (;;) { MPASS(owner != curthread); if (owner == (struct thread *)LK_KERNPROC) return (false); if ((x & LK_SHARE) && LK_SHARERS(x) > 0) return (false); if (owner == NULL) return (false); if (!TD_IS_RUNNING(owner)) return (false); if ((x & LK_ALL_WAITERS) != 0) return (false); lock_delay(lda); x = lockmgr_read_value(lk); if (LK_CAN_SHARE(x, flags, false)) { *xp = x; return (true); } owner = (struct thread *)LK_HOLDER(x); } } static __noinline int lockmgr_slock_hard(struct lock *lk, u_int flags, struct lock_object *ilk, const char *file, int line, struct lockmgr_wait *lwa) { uintptr_t tid, x; int error = 0; const char *iwmesg; int ipri, itimo; #ifdef KDTRACE_HOOKS uint64_t sleep_time = 0; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif struct lock_delay_arg lda; if (KERNEL_PANICKED()) goto out; tid = (uintptr_t)curthread; if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line, flags & LK_INTERLOCK ? ilk : NULL); x = lockmgr_read_value(lk); lock_delay_arg_init(&lda, &lockmgr_delay); if (!lk_adaptive) flags &= ~LK_ADAPTIVE; /* * The lock may already be locked exclusive by curthread, * avoid deadlock. */ if (LK_HOLDER(x) == tid) { LOCK_LOG2(lk, "%s: %p already held in exclusive mode", __func__, lk); error = EDEADLK; goto out; } for (;;) { if (lockmgr_slock_try(lk, &x, flags, false)) break; if ((flags & (LK_ADAPTIVE | LK_INTERLOCK)) == LK_ADAPTIVE) { if (lockmgr_slock_adaptive(&lda, lk, &x, flags)) continue; } #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&lk->lock_object, &contested, &waittime); /* * If the lock is expected to not sleep just give up * and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } /* * Acquire the sleepqueue chain lock because we * probabilly will need to manipulate waiters flags. */ sleepq_lock(&lk->lock_object); x = lockmgr_read_value(lk); retry_sleepq: /* * if the lock can be acquired in shared mode, try * again. */ if (LK_CAN_SHARE(x, flags, false)) { sleepq_release(&lk->lock_object); continue; } /* * Try to set the LK_SHARED_WAITERS flag. If we fail, * loop back and retry. */ if ((x & LK_SHARED_WAITERS) == 0) { if (!atomic_fcmpset_acq_ptr(&lk->lk_lock, &x, x | LK_SHARED_WAITERS)) { goto retry_sleepq; } LOCK_LOG2(lk, "%s: %p set shared waiters flag", __func__, lk); } if (lwa == NULL) { iwmesg = lk->lock_object.lo_name; ipri = lk->lk_pri; itimo = lk->lk_timo; } else { iwmesg = lwa->iwmesg; ipri = lwa->ipri; itimo = lwa->itimo; } /* * As far as we have been unable to acquire the * shared lock and the shared waiters flag is set, * we will sleep. */ #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&lk->lock_object); #endif error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo, SQ_SHARED_QUEUE); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&lk->lock_object); #endif flags &= ~LK_INTERLOCK; if (error) { LOCK_LOG3(lk, "%s: interrupted sleep for %p with %d", __func__, lk, error); break; } LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", __func__, lk); x = lockmgr_read_value(lk); } if (error == 0) { #ifdef KDTRACE_HOOKS if (sleep_time != 0) LOCKSTAT_RECORD4(lockmgr__block, lk, sleep_time, LOCKSTAT_READER, (x & LK_SHARE) == 0, (x & LK_SHARE) == 0 ? 0 : LK_SHARERS(x)); #endif #ifdef LOCK_PROFILING lockmgr_note_shared_acquire(lk, contested, waittime, file, line, flags); #else lockmgr_note_shared_acquire(lk, 0, 0, file, line, flags); #endif } out: lockmgr_exit(flags, ilk, 0); return (error); } static bool lockmgr_xlock_adaptive(struct lock_delay_arg *lda, struct lock *lk, uintptr_t *xp) { struct thread *owner; uintptr_t x; x = *xp; MPASS(x != LK_UNLOCKED); owner = (struct thread *)LK_HOLDER(x); for (;;) { MPASS(owner != curthread); if (owner == NULL) return (false); if ((x & LK_SHARE) && LK_SHARERS(x) > 0) return (false); if (owner == (struct thread *)LK_KERNPROC) return (false); if (!TD_IS_RUNNING(owner)) return (false); if ((x & LK_ALL_WAITERS) != 0) return (false); lock_delay(lda); x = lockmgr_read_value(lk); if (x == LK_UNLOCKED) { *xp = x; return (true); } owner = (struct thread *)LK_HOLDER(x); } } static __noinline int lockmgr_xlock_hard(struct lock *lk, u_int flags, struct lock_object *ilk, const char *file, int line, struct lockmgr_wait *lwa) { struct lock_class *class; uintptr_t tid, x, v; int error = 0; const char *iwmesg; int ipri, itimo; #ifdef KDTRACE_HOOKS uint64_t sleep_time = 0; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif struct lock_delay_arg lda; if (KERNEL_PANICKED()) goto out; tid = (uintptr_t)curthread; if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ? ilk : NULL); /* * If curthread already holds the lock and this one is * allowed to recurse, simply recurse on it. */ if (lockmgr_xlocked(lk)) { if ((flags & LK_CANRECURSE) == 0 && (lk->lock_object.lo_flags & LO_RECURSABLE) == 0) { /* * If the lock is expected to not panic just * give up and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; goto out; } if (flags & LK_INTERLOCK) { class = LOCK_CLASS(ilk); class->lc_unlock(ilk); } STACK_PRINT(lk); panic("%s: recursing on non recursive lockmgr %p " "@ %s:%d\n", __func__, lk, file, line); } atomic_set_ptr(&lk->lk_lock, LK_WRITER_RECURSED); lk->lk_recurse++; LOCK_LOG2(lk, "%s: %p recursing", __func__, lk); LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); goto out; } x = LK_UNLOCKED; lock_delay_arg_init(&lda, &lockmgr_delay); if (!lk_adaptive) flags &= ~LK_ADAPTIVE; for (;;) { if (x == LK_UNLOCKED) { if (atomic_fcmpset_acq_ptr(&lk->lk_lock, &x, tid)) break; continue; } if ((flags & (LK_ADAPTIVE | LK_INTERLOCK)) == LK_ADAPTIVE) { if (lockmgr_xlock_adaptive(&lda, lk, &x)) continue; } #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&lk->lock_object, &contested, &waittime); /* * If the lock is expected to not sleep just give up * and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } /* * Acquire the sleepqueue chain lock because we * probabilly will need to manipulate waiters flags. */ sleepq_lock(&lk->lock_object); x = lockmgr_read_value(lk); retry_sleepq: /* * if the lock has been released while we spun on * the sleepqueue chain lock just try again. */ if (x == LK_UNLOCKED) { sleepq_release(&lk->lock_object); continue; } /* * The lock can be in the state where there is a * pending queue of waiters, but still no owner. * This happens when the lock is contested and an * owner is going to claim the lock. * If curthread is the one successfully acquiring it * claim lock ownership and return, preserving waiters * flags. */ v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); if ((x & ~v) == LK_UNLOCKED) { v &= ~LK_EXCLUSIVE_SPINNERS; if (atomic_fcmpset_acq_ptr(&lk->lk_lock, &x, tid | v)) { sleepq_release(&lk->lock_object); LOCK_LOG2(lk, "%s: %p claimed by a new writer", __func__, lk); break; } goto retry_sleepq; } /* * Try to set the LK_EXCLUSIVE_WAITERS flag. If we * fail, loop back and retry. */ if ((x & LK_EXCLUSIVE_WAITERS) == 0) { if (!atomic_fcmpset_ptr(&lk->lk_lock, &x, x | LK_EXCLUSIVE_WAITERS)) { goto retry_sleepq; } LOCK_LOG2(lk, "%s: %p set excl waiters flag", __func__, lk); } if (lwa == NULL) { iwmesg = lk->lock_object.lo_name; ipri = lk->lk_pri; itimo = lk->lk_timo; } else { iwmesg = lwa->iwmesg; ipri = lwa->ipri; itimo = lwa->itimo; } /* * As far as we have been unable to acquire the * exclusive lock and the exclusive waiters flag * is set, we will sleep. */ #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(&lk->lock_object); #endif error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo, SQ_EXCLUSIVE_QUEUE); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(&lk->lock_object); #endif flags &= ~LK_INTERLOCK; if (error) { LOCK_LOG3(lk, "%s: interrupted sleep for %p with %d", __func__, lk, error); break; } LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", __func__, lk); x = lockmgr_read_value(lk); } if (error == 0) { #ifdef KDTRACE_HOOKS if (sleep_time != 0) LOCKSTAT_RECORD4(lockmgr__block, lk, sleep_time, LOCKSTAT_WRITER, (x & LK_SHARE) == 0, (x & LK_SHARE) == 0 ? 0 : LK_SHARERS(x)); #endif #ifdef LOCK_PROFILING lockmgr_note_exclusive_acquire(lk, contested, waittime, file, line, flags); #else lockmgr_note_exclusive_acquire(lk, 0, 0, file, line, flags); #endif } out: lockmgr_exit(flags, ilk, 0); return (error); } static __noinline int lockmgr_upgrade(struct lock *lk, u_int flags, struct lock_object *ilk, const char *file, int line, struct lockmgr_wait *lwa) { uintptr_t tid, v, setv; int error = 0; int op; if (KERNEL_PANICKED()) goto out; tid = (uintptr_t)curthread; _lockmgr_assert(lk, KA_SLOCKED, file, line); op = flags & LK_TYPE_MASK; v = lockmgr_read_value(lk); for (;;) { if (LK_SHARERS(v) > 1) { if (op == LK_TRYUPGRADE) { LOCK_LOG2(lk, "%s: %p failed the nowait upgrade", __func__, lk); error = EBUSY; goto out; } if (atomic_fcmpset_rel_ptr(&lk->lk_lock, &v, v - LK_ONE_SHARER)) { lockmgr_note_shared_release(lk, file, line); goto out_xlock; } continue; } MPASS((v & ~LK_ALL_WAITERS) == LK_SHARERS_LOCK(1)); setv = tid; setv |= (v & LK_ALL_WAITERS); /* * Try to switch from one shared lock to an exclusive one. * We need to preserve waiters flags during the operation. */ if (atomic_fcmpset_ptr(&lk->lk_lock, &v, setv)) { LOCK_LOG_LOCK("XUPGRADE", &lk->lock_object, 0, 0, file, line); WITNESS_UPGRADE(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); LOCKSTAT_RECORD0(lockmgr__upgrade, lk); TD_SLOCKS_DEC(curthread); goto out; } } out_xlock: error = lockmgr_xlock_hard(lk, flags, ilk, file, line, lwa); flags &= ~LK_INTERLOCK; out: lockmgr_exit(flags, ilk, 0); return (error); } int lockmgr_lock_flags(struct lock *lk, u_int flags, struct lock_object *ilk, const char *file, int line) { struct lock_class *class; uintptr_t x, tid; u_int op; bool locked; if (KERNEL_PANICKED()) return (0); op = flags & LK_TYPE_MASK; locked = false; switch (op) { case LK_SHARED: if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line, flags & LK_INTERLOCK ? ilk : NULL); if (__predict_false(lk->lock_object.lo_flags & LK_NOSHARE)) break; x = lockmgr_read_value(lk); if (lockmgr_slock_try(lk, &x, flags, true)) { lockmgr_note_shared_acquire(lk, 0, 0, file, line, flags); locked = true; } else { return (lockmgr_slock_hard(lk, flags, ilk, file, line, NULL)); } break; case LK_EXCLUSIVE: if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ? ilk : NULL); tid = (uintptr_t)curthread; if (lockmgr_read_value(lk) == LK_UNLOCKED && atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) { lockmgr_note_exclusive_acquire(lk, 0, 0, file, line, flags); locked = true; } else { return (lockmgr_xlock_hard(lk, flags, ilk, file, line, NULL)); } break; case LK_UPGRADE: case LK_TRYUPGRADE: return (lockmgr_upgrade(lk, flags, ilk, file, line, NULL)); default: break; } if (__predict_true(locked)) { if (__predict_false(flags & LK_INTERLOCK)) { class = LOCK_CLASS(ilk); class->lc_unlock(ilk); } return (0); } else { return (__lockmgr_args(lk, flags, ilk, LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, file, line)); } } static __noinline int lockmgr_sunlock_hard(struct lock *lk, uintptr_t x, u_int flags, struct lock_object *ilk, const char *file, int line) { int wakeup_swapper = 0; if (KERNEL_PANICKED()) goto out; wakeup_swapper = wakeupshlk(lk, file, line); out: lockmgr_exit(flags, ilk, wakeup_swapper); return (0); } static __noinline int lockmgr_xunlock_hard(struct lock *lk, uintptr_t x, u_int flags, struct lock_object *ilk, const char *file, int line) { uintptr_t tid, v; int wakeup_swapper = 0; u_int realexslp; int queue; if (KERNEL_PANICKED()) goto out; tid = (uintptr_t)curthread; /* * As first option, treact the lock as if it has not * any waiter. * Fix-up the tid var if the lock has been disowned. */ if (LK_HOLDER(x) == LK_KERNPROC) tid = LK_KERNPROC; /* * The lock is held in exclusive mode. * If the lock is recursed also, then unrecurse it. */ if (lockmgr_recursed_v(x)) { LOCK_LOG2(lk, "%s: %p unrecursing", __func__, lk); lk->lk_recurse--; if (lk->lk_recurse == 0) atomic_clear_ptr(&lk->lk_lock, LK_WRITER_RECURSED); goto out; } if (tid != LK_KERNPROC) LOCKSTAT_PROFILE_RELEASE_RWLOCK(lockmgr__release, lk, LOCKSTAT_WRITER); if (x == tid && atomic_cmpset_rel_ptr(&lk->lk_lock, tid, LK_UNLOCKED)) goto out; sleepq_lock(&lk->lock_object); x = lockmgr_read_value(lk); v = LK_UNLOCKED; /* * If the lock has exclusive waiters, give them * preference in order to avoid deadlock with * shared runners up. * If interruptible sleeps left the exclusive queue * empty avoid a starvation for the threads sleeping * on the shared queue by giving them precedence * and cleaning up the exclusive waiters bit anyway. * Please note that lk_exslpfail count may be lying * about the real number of waiters with the * LK_SLEEPFAIL flag on because they may be used in * conjunction with interruptible sleeps so * lk_exslpfail might be considered an 'upper limit' * bound, including the edge cases. */ MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE); if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) { - if (lk->lk_exslpfail < realexslp) { + if (lk->lk_exslpfail != USHRT_MAX && lk->lk_exslpfail < realexslp) { lk->lk_exslpfail = 0; queue = SQ_EXCLUSIVE_QUEUE; v |= (x & LK_SHARED_WAITERS); } else { lk->lk_exslpfail = 0; LOCK_LOG2(lk, "%s: %p has only LK_SLEEPFAIL sleepers", __func__, lk); LOCK_LOG2(lk, "%s: %p waking up threads on the exclusive queue", __func__, lk); wakeup_swapper = sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); queue = SQ_SHARED_QUEUE; } } else { /* * Exclusive waiters sleeping with LK_SLEEPFAIL * on and using interruptible sleeps/timeout * may have left spourious lk_exslpfail counts * on, so clean it up anyway. */ lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; } LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue", __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); atomic_store_rel_ptr(&lk->lk_lock, v); wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, queue); sleepq_release(&lk->lock_object); out: lockmgr_exit(flags, ilk, wakeup_swapper); return (0); } /* * Lightweight entry points for common operations. * * Functionality is similar to sx locks, in that none of the additional lockmgr * features are supported. To be clear, these are NOT supported: * 1. shared locking disablement * 2. returning with an error after sleep * 3. unlocking the interlock * * If in doubt, use lockmgr_lock_flags. */ int lockmgr_slock(struct lock *lk, u_int flags, const char *file, int line) { uintptr_t x; MPASS((flags & LK_TYPE_MASK) == LK_SHARED); MPASS((flags & LK_INTERLOCK) == 0); MPASS((lk->lock_object.lo_flags & LK_NOSHARE) == 0); if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line, NULL); x = lockmgr_read_value(lk); if (__predict_true(lockmgr_slock_try(lk, &x, flags, true))) { lockmgr_note_shared_acquire(lk, 0, 0, file, line, flags); return (0); } return (lockmgr_slock_hard(lk, flags | LK_ADAPTIVE, NULL, file, line, NULL)); } int lockmgr_xlock(struct lock *lk, u_int flags, const char *file, int line) { uintptr_t tid; MPASS((flags & LK_TYPE_MASK) == LK_EXCLUSIVE); MPASS((flags & LK_INTERLOCK) == 0); if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); tid = (uintptr_t)curthread; if (atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) { lockmgr_note_exclusive_acquire(lk, 0, 0, file, line, flags); return (0); } return (lockmgr_xlock_hard(lk, flags | LK_ADAPTIVE, NULL, file, line, NULL)); } int lockmgr_unlock(struct lock *lk) { uintptr_t x, tid; const char *file; int line; file = __FILE__; line = __LINE__; _lockmgr_assert(lk, KA_LOCKED, file, line); x = lockmgr_read_value(lk); if (__predict_true(x & LK_SHARE) != 0) { lockmgr_note_shared_release(lk, file, line); if (lockmgr_sunlock_try(lk, &x)) { LOCKSTAT_PROFILE_RELEASE_RWLOCK(lockmgr__release, lk, LOCKSTAT_READER); } else { return (lockmgr_sunlock_hard(lk, x, LK_RELEASE, NULL, file, line)); } } else { tid = (uintptr_t)curthread; lockmgr_note_exclusive_release(lk, file, line); if (x == tid && atomic_cmpset_rel_ptr(&lk->lk_lock, tid, LK_UNLOCKED)) { LOCKSTAT_PROFILE_RELEASE_RWLOCK(lockmgr__release, lk,LOCKSTAT_WRITER); } else { return (lockmgr_xunlock_hard(lk, x, LK_RELEASE, NULL, file, line)); } } return (0); } int __lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk, const char *wmesg, int pri, int timo, const char *file, int line) { GIANT_DECLARE; struct lockmgr_wait lwa; struct lock_class *class; const char *iwmesg; uintptr_t tid, v, x; u_int op, realexslp; int error, ipri, itimo, queue, wakeup_swapper; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif if (KERNEL_PANICKED()) return (0); error = 0; tid = (uintptr_t)curthread; op = (flags & LK_TYPE_MASK); iwmesg = (wmesg == LK_WMESG_DEFAULT) ? lk->lock_object.lo_name : wmesg; ipri = (pri == LK_PRIO_DEFAULT) ? lk->lk_pri : pri; itimo = (timo == LK_TIMO_DEFAULT) ? lk->lk_timo : timo; lwa.iwmesg = iwmesg; lwa.ipri = ipri; lwa.itimo = itimo; MPASS((flags & ~LK_TOTAL_MASK) == 0); KASSERT((op & (op - 1)) == 0, ("%s: Invalid requested operation @ %s:%d", __func__, file, line)); KASSERT((flags & (LK_NOWAIT | LK_SLEEPFAIL)) == 0 || (op != LK_DOWNGRADE && op != LK_RELEASE), ("%s: Invalid flags in regard of the operation desired @ %s:%d", __func__, file, line)); KASSERT((flags & LK_INTERLOCK) == 0 || ilk != NULL, ("%s: LK_INTERLOCK passed without valid interlock @ %s:%d", __func__, file, line)); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("%s: idle thread %p on lockmgr %s @ %s:%d", __func__, curthread, lk->lock_object.lo_name, file, line)); class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL; if (lk->lock_object.lo_flags & LK_NOSHARE) { switch (op) { case LK_SHARED: op = LK_EXCLUSIVE; break; case LK_UPGRADE: case LK_TRYUPGRADE: case LK_DOWNGRADE: _lockmgr_assert(lk, KA_XLOCKED | KA_NOTRECURSED, file, line); if (flags & LK_INTERLOCK) class->lc_unlock(ilk); return (0); } } wakeup_swapper = 0; switch (op) { case LK_SHARED: return (lockmgr_slock_hard(lk, flags, ilk, file, line, &lwa)); break; case LK_UPGRADE: case LK_TRYUPGRADE: return (lockmgr_upgrade(lk, flags, ilk, file, line, &lwa)); break; case LK_EXCLUSIVE: return (lockmgr_xlock_hard(lk, flags, ilk, file, line, &lwa)); break; case LK_DOWNGRADE: _lockmgr_assert(lk, KA_XLOCKED, file, line); WITNESS_DOWNGRADE(&lk->lock_object, 0, file, line); /* * Panic if the lock is recursed. */ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) { if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: downgrade a recursed lockmgr %s @ %s:%d\n", __func__, iwmesg, file, line); } TD_SLOCKS_INC(curthread); /* * In order to preserve waiters flags, just spin. */ for (;;) { x = lockmgr_read_value(lk); MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); x &= LK_ALL_WAITERS; if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x, LK_SHARERS_LOCK(1) | x)) break; cpu_spinwait(); } LOCK_LOG_LOCK("XDOWNGRADE", &lk->lock_object, 0, 0, file, line); LOCKSTAT_RECORD0(lockmgr__downgrade, lk); break; case LK_RELEASE: _lockmgr_assert(lk, KA_LOCKED, file, line); x = lockmgr_read_value(lk); if (__predict_true(x & LK_SHARE) != 0) { lockmgr_note_shared_release(lk, file, line); return (lockmgr_sunlock_hard(lk, x, flags, ilk, file, line)); } else { lockmgr_note_exclusive_release(lk, file, line); return (lockmgr_xunlock_hard(lk, x, flags, ilk, file, line)); } break; case LK_DRAIN: if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ? ilk : NULL); /* * Trying to drain a lock we already own will result in a * deadlock. */ if (lockmgr_xlocked(lk)) { if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: draining %s with the lock held @ %s:%d\n", __func__, iwmesg, file, line); } for (;;) { if (lk->lk_lock == LK_UNLOCKED && atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) break; #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&lk->lock_object, &contested, &waittime); /* * If the lock is expected to not sleep just give up * and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } /* * Acquire the sleepqueue chain lock because we * probabilly will need to manipulate waiters flags. */ sleepq_lock(&lk->lock_object); x = lockmgr_read_value(lk); /* * if the lock has been released while we spun on * the sleepqueue chain lock just try again. */ if (x == LK_UNLOCKED) { sleepq_release(&lk->lock_object); continue; } v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); if ((x & ~v) == LK_UNLOCKED) { v = (x & ~LK_EXCLUSIVE_SPINNERS); /* * If interruptible sleeps left the exclusive * queue empty avoid a starvation for the * threads sleeping on the shared queue by * giving them precedence and cleaning up the * exclusive waiters bit anyway. * Please note that lk_exslpfail count may be * lying about the real number of waiters with * the LK_SLEEPFAIL flag on because they may * be used in conjunction with interruptible * sleeps so lk_exslpfail might be considered * an 'upper limit' bound, including the edge * cases. */ if (v & LK_EXCLUSIVE_WAITERS) { queue = SQ_EXCLUSIVE_QUEUE; v &= ~LK_EXCLUSIVE_WAITERS; } else { /* * Exclusive waiters sleeping with * LK_SLEEPFAIL on and using * interruptible sleeps/timeout may * have left spourious lk_exslpfail * counts on, so clean it up anyway. */ MPASS(v & LK_SHARED_WAITERS); lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; v &= ~LK_SHARED_WAITERS; } if (queue == SQ_EXCLUSIVE_QUEUE) { realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE); if (lk->lk_exslpfail >= realexslp) { lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; v &= ~LK_SHARED_WAITERS; if (realexslp != 0) { LOCK_LOG2(lk, "%s: %p has only LK_SLEEPFAIL sleepers", __func__, lk); LOCK_LOG2(lk, "%s: %p waking up threads on the exclusive queue", __func__, lk); wakeup_swapper = sleepq_broadcast( &lk->lock_object, SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); } } else lk->lk_exslpfail = 0; } if (!atomic_cmpset_ptr(&lk->lk_lock, x, v)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG3(lk, "%s: %p waking up all threads on the %s queue", __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); wakeup_swapper |= sleepq_broadcast( &lk->lock_object, SLEEPQ_LK, 0, queue); /* * If shared waiters have been woken up we need * to wait for one of them to acquire the lock * before to set the exclusive waiters in * order to avoid a deadlock. */ if (queue == SQ_SHARED_QUEUE) { for (v = lk->lk_lock; (v & LK_SHARE) && !LK_SHARERS(v); v = lk->lk_lock) cpu_spinwait(); } } /* * Try to set the LK_EXCLUSIVE_WAITERS flag. If we * fail, loop back and retry. */ if ((x & LK_EXCLUSIVE_WAITERS) == 0) { if (!atomic_cmpset_ptr(&lk->lk_lock, x, x | LK_EXCLUSIVE_WAITERS)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG2(lk, "%s: %p set drain waiters flag", __func__, lk); } /* * As far as we have been unable to acquire the * exclusive lock and the exclusive waiters flag * is set, we will sleep. */ if (flags & LK_INTERLOCK) { class->lc_unlock(ilk); flags &= ~LK_INTERLOCK; } GIANT_SAVE(); sleepq_add(&lk->lock_object, NULL, iwmesg, SLEEPQ_LK, SQ_EXCLUSIVE_QUEUE); sleepq_wait(&lk->lock_object, ipri & PRIMASK); GIANT_RESTORE(); LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", __func__, lk); } if (error == 0) { lock_profile_obtain_lock_success(&lk->lock_object, contested, waittime, file, line); LOCK_LOG_LOCK("DRAIN", &lk->lock_object, 0, lk->lk_recurse, file, line); WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); STACK_SAVE(lk); } break; default: if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: unknown lockmgr request 0x%x\n", __func__, op); } if (flags & LK_INTERLOCK) class->lc_unlock(ilk); if (wakeup_swapper) kick_proc0(); return (error); } void _lockmgr_disown(struct lock *lk, const char *file, int line) { uintptr_t tid, x; if (SCHEDULER_STOPPED()) return; tid = (uintptr_t)curthread; _lockmgr_assert(lk, KA_XLOCKED, file, line); /* * Panic if the lock is recursed. */ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) panic("%s: disown a recursed lockmgr @ %s:%d\n", __func__, file, line); /* * If the owner is already LK_KERNPROC just skip the whole operation. */ if (LK_HOLDER(lk->lk_lock) != tid) return; lock_profile_release_lock(&lk->lock_object); LOCKSTAT_RECORD1(lockmgr__disown, lk, LOCKSTAT_WRITER); LOCK_LOG_LOCK("XDISOWN", &lk->lock_object, 0, 0, file, line); WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_DEC(curthread); STACK_SAVE(lk); /* * In order to preserve waiters flags, just spin. */ for (;;) { x = lockmgr_read_value(lk); MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); x &= LK_ALL_WAITERS; if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x, LK_KERNPROC | x)) return; cpu_spinwait(); } } void lockmgr_printinfo(const struct lock *lk) { struct thread *td; uintptr_t x; if (lk->lk_lock == LK_UNLOCKED) printf("lock type %s: UNLOCKED\n", lk->lock_object.lo_name); else if (lk->lk_lock & LK_SHARE) printf("lock type %s: SHARED (count %ju)\n", lk->lock_object.lo_name, (uintmax_t)LK_SHARERS(lk->lk_lock)); else { td = lockmgr_xholder(lk); if (td == (struct thread *)LK_KERNPROC) printf("lock type %s: EXCL by KERNPROC\n", lk->lock_object.lo_name); else printf("lock type %s: EXCL by thread %p " "(pid %d, %s, tid %d)\n", lk->lock_object.lo_name, td, td->td_proc->p_pid, td->td_proc->p_comm, td->td_tid); } x = lk->lk_lock; if (x & LK_EXCLUSIVE_WAITERS) printf(" with exclusive waiters pending\n"); if (x & LK_SHARED_WAITERS) printf(" with shared waiters pending\n"); if (x & LK_EXCLUSIVE_SPINNERS) printf(" with exclusive spinners pending\n"); STACK_PRINT(lk); } int lockstatus(const struct lock *lk) { uintptr_t v, x; int ret; ret = LK_SHARED; x = lockmgr_read_value(lk); v = LK_HOLDER(x); if ((x & LK_SHARE) == 0) { if (v == (uintptr_t)curthread || v == LK_KERNPROC) ret = LK_EXCLUSIVE; else ret = LK_EXCLOTHER; } else if (x == LK_UNLOCKED) ret = 0; return (ret); } #ifdef INVARIANT_SUPPORT FEATURE(invariant_support, "Support for modules compiled with INVARIANTS option"); #ifndef INVARIANTS #undef _lockmgr_assert #endif void _lockmgr_assert(const struct lock *lk, int what, const char *file, int line) { int slocked = 0; if (KERNEL_PANICKED()) return; switch (what) { case KA_SLOCKED: case KA_SLOCKED | KA_NOTRECURSED: case KA_SLOCKED | KA_RECURSED: slocked = 1; case KA_LOCKED: case KA_LOCKED | KA_NOTRECURSED: case KA_LOCKED | KA_RECURSED: #ifdef WITNESS /* * We cannot trust WITNESS if the lock is held in exclusive * mode and a call to lockmgr_disown() happened. * Workaround this skipping the check if the lock is held in * exclusive mode even for the KA_LOCKED case. */ if (slocked || (lk->lk_lock & LK_SHARE)) { witness_assert(&lk->lock_object, what, file, line); break; } #endif if (lk->lk_lock == LK_UNLOCKED || ((lk->lk_lock & LK_SHARE) == 0 && (slocked || (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk))))) panic("Lock %s not %slocked @ %s:%d\n", lk->lock_object.lo_name, slocked ? "share" : "", file, line); if ((lk->lk_lock & LK_SHARE) == 0) { if (lockmgr_recursed(lk)) { if (what & KA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); } else if (what & KA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); } break; case KA_XLOCKED: case KA_XLOCKED | KA_NOTRECURSED: case KA_XLOCKED | KA_RECURSED: if (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk)) panic("Lock %s not exclusively locked @ %s:%d\n", lk->lock_object.lo_name, file, line); if (lockmgr_recursed(lk)) { if (what & KA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); } else if (what & KA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); break; case KA_UNLOCKED: if (lockmgr_xlocked(lk) || lockmgr_disowned(lk)) panic("Lock %s exclusively locked @ %s:%d\n", lk->lock_object.lo_name, file, line); break; default: panic("Unknown lockmgr assertion: %d @ %s:%d\n", what, file, line); } } #endif #ifdef DDB int lockmgr_chain(struct thread *td, struct thread **ownerp) { const struct lock *lk; lk = td->td_wchan; if (LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr) return (0); db_printf("blocked on lockmgr %s", lk->lock_object.lo_name); if (lk->lk_lock & LK_SHARE) db_printf("SHARED (count %ju)\n", (uintmax_t)LK_SHARERS(lk->lk_lock)); else db_printf("EXCL\n"); *ownerp = lockmgr_xholder(lk); return (1); } static void db_show_lockmgr(const struct lock_object *lock) { struct thread *td; const struct lock *lk; lk = (const struct lock *)lock; db_printf(" state: "); if (lk->lk_lock == LK_UNLOCKED) db_printf("UNLOCKED\n"); else if (lk->lk_lock & LK_SHARE) db_printf("SLOCK: %ju\n", (uintmax_t)LK_SHARERS(lk->lk_lock)); else { td = lockmgr_xholder(lk); if (td == (struct thread *)LK_KERNPROC) db_printf("XLOCK: LK_KERNPROC\n"); else db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_proc->p_comm); if (lockmgr_recursed(lk)) db_printf(" recursed: %d\n", lk->lk_recurse); } db_printf(" waiters: "); switch (lk->lk_lock & LK_ALL_WAITERS) { case LK_SHARED_WAITERS: db_printf("shared\n"); break; case LK_EXCLUSIVE_WAITERS: db_printf("exclusive\n"); break; case LK_ALL_WAITERS: db_printf("shared and exclusive\n"); break; default: db_printf("none\n"); } db_printf(" spinners: "); if (lk->lk_lock & LK_EXCLUSIVE_SPINNERS) db_printf("exclusive\n"); else db_printf("none\n"); } #endif diff --git a/sys/kern/subr_lock.c b/sys/kern/subr_lock.c index b69ee349482c..a74f7e62db4a 100644 --- a/sys/kern/subr_lock.c +++ b/sys/kern/subr_lock.c @@ -1,751 +1,752 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2006 John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This module holds the global variables and functions used to maintain * lock_object structures. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_mprof.h" #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include SDT_PROVIDER_DEFINE(lock); SDT_PROBE_DEFINE1(lock, , , starvation, "u_int"); CTASSERT(LOCK_CLASS_MAX == 15); struct lock_class *lock_classes[LOCK_CLASS_MAX + 1] = { &lock_class_mtx_spin, &lock_class_mtx_sleep, &lock_class_sx, &lock_class_rm, &lock_class_rm_sleepable, &lock_class_rw, &lock_class_lockmgr, }; void lock_init(struct lock_object *lock, struct lock_class *class, const char *name, const char *type, int flags) { int i; /* Check for double-init and zero object. */ KASSERT(flags & LO_NEW || !lock_initialized(lock), ("lock \"%s\" %p already initialized", name, lock)); /* Look up lock class to find its index. */ for (i = 0; i < LOCK_CLASS_MAX; i++) if (lock_classes[i] == class) { lock->lo_flags = i << LO_CLASSSHIFT; break; } KASSERT(i < LOCK_CLASS_MAX, ("unknown lock class %p", class)); /* Initialize the lock object. */ lock->lo_name = name; lock->lo_flags |= flags | LO_INITIALIZED; LOCK_LOG_INIT(lock, 0); WITNESS_INIT(lock, (type != NULL) ? type : name); } void lock_destroy(struct lock_object *lock) { KASSERT(lock_initialized(lock), ("lock %p is not initialized", lock)); WITNESS_DESTROY(lock); LOCK_LOG_DESTROY(lock, 0); lock->lo_flags &= ~LO_INITIALIZED; } static SYSCTL_NODE(_debug, OID_AUTO, lock, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "lock debugging"); static SYSCTL_NODE(_debug_lock, OID_AUTO, delay, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "lock delay"); static u_int __read_mostly starvation_limit = 131072; SYSCTL_INT(_debug_lock_delay, OID_AUTO, starvation_limit, CTLFLAG_RW, &starvation_limit, 0, ""); static u_int __read_mostly restrict_starvation = 0; SYSCTL_INT(_debug_lock_delay, OID_AUTO, restrict_starvation, CTLFLAG_RW, &restrict_starvation, 0, ""); void lock_delay(struct lock_delay_arg *la) { struct lock_delay_config *lc = la->config; u_short i; la->delay <<= 1; if (__predict_false(la->delay > lc->max)) la->delay = lc->max; for (i = la->delay; i > 0; i--) cpu_spinwait(); la->spin_cnt += la->delay; if (__predict_false(la->spin_cnt > starvation_limit)) { SDT_PROBE1(lock, , , starvation, la->delay); if (restrict_starvation) la->delay = lc->base; } } static u_int lock_roundup_2(u_int val) { u_int res; for (res = 1; res <= val; res <<= 1) continue; return (res); } void lock_delay_default_init(struct lock_delay_config *lc) { lc->base = 1; lc->max = lock_roundup_2(mp_ncpus) * 256; if (lc->max > 32678) lc->max = 32678; } struct lock_delay_config __read_frequently locks_delay; u_short __read_frequently locks_delay_retries; u_short __read_frequently locks_delay_loops; SYSCTL_U16(_debug_lock, OID_AUTO, delay_base, CTLFLAG_RW, &locks_delay.base, 0, ""); SYSCTL_U16(_debug_lock, OID_AUTO, delay_max, CTLFLAG_RW, &locks_delay.max, 0, ""); SYSCTL_U16(_debug_lock, OID_AUTO, delay_retries, CTLFLAG_RW, &locks_delay_retries, 0, ""); SYSCTL_U16(_debug_lock, OID_AUTO, delay_loops, CTLFLAG_RW, &locks_delay_loops, 0, ""); static void locks_delay_init(void *arg __unused) { lock_delay_default_init(&locks_delay); locks_delay_retries = 10; locks_delay_loops = max(10000, locks_delay.max); } LOCK_DELAY_SYSINIT(locks_delay_init); #ifdef DDB DB_SHOW_COMMAND(lock, db_show_lock) { struct lock_object *lock; struct lock_class *class; if (!have_addr) return; lock = (struct lock_object *)addr; if (LO_CLASSINDEX(lock) > LOCK_CLASS_MAX) { db_printf("Unknown lock class: %d\n", LO_CLASSINDEX(lock)); return; } class = LOCK_CLASS(lock); db_printf(" class: %s\n", class->lc_name); db_printf(" name: %s\n", lock->lo_name); class->lc_ddb_show(lock); } #endif #ifdef LOCK_PROFILING /* * One object per-thread for each lock the thread owns. Tracks individual * lock instances. */ struct lock_profile_object { LIST_ENTRY(lock_profile_object) lpo_link; struct lock_object *lpo_obj; const char *lpo_file; int lpo_line; uint16_t lpo_ref; uint16_t lpo_cnt; uint64_t lpo_acqtime; uint64_t lpo_waittime; u_int lpo_contest_locking; }; /* * One lock_prof for each (file, line, lock object) triple. */ struct lock_prof { SLIST_ENTRY(lock_prof) link; struct lock_class *class; const char *file; const char *name; int line; int ticks; uintmax_t cnt_wait_max; uintmax_t cnt_max; uintmax_t cnt_tot; uintmax_t cnt_wait; uintmax_t cnt_cur; uintmax_t cnt_contest_locking; }; SLIST_HEAD(lphead, lock_prof); #define LPROF_HASH_SIZE 4096 #define LPROF_HASH_MASK (LPROF_HASH_SIZE - 1) #define LPROF_CACHE_SIZE 4096 /* * Array of objects and profs for each type of object for each cpu. Spinlocks * are handled separately because a thread may be preempted and acquire a * spinlock while in the lock profiling code of a non-spinlock. In this way * we only need a critical section to protect the per-cpu lists. */ struct lock_prof_type { struct lphead lpt_lpalloc; struct lpohead lpt_lpoalloc; struct lphead lpt_hash[LPROF_HASH_SIZE]; struct lock_prof lpt_prof[LPROF_CACHE_SIZE]; struct lock_profile_object lpt_objs[LPROF_CACHE_SIZE]; }; struct lock_prof_cpu { struct lock_prof_type lpc_types[2]; /* One for spin one for other. */ }; DPCPU_DEFINE_STATIC(struct lock_prof_cpu, lp); #define LP_CPU_SELF (DPCPU_PTR(lp)) #define LP_CPU(cpu) (DPCPU_ID_PTR((cpu), lp)) volatile int __read_mostly lock_prof_enable; static volatile int lock_prof_resetting; #define LPROF_SBUF_SIZE 256 static int lock_prof_rejected; static int lock_prof_skipspin; static int lock_prof_skipcount; #ifndef USE_CPU_NANOSECONDS uint64_t nanoseconds(void) { struct bintime bt; uint64_t ns; binuptime(&bt); /* From bintime2timespec */ ns = bt.sec * (uint64_t)1000000000; ns += ((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32; return (ns); } #endif static void lock_prof_init_type(struct lock_prof_type *type) { int i; SLIST_INIT(&type->lpt_lpalloc); LIST_INIT(&type->lpt_lpoalloc); for (i = 0; i < LPROF_CACHE_SIZE; i++) { SLIST_INSERT_HEAD(&type->lpt_lpalloc, &type->lpt_prof[i], link); LIST_INSERT_HEAD(&type->lpt_lpoalloc, &type->lpt_objs[i], lpo_link); } } static void lock_prof_init(void *arg) { int cpu; CPU_FOREACH(cpu) { lock_prof_init_type(&LP_CPU(cpu)->lpc_types[0]); lock_prof_init_type(&LP_CPU(cpu)->lpc_types[1]); } } SYSINIT(lockprof, SI_SUB_SMP, SI_ORDER_ANY, lock_prof_init, NULL); static void lock_prof_reset_wait(void) { /* * Spin relinquishing our cpu so that quiesce_all_cpus may * complete. */ while (lock_prof_resetting) sched_relinquish(curthread); } static void lock_prof_reset(void) { struct lock_prof_cpu *lpc; int enabled, i, cpu; /* * We not only race with acquiring and releasing locks but also * thread exit. To be certain that threads exit without valid head * pointers they must see resetting set before enabled is cleared. * Otherwise a lock may not be removed from a per-thread list due * to disabled being set but not wait for reset() to remove it below. */ atomic_store_rel_int(&lock_prof_resetting, 1); enabled = lock_prof_enable; lock_prof_enable = 0; /* * This both publishes lock_prof_enable as disabled and makes sure * everyone else reads it if they are not far enough. We wait for the * rest down below. */ cpus_fence_seq_cst(); quiesce_all_critical(); /* * Some objects may have migrated between CPUs. Clear all links * before we zero the structures. Some items may still be linked * into per-thread lists as well. */ CPU_FOREACH(cpu) { lpc = LP_CPU(cpu); for (i = 0; i < LPROF_CACHE_SIZE; i++) { LIST_REMOVE(&lpc->lpc_types[0].lpt_objs[i], lpo_link); LIST_REMOVE(&lpc->lpc_types[1].lpt_objs[i], lpo_link); } } CPU_FOREACH(cpu) { lpc = LP_CPU(cpu); bzero(lpc, sizeof(*lpc)); lock_prof_init_type(&lpc->lpc_types[0]); lock_prof_init_type(&lpc->lpc_types[1]); } /* * Paired with the fence from cpus_fence_seq_cst() */ atomic_store_rel_int(&lock_prof_resetting, 0); lock_prof_enable = enabled; } static void lock_prof_output(struct lock_prof *lp, struct sbuf *sb) { const char *p; for (p = lp->file; p != NULL && strncmp(p, "../", 3) == 0; p += 3); sbuf_printf(sb, "%8ju %9ju %11ju %11ju %11ju %6ju %6ju %2ju %6ju %s:%d (%s:%s)\n", lp->cnt_max / 1000, lp->cnt_wait_max / 1000, lp->cnt_tot / 1000, lp->cnt_wait / 1000, lp->cnt_cur, lp->cnt_cur == 0 ? (uintmax_t)0 : lp->cnt_tot / (lp->cnt_cur * 1000), lp->cnt_cur == 0 ? (uintmax_t)0 : lp->cnt_wait / (lp->cnt_cur * 1000), (uintmax_t)0, lp->cnt_contest_locking, p, lp->line, lp->class->lc_name, lp->name); } static void lock_prof_sum(struct lock_prof *match, struct lock_prof *dst, int hash, int spin, int t) { struct lock_prof_type *type; struct lock_prof *l; int cpu; dst->file = match->file; dst->line = match->line; dst->class = match->class; dst->name = match->name; CPU_FOREACH(cpu) { type = &LP_CPU(cpu)->lpc_types[spin]; SLIST_FOREACH(l, &type->lpt_hash[hash], link) { if (l->ticks == t) continue; if (l->file != match->file || l->line != match->line || l->name != match->name) continue; l->ticks = t; if (l->cnt_max > dst->cnt_max) dst->cnt_max = l->cnt_max; if (l->cnt_wait_max > dst->cnt_wait_max) dst->cnt_wait_max = l->cnt_wait_max; dst->cnt_tot += l->cnt_tot; dst->cnt_wait += l->cnt_wait; dst->cnt_cur += l->cnt_cur; dst->cnt_contest_locking += l->cnt_contest_locking; } } } static void lock_prof_type_stats(struct lock_prof_type *type, struct sbuf *sb, int spin, int t) { struct lock_prof *l; int i; for (i = 0; i < LPROF_HASH_SIZE; ++i) { SLIST_FOREACH(l, &type->lpt_hash[i], link) { struct lock_prof lp = {}; if (l->ticks == t) continue; lock_prof_sum(l, &lp, i, spin, t); lock_prof_output(&lp, sb); } } } static int dump_lock_prof_stats(SYSCTL_HANDLER_ARGS) { struct sbuf *sb; int error, cpu, t; int enabled; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, LPROF_SBUF_SIZE, req); sbuf_printf(sb, "\n%8s %9s %11s %11s %11s %6s %6s %2s %6s %s\n", "max", "wait_max", "total", "wait_total", "count", "avg", "wait_avg", "cnt_hold", "cnt_lock", "name"); enabled = lock_prof_enable; lock_prof_enable = 0; /* * See the comment in lock_prof_reset */ cpus_fence_seq_cst(); quiesce_all_critical(); t = ticks; CPU_FOREACH(cpu) { lock_prof_type_stats(&LP_CPU(cpu)->lpc_types[0], sb, 0, t); lock_prof_type_stats(&LP_CPU(cpu)->lpc_types[1], sb, 1, t); } atomic_thread_fence_rel(); lock_prof_enable = enabled; error = sbuf_finish(sb); /* Output a trailing NUL. */ if (error == 0) error = SYSCTL_OUT(req, "", 1); sbuf_delete(sb); return (error); } static int enable_lock_prof(SYSCTL_HANDLER_ARGS) { int error, v; v = lock_prof_enable; error = sysctl_handle_int(oidp, &v, v, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == lock_prof_enable) return (0); if (v == 1) lock_prof_reset(); lock_prof_enable = !!v; return (0); } static int reset_lock_prof_stats(SYSCTL_HANDLER_ARGS) { int error, v; v = 0; error = sysctl_handle_int(oidp, &v, 0, req); if (error) return (error); if (req->newptr == NULL) return (error); if (v == 0) return (0); lock_prof_reset(); return (0); } static struct lock_prof * lock_profile_lookup(struct lock_object *lo, int spin, const char *file, int line) { const char *unknown = "(unknown)"; struct lock_prof_type *type; struct lock_prof *lp; struct lphead *head; const char *p; u_int hash; p = file; if (p == NULL || *p == '\0') p = unknown; hash = (uintptr_t)lo->lo_name * 31 + (uintptr_t)p * 31 + line; hash &= LPROF_HASH_MASK; type = &LP_CPU_SELF->lpc_types[spin]; head = &type->lpt_hash[hash]; SLIST_FOREACH(lp, head, link) { if (lp->line == line && lp->file == p && lp->name == lo->lo_name) return (lp); } lp = SLIST_FIRST(&type->lpt_lpalloc); if (lp == NULL) { lock_prof_rejected++; return (lp); } SLIST_REMOVE_HEAD(&type->lpt_lpalloc, link); lp->file = p; lp->line = line; lp->class = LOCK_CLASS(lo); lp->name = lo->lo_name; SLIST_INSERT_HEAD(&type->lpt_hash[hash], lp, link); return (lp); } static struct lock_profile_object * lock_profile_object_lookup(struct lock_object *lo, int spin, const char *file, int line) { struct lock_profile_object *l; struct lock_prof_type *type; struct lpohead *head; head = &curthread->td_lprof[spin]; LIST_FOREACH(l, head, lpo_link) if (l->lpo_obj == lo && l->lpo_file == file && l->lpo_line == line) return (l); type = &LP_CPU_SELF->lpc_types[spin]; l = LIST_FIRST(&type->lpt_lpoalloc); if (l == NULL) { lock_prof_rejected++; return (NULL); } LIST_REMOVE(l, lpo_link); l->lpo_obj = lo; l->lpo_file = file; l->lpo_line = line; l->lpo_cnt = 0; LIST_INSERT_HEAD(head, l, lpo_link); return (l); } void lock_profile_obtain_lock_success(struct lock_object *lo, int contested, uint64_t waittime, const char *file, int line) { static int lock_prof_count; struct lock_profile_object *l; int spin; if (SCHEDULER_STOPPED()) return; /* don't reset the timer when/if recursing */ if (!lock_prof_enable || (lo->lo_flags & LO_NOPROFILE)) return; if (lock_prof_skipcount && (++lock_prof_count % lock_prof_skipcount) != 0) return; spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0; if (spin && lock_prof_skipspin == 1) return; critical_enter(); /* Recheck enabled now that we're in a critical section. */ if (lock_prof_enable == 0) goto out; l = lock_profile_object_lookup(lo, spin, file, line); if (l == NULL) goto out; l->lpo_cnt++; if (++l->lpo_ref > 1) goto out; l->lpo_contest_locking = contested; l->lpo_acqtime = nanoseconds(); if (waittime && (l->lpo_acqtime > waittime)) l->lpo_waittime = l->lpo_acqtime - waittime; else l->lpo_waittime = 0; out: /* * Paired with cpus_fence_seq_cst(). */ atomic_thread_fence_rel(); critical_exit(); } void lock_profile_thread_exit(struct thread *td) { #ifdef INVARIANTS struct lock_profile_object *l; MPASS(curthread->td_critnest == 0); #endif /* * If lock profiling was disabled we have to wait for reset to * clear our pointers before we can exit safely. */ lock_prof_reset_wait(); #ifdef INVARIANTS LIST_FOREACH(l, &td->td_lprof[0], lpo_link) printf("thread still holds lock acquired at %s:%d\n", l->lpo_file, l->lpo_line); LIST_FOREACH(l, &td->td_lprof[1], lpo_link) printf("thread still holds lock acquired at %s:%d\n", l->lpo_file, l->lpo_line); #endif MPASS(LIST_FIRST(&td->td_lprof[0]) == NULL); MPASS(LIST_FIRST(&td->td_lprof[1]) == NULL); } void lock_profile_release_lock(struct lock_object *lo) { struct lock_profile_object *l; struct lock_prof_type *type; struct lock_prof *lp; uint64_t curtime, holdtime; struct lpohead *head; int spin; if (SCHEDULER_STOPPED()) return; if (lo->lo_flags & LO_NOPROFILE) return; spin = (LOCK_CLASS(lo)->lc_flags & LC_SPINLOCK) ? 1 : 0; head = &curthread->td_lprof[spin]; if (LIST_FIRST(head) == NULL) return; critical_enter(); /* Recheck enabled now that we're in a critical section. */ if (lock_prof_enable == 0 && lock_prof_resetting == 1) goto out; /* * If lock profiling is not enabled we still want to remove the * lpo from our queue. */ LIST_FOREACH(l, head, lpo_link) if (l->lpo_obj == lo) break; if (l == NULL) goto out; if (--l->lpo_ref > 0) goto out; lp = lock_profile_lookup(lo, spin, l->lpo_file, l->lpo_line); if (lp == NULL) goto release; curtime = nanoseconds(); if (curtime < l->lpo_acqtime) goto release; holdtime = curtime - l->lpo_acqtime; /* * Record if the lock has been held longer now than ever * before. */ if (holdtime > lp->cnt_max) lp->cnt_max = holdtime; if (l->lpo_waittime > lp->cnt_wait_max) lp->cnt_wait_max = l->lpo_waittime; lp->cnt_tot += holdtime; lp->cnt_wait += l->lpo_waittime; lp->cnt_contest_locking += l->lpo_contest_locking; lp->cnt_cur += l->lpo_cnt; release: LIST_REMOVE(l, lpo_link); type = &LP_CPU_SELF->lpc_types[spin]; LIST_INSERT_HEAD(&type->lpt_lpoalloc, l, lpo_link); out: /* * Paired with cpus_fence_seq_cst(). */ atomic_thread_fence_rel(); critical_exit(); } static SYSCTL_NODE(_debug_lock, OID_AUTO, prof, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "lock profiling"); SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipspin, CTLFLAG_RW, &lock_prof_skipspin, 0, "Skip profiling on spinlocks."); SYSCTL_INT(_debug_lock_prof, OID_AUTO, skipcount, CTLFLAG_RW, &lock_prof_skipcount, 0, "Sample approximately every N lock acquisitions."); SYSCTL_INT(_debug_lock_prof, OID_AUTO, rejected, CTLFLAG_RD, &lock_prof_rejected, 0, "Number of rejected profiling records"); SYSCTL_PROC(_debug_lock_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, dump_lock_prof_stats, "A", "Lock profiling statistics"); SYSCTL_PROC(_debug_lock_prof, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, reset_lock_prof_stats, "I", "Reset lock profiling statistics"); SYSCTL_PROC(_debug_lock_prof, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, enable_lock_prof, "I", "Enable lock profiling"); #endif diff --git a/sys/sys/_lockmgr.h b/sys/sys/_lockmgr.h index 62e50df1ac4e..b0d164bdb4dc 100644 --- a/sys/sys/_lockmgr.h +++ b/sys/sys/_lockmgr.h @@ -1,51 +1,51 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008 Attilio Rao * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * * $FreeBSD$ */ #ifndef _SYS__LOCKMGR_H_ #define _SYS__LOCKMGR_H_ #ifdef DEBUG_LOCKS #include #endif struct lock { struct lock_object lock_object; volatile uintptr_t lk_lock; - u_int lk_exslpfail; + u_short lk_exslpfail; + u_short lk_pri; int lk_timo; - int lk_pri; #ifdef DEBUG_LOCKS struct stack lk_stack; #endif }; #endif diff --git a/sys/sys/param.h b/sys/sys/param.h index 058aef99e077..fa02b55d1f1b 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -1,374 +1,375 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * documentation/content/en/books/porters-handbook/versions/chapter.adoc * * scheme is: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * X.0-CURRENT before releng/X.0 is created, otherwise 'R' is * in the range 5 to 9. */ #undef __FreeBSD_version #define __FreeBSD_version 1400004 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #if defined(_KERNEL) || defined(IN_RTLD) #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAP_GUARD 1200035 #define P_OSREL_WRFSBASE 1200041 #define P_OSREL_CK_CYLGRP 1200046 #define P_OSREL_VMTOTAL64 1200054 #define P_OSREL_CK_SUPERBLOCK 1300000 #define P_OSREL_CK_INODE 1300005 #define P_OSREL_POWERPC_NEW_AUX_ARGS 1300070 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 255 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL #ifndef LOCORE /* Signals. */ #include #endif #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif -#define PRIMASK 0x0ff -#define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ -#define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ +#define PRIMASK 0x0ff +#define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ +#define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ +#define PRILASTFLAG 0x200 /* Last flag defined above */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must * be >= MAXBSIZE and can be set differently for different * architectures by defining it in . * Making this larger allows NFS to do larger reads/writes. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * The default value here can be overridden on a per-architecture * basis by defining it in . * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #ifndef MAXBCACHEBUF #define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */ #endif #ifndef BKVASIZE #define BKVASIZE 16384 /* must be power of 2 */ #endif #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) __align_down(x, y) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) __align_up(x, y) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */