Index: head/sys/kern/kern_sx.c =================================================================== --- head/sys/kern/kern_sx.c (revision 228432) +++ head/sys/kern/kern_sx.c (revision 228433) @@ -1,1193 +1,1193 @@ /*- * Copyright (c) 2007 Attilio Rao * Copyright (c) 2001 Jason Evans * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ /* * Shared/exclusive locks. This implementation attempts to ensure * deterministic lock granting behavior, so that slocks and xlocks are * interleaved. * * Priority propagation will not generally raise the priority of lock holders, * so should not be relied upon in combination with sx locks. */ #include "opt_ddb.h" #include "opt_kdtrace.h" #include "opt_no_adaptive_sx.h" #include __FBSDID("$FreeBSD$"); #include +#include #include #include #include #include #include #include #include -#include #if defined(SMP) && !defined(NO_ADAPTIVE_SX) #include #endif #ifdef DDB #include #endif #if defined(SMP) && !defined(NO_ADAPTIVE_SX) #define ADAPTIVE_SX #endif CTASSERT((SX_NOADAPTIVE & LO_CLASSFLAGS) == SX_NOADAPTIVE); /* Handy macros for sleep queues. */ #define SQ_EXCLUSIVE_QUEUE 0 #define SQ_SHARED_QUEUE 1 /* * Variations on DROP_GIANT()/PICKUP_GIANT() for use in this file. We * drop Giant anytime we have to sleep or if we adaptively spin. */ #define GIANT_DECLARE \ int _giantcnt = 0; \ WITNESS_SAVE_DECL(Giant) \ #define GIANT_SAVE() do { \ if (mtx_owned(&Giant)) { \ WITNESS_SAVE(&Giant.lock_object, Giant); \ while (mtx_owned(&Giant)) { \ _giantcnt++; \ mtx_unlock(&Giant); \ } \ } \ } while (0) #define GIANT_RESTORE() do { \ if (_giantcnt > 0) { \ mtx_assert(&Giant, MA_NOTOWNED); \ while (_giantcnt--) \ mtx_lock(&Giant); \ WITNESS_RESTORE(&Giant.lock_object, Giant); \ } \ } while (0) /* * Returns true if an exclusive lock is recursed. It assumes * curthread currently has an exclusive lock. */ #define sx_recurse lock_object.lo_data #define sx_recursed(sx) ((sx)->sx_recurse != 0) static void assert_sx(const struct lock_object *lock, int what); #ifdef DDB static void db_show_sx(const struct lock_object *lock); #endif static void lock_sx(struct lock_object *lock, int how); #ifdef KDTRACE_HOOKS static int owner_sx(const struct lock_object *lock, struct thread **owner); #endif static int unlock_sx(struct lock_object *lock); struct lock_class lock_class_sx = { .lc_name = "sx", .lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE, .lc_assert = assert_sx, #ifdef DDB .lc_ddb_show = db_show_sx, #endif .lc_lock = lock_sx, .lc_unlock = unlock_sx, #ifdef KDTRACE_HOOKS .lc_owner = owner_sx, #endif }; #ifndef INVARIANTS #define _sx_assert(sx, what, file, line) #endif #ifdef ADAPTIVE_SX static u_int asx_retries = 10; static u_int asx_loops = 10000; static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging"); SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, ""); SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, ""); #endif void assert_sx(const struct lock_object *lock, int what) { sx_assert((const struct sx *)lock, what); } void lock_sx(struct lock_object *lock, int how) { struct sx *sx; sx = (struct sx *)lock; if (how) sx_xlock(sx); else sx_slock(sx); } int unlock_sx(struct lock_object *lock) { struct sx *sx; sx = (struct sx *)lock; sx_assert(sx, SA_LOCKED | SA_NOTRECURSED); if (sx_xlocked(sx)) { sx_xunlock(sx); return (1); } else { sx_sunlock(sx); return (0); } } #ifdef KDTRACE_HOOKS int owner_sx(const struct lock_object *lock, struct thread **owner) { const struct sx *sx = (const struct sx *)lock; uintptr_t x = sx->sx_lock; *owner = (struct thread *)SX_OWNER(x); return ((x & SX_LOCK_SHARED) != 0 ? (SX_SHARERS(x) != 0) : (*owner != NULL)); } #endif void sx_sysinit(void *arg) { struct sx_args *sargs = arg; sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags); } void sx_init_flags(struct sx *sx, const char *description, int opts) { int flags; MPASS((opts & ~(SX_QUIET | SX_RECURSE | SX_NOWITNESS | SX_DUPOK | SX_NOPROFILE | SX_NOADAPTIVE)) == 0); ASSERT_ATOMIC_LOAD_PTR(sx->sx_lock, ("%s: sx_lock not aligned for %s: %p", __func__, description, &sx->sx_lock)); flags = LO_SLEEPABLE | LO_UPGRADABLE; if (opts & SX_DUPOK) flags |= LO_DUPOK; if (opts & SX_NOPROFILE) flags |= LO_NOPROFILE; if (!(opts & SX_NOWITNESS)) flags |= LO_WITNESS; if (opts & SX_RECURSE) flags |= LO_RECURSABLE; if (opts & SX_QUIET) flags |= LO_QUIET; flags |= opts & SX_NOADAPTIVE; sx->sx_lock = SX_LOCK_UNLOCKED; sx->sx_recurse = 0; lock_init(&sx->lock_object, &lock_class_sx, description, NULL, flags); } void sx_destroy(struct sx *sx) { KASSERT(sx->sx_lock == SX_LOCK_UNLOCKED, ("sx lock still held")); KASSERT(sx->sx_recurse == 0, ("sx lock still recursed")); sx->sx_lock = SX_LOCK_DESTROYED; lock_destroy(&sx->lock_object); } int _sx_slock(struct sx *sx, int opts, const char *file, int line) { int error = 0; if (SCHEDULER_STOPPED()) return (0); MPASS(curthread != NULL); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_slock() of destroyed sx @ %s:%d", file, line)); WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER, file, line, NULL); error = __sx_slock(sx, opts, file, line); if (!error) { LOCK_LOG_LOCK("SLOCK", &sx->lock_object, 0, 0, file, line); WITNESS_LOCK(&sx->lock_object, 0, file, line); curthread->td_locks++; } return (error); } int sx_try_slock_(struct sx *sx, const char *file, int line) { uintptr_t x; if (SCHEDULER_STOPPED()) return (1); for (;;) { x = sx->sx_lock; KASSERT(x != SX_LOCK_DESTROYED, ("sx_try_slock() of destroyed sx @ %s:%d", file, line)); if (!(x & SX_LOCK_SHARED)) break; if (atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER)) { LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 1, file, line); WITNESS_LOCK(&sx->lock_object, LOP_TRYLOCK, file, line); curthread->td_locks++; return (1); } } LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 0, file, line); return (0); } int _sx_xlock(struct sx *sx, int opts, const char *file, int line) { int error = 0; if (SCHEDULER_STOPPED()) return (0); MPASS(curthread != NULL); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_xlock() of destroyed sx @ %s:%d", file, line)); WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL); error = __sx_xlock(sx, curthread, opts, file, line); if (!error) { LOCK_LOG_LOCK("XLOCK", &sx->lock_object, 0, sx->sx_recurse, file, line); WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line); curthread->td_locks++; } return (error); } int sx_try_xlock_(struct sx *sx, const char *file, int line) { int rval; if (SCHEDULER_STOPPED()) return (1); MPASS(curthread != NULL); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_try_xlock() of destroyed sx @ %s:%d", file, line)); if (sx_xlocked(sx) && (sx->lock_object.lo_flags & LO_RECURSABLE) != 0) { sx->sx_recurse++; atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED); rval = 1; } else rval = atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, (uintptr_t)curthread); LOCK_LOG_TRY("XLOCK", &sx->lock_object, 0, rval, file, line); if (rval) { WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); curthread->td_locks++; } return (rval); } void _sx_sunlock(struct sx *sx, const char *file, int line) { if (SCHEDULER_STOPPED()) return; MPASS(curthread != NULL); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_sunlock() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_SLOCKED, file, line); curthread->td_locks--; WITNESS_UNLOCK(&sx->lock_object, 0, file, line); LOCK_LOG_LOCK("SUNLOCK", &sx->lock_object, 0, 0, file, line); __sx_sunlock(sx, file, line); LOCKSTAT_PROFILE_RELEASE_LOCK(LS_SX_SUNLOCK_RELEASE, sx); } void _sx_xunlock(struct sx *sx, const char *file, int line) { if (SCHEDULER_STOPPED()) return; MPASS(curthread != NULL); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_xunlock() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_XLOCKED, file, line); curthread->td_locks--; WITNESS_UNLOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line); LOCK_LOG_LOCK("XUNLOCK", &sx->lock_object, 0, sx->sx_recurse, file, line); if (!sx_recursed(sx)) LOCKSTAT_PROFILE_RELEASE_LOCK(LS_SX_XUNLOCK_RELEASE, sx); __sx_xunlock(sx, curthread, file, line); } /* * Try to do a non-blocking upgrade from a shared lock to an exclusive lock. * This will only succeed if this thread holds a single shared lock. * Return 1 if if the upgrade succeed, 0 otherwise. */ int sx_try_upgrade_(struct sx *sx, const char *file, int line) { uintptr_t x; int success; if (SCHEDULER_STOPPED()) return (1); KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_try_upgrade() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_SLOCKED, file, line); /* * Try to switch from one shared lock to an exclusive lock. We need * to maintain the SX_LOCK_EXCLUSIVE_WAITERS flag if set so that * we will wake up the exclusive waiters when we drop the lock. */ x = sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS; success = atomic_cmpset_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | x, (uintptr_t)curthread | x); LOCK_LOG_TRY("XUPGRADE", &sx->lock_object, 0, success, file, line); if (success) { WITNESS_UPGRADE(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file, line); LOCKSTAT_RECORD0(LS_SX_TRYUPGRADE_UPGRADE, sx); } return (success); } /* * Downgrade an unrecursed exclusive lock into a single shared lock. */ void sx_downgrade_(struct sx *sx, const char *file, int line) { uintptr_t x; int wakeup_swapper; if (SCHEDULER_STOPPED()) return; KASSERT(sx->sx_lock != SX_LOCK_DESTROYED, ("sx_downgrade() of destroyed sx @ %s:%d", file, line)); _sx_assert(sx, SA_XLOCKED | SA_NOTRECURSED, file, line); #ifndef INVARIANTS if (sx_recursed(sx)) panic("downgrade of a recursed lock"); #endif WITNESS_DOWNGRADE(&sx->lock_object, 0, file, line); /* * Try to switch from an exclusive lock with no shared waiters * to one sharer with no shared waiters. If there are * exclusive waiters, we don't need to lock the sleep queue so * long as we preserve the flag. We do one quick try and if * that fails we grab the sleepq lock to keep the flags from * changing and do it the slow way. * * We have to lock the sleep queue if there are shared waiters * so we can wake them up. */ x = sx->sx_lock; if (!(x & SX_LOCK_SHARED_WAITERS) && atomic_cmpset_rel_ptr(&sx->sx_lock, x, SX_SHARERS_LOCK(1) | (x & SX_LOCK_EXCLUSIVE_WAITERS))) { LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line); return; } /* * Lock the sleep queue so we can read the waiters bits * without any races and wakeup any shared waiters. */ sleepq_lock(&sx->lock_object); /* * Preserve SX_LOCK_EXCLUSIVE_WAITERS while downgraded to a single * shared lock. If there are any shared waiters, wake them up. */ wakeup_swapper = 0; x = sx->sx_lock; atomic_store_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | (x & SX_LOCK_EXCLUSIVE_WAITERS)); if (x & SX_LOCK_SHARED_WAITERS) wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, SQ_SHARED_QUEUE); sleepq_release(&sx->lock_object); LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line); LOCKSTAT_RECORD0(LS_SX_DOWNGRADE_DOWNGRADE, sx); if (wakeup_swapper) kick_proc0(); } /* * This function represents the so-called 'hard case' for sx_xlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ int _sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file, int line) { GIANT_DECLARE; #ifdef ADAPTIVE_SX volatile struct thread *owner; u_int i, spintries = 0; #endif uintptr_t x; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif int error = 0; #ifdef KDTRACE_HOOKS uint64_t spin_cnt = 0; uint64_t sleep_cnt = 0; int64_t sleep_time = 0; #endif if (SCHEDULER_STOPPED()) return (0); /* If we already hold an exclusive lock, then recurse. */ if (sx_xlocked(sx)) { KASSERT((sx->lock_object.lo_flags & LO_RECURSABLE) != 0, ("_sx_xlock_hard: recursed on non-recursive sx %s @ %s:%d\n", sx->lock_object.lo_name, file, line)); sx->sx_recurse++; atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p recursing", __func__, sx); return (0); } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__, sx->lock_object.lo_name, (void *)sx->sx_lock, file, line); while (!atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, tid)) { #ifdef KDTRACE_HOOKS spin_cnt++; #endif lock_profile_obtain_lock_failed(&sx->lock_object, &contested, &waittime); #ifdef ADAPTIVE_SX /* * If the lock is write locked and the owner is * running on another CPU, spin until the owner stops * running or the state of the lock changes. */ x = sx->sx_lock; if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { if ((x & SX_LOCK_SHARED) == 0) { x = SX_OWNER(x); owner = (struct thread *)x; if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, sx, owner); GIANT_SAVE(); while (SX_OWNER(sx->sx_lock) == x && TD_IS_RUNNING(owner)) { cpu_spinwait(); #ifdef KDTRACE_HOOKS spin_cnt++; #endif } continue; } } else if (SX_SHARERS(x) && spintries < asx_retries) { GIANT_SAVE(); spintries++; for (i = 0; i < asx_loops; i++) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: shared spinning on %p with %u and %u", __func__, sx, spintries, i); x = sx->sx_lock; if ((x & SX_LOCK_SHARED) == 0 || SX_SHARERS(x) == 0) break; cpu_spinwait(); #ifdef KDTRACE_HOOKS spin_cnt++; #endif } if (i != asx_loops) continue; } } #endif sleepq_lock(&sx->lock_object); x = sx->sx_lock; /* * If the lock was released while spinning on the * sleep queue chain lock, try again. */ if (x == SX_LOCK_UNLOCKED) { sleepq_release(&sx->lock_object); continue; } #ifdef ADAPTIVE_SX /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owners) while we were waiting on the sleep queue * chain lock. If so, drop the sleep queue lock and try * again. */ if (!(x & SX_LOCK_SHARED) && (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { owner = (struct thread *)SX_OWNER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&sx->lock_object); continue; } } #endif /* * If an exclusive lock was released with both shared * and exclusive waiters and a shared waiter hasn't * woken up and acquired the lock yet, sx_lock will be * set to SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS. * If we see that value, try to acquire it once. Note * that we have to preserve SX_LOCK_EXCLUSIVE_WAITERS * as there are other exclusive waiters still. If we * fail, restart the loop. */ if (x == (SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS)) { if (atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS, tid | SX_LOCK_EXCLUSIVE_WAITERS)) { sleepq_release(&sx->lock_object); CTR2(KTR_LOCK, "%s: %p claimed by new writer", __func__, sx); break; } sleepq_release(&sx->lock_object); continue; } /* * Try to set the SX_LOCK_EXCLUSIVE_WAITERS. If we fail, * than loop back and retry. */ if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) { if (!atomic_cmpset_ptr(&sx->sx_lock, x, x | SX_LOCK_EXCLUSIVE_WAITERS)) { sleepq_release(&sx->lock_object); continue; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set excl waiters flag", __func__, sx); } /* * Since we have been unable to acquire the exclusive * lock and the exclusive waiters flag is set, we have * to sleep. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on sleep queue", __func__, sx); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(); #endif GIANT_SAVE(); sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name, SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ? SLEEPQ_INTERRUPTIBLE : 0), SQ_EXCLUSIVE_QUEUE); if (!(opts & SX_INTERRUPTIBLE)) sleepq_wait(&sx->lock_object, 0); else error = sleepq_wait_sig(&sx->lock_object, 0); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(); sleep_cnt++; #endif if (error) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: interruptible sleep by %p suspended by signal", __func__, sx); break; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from sleep queue", __func__, sx); } GIANT_RESTORE(); if (!error) LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_XLOCK_ACQUIRE, sx, contested, waittime, file, line); #ifdef KDTRACE_HOOKS if (sleep_time) LOCKSTAT_RECORD1(LS_SX_XLOCK_BLOCK, sx, sleep_time); if (spin_cnt > sleep_cnt) LOCKSTAT_RECORD1(LS_SX_XLOCK_SPIN, sx, (spin_cnt - sleep_cnt)); #endif return (error); } /* * This function represents the so-called 'hard case' for sx_xunlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ void _sx_xunlock_hard(struct sx *sx, uintptr_t tid, const char *file, int line) { uintptr_t x; int queue, wakeup_swapper; if (SCHEDULER_STOPPED()) return; MPASS(!(sx->sx_lock & SX_LOCK_SHARED)); /* If the lock is recursed, then unrecurse one level. */ if (sx_xlocked(sx) && sx_recursed(sx)) { if ((--sx->sx_recurse) == 0) atomic_clear_ptr(&sx->sx_lock, SX_LOCK_RECURSED); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, sx); return; } MPASS(sx->sx_lock & (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)); if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p contested", __func__, sx); sleepq_lock(&sx->lock_object); x = SX_LOCK_UNLOCKED; /* * The wake up algorithm here is quite simple and probably not * ideal. It gives precedence to shared waiters if they are * present. For this condition, we have to preserve the * state of the exclusive waiters flag. * If interruptible sleeps left the shared queue empty avoid a * starvation for the threads sleeping on the exclusive queue by giving * them precedence and cleaning up the shared waiters bit anyway. */ if ((sx->sx_lock & SX_LOCK_SHARED_WAITERS) != 0 && sleepq_sleepcnt(&sx->lock_object, SQ_SHARED_QUEUE) != 0) { queue = SQ_SHARED_QUEUE; x |= (sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS); } else queue = SQ_EXCLUSIVE_QUEUE; /* Wake up all the waiters for the specific queue. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: %p waking up all threads on %s queue", __func__, sx, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); atomic_store_rel_ptr(&sx->sx_lock, x); wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, queue); sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); } /* * This function represents the so-called 'hard case' for sx_slock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ int _sx_slock_hard(struct sx *sx, int opts, const char *file, int line) { GIANT_DECLARE; #ifdef ADAPTIVE_SX volatile struct thread *owner; #endif #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif uintptr_t x; int error = 0; #ifdef KDTRACE_HOOKS uint64_t spin_cnt = 0; uint64_t sleep_cnt = 0; int64_t sleep_time = 0; #endif if (SCHEDULER_STOPPED()) return (0); /* * As with rwlocks, we don't make any attempt to try to block * shared locks once there is an exclusive waiter. */ for (;;) { #ifdef KDTRACE_HOOKS spin_cnt++; #endif x = sx->sx_lock; /* * If no other thread has an exclusive lock then try to bump up * the count of sharers. Since we have to preserve the state * of SX_LOCK_EXCLUSIVE_WAITERS, if we fail to acquire the * shared lock loop back and retry. */ if (x & SX_LOCK_SHARED) { MPASS(!(x & SX_LOCK_SHARED_WAITERS)); if (atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeed %p -> %p", __func__, sx, (void *)x, (void *)(x + SX_ONE_SHARER)); break; } continue; } lock_profile_obtain_lock_failed(&sx->lock_object, &contested, &waittime); #ifdef ADAPTIVE_SX /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { x = SX_OWNER(x); owner = (struct thread *)x; if (TD_IS_RUNNING(owner)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, sx, owner); GIANT_SAVE(); while (SX_OWNER(sx->sx_lock) == x && TD_IS_RUNNING(owner)) { #ifdef KDTRACE_HOOKS spin_cnt++; #endif cpu_spinwait(); } continue; } } #endif /* * Some other thread already has an exclusive lock, so * start the process of blocking. */ sleepq_lock(&sx->lock_object); x = sx->sx_lock; /* * The lock could have been released while we spun. * In this case loop back and retry. */ if (x & SX_LOCK_SHARED) { sleepq_release(&sx->lock_object); continue; } #ifdef ADAPTIVE_SX /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ if (!(x & SX_LOCK_SHARED) && (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) { owner = (struct thread *)SX_OWNER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&sx->lock_object); continue; } } #endif /* * Try to set the SX_LOCK_SHARED_WAITERS flag. If we * fail to set it drop the sleep queue lock and loop * back. */ if (!(x & SX_LOCK_SHARED_WAITERS)) { if (!atomic_cmpset_ptr(&sx->sx_lock, x, x | SX_LOCK_SHARED_WAITERS)) { sleepq_release(&sx->lock_object); continue; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p set shared waiters flag", __func__, sx); } /* * Since we have been unable to acquire the shared lock, * we have to sleep. */ if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p blocking on sleep queue", __func__, sx); #ifdef KDTRACE_HOOKS sleep_time -= lockstat_nsecs(); #endif GIANT_SAVE(); sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name, SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ? SLEEPQ_INTERRUPTIBLE : 0), SQ_SHARED_QUEUE); if (!(opts & SX_INTERRUPTIBLE)) sleepq_wait(&sx->lock_object, 0); else error = sleepq_wait_sig(&sx->lock_object, 0); #ifdef KDTRACE_HOOKS sleep_time += lockstat_nsecs(); sleep_cnt++; #endif if (error) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: interruptible sleep by %p suspended by signal", __func__, sx); break; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p resuming from sleep queue", __func__, sx); } if (error == 0) LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_SLOCK_ACQUIRE, sx, contested, waittime, file, line); #ifdef KDTRACE_HOOKS if (sleep_time) LOCKSTAT_RECORD1(LS_SX_XLOCK_BLOCK, sx, sleep_time); if (spin_cnt > sleep_cnt) LOCKSTAT_RECORD1(LS_SX_XLOCK_SPIN, sx, (spin_cnt - sleep_cnt)); #endif GIANT_RESTORE(); return (error); } /* * This function represents the so-called 'hard case' for sx_sunlock * operation. All 'easy case' failures are redirected to this. Note * that ideally this would be a static function, but it needs to be * accessible from at least sx.h. */ void _sx_sunlock_hard(struct sx *sx, const char *file, int line) { uintptr_t x; int wakeup_swapper; if (SCHEDULER_STOPPED()) return; for (;;) { x = sx->sx_lock; /* * We should never have sharers while at least one thread * holds a shared lock. */ KASSERT(!(x & SX_LOCK_SHARED_WAITERS), ("%s: waiting sharers", __func__)); /* * See if there is more than one shared lock held. If * so, just drop one and return. */ if (SX_SHARERS(x) > 1) { if (atomic_cmpset_rel_ptr(&sx->sx_lock, x, x - SX_ONE_SHARER)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR4(KTR_LOCK, "%s: %p succeeded %p -> %p", __func__, sx, (void *)x, (void *)(x - SX_ONE_SHARER)); break; } continue; } /* * If there aren't any waiters for an exclusive lock, * then try to drop it quickly. */ if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) { MPASS(x == SX_SHARERS_LOCK(1)); if (atomic_cmpset_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1), SX_LOCK_UNLOCKED)) { if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p last succeeded", __func__, sx); break; } continue; } /* * At this point, there should just be one sharer with * exclusive waiters. */ MPASS(x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS)); sleepq_lock(&sx->lock_object); /* * Wake up semantic here is quite simple: * Just wake up all the exclusive waiters. * Note that the state of the lock could have changed, * so if it fails loop back and retry. */ if (!atomic_cmpset_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS, SX_LOCK_UNLOCKED)) { sleepq_release(&sx->lock_object); continue; } if (LOCK_LOG_TEST(&sx->lock_object, 0)) CTR2(KTR_LOCK, "%s: %p waking up all thread on" "exclusive queue", __func__, sx); wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0, SQ_EXCLUSIVE_QUEUE); sleepq_release(&sx->lock_object); if (wakeup_swapper) kick_proc0(); break; } } #ifdef INVARIANT_SUPPORT #ifndef INVARIANTS #undef _sx_assert #endif /* * In the non-WITNESS case, sx_assert() can only detect that at least * *some* thread owns an slock, but it cannot guarantee that *this* * thread owns an slock. */ void _sx_assert(const struct sx *sx, int what, const char *file, int line) { #ifndef WITNESS int slocked = 0; #endif if (panicstr != NULL) return; switch (what) { case SA_SLOCKED: case SA_SLOCKED | SA_NOTRECURSED: case SA_SLOCKED | SA_RECURSED: #ifndef WITNESS slocked = 1; /* FALLTHROUGH */ #endif case SA_LOCKED: case SA_LOCKED | SA_NOTRECURSED: case SA_LOCKED | SA_RECURSED: #ifdef WITNESS witness_assert(&sx->lock_object, what, file, line); #else /* * If some other thread has an exclusive lock or we * have one and are asserting a shared lock, fail. * Also, if no one has a lock at all, fail. */ if (sx->sx_lock == SX_LOCK_UNLOCKED || (!(sx->sx_lock & SX_LOCK_SHARED) && (slocked || sx_xholder(sx) != curthread))) panic("Lock %s not %slocked @ %s:%d\n", sx->lock_object.lo_name, slocked ? "share " : "", file, line); if (!(sx->sx_lock & SX_LOCK_SHARED)) { if (sx_recursed(sx)) { if (what & SA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } else if (what & SA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } #endif break; case SA_XLOCKED: case SA_XLOCKED | SA_NOTRECURSED: case SA_XLOCKED | SA_RECURSED: if (sx_xholder(sx) != curthread) panic("Lock %s not exclusively locked @ %s:%d\n", sx->lock_object.lo_name, file, line); if (sx_recursed(sx)) { if (what & SA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); } else if (what & SA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", sx->lock_object.lo_name, file, line); break; case SA_UNLOCKED: #ifdef WITNESS witness_assert(&sx->lock_object, what, file, line); #else /* * If we hold an exclusve lock fail. We can't * reliably check to see if we hold a shared lock or * not. */ if (sx_xholder(sx) == curthread) panic("Lock %s exclusively locked @ %s:%d\n", sx->lock_object.lo_name, file, line); #endif break; default: panic("Unknown sx lock assertion: %d @ %s:%d", what, file, line); } } #endif /* INVARIANT_SUPPORT */ #ifdef DDB static void db_show_sx(const struct lock_object *lock) { struct thread *td; const struct sx *sx; sx = (const struct sx *)lock; db_printf(" state: "); if (sx->sx_lock == SX_LOCK_UNLOCKED) db_printf("UNLOCKED\n"); else if (sx->sx_lock == SX_LOCK_DESTROYED) { db_printf("DESTROYED\n"); return; } else if (sx->sx_lock & SX_LOCK_SHARED) db_printf("SLOCK: %ju\n", (uintmax_t)SX_SHARERS(sx->sx_lock)); else { td = sx_xholder(sx); db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_name); if (sx_recursed(sx)) db_printf(" recursed: %d\n", sx->sx_recurse); } db_printf(" waiters: "); switch(sx->sx_lock & (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)) { case SX_LOCK_SHARED_WAITERS: db_printf("shared\n"); break; case SX_LOCK_EXCLUSIVE_WAITERS: db_printf("exclusive\n"); break; case SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS: db_printf("exclusive and shared\n"); break; default: db_printf("none\n"); } } /* * Check to see if a thread that is blocked on a sleep queue is actually * blocked on an sx lock. If so, output some details and return true. * If the lock has an exclusive owner, return that in *ownerp. */ int sx_chain(struct thread *td, struct thread **ownerp) { struct sx *sx; /* * Check to see if this thread is blocked on an sx lock. * First, we check the lock class. If that is ok, then we * compare the lock name against the wait message. */ sx = td->td_wchan; if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx || sx->lock_object.lo_name != td->td_wmesg) return (0); /* We think we have an sx lock, so output some details. */ db_printf("blocked on sx \"%s\" ", td->td_wmesg); *ownerp = sx_xholder(sx); if (sx->sx_lock & SX_LOCK_SHARED) db_printf("SLOCK (count %ju)\n", (uintmax_t)SX_SHARERS(sx->sx_lock)); else db_printf("XLOCK\n"); return (1); } #endif Index: head/sys/kern/vfs_cache.c =================================================================== --- head/sys/kern/vfs_cache.c (revision 228432) +++ head/sys/kern/vfs_cache.c (revision 228433) @@ -1,1280 +1,1280 @@ /*- * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Poul-Henning Kamp of the FreeBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_kdtrace.h" #include "opt_ktrace.h" #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #ifdef KTRACE #include #endif #include SDT_PROVIDER_DECLARE(vfs); SDT_PROBE_DEFINE3(vfs, namecache, enter, done, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, entry, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, hit, "struct vnode *", "struct char *", "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, miss, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, return, "int", "struct vnode *", "struct char *"); SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit_negative, hit-negative, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, miss, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, purge, done, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, done, "struct mount *"); SDT_PROBE_DEFINE3(vfs, namecache, zap, done, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, done, "struct vnode *", "char *"); /* * This structure describes the elements in the cache of recent * names looked up by namei. */ struct namecache { LIST_ENTRY(namecache) nc_hash; /* hash chain */ LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ struct vnode *nc_dvp; /* vnode of parent of name */ struct vnode *nc_vp; /* vnode the name refers to */ u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ char nc_name[0]; /* segment name + nul */ }; /* * Name caching works as follows: * * Names found by directory scans are retained in a cache * for future reference. It is managed LRU, so frequently * used names will hang around. Cache is indexed by hash value * obtained from (vp, name) where vp refers to the directory * containing name. * * If it is a "negative" entry, (i.e. for a name that is known NOT to * exist) the vnode pointer will be NULL. * * Upon reaching the last segment of a path, if the reference * is for DELETE, or NOCACHE is set (rewrite), and the * name is located in the cache, it will be dropped. */ /* * Structures associated with name cacheing. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */ static u_long nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "Size of namecache hash table"); static u_long ncnegfactor = 16; /* ratio of negative entries */ SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "Ratio of negative namecache entries"); static u_long numneg; /* number of negative entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "Number of negative entries in namecache"); static u_long numcache; /* number of cache entries allocated */ SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "Number of namecache entries"); static u_long numcachehv; /* number of cache entries with vnodes held */ SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "Number of namecache entries with vnodes held"); static u_int ncsizefactor = 2; SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, "Size factor for namecache"); struct nchstats nchstats; /* cache effectiveness statistics */ static struct rwlock cache_lock; RW_SYSINIT(vfscache, &cache_lock, "Name Cache"); #define CACHE_UPGRADE_LOCK() rw_try_upgrade(&cache_lock) #define CACHE_RLOCK() rw_rlock(&cache_lock) #define CACHE_RUNLOCK() rw_runlock(&cache_lock) #define CACHE_WLOCK() rw_wlock(&cache_lock) #define CACHE_WUNLOCK() rw_wunlock(&cache_lock) /* * UMA zones for the VFS cache. * * The small cache is used for entries with short names, which are the * most common. The large cache is used for entries which are too big to * fit in the small cache. */ static uma_zone_t cache_zone_small; static uma_zone_t cache_zone_large; #define CACHE_PATH_CUTOFF 35 #define CACHE_ZONE_SMALL (sizeof(struct namecache) + CACHE_PATH_CUTOFF \ + 1) #define CACHE_ZONE_LARGE (sizeof(struct namecache) + NAME_MAX + 1) #define cache_alloc(len) uma_zalloc(((len) <= CACHE_PATH_CUTOFF) ? \ cache_zone_small : cache_zone_large, M_WAITOK) #define cache_free(ncp) do { \ if (ncp != NULL) \ uma_zfree(((ncp)->nc_nlen <= CACHE_PATH_CUTOFF) ? \ cache_zone_small : cache_zone_large, (ncp)); \ } while (0) static int doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "VFS namecache enabled"); /* Export size information to userland */ SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0, sizeof(struct namecache), "sizeof(struct namecache)"); /* * The new name cache statistics */ static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); #define STATNODE(mode, name, var, descr) \ SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, descr); STATNODE(CTLFLAG_RD, numneg, &numneg, "Number of negative cache entries"); STATNODE(CTLFLAG_RD, numcache, &numcache, "Number of cache entries"); static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls, "Number of cache lookups"); static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits, "Number of '.' hits"); static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits, "Number of '..' hits"); static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks, "Number of checks in lookup"); static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss, "Number of cache misses"); static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap, "Number of cache misses we do not want to cache"); static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps, "Number of cache hits (positive) we do not want to cache"); static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits, "Number of cache hits (positive)"); static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps, "Number of cache hits (negative) we do not want to cache"); static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits, "Number of cache hits (negative)"); static u_long numupgrades; STATNODE(CTLFLAG_RD, numupgrades, &numupgrades, "Number of updates of the cache after lookup (write lock + retry)"); SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD | CTLFLAG_MPSAFE, &nchstats, sizeof(nchstats), "LU", "VFS cache effectiveness statistics"); static void cache_zap(struct namecache *ncp); static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen); static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, u_int buflen); static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); /* * Flags in namecache.nc_flag */ #define NCF_WHITE 0x01 #define NCF_ISDOTDOT 0x02 #ifdef DIAGNOSTIC /* * Grab an atomic snapshot of the name cache hash chain lengths */ static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats"); static int sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count; n_nchash = nchash + 1; /* nchash is max index, not count */ if (!req->oldptr) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { CACHE_RLOCK(); count = 0; LIST_FOREACH(ncp, ncpp, nc_hash) { count++; } CACHE_RUNLOCK(); error = SYSCTL_OUT(req, &count, sizeof(count)); if (error) return (error); } return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); static int sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count, maxlength, used, pct; if (!req->oldptr) return SYSCTL_OUT(req, 0, 4 * sizeof(int)); n_nchash = nchash + 1; /* nchash is max index, not count */ used = 0; maxlength = 0; /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; CACHE_RLOCK(); LIST_FOREACH(ncp, ncpp, nc_hash) { count++; } CACHE_RUNLOCK(); if (count) used++; if (maxlength < count) maxlength = count; } n_nchash = nchash + 1; pct = (used * 100 * 100) / n_nchash; error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); if (error) return (error); error = SYSCTL_OUT(req, &used, sizeof(used)); if (error) return (error); error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); if (error) return (error); error = SYSCTL_OUT(req, &pct, sizeof(pct)); if (error) return (error); return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths"); #endif /* * cache_zap(): * * Removes a namecache entry from cache, whether it contains an actual * pointer to a vnode or if it is just a negative cache entry. */ static void cache_zap(ncp) struct namecache *ncp; { struct vnode *vp; rw_assert(&cache_lock, RA_WLOCKED); CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp); #ifdef KDTRACE_HOOKS if (ncp->nc_vp != NULL) { SDT_PROBE(vfs, namecache, zap, done, ncp->nc_dvp, ncp->nc_name, ncp->nc_vp, 0, 0); } else { SDT_PROBE(vfs, namecache, zap_negative, done, ncp->nc_dvp, ncp->nc_name, 0, 0, 0); } #endif vp = NULL; LIST_REMOVE(ncp, nc_hash); if (ncp->nc_flag & NCF_ISDOTDOT) { if (ncp == ncp->nc_dvp->v_cache_dd) ncp->nc_dvp->v_cache_dd = NULL; } else { LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { vp = ncp->nc_dvp; numcachehv--; } } if (ncp->nc_vp) { TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); if (ncp == ncp->nc_vp->v_cache_dd) ncp->nc_vp->v_cache_dd = NULL; } else { TAILQ_REMOVE(&ncneg, ncp, nc_dst); numneg--; } numcache--; cache_free(ncp); if (vp) vdrop(vp); } /* * Lookup an entry in the cache * * Lookup is called with dvp pointing to the directory to search, * cnp pointing to the name of the entry being sought. If the lookup * succeeds, the vnode is returned in *vpp, and a status of -1 is * returned. If the lookup determines that the name does not exist * (negative cacheing), a status of ENOENT is returned. If the lookup * fails, a status of zero is returned. If the directory vnode is * recycled out from under us due to a forced unmount, a status of * ENOENT is returned. * * vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is * unlocked. If we're looking up . an extra ref is taken, but the lock is * not recursively acquired. */ int cache_lookup(dvp, vpp, cnp) struct vnode *dvp; struct vnode **vpp; struct componentname *cnp; { struct namecache *ncp; uint32_t hash; int error, ltype, wlocked; if (!doingcache) { cnp->cn_flags &= ~MAKEENTRY; return (0); } retry: CACHE_RLOCK(); wlocked = 0; numcalls++; error = 0; retry_wlocked: if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) { *vpp = dvp; CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", dvp, cnp->cn_nameptr); dothits++; SDT_PROBE(vfs, namecache, lookup, hit, dvp, ".", *vpp, 0, 0); goto success; } if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { dotdothits++; if (dvp->v_cache_dd == NULL) { SDT_PROBE(vfs, namecache, lookup, miss, dvp, "..", NULL, 0, 0); goto unlock; } if ((cnp->cn_flags & MAKEENTRY) == 0) { if (!wlocked && !CACHE_UPGRADE_LOCK()) goto wlock; if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT) cache_zap(dvp->v_cache_dd); dvp->v_cache_dd = NULL; CACHE_WUNLOCK(); return (0); } if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT) *vpp = dvp->v_cache_dd->nc_vp; else *vpp = dvp->v_cache_dd->nc_dvp; /* Return failure if negative entry was found. */ if (*vpp == NULL) { ncp = dvp->v_cache_dd; goto negative_success; } CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", dvp, cnp->cn_nameptr, *vpp); SDT_PROBE(vfs, namecache, lookup, hit, dvp, "..", *vpp, 0, 0); goto success; } } hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT); hash = fnv_32_buf(&dvp, sizeof(dvp), hash); LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { numchecks++; if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } /* We failed to find an entry */ if (ncp == NULL) { SDT_PROBE(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL, 0, 0); if ((cnp->cn_flags & MAKEENTRY) == 0) { nummisszap++; } else { nummiss++; } nchstats.ncs_miss++; goto unlock; } /* We don't want to have an entry, so dump it */ if ((cnp->cn_flags & MAKEENTRY) == 0) { numposzaps++; nchstats.ncs_badhits++; if (!wlocked && !CACHE_UPGRADE_LOCK()) goto wlock; cache_zap(ncp); CACHE_WUNLOCK(); return (0); } /* We found a "positive" match, return the vnode */ if (ncp->nc_vp) { numposhits++; nchstats.ncs_goodhits++; *vpp = ncp->nc_vp; CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", dvp, cnp->cn_nameptr, *vpp, ncp); SDT_PROBE(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp, 0, 0); goto success; } negative_success: /* We found a negative match, and want to create it, so purge */ if (cnp->cn_nameiop == CREATE) { numnegzaps++; nchstats.ncs_badhits++; if (!wlocked && !CACHE_UPGRADE_LOCK()) goto wlock; cache_zap(ncp); CACHE_WUNLOCK(); return (0); } if (!wlocked && !CACHE_UPGRADE_LOCK()) goto wlock; numneghits++; /* * We found a "negative" match, so we shift it to the end of * the "negative" cache entries queue to satisfy LRU. Also, * check to see if the entry is a whiteout; indicate this to * the componentname, if so. */ TAILQ_REMOVE(&ncneg, ncp, nc_dst); TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); nchstats.ncs_neghits++; if (ncp->nc_flag & NCF_WHITE) cnp->cn_flags |= ISWHITEOUT; SDT_PROBE(vfs, namecache, lookup, hit_negative, dvp, ncp->nc_name, 0, 0, 0); CACHE_WUNLOCK(); return (ENOENT); wlock: /* * We need to update the cache after our lookup, so upgrade to * a write lock and retry the operation. */ CACHE_RUNLOCK(); CACHE_WLOCK(); numupgrades++; wlocked = 1; goto retry_wlocked; success: /* * On success we return a locked and ref'd vnode as per the lookup * protocol. */ if (dvp == *vpp) { /* lookup on "." */ VREF(*vpp); if (wlocked) CACHE_WUNLOCK(); else CACHE_RUNLOCK(); /* * When we lookup "." we still can be asked to lock it * differently... */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(*vpp)) { if (ltype == LK_EXCLUSIVE) { vn_lock(*vpp, LK_UPGRADE | LK_RETRY); if ((*vpp)->v_iflag & VI_DOOMED) { /* forced unmount */ vrele(*vpp); *vpp = NULL; return (ENOENT); } } else vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); } return (-1); } ltype = 0; /* silence gcc warning */ if (cnp->cn_flags & ISDOTDOT) { ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp, 0); } VI_LOCK(*vpp); if (wlocked) CACHE_WUNLOCK(); else CACHE_RUNLOCK(); error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread); if (cnp->cn_flags & ISDOTDOT) { vn_lock(dvp, ltype | LK_RETRY); if (dvp->v_iflag & VI_DOOMED) { if (error == 0) vput(*vpp); *vpp = NULL; return (ENOENT); } } if (error) { *vpp = NULL; goto retry; } if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); } return (-1); unlock: if (wlocked) CACHE_WUNLOCK(); else CACHE_RUNLOCK(); return (0); } /* * Add an entry to the cache. */ void cache_enter(dvp, vp, cnp) struct vnode *dvp; struct vnode *vp; struct componentname *cnp; { struct namecache *ncp, *n2; struct nchashhead *ncpp; uint32_t hash; int flag; int hold; int zap; int len; CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp, ("cache_enter: Adding a doomed vnode")); VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp, ("cache_enter: Doomed vnode used as src")); if (!doingcache) return; /* * Avoid blowout in namecache entries. */ if (numcache >= desiredvnodes * ncsizefactor) return; flag = 0; if (cnp->cn_nameptr[0] == '.') { if (cnp->cn_namelen == 1) return; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { CACHE_WLOCK(); /* * If dotdot entry already exists, just retarget it * to new parent vnode, otherwise continue with new * namecache entry allocation. */ if ((ncp = dvp->v_cache_dd) != NULL && ncp->nc_flag & NCF_ISDOTDOT) { KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); if (ncp->nc_vp != NULL) TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); else TAILQ_REMOVE(&ncneg, ncp, nc_dst); if (vp != NULL) TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); else TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); ncp->nc_vp = vp; CACHE_WUNLOCK(); return; } dvp->v_cache_dd = NULL; SDT_PROBE(vfs, namecache, enter, done, dvp, "..", vp, 0, 0); CACHE_WUNLOCK(); flag = NCF_ISDOTDOT; } } hold = 0; zap = 0; /* * Calculate the hash key and setup as much of the new * namecache entry as possible before acquiring the lock. */ ncp = cache_alloc(cnp->cn_namelen); ncp->nc_vp = vp; ncp->nc_dvp = dvp; ncp->nc_flag = flag; len = ncp->nc_nlen = cnp->cn_namelen; hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT); strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1); hash = fnv_32_buf(&dvp, sizeof(dvp), hash); CACHE_WLOCK(); /* * See if this vnode or negative entry is already in the cache * with this name. This can happen with concurrent lookups of * the same path name. */ ncpp = NCHHASH(hash); LIST_FOREACH(n2, ncpp, nc_hash) { if (n2->nc_dvp == dvp && n2->nc_nlen == cnp->cn_namelen && !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { CACHE_WUNLOCK(); cache_free(ncp); return; } } if (flag == NCF_ISDOTDOT) { /* * See if we are trying to add .. entry, but some other lookup * has populated v_cache_dd pointer already. */ if (dvp->v_cache_dd != NULL) { CACHE_WUNLOCK(); cache_free(ncp); return; } KASSERT(vp == NULL || vp->v_type == VDIR, ("wrong vnode type %p", vp)); dvp->v_cache_dd = ncp; } numcache++; if (!vp) { numneg++; if (cnp->cn_flags & ISWHITEOUT) ncp->nc_flag |= NCF_WHITE; } else if (vp->v_type == VDIR) { if (flag != NCF_ISDOTDOT) { if ((n2 = vp->v_cache_dd) != NULL && (n2->nc_flag & NCF_ISDOTDOT) != 0) cache_zap(n2); vp->v_cache_dd = ncp; } } else { vp->v_cache_dd = NULL; } /* * Insert the new namecache entry into the appropriate chain * within the cache entries table. */ LIST_INSERT_HEAD(ncpp, ncp, nc_hash); if (flag != NCF_ISDOTDOT) { if (LIST_EMPTY(&dvp->v_cache_src)) { hold = 1; numcachehv++; } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); } /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. */ if (vp) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); SDT_PROBE(vfs, namecache, enter, done, dvp, ncp->nc_name, vp, 0, 0); } else { TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); SDT_PROBE(vfs, namecache, enter_negative, done, dvp, ncp->nc_name, 0, 0, 0); } if (numneg * ncnegfactor > numcache) { ncp = TAILQ_FIRST(&ncneg); zap = 1; } if (hold) vhold(dvp); if (zap) cache_zap(ncp); CACHE_WUNLOCK(); } /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { TAILQ_INIT(&ncneg); cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); /* * Invalidate all entries to a particular vnode. */ void cache_purge(vp) struct vnode *vp; { CTR1(KTR_VFS, "cache_purge(%p)", vp); SDT_PROBE(vfs, namecache, purge, done, vp, 0, 0, 0, 0); CACHE_WLOCK(); while (!LIST_EMPTY(&vp->v_cache_src)) cache_zap(LIST_FIRST(&vp->v_cache_src)); while (!TAILQ_EMPTY(&vp->v_cache_dst)) cache_zap(TAILQ_FIRST(&vp->v_cache_dst)); if (vp->v_cache_dd != NULL) { KASSERT(vp->v_cache_dd->nc_flag & NCF_ISDOTDOT, ("lost dotdot link")); cache_zap(vp->v_cache_dd); } KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); CACHE_WUNLOCK(); } /* * Invalidate all negative entries for a particular directory vnode. */ void cache_purge_negative(vp) struct vnode *vp; { struct namecache *cp, *ncp; CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); SDT_PROBE(vfs, namecache, purge_negative, done, vp, 0, 0, 0, 0); CACHE_WLOCK(); LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) { if (cp->nc_vp == NULL) cache_zap(cp); } CACHE_WUNLOCK(); } /* * Flush all entries referencing a particular filesystem. */ void cache_purgevfs(mp) struct mount *mp; { struct nchashhead *ncpp; struct namecache *ncp, *nnp; /* Scan hash tables for applicable entries */ SDT_PROBE(vfs, namecache, purgevfs, done, mp, 0, 0, 0, 0); CACHE_WLOCK(); for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) { if (ncp->nc_dvp->v_mount == mp) cache_zap(ncp); } } CACHE_WUNLOCK(); } /* * Perform canonical checks and cache lookup and pass on to filesystem * through the vop_cachedlookup only if needed. */ int vfs_cache_lookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { struct vnode *dvp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; struct ucred *cred = cnp->cn_cred; int flags = cnp->cn_flags; struct thread *td = cnp->cn_thread; *vpp = NULL; dvp = ap->a_dvp; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); error = VOP_ACCESS(dvp, VEXEC, cred, td); if (error) return (error); error = cache_lookup(dvp, vpp, cnp); if (error == 0) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); if (error == -1) return (0); return (error); } #ifndef _SYS_SYSPROTO_H_ struct __getcwd_args { u_char *buf; u_int buflen; }; #endif /* * XXX All of these sysctls would probably be more productive dead. */ static int disablecwd; SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "Disable the getcwd syscall"); /* Implementation of the getcwd syscall. */ int sys___getcwd(td, uap) struct thread *td; struct __getcwd_args *uap; { return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen)); } int kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen) { char *bp, *tmpbuf; struct filedesc *fdp; struct vnode *cdir, *rdir; int error, vfslocked; if (disablecwd) return (ENODEV); if (buflen < 2) return (EINVAL); if (buflen > MAXPATHLEN) buflen = MAXPATHLEN; tmpbuf = malloc(buflen, M_TEMP, M_WAITOK); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); cdir = fdp->fd_cdir; VREF(cdir); rdir = fdp->fd_rdir; VREF(rdir); FILEDESC_SUNLOCK(fdp); error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen); vfslocked = VFS_LOCK_GIANT(rdir->v_mount); vrele(rdir); VFS_UNLOCK_GIANT(vfslocked); vfslocked = VFS_LOCK_GIANT(cdir->v_mount); vrele(cdir); VFS_UNLOCK_GIANT(vfslocked); if (!error) { if (bufseg == UIO_SYSSPACE) bcopy(bp, buf, strlen(bp) + 1); else error = copyout(bp, buf, strlen(bp) + 1); #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI)) ktrnamei(bp); #endif } free(tmpbuf, M_TEMP); return (error); } /* * Thus begins the fullpath magic. */ #undef STATNODE #define STATNODE(name, descr) \ static u_int name; \ SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr) static int disablefullpath; SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0, "Disable the vn_fullpath function"); /* These count for kern___getcwd(), too. */ STATNODE(numfullpathcalls, "Number of fullpath search calls"); STATNODE(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); STATNODE(numfullpathfail2, "Number of fullpath search errors (VOP_VPTOCNP failures)"); STATNODE(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); STATNODE(numfullpathfound, "Number of successful fullpath calls"); /* * Retrieve the full filesystem path that correspond to a vnode from the name * cache (if available) */ int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { char *buf; struct filedesc *fdp; struct vnode *rdir; int error, vfslocked; if (disablefullpath) return (ENODEV); if (vn == NULL) return (EINVAL); buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); rdir = fdp->fd_rdir; VREF(rdir); FILEDESC_SUNLOCK(fdp); error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN); vfslocked = VFS_LOCK_GIANT(rdir->v_mount); vrele(rdir); VFS_UNLOCK_GIANT(vfslocked); if (!error) *freebuf = buf; else free(buf, M_TEMP); return (error); } /* * This function is similar to vn_fullpath, but it attempts to lookup the * pathname relative to the global root mount point. This is required for the * auditing sub-system, as audited pathnames must be absolute, relative to the * global root mount point. */ int vn_fullpath_global(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { char *buf; int error; if (disablefullpath) return (ENODEV); if (vn == NULL) return (EINVAL); buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN); if (!error) *freebuf = buf; else free(buf, M_TEMP); return (error); } int vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) { int error; CACHE_RLOCK(); error = vn_vptocnp_locked(vp, cred, buf, buflen); if (error == 0) CACHE_RUNLOCK(); return (error); } static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen) { struct vnode *dvp; struct namecache *ncp; int error, vfslocked; TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; } if (ncp != NULL) { if (*buflen < ncp->nc_nlen) { CACHE_RUNLOCK(); vfslocked = VFS_LOCK_GIANT((*vp)->v_mount); vrele(*vp); VFS_UNLOCK_GIANT(vfslocked); numfullpathfail4++; error = ENOMEM; SDT_PROBE(vfs, namecache, fullpath, return, error, vp, NULL, 0, 0); return (error); } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp, ncp->nc_name, vp, 0, 0); dvp = *vp; *vp = ncp->nc_dvp; vref(*vp); CACHE_RUNLOCK(); vfslocked = VFS_LOCK_GIANT(dvp->v_mount); vrele(dvp); VFS_UNLOCK_GIANT(vfslocked); CACHE_RLOCK(); return (0); } SDT_PROBE(vfs, namecache, fullpath, miss, vp, 0, 0, 0, 0); CACHE_RUNLOCK(); vfslocked = VFS_LOCK_GIANT((*vp)->v_mount); vn_lock(*vp, LK_SHARED | LK_RETRY); error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); vput(*vp); VFS_UNLOCK_GIANT(vfslocked); if (error) { numfullpathfail2++; SDT_PROBE(vfs, namecache, fullpath, return, error, vp, NULL, 0, 0); return (error); } *vp = dvp; CACHE_RLOCK(); if (dvp->v_iflag & VI_DOOMED) { /* forced unmount */ CACHE_RUNLOCK(); vfslocked = VFS_LOCK_GIANT(dvp->v_mount); vrele(dvp); VFS_UNLOCK_GIANT(vfslocked); error = ENOENT; SDT_PROBE(vfs, namecache, fullpath, return, error, vp, NULL, 0, 0); return (error); } /* * *vp has its use count incremented still. */ return (0); } /* * The magic behind kern___getcwd() and vn_fullpath(). */ static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, u_int buflen) { int error, slash_prefixed, vfslocked; #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *vp1; buflen--; buf[buflen] = '\0'; error = 0; slash_prefixed = 0; SDT_PROBE(vfs, namecache, fullpath, entry, vp, 0, 0, 0, 0); numfullpathcalls++; vref(vp); CACHE_RLOCK(); if (vp->v_type != VDIR) { error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen); if (error) return (error); if (buflen == 0) { CACHE_RUNLOCK(); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); return (ENOMEM); } buf[--buflen] = '/'; slash_prefixed = 1; } while (vp != rdir && vp != rootvnode) { if (vp->v_vflag & VV_ROOT) { if (vp->v_iflag & VI_DOOMED) { /* forced unmount */ CACHE_RUNLOCK(); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); error = ENOENT; SDT_PROBE(vfs, namecache, fullpath, return, error, vp, NULL, 0, 0); break; } vp1 = vp->v_mount->mnt_vnodecovered; vref(vp1); CACHE_RUNLOCK(); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); vp = vp1; CACHE_RLOCK(); continue; } if (vp->v_type != VDIR) { CACHE_RUNLOCK(); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); numfullpathfail1++; error = ENOTDIR; SDT_PROBE(vfs, namecache, fullpath, return, error, vp, NULL, 0, 0); break; } error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen); if (error) break; if (buflen == 0) { CACHE_RUNLOCK(); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); error = ENOMEM; SDT_PROBE(vfs, namecache, fullpath, return, error, startvp, NULL, 0, 0); break; } buf[--buflen] = '/'; slash_prefixed = 1; } if (error) return (error); if (!slash_prefixed) { if (buflen == 0) { CACHE_RUNLOCK(); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); numfullpathfail4++; SDT_PROBE(vfs, namecache, fullpath, return, ENOMEM, startvp, NULL, 0, 0); return (ENOMEM); } buf[--buflen] = '/'; } numfullpathfound++; CACHE_RUNLOCK(); vfslocked = VFS_LOCK_GIANT(vp->v_mount); vrele(vp); VFS_UNLOCK_GIANT(vfslocked); SDT_PROBE(vfs, namecache, fullpath, return, 0, startvp, buf + buflen, 0, 0); *retbuf = buf + buflen; return (0); } int vn_commname(struct vnode *vp, char *buf, u_int buflen) { struct namecache *ncp; int l; CACHE_RLOCK(); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; if (ncp == NULL) { CACHE_RUNLOCK(); return (ENOENT); } l = min(ncp->nc_nlen, buflen - 1); memcpy(buf, ncp->nc_name, l); CACHE_RUNLOCK(); buf[l] = '\0'; return (0); } Index: head/sys/security/mac/mac_framework.c =================================================================== --- head/sys/security/mac/mac_framework.c (revision 228432) +++ head/sys/security/mac/mac_framework.c (revision 228433) @@ -1,597 +1,597 @@ /*- * Copyright (c) 1999-2002, 2006, 2009 Robert N. M. Watson * Copyright (c) 2001 Ilmar S. Habibulin * Copyright (c) 2001-2005 Networks Associates Technology, Inc. * Copyright (c) 2005-2006 SPARTA, Inc. * Copyright (c) 2008-2009 Apple Inc. * All rights reserved. * * This software was developed by Robert Watson and Ilmar Habibulin for the * TrustedBSD Project. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * This software was enhanced by SPARTA ISSO under SPAWAR contract * N66001-04-C-6019 ("SEFOS"). * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /*- * Framework for extensible kernel access control. This file contains core * kernel infrastructure for the TrustedBSD MAC Framework, including policy * registration, versioning, locking, error composition operator, and system * calls. * * The MAC Framework implements three programming interfaces: * * - The kernel MAC interface, defined in mac_framework.h, and invoked * throughout the kernel to request security decisions, notify of security * related events, etc. * * - The MAC policy module interface, defined in mac_policy.h, which is * implemented by MAC policy modules and invoked by the MAC Framework to * forward kernel security requests and notifications to policy modules. * * - The user MAC API, defined in mac.h, which allows user programs to query * and set label state on objects. * * The majority of the MAC Framework implementation may be found in * src/sys/security/mac. Sample policy modules may be found in * src/sys/security/mac_*. */ #include "opt_kdtrace.h" #include "opt_mac.h" #include __FBSDID("$FreeBSD$"); #include +#include #include #include #include #include #include #include #include #include -#include #include #include #include #include /* * DTrace SDT providers for MAC. */ SDT_PROVIDER_DEFINE(mac); SDT_PROVIDER_DEFINE(mac_framework); SDT_PROBE_DEFINE2(mac, kernel, policy, modevent, modevent, "int", "struct mac_policy_conf *mpc"); SDT_PROBE_DEFINE1(mac, kernel, policy, register, register, "struct mac_policy_conf *"); SDT_PROBE_DEFINE1(mac, kernel, policy, unregister, unregister, "struct mac_policy_conf *"); /* * Root sysctl node for all MAC and MAC policy controls. */ SYSCTL_NODE(_security, OID_AUTO, mac, CTLFLAG_RW, 0, "TrustedBSD MAC policy controls"); /* * Declare that the kernel provides MAC support, version 3 (FreeBSD 7.x). * This permits modules to refuse to be loaded if the necessary support isn't * present, even if it's pre-boot. */ MODULE_VERSION(kernel_mac_support, MAC_VERSION); static unsigned int mac_version = MAC_VERSION; SYSCTL_UINT(_security_mac, OID_AUTO, version, CTLFLAG_RD, &mac_version, 0, ""); /* * Labels consist of a indexed set of "slots", which are allocated policies * as required. The MAC Framework maintains a bitmask of slots allocated so * far to prevent reuse. Slots cannot be reused, as the MAC Framework * guarantees that newly allocated slots in labels will be NULL unless * otherwise initialized, and because we do not have a mechanism to garbage * collect slots on policy unload. As labeled policies tend to be statically * loaded during boot, and not frequently unloaded and reloaded, this is not * generally an issue. */ #if MAC_MAX_SLOTS > 32 #error "MAC_MAX_SLOTS too large" #endif static unsigned int mac_max_slots = MAC_MAX_SLOTS; static unsigned int mac_slot_offsets_free = (1 << MAC_MAX_SLOTS) - 1; SYSCTL_UINT(_security_mac, OID_AUTO, max_slots, CTLFLAG_RD, &mac_max_slots, 0, ""); /* * Has the kernel started generating labeled objects yet? All read/write * access to this variable is serialized during the boot process. Following * the end of serialization, we don't update this flag; no locking. */ static int mac_late = 0; /* * Each policy declares a mask of object types requiring labels to be * allocated for them. For convenience, we combine and cache the bitwise or * of the per-policy object flags to track whether we will allocate a label * for an object type at run-time. */ uint64_t mac_labeled; SYSCTL_UQUAD(_security_mac, OID_AUTO, labeled, CTLFLAG_RD, &mac_labeled, 0, "Mask of object types being labeled"); MALLOC_DEFINE(M_MACTEMP, "mactemp", "MAC temporary label storage"); /* * MAC policy modules are placed in one of two lists: mac_static_policy_list, * for policies that are loaded early and cannot be unloaded, and * mac_policy_list, which holds policies either loaded later in the boot * cycle or that may be unloaded. The static policy list does not require * locks to iterate over, but the dynamic list requires synchronization. * Support for dynamic policy loading can be compiled out using the * MAC_STATIC kernel option. * * The dynamic policy list is protected by two locks: modifying the list * requires both locks to be held exclusively. One of the locks, * mac_policy_rm, is acquired over policy entry points that will never sleep; * the other, mac_policy_sx, is acquire over policy entry points that may * sleep. The former category will be used when kernel locks may be held * over calls to the MAC Framework, during network processing in ithreads, * etc. The latter will tend to involve potentially blocking memory * allocations, extended attribute I/O, etc. */ #ifndef MAC_STATIC static struct rmlock mac_policy_rm; /* Non-sleeping entry points. */ static struct sx mac_policy_sx; /* Sleeping entry points. */ #endif struct mac_policy_list_head mac_policy_list; struct mac_policy_list_head mac_static_policy_list; u_int mac_policy_count; /* Registered policy count. */ static void mac_policy_xlock(void); static void mac_policy_xlock_assert(void); static void mac_policy_xunlock(void); void mac_policy_slock_nosleep(struct rm_priotracker *tracker) { #ifndef MAC_STATIC if (!mac_late) return; rm_rlock(&mac_policy_rm, tracker); #endif } void mac_policy_slock_sleep(void) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "mac_policy_slock_sleep"); #ifndef MAC_STATIC if (!mac_late) return; sx_slock(&mac_policy_sx); #endif } void mac_policy_sunlock_nosleep(struct rm_priotracker *tracker) { #ifndef MAC_STATIC if (!mac_late) return; rm_runlock(&mac_policy_rm, tracker); #endif } void mac_policy_sunlock_sleep(void) { #ifndef MAC_STATIC if (!mac_late) return; sx_sunlock(&mac_policy_sx); #endif } static void mac_policy_xlock(void) { WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "mac_policy_xlock()"); #ifndef MAC_STATIC if (!mac_late) return; sx_xlock(&mac_policy_sx); rm_wlock(&mac_policy_rm); #endif } static void mac_policy_xunlock(void) { #ifndef MAC_STATIC if (!mac_late) return; rm_wunlock(&mac_policy_rm); sx_xunlock(&mac_policy_sx); #endif } static void mac_policy_xlock_assert(void) { #ifndef MAC_STATIC if (!mac_late) return; /* XXXRW: rm_assert(&mac_policy_rm, RA_WLOCKED); */ sx_assert(&mac_policy_sx, SA_XLOCKED); #endif } /* * Initialize the MAC subsystem, including appropriate SMP locks. */ static void mac_init(void) { LIST_INIT(&mac_static_policy_list); LIST_INIT(&mac_policy_list); mac_labelzone_init(); #ifndef MAC_STATIC rm_init_flags(&mac_policy_rm, "mac_policy_rm", RM_NOWITNESS); sx_init_flags(&mac_policy_sx, "mac_policy_sx", SX_NOWITNESS); #endif } /* * For the purposes of modules that want to know if they were loaded "early", * set the mac_late flag once we've processed modules either linked into the * kernel, or loaded before the kernel startup. */ static void mac_late_init(void) { mac_late = 1; } /* * Given a policy, derive from its set of non-NULL label init methods what * object types the policy is interested in. */ static uint64_t mac_policy_getlabeled(struct mac_policy_conf *mpc) { uint64_t labeled; #define MPC_FLAG(method, flag) \ if (mpc->mpc_ops->mpo_ ## method != NULL) \ labeled |= (flag); \ labeled = 0; MPC_FLAG(cred_init_label, MPC_OBJECT_CRED); MPC_FLAG(proc_init_label, MPC_OBJECT_PROC); MPC_FLAG(vnode_init_label, MPC_OBJECT_VNODE); MPC_FLAG(inpcb_init_label, MPC_OBJECT_INPCB); MPC_FLAG(socket_init_label, MPC_OBJECT_SOCKET); MPC_FLAG(devfs_init_label, MPC_OBJECT_DEVFS); MPC_FLAG(mbuf_init_label, MPC_OBJECT_MBUF); MPC_FLAG(ipq_init_label, MPC_OBJECT_IPQ); MPC_FLAG(ifnet_init_label, MPC_OBJECT_IFNET); MPC_FLAG(bpfdesc_init_label, MPC_OBJECT_BPFDESC); MPC_FLAG(pipe_init_label, MPC_OBJECT_PIPE); MPC_FLAG(mount_init_label, MPC_OBJECT_MOUNT); MPC_FLAG(posixsem_init_label, MPC_OBJECT_POSIXSEM); MPC_FLAG(posixshm_init_label, MPC_OBJECT_POSIXSHM); MPC_FLAG(sysvmsg_init_label, MPC_OBJECT_SYSVMSG); MPC_FLAG(sysvmsq_init_label, MPC_OBJECT_SYSVMSQ); MPC_FLAG(sysvsem_init_label, MPC_OBJECT_SYSVSEM); MPC_FLAG(sysvshm_init_label, MPC_OBJECT_SYSVSHM); MPC_FLAG(syncache_init_label, MPC_OBJECT_SYNCACHE); MPC_FLAG(ip6q_init_label, MPC_OBJECT_IP6Q); #undef MPC_FLAG return (labeled); } /* * When policies are loaded or unloaded, walk the list of registered policies * and built mac_labeled, a bitmask representing the union of all objects * requiring labels across all policies. */ static void mac_policy_update(void) { struct mac_policy_conf *mpc; mac_policy_xlock_assert(); mac_labeled = 0; mac_policy_count = 0; LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) { mac_labeled |= mac_policy_getlabeled(mpc); mac_policy_count++; } LIST_FOREACH(mpc, &mac_policy_list, mpc_list) { mac_labeled |= mac_policy_getlabeled(mpc); mac_policy_count++; } } static int mac_policy_register(struct mac_policy_conf *mpc) { struct mac_policy_conf *tmpc; int error, slot, static_entry; error = 0; /* * We don't technically need exclusive access while !mac_late, but * hold it for assertion consistency. */ mac_policy_xlock(); /* * If the module can potentially be unloaded, or we're loading late, * we have to stick it in the non-static list and pay an extra * performance overhead. Otherwise, we can pay a light locking cost * and stick it in the static list. */ static_entry = (!mac_late && !(mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_UNLOADOK)); if (static_entry) { LIST_FOREACH(tmpc, &mac_static_policy_list, mpc_list) { if (strcmp(tmpc->mpc_name, mpc->mpc_name) == 0) { error = EEXIST; goto out; } } } else { LIST_FOREACH(tmpc, &mac_policy_list, mpc_list) { if (strcmp(tmpc->mpc_name, mpc->mpc_name) == 0) { error = EEXIST; goto out; } } } if (mpc->mpc_field_off != NULL) { slot = ffs(mac_slot_offsets_free); if (slot == 0) { error = ENOMEM; goto out; } slot--; mac_slot_offsets_free &= ~(1 << slot); *mpc->mpc_field_off = slot; } mpc->mpc_runtime_flags |= MPC_RUNTIME_FLAG_REGISTERED; /* * If we're loading a MAC module after the framework has initialized, * it has to go into the dynamic list. If we're loading it before * we've finished initializing, it can go into the static list with * weaker locker requirements. */ if (static_entry) LIST_INSERT_HEAD(&mac_static_policy_list, mpc, mpc_list); else LIST_INSERT_HEAD(&mac_policy_list, mpc, mpc_list); /* * Per-policy initialization. Currently, this takes place under the * exclusive lock, so policies must not sleep in their init method. * In the future, we may want to separate "init" from "start", with * "init" occuring without the lock held. Likewise, on tear-down, * breaking out "stop" from "destroy". */ if (mpc->mpc_ops->mpo_init != NULL) (*(mpc->mpc_ops->mpo_init))(mpc); mac_policy_update(); SDT_PROBE(mac, kernel, policy, register, mpc, 0, 0, 0, 0); printf("Security policy loaded: %s (%s)\n", mpc->mpc_fullname, mpc->mpc_name); out: mac_policy_xunlock(); return (error); } static int mac_policy_unregister(struct mac_policy_conf *mpc) { /* * If we fail the load, we may get a request to unload. Check to see * if we did the run-time registration, and if not, silently succeed. */ mac_policy_xlock(); if ((mpc->mpc_runtime_flags & MPC_RUNTIME_FLAG_REGISTERED) == 0) { mac_policy_xunlock(); return (0); } #if 0 /* * Don't allow unloading modules with private data. */ if (mpc->mpc_field_off != NULL) { mac_policy_xunlock(); return (EBUSY); } #endif /* * Only allow the unload to proceed if the module is unloadable by * its own definition. */ if ((mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_UNLOADOK) == 0) { mac_policy_xunlock(); return (EBUSY); } if (mpc->mpc_ops->mpo_destroy != NULL) (*(mpc->mpc_ops->mpo_destroy))(mpc); LIST_REMOVE(mpc, mpc_list); mpc->mpc_runtime_flags &= ~MPC_RUNTIME_FLAG_REGISTERED; mac_policy_update(); mac_policy_xunlock(); SDT_PROBE(mac, kernel, policy, unregister, mpc, 0, 0, 0, 0); printf("Security policy unload: %s (%s)\n", mpc->mpc_fullname, mpc->mpc_name); return (0); } /* * Allow MAC policy modules to register during boot, etc. */ int mac_policy_modevent(module_t mod, int type, void *data) { struct mac_policy_conf *mpc; int error; error = 0; mpc = (struct mac_policy_conf *) data; #ifdef MAC_STATIC if (mac_late) { printf("mac_policy_modevent: MAC_STATIC and late\n"); return (EBUSY); } #endif SDT_PROBE(mac, kernel, policy, modevent, type, mpc, 0, 0, 0); switch (type) { case MOD_LOAD: if (mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_NOTLATE && mac_late) { printf("mac_policy_modevent: can't load %s policy " "after booting\n", mpc->mpc_name); error = EBUSY; break; } error = mac_policy_register(mpc); break; case MOD_UNLOAD: /* Don't unregister the module if it was never registered. */ if ((mpc->mpc_runtime_flags & MPC_RUNTIME_FLAG_REGISTERED) != 0) error = mac_policy_unregister(mpc); else error = 0; break; default: error = EOPNOTSUPP; break; } return (error); } /* * Define an error value precedence, and given two arguments, selects the * value with the higher precedence. */ int mac_error_select(int error1, int error2) { /* Certain decision-making errors take top priority. */ if (error1 == EDEADLK || error2 == EDEADLK) return (EDEADLK); /* Invalid arguments should be reported where possible. */ if (error1 == EINVAL || error2 == EINVAL) return (EINVAL); /* Precedence goes to "visibility", with both process and file. */ if (error1 == ESRCH || error2 == ESRCH) return (ESRCH); if (error1 == ENOENT || error2 == ENOENT) return (ENOENT); /* Precedence goes to DAC/MAC protections. */ if (error1 == EACCES || error2 == EACCES) return (EACCES); /* Precedence goes to privilege. */ if (error1 == EPERM || error2 == EPERM) return (EPERM); /* Precedence goes to error over success; otherwise, arbitrary. */ if (error1 != 0) return (error1); return (error2); } int mac_check_structmac_consistent(struct mac *mac) { if (mac->m_buflen < 0 || mac->m_buflen > MAC_MAX_LABEL_BUF_LEN) return (EINVAL); return (0); } SYSINIT(mac, SI_SUB_MAC, SI_ORDER_FIRST, mac_init, NULL); SYSINIT(mac_late, SI_SUB_MAC_LATE, SI_ORDER_FIRST, mac_late_init, NULL); Index: head/sys/security/mac/mac_priv.c =================================================================== --- head/sys/security/mac/mac_priv.c (revision 228432) +++ head/sys/security/mac/mac_priv.c (revision 228433) @@ -1,96 +1,97 @@ /*- * Copyright (c) 2006 nCircle Network Security, Inc. * Copyright (c) 2009 Robert N. M. Watson * All rights reserved. * * This software was developed by Robert N. M. Watson for the TrustedBSD * Project under contract to nCircle Network Security, Inc. * * This software was developed at the University of Cambridge Computer * Laboratory with support from a grant from Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY, * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * MAC checks for system privileges. */ #include "sys/cdefs.h" __FBSDID("$FreeBSD$"); #include "opt_kdtrace.h" #include "opt_mac.h" #include +#include #include #include #include #include #include #include #include /* * The MAC Framework interacts with kernel privilege checks in two ways: it * may restrict the granting of privilege to a subject, and it may grant * additional privileges to the subject. Policies may implement none, one, * or both of these entry points. Restriction of privilege by any policy * always overrides granting of privilege by any policy or other privilege * mechanism. See kern_priv.c:priv_check_cred() for details of the * composition. */ MAC_CHECK_PROBE_DEFINE2(priv_check, "struct ucred *", "int"); /* * Restrict access to a privilege for a credential. Return failure if any * policy denies access. */ int mac_priv_check(struct ucred *cred, int priv) { int error; MAC_POLICY_CHECK_NOSLEEP(priv_check, cred, priv); MAC_CHECK_PROBE2(priv_check, error, cred, priv); return (error); } MAC_GRANT_PROBE_DEFINE2(priv_grant, "struct ucred *", "int"); /* * Grant access to a privilege for a credential. Return success if any * policy grants access. */ int mac_priv_grant(struct ucred *cred, int priv) { int error; MAC_POLICY_GRANT_NOSLEEP(priv_grant, cred, priv); MAC_GRANT_PROBE2(priv_grant, error, cred, priv); return (error); }