Index: stable/10/share/man/man9/lock.9 =================================================================== --- stable/10/share/man/man9/lock.9 (revision 274605) +++ stable/10/share/man/man9/lock.9 (revision 274606) @@ -1,400 +1,423 @@ .\" .\" Copyright (C) 2002 Chad David . All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice(s), this list of conditions and the following disclaimer as .\" the first lines of this file unmodified other than the possible .\" addition of one or more copyright notices. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice(s), this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY .\" EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED .\" WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE .\" DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY .\" DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES .\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR .\" SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER .\" CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH .\" DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd October 6, 2013 +.Dd November 2, 2014 .Dt LOCK 9 .Os .Sh NAME .Nm lockinit , .Nm lockdestroy , .Nm lockmgr , .Nm lockmgr_args , .Nm lockmgr_args_rw , .Nm lockmgr_disown , .Nm lockmgr_printinfo , .Nm lockmgr_recursed , .Nm lockmgr_rw , .Nm lockmgr_waiters , .Nm lockstatus , .Nm lockmgr_assert .Nd "lockmgr family of functions" .Sh SYNOPSIS .In sys/types.h .In sys/lock.h .In sys/lockmgr.h .Ft void .Fn lockinit "struct lock *lkp" "int prio" "const char *wmesg" "int timo" "int flags" .Ft void .Fn lockdestroy "struct lock *lkp" .Ft int .Fn lockmgr "struct lock *lkp" "u_int flags" "struct mtx *ilk" .Ft int .Fn lockmgr_args "struct lock *lkp" "u_int flags" "struct mtx *ilk" "const char *wmesg" "int prio" "int timo" .Ft int .Fn lockmgr_args_rw "struct lock *lkp" "u_int flags" "struct rwlock *ilk" "const char *wmesg" "int prio" "int timo" .Ft void .Fn lockmgr_disown "struct lock *lkp" .Ft void .Fn lockmgr_printinfo "const struct lock *lkp" .Ft int .Fn lockmgr_recursed "const struct lock *lkp" .Ft int .Fn lockmgr_rw "struct lock *lkp" "u_int flags" "struct rwlock *ilk" .Ft int .Fn lockmgr_waiters "const struct lock *lkp" .Ft int .Fn lockstatus "const struct lock *lkp" .Pp .Cd "options INVARIANTS" .Cd "options INVARIANT_SUPPORT" .Ft void .Fn lockmgr_assert "const struct lock *lkp" "int what" .Sh DESCRIPTION The .Fn lockinit function is used to initialize a lock. It must be called before any operation can be performed on a lock. Its arguments are: .Bl -tag -width ".Fa wmesg" .It Fa lkp A pointer to the lock to initialize. .It Fa prio The priority passed to .Xr sleep 9 . .It Fa wmesg The lock message. This is used for both debugging output and .Xr sleep 9 . .It Fa timo The timeout value passed to .Xr sleep 9 . .It Fa flags The flags the lock is to be initialized with: .Bl -tag -width ".Dv LK_CANRECURSE" .It Dv LK_ADAPTIVE Enable adaptive spinning for this lock if the kernel is compiled with the ADAPTIVE_LOCKMGRS option. .It Dv LK_CANRECURSE Allow recursive exclusive locks. .It Dv LK_NOPROFILE Disable lock profiling for this lock. .It Dv LK_NOSHARE Allow exclusive locks only. .It Dv LK_NOWITNESS Instruct .Xr witness 4 to ignore this lock. .It Dv LK_NODUP .Xr witness 4 should log messages about duplicate locks being acquired. .It Dv LK_QUIET Disable .Xr ktr 4 logging for this lock. .It Dv LK_TIMELOCK Use .Fa timo during a sleep; otherwise, 0 is used. .El .El .Pp The .Fn lockdestroy function is used to destroy a lock, and while it is called in a number of places in the kernel, it currently does nothing. .Pp The .Fn lockmgr and .Fn lockmgr_rw functions handle general locking functionality within the kernel, including support for shared and exclusive locks, and recursion. .Fn lockmgr and .Fn lockmgr_rw are also able to upgrade and downgrade locks. .Pp Their arguments are: .Bl -tag -width ".Fa flags" .It Fa lkp A pointer to the lock to manipulate. .It Fa flags Flags indicating what action is to be taken. -.Bl -tag -width ".Dv LK_CANRECURSE" +.Bl -tag -width ".Dv LK_NODDLKTREAT" .It Dv LK_SHARED Acquire a shared lock. If an exclusive lock is currently held, .Dv EDEADLK will be returned. .It Dv LK_EXCLUSIVE Acquire an exclusive lock. If an exclusive lock is already held, and .Dv LK_CANRECURSE is not set, the system will .Xr panic 9 . .It Dv LK_DOWNGRADE Downgrade exclusive lock to a shared lock. Downgrading a shared lock is not permitted. If an exclusive lock has been recursed, the system will .Xr panic 9 . .It Dv LK_UPGRADE Upgrade a shared lock to an exclusive lock. If this call fails, the shared lock is lost, even if the .Dv LK_NOWAIT flag is specified. During the upgrade, the shared lock could be temporarily dropped. Attempts to upgrade an exclusive lock will cause a .Xr panic 9 . .It Dv LK_TRYUPGRADE Try to upgrade a shared lock to an exclusive lock. The failure to upgrade does not result in the dropping of the shared lock ownership. .It Dv LK_RELEASE Release the lock. Releasing a lock that is not held can cause a .Xr panic 9 . .It Dv LK_DRAIN Wait for all activity on the lock to end, then mark it decommissioned. This is used before freeing a lock that is part of a piece of memory that is about to be freed. (As documented in .In sys/lockmgr.h . ) .It Dv LK_SLEEPFAIL Fail if operation has slept. .It Dv LK_NOWAIT Do not allow the call to sleep. This can be used to test the lock. .It Dv LK_NOWITNESS Skip the .Xr witness 4 checks for this instance. .It Dv LK_CANRECURSE Allow recursion on an exclusive lock. For every lock there must be a release. .It Dv LK_INTERLOCK Unlock the interlock (which should be locked already). +.It Dv LK_NODDLKTREAT +Normally, +.Fn lockmgr +postpones serving further shared requests for shared-locked lock if there is +exclusive waiter, to avoid exclusive lock starvation. +But, if the thread requesting the shared lock already owns a shared lockmgr +lock, the request is granted even in presence of the parallel exclusive lock +request, which is done to avoid deadlocks with recursive shared acquisition. +.Pp +The +.Dv LK_NODDLKTREAT +flag can only be used by code which requests shared non-recursive lock. +The flag allows exclusive requests to preempt the current shared request +even if the current thread owns shared locks. +This is safe since shared lock is guaranteed to not recurse, and is used +when thread is known to held unrelated shared locks, to not cause +unneccessary starvation. An example is +.Dv vp +locking in VFS +.Xr lookup 9 , +when +.Dv dvp +is already locked. .El .It Fa ilk An interlock mutex for controlling group access to the lock. If .Dv LK_INTERLOCK is specified, .Fn lockmgr and .Fn lockmgr_rw assume .Fa ilk is currently owned and not recursed, and will return it unlocked. See .Xr mtx_assert 9 . .El .Pp The .Fn lockmgr_args and .Fn lockmgr_args_rw function work like .Fn lockmgr and .Fn lockmgr_rw but accepting a .Fa wmesg , .Fa timo and .Fa prio on a per-instance basis. The specified values will override the default ones, but this can still be used passing, respectively, .Dv LK_WMESG_DEFAULT , .Dv LK_PRIO_DEFAULT and .Dv LK_TIMO_DEFAULT . .Pp The .Fn lockmgr_disown function switches the owner from the current thread to be .Dv LK_KERNPROC , if the lock is already held. .Pp The .Fn lockmgr_printinfo function prints debugging information about the lock. It is used primarily by .Xr VOP_PRINT 9 functions. .Pp The .Fn lockmgr_recursed function returns true if the lock is recursed, 0 otherwise. .Pp The .Fn lockmgr_waiters function returns true if the lock has waiters, 0 otherwise. .Pp The .Fn lockstatus function returns the status of the lock in relation to the current thread. .Pp When compiled with .Cd "options INVARIANTS" and .Cd "options INVARIANT_SUPPORT" , the .Fn lockmgr_assert function tests .Fa lkp for the assertions specified in .Fa what , and panics if they are not met. One of the following assertions must be specified: .Bl -tag -width ".Dv KA_UNLOCKED" .It Dv KA_LOCKED Assert that the current thread has either a shared or an exclusive lock on the .Vt lkp lock pointed to by the first argument. .It Dv KA_SLOCKED Assert that the current thread has a shared lock on the .Vt lkp lock pointed to by the first argument. .It Dv KA_XLOCKED Assert that the current thread has an exclusive lock on the .Vt lkp lock pointed to by the first argument. .It Dv KA_UNLOCKED Assert that the current thread has no lock on the .Vt lkp lock pointed to by the first argument. .El .Pp In addition, one of the following optional assertions can be used with either an .Dv KA_LOCKED , .Dv KA_SLOCKED , or .Dv KA_XLOCKED assertion: .Bl -tag -width ".Dv KA_NOTRECURSED" .It Dv KA_RECURSED Assert that the current thread has a recursed lock on .Fa lkp . .It Dv KA_NOTRECURSED Assert that the current thread does not have a recursed lock on .Fa lkp . .El .Sh RETURN VALUES The .Fn lockmgr and .Fn lockmgr_rw functions return 0 on success and non-zero on failure. .Pp The .Fn lockstatus function returns: .Bl -tag -width ".Dv LK_EXCLUSIVE" .It Dv LK_EXCLUSIVE An exclusive lock is held by the current thread. .It Dv LK_EXCLOTHER An exclusive lock is held by someone other than the current thread. .It Dv LK_SHARED A shared lock is held. .It Li 0 The lock is not held by anyone. .El .Sh ERRORS .Fn lockmgr and .Fn lockmgr_rw fail if: .Bl -tag -width Er .It Bq Er EBUSY .Dv LK_FORCEUPGRADE was requested and another thread had already requested a lock upgrade. .It Bq Er EBUSY .Dv LK_NOWAIT was set, and a sleep would have been required, or .Dv LK_TRYUPGRADE operation was not able to upgrade the lock. .It Bq Er ENOLCK .Dv LK_SLEEPFAIL was set and .Fn lockmgr or .Fn lockmgr_rw did sleep. .It Bq Er EINTR .Dv PCATCH was set in the lock priority, and a signal was delivered during a sleep. Note the .Er ERESTART error below. .It Bq Er ERESTART .Dv PCATCH was set in the lock priority, a signal was delivered during a sleep, and the system call is to be restarted. .It Bq Er EWOULDBLOCK a non-zero timeout was given, and the timeout expired. .El .Sh LOCKS If .Dv LK_INTERLOCK is passed in the .Fa flags argument to .Fn lockmgr or .Fn lockmgr_rw , the .Fa ilk must be held prior to calling .Fn lockmgr or .Fn lockmgr_rw , and will be returned unlocked. .Pp Upgrade attempts that fail result in the loss of the lock that is currently held. Also, it is invalid to upgrade an exclusive lock, and a .Xr panic 9 will be the result of trying. .Sh SEE ALSO .Xr condvar 9 , .Xr locking 9 , .Xr mutex 9 , .Xr rwlock 9 , .Xr sleep 9 , .Xr sx 9 , .Xr mtx_assert 9 , .Xr panic 9 , .Xr VOP_PRINT 9 .Sh AUTHORS This manual page was written by .An Chad David Aq davidc@acns.ab.ca . Index: stable/10/sys/kern/kern_lock.c =================================================================== --- stable/10/sys/kern/kern_lock.c (revision 274605) +++ stable/10/sys/kern/kern_lock.c (revision 274606) @@ -1,1526 +1,1527 @@ /*- * Copyright (c) 2008 Attilio Rao * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #include "opt_adaptive_lockmgrs.h" #include "opt_ddb.h" #include "opt_hwpmc_hooks.h" #include "opt_kdtrace.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #ifdef DEBUG_LOCKS #include #endif #include #include #include #ifdef DDB #include #endif #ifdef HWPMC_HOOKS #include PMC_SOFT_DECLARE( , , lock, failed); #endif CTASSERT(((LK_ADAPTIVE | LK_NOSHARE) & LO_CLASSFLAGS) == (LK_ADAPTIVE | LK_NOSHARE)); CTASSERT(LK_UNLOCKED == (LK_UNLOCKED & ~(LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS))); #define SQ_EXCLUSIVE_QUEUE 0 #define SQ_SHARED_QUEUE 1 #ifndef INVARIANTS #define _lockmgr_assert(lk, what, file, line) #define TD_LOCKS_INC(td) #define TD_LOCKS_DEC(td) #else #define TD_LOCKS_INC(td) ((td)->td_locks++) #define TD_LOCKS_DEC(td) ((td)->td_locks--) #endif #define TD_SLOCKS_INC(td) ((td)->td_lk_slocks++) #define TD_SLOCKS_DEC(td) ((td)->td_lk_slocks--) #ifndef DEBUG_LOCKS #define STACK_PRINT(lk) #define STACK_SAVE(lk) #define STACK_ZERO(lk) #else #define STACK_PRINT(lk) stack_print_ddb(&(lk)->lk_stack) #define STACK_SAVE(lk) stack_save(&(lk)->lk_stack) #define STACK_ZERO(lk) stack_zero(&(lk)->lk_stack) #endif #define LOCK_LOG2(lk, string, arg1, arg2) \ if (LOCK_LOG_TEST(&(lk)->lock_object, 0)) \ CTR2(KTR_LOCK, (string), (arg1), (arg2)) #define LOCK_LOG3(lk, string, arg1, arg2, arg3) \ if (LOCK_LOG_TEST(&(lk)->lock_object, 0)) \ CTR3(KTR_LOCK, (string), (arg1), (arg2), (arg3)) #define GIANT_DECLARE \ int _i = 0; \ WITNESS_SAVE_DECL(Giant) #define GIANT_RESTORE() do { \ if (_i > 0) { \ while (_i--) \ mtx_lock(&Giant); \ WITNESS_RESTORE(&Giant.lock_object, Giant); \ } \ } while (0) #define GIANT_SAVE() do { \ if (mtx_owned(&Giant)) { \ WITNESS_SAVE(&Giant.lock_object, Giant); \ while (mtx_owned(&Giant)) { \ _i++; \ mtx_unlock(&Giant); \ } \ } \ } while (0) -#define LK_CAN_SHARE(x) \ - (((x) & LK_SHARE) && (((x) & LK_EXCLUSIVE_WAITERS) == 0 || \ - ((x) & LK_EXCLUSIVE_SPINNERS) == 0 || \ - curthread->td_lk_slocks || (curthread->td_pflags & TDP_DEADLKTREAT))) +#define LK_CAN_SHARE(x, flags) \ + (((x) & LK_SHARE) && \ + (((x) & (LK_EXCLUSIVE_WAITERS | LK_EXCLUSIVE_SPINNERS)) == 0 || \ + (curthread->td_lk_slocks != 0 && !(flags & LK_NODDLKTREAT)) || \ + (curthread->td_pflags & TDP_DEADLKTREAT))) #define LK_TRYOP(x) \ ((x) & LK_NOWAIT) #define LK_CAN_WITNESS(x) \ (((x) & LK_NOWITNESS) == 0 && !LK_TRYOP(x)) #define LK_TRYWIT(x) \ (LK_TRYOP(x) ? LOP_TRYLOCK : 0) #define LK_CAN_ADAPT(lk, f) \ (((lk)->lock_object.lo_flags & LK_ADAPTIVE) != 0 && \ ((f) & LK_SLEEPFAIL) == 0) #define lockmgr_disowned(lk) \ (((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == LK_KERNPROC) #define lockmgr_xlocked(lk) \ (((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == (uintptr_t)curthread) static void assert_lockmgr(const struct lock_object *lock, int how); #ifdef DDB static void db_show_lockmgr(const struct lock_object *lock); #endif static void lock_lockmgr(struct lock_object *lock, uintptr_t how); #ifdef KDTRACE_HOOKS static int owner_lockmgr(const struct lock_object *lock, struct thread **owner); #endif static uintptr_t unlock_lockmgr(struct lock_object *lock); struct lock_class lock_class_lockmgr = { .lc_name = "lockmgr", .lc_flags = LC_RECURSABLE | LC_SLEEPABLE | LC_SLEEPLOCK | LC_UPGRADABLE, .lc_assert = assert_lockmgr, #ifdef DDB .lc_ddb_show = db_show_lockmgr, #endif .lc_lock = lock_lockmgr, .lc_unlock = unlock_lockmgr, #ifdef KDTRACE_HOOKS .lc_owner = owner_lockmgr, #endif }; #ifdef ADAPTIVE_LOCKMGRS static u_int alk_retries = 10; static u_int alk_loops = 10000; static SYSCTL_NODE(_debug, OID_AUTO, lockmgr, CTLFLAG_RD, NULL, "lockmgr debugging"); SYSCTL_UINT(_debug_lockmgr, OID_AUTO, retries, CTLFLAG_RW, &alk_retries, 0, ""); SYSCTL_UINT(_debug_lockmgr, OID_AUTO, loops, CTLFLAG_RW, &alk_loops, 0, ""); #endif static __inline struct thread * lockmgr_xholder(const struct lock *lk) { uintptr_t x; x = lk->lk_lock; return ((x & LK_SHARE) ? NULL : (struct thread *)LK_HOLDER(x)); } /* * It assumes sleepq_lock held and returns with this one unheld. * It also assumes the generic interlock is sane and previously checked. * If LK_INTERLOCK is specified the interlock is not reacquired after the * sleep. */ static __inline int sleeplk(struct lock *lk, u_int flags, struct lock_object *ilk, const char *wmesg, int pri, int timo, int queue) { GIANT_DECLARE; struct lock_class *class; int catch, error; class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL; catch = pri & PCATCH; pri &= PRIMASK; error = 0; LOCK_LOG3(lk, "%s: %p blocking on the %s sleepqueue", __func__, lk, (queue == SQ_EXCLUSIVE_QUEUE) ? "exclusive" : "shared"); if (flags & LK_INTERLOCK) class->lc_unlock(ilk); if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0) lk->lk_exslpfail++; GIANT_SAVE(); sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ? SLEEPQ_INTERRUPTIBLE : 0), queue); if ((flags & LK_TIMELOCK) && timo) sleepq_set_timeout(&lk->lock_object, timo); /* * Decisional switch for real sleeping. */ if ((flags & LK_TIMELOCK) && timo && catch) error = sleepq_timedwait_sig(&lk->lock_object, pri); else if ((flags & LK_TIMELOCK) && timo) error = sleepq_timedwait(&lk->lock_object, pri); else if (catch) error = sleepq_wait_sig(&lk->lock_object, pri); else sleepq_wait(&lk->lock_object, pri); GIANT_RESTORE(); if ((flags & LK_SLEEPFAIL) && error == 0) error = ENOLCK; return (error); } static __inline int wakeupshlk(struct lock *lk, const char *file, int line) { uintptr_t v, x; u_int realexslp; int queue, wakeup_swapper; WITNESS_UNLOCK(&lk->lock_object, 0, file, line); LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, file, line); wakeup_swapper = 0; for (;;) { x = lk->lk_lock; /* * If there is more than one shared lock held, just drop one * and return. */ if (LK_SHARERS(x) > 1) { if (atomic_cmpset_rel_ptr(&lk->lk_lock, x, x - LK_ONE_SHARER)) break; continue; } /* * If there are not waiters on the exclusive queue, drop the * lock quickly. */ if ((x & LK_ALL_WAITERS) == 0) { MPASS((x & ~LK_EXCLUSIVE_SPINNERS) == LK_SHARERS_LOCK(1)); if (atomic_cmpset_rel_ptr(&lk->lk_lock, x, LK_UNLOCKED)) break; continue; } /* * We should have a sharer with waiters, so enter the hard * path in order to handle wakeups correctly. */ sleepq_lock(&lk->lock_object); x = lk->lk_lock & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); v = LK_UNLOCKED; /* * If the lock has exclusive waiters, give them preference in * order to avoid deadlock with shared runners up. * If interruptible sleeps left the exclusive queue empty * avoid a starvation for the threads sleeping on the shared * queue by giving them precedence and cleaning up the * exclusive waiters bit anyway. * Please note that lk_exslpfail count may be lying about * the real number of waiters with the LK_SLEEPFAIL flag on * because they may be used in conjuction with interruptible * sleeps so lk_exslpfail might be considered an 'upper limit' * bound, including the edge cases. */ realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE); if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) { if (lk->lk_exslpfail < realexslp) { lk->lk_exslpfail = 0; queue = SQ_EXCLUSIVE_QUEUE; v |= (x & LK_SHARED_WAITERS); } else { lk->lk_exslpfail = 0; LOCK_LOG2(lk, "%s: %p has only LK_SLEEPFAIL sleepers", __func__, lk); LOCK_LOG2(lk, "%s: %p waking up threads on the exclusive queue", __func__, lk); wakeup_swapper = sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); queue = SQ_SHARED_QUEUE; } } else { /* * Exclusive waiters sleeping with LK_SLEEPFAIL on * and using interruptible sleeps/timeout may have * left spourious lk_exslpfail counts on, so clean * it up anyway. */ lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; } if (!atomic_cmpset_rel_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x, v)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue", __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, queue); sleepq_release(&lk->lock_object); break; } lock_profile_release_lock(&lk->lock_object); TD_LOCKS_DEC(curthread); TD_SLOCKS_DEC(curthread); return (wakeup_swapper); } static void assert_lockmgr(const struct lock_object *lock, int what) { panic("lockmgr locks do not support assertions"); } static void lock_lockmgr(struct lock_object *lock, uintptr_t how) { panic("lockmgr locks do not support sleep interlocking"); } static uintptr_t unlock_lockmgr(struct lock_object *lock) { panic("lockmgr locks do not support sleep interlocking"); } #ifdef KDTRACE_HOOKS static int owner_lockmgr(const struct lock_object *lock, struct thread **owner) { panic("lockmgr locks do not support owner inquiring"); } #endif void lockinit(struct lock *lk, int pri, const char *wmesg, int timo, int flags) { int iflags; MPASS((flags & ~LK_INIT_MASK) == 0); ASSERT_ATOMIC_LOAD_PTR(lk->lk_lock, ("%s: lockmgr not aligned for %s: %p", __func__, wmesg, &lk->lk_lock)); iflags = LO_SLEEPABLE | LO_UPGRADABLE; if (flags & LK_CANRECURSE) iflags |= LO_RECURSABLE; if ((flags & LK_NODUP) == 0) iflags |= LO_DUPOK; if (flags & LK_NOPROFILE) iflags |= LO_NOPROFILE; if ((flags & LK_NOWITNESS) == 0) iflags |= LO_WITNESS; if (flags & LK_QUIET) iflags |= LO_QUIET; if (flags & LK_IS_VNODE) iflags |= LO_IS_VNODE; iflags |= flags & (LK_ADAPTIVE | LK_NOSHARE); lock_init(&lk->lock_object, &lock_class_lockmgr, wmesg, NULL, iflags); lk->lk_lock = LK_UNLOCKED; lk->lk_recurse = 0; lk->lk_exslpfail = 0; lk->lk_timo = timo; lk->lk_pri = pri; STACK_ZERO(lk); } /* * XXX: Gross hacks to manipulate external lock flags after * initialization. Used for certain vnode and buf locks. */ void lockallowshare(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags &= ~LK_NOSHARE; } void lockdisableshare(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags |= LK_NOSHARE; } void lockallowrecurse(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags |= LO_RECURSABLE; } void lockdisablerecurse(struct lock *lk) { lockmgr_assert(lk, KA_XLOCKED); lk->lock_object.lo_flags &= ~LO_RECURSABLE; } void lockdestroy(struct lock *lk) { KASSERT(lk->lk_lock == LK_UNLOCKED, ("lockmgr still held")); KASSERT(lk->lk_recurse == 0, ("lockmgr still recursed")); KASSERT(lk->lk_exslpfail == 0, ("lockmgr still exclusive waiters")); lock_destroy(&lk->lock_object); } int __lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk, const char *wmesg, int pri, int timo, const char *file, int line) { GIANT_DECLARE; struct lock_class *class; const char *iwmesg; uintptr_t tid, v, x; u_int op, realexslp; int error, ipri, itimo, queue, wakeup_swapper; #ifdef LOCK_PROFILING uint64_t waittime = 0; int contested = 0; #endif #ifdef ADAPTIVE_LOCKMGRS volatile struct thread *owner; u_int i, spintries = 0; #endif error = 0; tid = (uintptr_t)curthread; op = (flags & LK_TYPE_MASK); iwmesg = (wmesg == LK_WMESG_DEFAULT) ? lk->lock_object.lo_name : wmesg; ipri = (pri == LK_PRIO_DEFAULT) ? lk->lk_pri : pri; itimo = (timo == LK_TIMO_DEFAULT) ? lk->lk_timo : timo; MPASS((flags & ~LK_TOTAL_MASK) == 0); KASSERT((op & (op - 1)) == 0, ("%s: Invalid requested operation @ %s:%d", __func__, file, line)); KASSERT((flags & (LK_NOWAIT | LK_SLEEPFAIL)) == 0 || (op != LK_DOWNGRADE && op != LK_RELEASE), ("%s: Invalid flags in regard of the operation desired @ %s:%d", __func__, file, line)); KASSERT((flags & LK_INTERLOCK) == 0 || ilk != NULL, ("%s: LK_INTERLOCK passed without valid interlock @ %s:%d", __func__, file, line)); KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread), ("%s: idle thread %p on lockmgr %s @ %s:%d", __func__, curthread, lk->lock_object.lo_name, file, line)); class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL; if (panicstr != NULL) { if (flags & LK_INTERLOCK) class->lc_unlock(ilk); return (0); } if (lk->lock_object.lo_flags & LK_NOSHARE) { switch (op) { case LK_SHARED: op = LK_EXCLUSIVE; break; case LK_UPGRADE: case LK_TRYUPGRADE: case LK_DOWNGRADE: _lockmgr_assert(lk, KA_XLOCKED | KA_NOTRECURSED, file, line); if (flags & LK_INTERLOCK) class->lc_unlock(ilk); return (0); } } wakeup_swapper = 0; switch (op) { case LK_SHARED: if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line, flags & LK_INTERLOCK ? ilk : NULL); for (;;) { x = lk->lk_lock; /* * If no other thread has an exclusive lock, or * no exclusive waiter is present, bump the count of * sharers. Since we have to preserve the state of * waiters, if we fail to acquire the shared lock * loop back and retry. */ - if (LK_CAN_SHARE(x)) { + if (LK_CAN_SHARE(x, flags)) { if (atomic_cmpset_acq_ptr(&lk->lk_lock, x, x + LK_ONE_SHARER)) break; continue; } #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&lk->lock_object, &contested, &waittime); /* * If the lock is already held by curthread in * exclusive way avoid a deadlock. */ if (LK_HOLDER(x) == tid) { LOCK_LOG2(lk, "%s: %p already held in exclusive mode", __func__, lk); error = EDEADLK; break; } /* * If the lock is expected to not sleep just give up * and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } #ifdef ADAPTIVE_LOCKMGRS /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. We need a double-state handle here * because for a failed acquisition the lock can be * either held in exclusive mode or shared mode * (for the writer starvation avoidance technique). */ if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 && LK_HOLDER(x) != LK_KERNPROC) { owner = (struct thread *)LK_HOLDER(x); if (LOCK_LOG_TEST(&lk->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, lk, owner); /* * If we are holding also an interlock drop it * in order to avoid a deadlock if the lockmgr * owner is adaptively spinning on the * interlock itself. */ if (flags & LK_INTERLOCK) { class->lc_unlock(ilk); flags &= ~LK_INTERLOCK; } GIANT_SAVE(); while (LK_HOLDER(lk->lk_lock) == (uintptr_t)owner && TD_IS_RUNNING(owner)) cpu_spinwait(); GIANT_RESTORE(); continue; } else if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) != 0 && LK_SHARERS(x) && spintries < alk_retries) { if (flags & LK_INTERLOCK) { class->lc_unlock(ilk); flags &= ~LK_INTERLOCK; } GIANT_SAVE(); spintries++; for (i = 0; i < alk_loops; i++) { if (LOCK_LOG_TEST(&lk->lock_object, 0)) CTR4(KTR_LOCK, "%s: shared spinning on %p with %u and %u", __func__, lk, spintries, i); x = lk->lk_lock; if ((x & LK_SHARE) == 0 || - LK_CAN_SHARE(x) != 0) + LK_CAN_SHARE(x, flags) != 0) break; cpu_spinwait(); } GIANT_RESTORE(); if (i != alk_loops) continue; } #endif /* * Acquire the sleepqueue chain lock because we * probabilly will need to manipulate waiters flags. */ sleepq_lock(&lk->lock_object); x = lk->lk_lock; /* * if the lock can be acquired in shared mode, try * again. */ - if (LK_CAN_SHARE(x)) { + if (LK_CAN_SHARE(x, flags)) { sleepq_release(&lk->lock_object); continue; } #ifdef ADAPTIVE_LOCKMGRS /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owner) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 && LK_HOLDER(x) != LK_KERNPROC) { owner = (struct thread *)LK_HOLDER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&lk->lock_object); continue; } } #endif /* * Try to set the LK_SHARED_WAITERS flag. If we fail, * loop back and retry. */ if ((x & LK_SHARED_WAITERS) == 0) { if (!atomic_cmpset_acq_ptr(&lk->lk_lock, x, x | LK_SHARED_WAITERS)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG2(lk, "%s: %p set shared waiters flag", __func__, lk); } /* * As far as we have been unable to acquire the * shared lock and the shared waiters flag is set, * we will sleep. */ error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo, SQ_SHARED_QUEUE); flags &= ~LK_INTERLOCK; if (error) { LOCK_LOG3(lk, "%s: interrupted sleep for %p with %d", __func__, lk, error); break; } LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", __func__, lk); } if (error == 0) { lock_profile_obtain_lock_success(&lk->lock_object, contested, waittime, file, line); LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, file, line); WITNESS_LOCK(&lk->lock_object, LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); TD_SLOCKS_INC(curthread); STACK_SAVE(lk); } break; case LK_UPGRADE: case LK_TRYUPGRADE: _lockmgr_assert(lk, KA_SLOCKED, file, line); v = lk->lk_lock; x = v & LK_ALL_WAITERS; v &= LK_EXCLUSIVE_SPINNERS; /* * Try to switch from one shared lock to an exclusive one. * We need to preserve waiters flags during the operation. */ if (atomic_cmpset_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x | v, tid | x)) { LOCK_LOG_LOCK("XUPGRADE", &lk->lock_object, 0, 0, file, line); WITNESS_UPGRADE(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_SLOCKS_DEC(curthread); break; } /* * In LK_TRYUPGRADE mode, do not drop the lock, * returning EBUSY instead. */ if (op == LK_TRYUPGRADE) { LOCK_LOG2(lk, "%s: %p failed the nowait upgrade", __func__, lk); error = EBUSY; break; } /* * We have been unable to succeed in upgrading, so just * give up the shared lock. */ wakeup_swapper |= wakeupshlk(lk, file, line); /* FALLTHROUGH */ case LK_EXCLUSIVE: if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ? ilk : NULL); /* * If curthread already holds the lock and this one is * allowed to recurse, simply recurse on it. */ if (lockmgr_xlocked(lk)) { if ((flags & LK_CANRECURSE) == 0 && (lk->lock_object.lo_flags & LO_RECURSABLE) == 0) { /* * If the lock is expected to not panic just * give up and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: recursing on non recursive lockmgr %s @ %s:%d\n", __func__, iwmesg, file, line); } lk->lk_recurse++; LOCK_LOG2(lk, "%s: %p recursing", __func__, lk); LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); break; } while (!atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) { #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&lk->lock_object, &contested, &waittime); /* * If the lock is expected to not sleep just give up * and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } #ifdef ADAPTIVE_LOCKMGRS /* * If the owner is running on another CPU, spin until * the owner stops running or the state of the lock * changes. */ x = lk->lk_lock; if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 && LK_HOLDER(x) != LK_KERNPROC) { owner = (struct thread *)LK_HOLDER(x); if (LOCK_LOG_TEST(&lk->lock_object, 0)) CTR3(KTR_LOCK, "%s: spinning on %p held by %p", __func__, lk, owner); /* * If we are holding also an interlock drop it * in order to avoid a deadlock if the lockmgr * owner is adaptively spinning on the * interlock itself. */ if (flags & LK_INTERLOCK) { class->lc_unlock(ilk); flags &= ~LK_INTERLOCK; } GIANT_SAVE(); while (LK_HOLDER(lk->lk_lock) == (uintptr_t)owner && TD_IS_RUNNING(owner)) cpu_spinwait(); GIANT_RESTORE(); continue; } else if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) != 0 && LK_SHARERS(x) && spintries < alk_retries) { if ((x & LK_EXCLUSIVE_SPINNERS) == 0 && !atomic_cmpset_ptr(&lk->lk_lock, x, x | LK_EXCLUSIVE_SPINNERS)) continue; if (flags & LK_INTERLOCK) { class->lc_unlock(ilk); flags &= ~LK_INTERLOCK; } GIANT_SAVE(); spintries++; for (i = 0; i < alk_loops; i++) { if (LOCK_LOG_TEST(&lk->lock_object, 0)) CTR4(KTR_LOCK, "%s: shared spinning on %p with %u and %u", __func__, lk, spintries, i); if ((lk->lk_lock & LK_EXCLUSIVE_SPINNERS) == 0) break; cpu_spinwait(); } GIANT_RESTORE(); if (i != alk_loops) continue; } #endif /* * Acquire the sleepqueue chain lock because we * probabilly will need to manipulate waiters flags. */ sleepq_lock(&lk->lock_object); x = lk->lk_lock; /* * if the lock has been released while we spun on * the sleepqueue chain lock just try again. */ if (x == LK_UNLOCKED) { sleepq_release(&lk->lock_object); continue; } #ifdef ADAPTIVE_LOCKMGRS /* * The current lock owner might have started executing * on another CPU (or the lock could have changed * owner) while we were waiting on the turnstile * chain lock. If so, drop the turnstile lock and try * again. */ if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 && LK_HOLDER(x) != LK_KERNPROC) { owner = (struct thread *)LK_HOLDER(x); if (TD_IS_RUNNING(owner)) { sleepq_release(&lk->lock_object); continue; } } #endif /* * The lock can be in the state where there is a * pending queue of waiters, but still no owner. * This happens when the lock is contested and an * owner is going to claim the lock. * If curthread is the one successfully acquiring it * claim lock ownership and return, preserving waiters * flags. */ v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); if ((x & ~v) == LK_UNLOCKED) { v &= ~LK_EXCLUSIVE_SPINNERS; if (atomic_cmpset_acq_ptr(&lk->lk_lock, x, tid | v)) { sleepq_release(&lk->lock_object); LOCK_LOG2(lk, "%s: %p claimed by a new writer", __func__, lk); break; } sleepq_release(&lk->lock_object); continue; } /* * Try to set the LK_EXCLUSIVE_WAITERS flag. If we * fail, loop back and retry. */ if ((x & LK_EXCLUSIVE_WAITERS) == 0) { if (!atomic_cmpset_ptr(&lk->lk_lock, x, x | LK_EXCLUSIVE_WAITERS)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG2(lk, "%s: %p set excl waiters flag", __func__, lk); } /* * As far as we have been unable to acquire the * exclusive lock and the exclusive waiters flag * is set, we will sleep. */ error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo, SQ_EXCLUSIVE_QUEUE); flags &= ~LK_INTERLOCK; if (error) { LOCK_LOG3(lk, "%s: interrupted sleep for %p with %d", __func__, lk, error); break; } LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", __func__, lk); } if (error == 0) { lock_profile_obtain_lock_success(&lk->lock_object, contested, waittime, file, line); LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); STACK_SAVE(lk); } break; case LK_DOWNGRADE: _lockmgr_assert(lk, KA_XLOCKED, file, line); LOCK_LOG_LOCK("XDOWNGRADE", &lk->lock_object, 0, 0, file, line); WITNESS_DOWNGRADE(&lk->lock_object, 0, file, line); /* * Panic if the lock is recursed. */ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) { if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: downgrade a recursed lockmgr %s @ %s:%d\n", __func__, iwmesg, file, line); } TD_SLOCKS_INC(curthread); /* * In order to preserve waiters flags, just spin. */ for (;;) { x = lk->lk_lock; MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); x &= LK_ALL_WAITERS; if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x, LK_SHARERS_LOCK(1) | x)) break; cpu_spinwait(); } break; case LK_RELEASE: _lockmgr_assert(lk, KA_LOCKED, file, line); x = lk->lk_lock; if ((x & LK_SHARE) == 0) { /* * As first option, treact the lock as if it has not * any waiter. * Fix-up the tid var if the lock has been disowned. */ if (LK_HOLDER(x) == LK_KERNPROC) tid = LK_KERNPROC; else { WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_DEC(curthread); } LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0, lk->lk_recurse, file, line); /* * The lock is held in exclusive mode. * If the lock is recursed also, then unrecurse it. */ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) { LOCK_LOG2(lk, "%s: %p unrecursing", __func__, lk); lk->lk_recurse--; break; } if (tid != LK_KERNPROC) lock_profile_release_lock(&lk->lock_object); if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid, LK_UNLOCKED)) break; sleepq_lock(&lk->lock_object); x = lk->lk_lock; v = LK_UNLOCKED; /* * If the lock has exclusive waiters, give them * preference in order to avoid deadlock with * shared runners up. * If interruptible sleeps left the exclusive queue * empty avoid a starvation for the threads sleeping * on the shared queue by giving them precedence * and cleaning up the exclusive waiters bit anyway. * Please note that lk_exslpfail count may be lying * about the real number of waiters with the * LK_SLEEPFAIL flag on because they may be used in * conjuction with interruptible sleeps so * lk_exslpfail might be considered an 'upper limit' * bound, including the edge cases. */ MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE); if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) { if (lk->lk_exslpfail < realexslp) { lk->lk_exslpfail = 0; queue = SQ_EXCLUSIVE_QUEUE; v |= (x & LK_SHARED_WAITERS); } else { lk->lk_exslpfail = 0; LOCK_LOG2(lk, "%s: %p has only LK_SLEEPFAIL sleepers", __func__, lk); LOCK_LOG2(lk, "%s: %p waking up threads on the exclusive queue", __func__, lk); wakeup_swapper = sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); queue = SQ_SHARED_QUEUE; } } else { /* * Exclusive waiters sleeping with LK_SLEEPFAIL * on and using interruptible sleeps/timeout * may have left spourious lk_exslpfail counts * on, so clean it up anyway. */ lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; } LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue", __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); atomic_store_rel_ptr(&lk->lk_lock, v); wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, queue); sleepq_release(&lk->lock_object); break; } else wakeup_swapper = wakeupshlk(lk, file, line); break; case LK_DRAIN: if (LK_CAN_WITNESS(flags)) WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ? ilk : NULL); /* * Trying to drain a lock we already own will result in a * deadlock. */ if (lockmgr_xlocked(lk)) { if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: draining %s with the lock held @ %s:%d\n", __func__, iwmesg, file, line); } while (!atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) { #ifdef HWPMC_HOOKS PMC_SOFT_CALL( , , lock, failed); #endif lock_profile_obtain_lock_failed(&lk->lock_object, &contested, &waittime); /* * If the lock is expected to not sleep just give up * and return. */ if (LK_TRYOP(flags)) { LOCK_LOG2(lk, "%s: %p fails the try operation", __func__, lk); error = EBUSY; break; } /* * Acquire the sleepqueue chain lock because we * probabilly will need to manipulate waiters flags. */ sleepq_lock(&lk->lock_object); x = lk->lk_lock; /* * if the lock has been released while we spun on * the sleepqueue chain lock just try again. */ if (x == LK_UNLOCKED) { sleepq_release(&lk->lock_object); continue; } v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); if ((x & ~v) == LK_UNLOCKED) { v = (x & ~LK_EXCLUSIVE_SPINNERS); /* * If interruptible sleeps left the exclusive * queue empty avoid a starvation for the * threads sleeping on the shared queue by * giving them precedence and cleaning up the * exclusive waiters bit anyway. * Please note that lk_exslpfail count may be * lying about the real number of waiters with * the LK_SLEEPFAIL flag on because they may * be used in conjuction with interruptible * sleeps so lk_exslpfail might be considered * an 'upper limit' bound, including the edge * cases. */ if (v & LK_EXCLUSIVE_WAITERS) { queue = SQ_EXCLUSIVE_QUEUE; v &= ~LK_EXCLUSIVE_WAITERS; } else { /* * Exclusive waiters sleeping with * LK_SLEEPFAIL on and using * interruptible sleeps/timeout may * have left spourious lk_exslpfail * counts on, so clean it up anyway. */ MPASS(v & LK_SHARED_WAITERS); lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; v &= ~LK_SHARED_WAITERS; } if (queue == SQ_EXCLUSIVE_QUEUE) { realexslp = sleepq_sleepcnt(&lk->lock_object, SQ_EXCLUSIVE_QUEUE); if (lk->lk_exslpfail >= realexslp) { lk->lk_exslpfail = 0; queue = SQ_SHARED_QUEUE; v &= ~LK_SHARED_WAITERS; if (realexslp != 0) { LOCK_LOG2(lk, "%s: %p has only LK_SLEEPFAIL sleepers", __func__, lk); LOCK_LOG2(lk, "%s: %p waking up threads on the exclusive queue", __func__, lk); wakeup_swapper = sleepq_broadcast( &lk->lock_object, SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); } } else lk->lk_exslpfail = 0; } if (!atomic_cmpset_ptr(&lk->lk_lock, x, v)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG3(lk, "%s: %p waking up all threads on the %s queue", __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); wakeup_swapper |= sleepq_broadcast( &lk->lock_object, SLEEPQ_LK, 0, queue); /* * If shared waiters have been woken up we need * to wait for one of them to acquire the lock * before to set the exclusive waiters in * order to avoid a deadlock. */ if (queue == SQ_SHARED_QUEUE) { for (v = lk->lk_lock; (v & LK_SHARE) && !LK_SHARERS(v); v = lk->lk_lock) cpu_spinwait(); } } /* * Try to set the LK_EXCLUSIVE_WAITERS flag. If we * fail, loop back and retry. */ if ((x & LK_EXCLUSIVE_WAITERS) == 0) { if (!atomic_cmpset_ptr(&lk->lk_lock, x, x | LK_EXCLUSIVE_WAITERS)) { sleepq_release(&lk->lock_object); continue; } LOCK_LOG2(lk, "%s: %p set drain waiters flag", __func__, lk); } /* * As far as we have been unable to acquire the * exclusive lock and the exclusive waiters flag * is set, we will sleep. */ if (flags & LK_INTERLOCK) { class->lc_unlock(ilk); flags &= ~LK_INTERLOCK; } GIANT_SAVE(); sleepq_add(&lk->lock_object, NULL, iwmesg, SLEEPQ_LK, SQ_EXCLUSIVE_QUEUE); sleepq_wait(&lk->lock_object, ipri & PRIMASK); GIANT_RESTORE(); LOCK_LOG2(lk, "%s: %p resuming from the sleep queue", __func__, lk); } if (error == 0) { lock_profile_obtain_lock_success(&lk->lock_object, contested, waittime, file, line); LOCK_LOG_LOCK("DRAIN", &lk->lock_object, 0, lk->lk_recurse, file, line); WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE | LK_TRYWIT(flags), file, line); TD_LOCKS_INC(curthread); STACK_SAVE(lk); } break; default: if (flags & LK_INTERLOCK) class->lc_unlock(ilk); panic("%s: unknown lockmgr request 0x%x\n", __func__, op); } if (flags & LK_INTERLOCK) class->lc_unlock(ilk); if (wakeup_swapper) kick_proc0(); return (error); } void _lockmgr_disown(struct lock *lk, const char *file, int line) { uintptr_t tid, x; if (SCHEDULER_STOPPED()) return; tid = (uintptr_t)curthread; _lockmgr_assert(lk, KA_XLOCKED, file, line); /* * Panic if the lock is recursed. */ if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) panic("%s: disown a recursed lockmgr @ %s:%d\n", __func__, file, line); /* * If the owner is already LK_KERNPROC just skip the whole operation. */ if (LK_HOLDER(lk->lk_lock) != tid) return; lock_profile_release_lock(&lk->lock_object); LOCK_LOG_LOCK("XDISOWN", &lk->lock_object, 0, 0, file, line); WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line); TD_LOCKS_DEC(curthread); STACK_SAVE(lk); /* * In order to preserve waiters flags, just spin. */ for (;;) { x = lk->lk_lock; MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); x &= LK_ALL_WAITERS; if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x, LK_KERNPROC | x)) return; cpu_spinwait(); } } void lockmgr_printinfo(const struct lock *lk) { struct thread *td; uintptr_t x; if (lk->lk_lock == LK_UNLOCKED) printf("lock type %s: UNLOCKED\n", lk->lock_object.lo_name); else if (lk->lk_lock & LK_SHARE) printf("lock type %s: SHARED (count %ju)\n", lk->lock_object.lo_name, (uintmax_t)LK_SHARERS(lk->lk_lock)); else { td = lockmgr_xholder(lk); printf("lock type %s: EXCL by thread %p " "(pid %d, %s, tid %d)\n", lk->lock_object.lo_name, td, td->td_proc->p_pid, td->td_proc->p_comm, td->td_tid); } x = lk->lk_lock; if (x & LK_EXCLUSIVE_WAITERS) printf(" with exclusive waiters pending\n"); if (x & LK_SHARED_WAITERS) printf(" with shared waiters pending\n"); if (x & LK_EXCLUSIVE_SPINNERS) printf(" with exclusive spinners pending\n"); STACK_PRINT(lk); } int lockstatus(const struct lock *lk) { uintptr_t v, x; int ret; ret = LK_SHARED; x = lk->lk_lock; v = LK_HOLDER(x); if ((x & LK_SHARE) == 0) { if (v == (uintptr_t)curthread || v == LK_KERNPROC) ret = LK_EXCLUSIVE; else ret = LK_EXCLOTHER; } else if (x == LK_UNLOCKED) ret = 0; return (ret); } #ifdef INVARIANT_SUPPORT FEATURE(invariant_support, "Support for modules compiled with INVARIANTS option"); #ifndef INVARIANTS #undef _lockmgr_assert #endif void _lockmgr_assert(const struct lock *lk, int what, const char *file, int line) { int slocked = 0; if (panicstr != NULL) return; switch (what) { case KA_SLOCKED: case KA_SLOCKED | KA_NOTRECURSED: case KA_SLOCKED | KA_RECURSED: slocked = 1; case KA_LOCKED: case KA_LOCKED | KA_NOTRECURSED: case KA_LOCKED | KA_RECURSED: #ifdef WITNESS /* * We cannot trust WITNESS if the lock is held in exclusive * mode and a call to lockmgr_disown() happened. * Workaround this skipping the check if the lock is held in * exclusive mode even for the KA_LOCKED case. */ if (slocked || (lk->lk_lock & LK_SHARE)) { witness_assert(&lk->lock_object, what, file, line); break; } #endif if (lk->lk_lock == LK_UNLOCKED || ((lk->lk_lock & LK_SHARE) == 0 && (slocked || (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk))))) panic("Lock %s not %slocked @ %s:%d\n", lk->lock_object.lo_name, slocked ? "share" : "", file, line); if ((lk->lk_lock & LK_SHARE) == 0) { if (lockmgr_recursed(lk)) { if (what & KA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); } else if (what & KA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); } break; case KA_XLOCKED: case KA_XLOCKED | KA_NOTRECURSED: case KA_XLOCKED | KA_RECURSED: if (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk)) panic("Lock %s not exclusively locked @ %s:%d\n", lk->lock_object.lo_name, file, line); if (lockmgr_recursed(lk)) { if (what & KA_NOTRECURSED) panic("Lock %s recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); } else if (what & KA_RECURSED) panic("Lock %s not recursed @ %s:%d\n", lk->lock_object.lo_name, file, line); break; case KA_UNLOCKED: if (lockmgr_xlocked(lk) || lockmgr_disowned(lk)) panic("Lock %s exclusively locked @ %s:%d\n", lk->lock_object.lo_name, file, line); break; default: panic("Unknown lockmgr assertion: %d @ %s:%d\n", what, file, line); } } #endif #ifdef DDB int lockmgr_chain(struct thread *td, struct thread **ownerp) { struct lock *lk; lk = td->td_wchan; if (LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr) return (0); db_printf("blocked on lockmgr %s", lk->lock_object.lo_name); if (lk->lk_lock & LK_SHARE) db_printf("SHARED (count %ju)\n", (uintmax_t)LK_SHARERS(lk->lk_lock)); else db_printf("EXCL\n"); *ownerp = lockmgr_xholder(lk); return (1); } static void db_show_lockmgr(const struct lock_object *lock) { struct thread *td; const struct lock *lk; lk = (const struct lock *)lock; db_printf(" state: "); if (lk->lk_lock == LK_UNLOCKED) db_printf("UNLOCKED\n"); else if (lk->lk_lock & LK_SHARE) db_printf("SLOCK: %ju\n", (uintmax_t)LK_SHARERS(lk->lk_lock)); else { td = lockmgr_xholder(lk); if (td == (struct thread *)LK_KERNPROC) db_printf("XLOCK: LK_KERNPROC\n"); else db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td, td->td_tid, td->td_proc->p_pid, td->td_proc->p_comm); if (lockmgr_recursed(lk)) db_printf(" recursed: %d\n", lk->lk_recurse); } db_printf(" waiters: "); switch (lk->lk_lock & LK_ALL_WAITERS) { case LK_SHARED_WAITERS: db_printf("shared\n"); break; case LK_EXCLUSIVE_WAITERS: db_printf("exclusive\n"); break; case LK_ALL_WAITERS: db_printf("shared and exclusive\n"); break; default: db_printf("none\n"); } db_printf(" spinners: "); if (lk->lk_lock & LK_EXCLUSIVE_SPINNERS) db_printf("exclusive\n"); else db_printf("none\n"); } #endif Index: stable/10/sys/kern/vfs_lookup.c =================================================================== --- stable/10/sys/kern/vfs_lookup.c (revision 274605) +++ stable/10/sys/kern/vfs_lookup.c (revision 274606) @@ -1,1240 +1,1241 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_kdtrace.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #define NAMEI_DIAGNOSTIC 1 #undef NAMEI_DIAGNOSTIC SDT_PROVIDER_DECLARE(vfs); SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, "struct vnode *", "char *", "unsigned long"); SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *"); /* * Allocation zone for namei */ uma_zone_t namei_zone; /* * Placeholder vnode for mp traversal */ static struct vnode *vp_crossmp; static void nameiinit(void *dummy __unused) { namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp); vn_lock(vp_crossmp, LK_EXCLUSIVE); VN_LOCK_ASHARE(vp_crossmp); VOP_UNLOCK(vp_crossmp, 0); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL); static int lookup_shared = 1; SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RW, &lookup_shared, 0, "Enables/Disables shared locks for path name translation"); TUNABLE_INT("vfs.lookup_shared", &lookup_shared); /* * Convert a pathname into a pointer to a locked vnode. * * The FOLLOW flag is set when symbolic links are to be followed * when they occur at the end of the name translation process. * Symbolic links are always followed for all other pathname * components other than the last. * * The segflg defines whether the name is to be copied from user * space or kernel space. * * Overall outline of namei: * * copy in name * get starting directory * while (!done && !error) { * call lookup to search path. * if symbolic link, massage name in buffer and continue * } */ static void namei_cleanup_cnp(struct componentname *cnp) { uma_zfree(namei_zone, cnp->cn_pnbuf); #ifdef DIAGNOSTIC cnp->cn_pnbuf = NULL; cnp->cn_nameptr = NULL; #endif } int namei(struct nameidata *ndp) { struct filedesc *fdp; /* pointer to file descriptor state */ char *cp; /* pointer into pathname argument */ struct vnode *dp; /* the directory we are searching */ struct iovec aiov; /* uio for reading symbolic links */ struct uio auio; int error, linklen; struct componentname *cnp = &ndp->ni_cnd; struct thread *td = cnp->cn_thread; struct proc *p = td->td_proc; ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred; KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc")); KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0, ("namei: nameiop contaminated with flags")); KASSERT((cnp->cn_flags & OPMASK) == 0, ("namei: flags contaminated with nameiops")); if (!lookup_shared) cnp->cn_flags &= ~LOCKSHARED; fdp = p->p_fd; /* We will set this ourselves if we need it. */ cnp->cn_flags &= ~TRAILINGSLASH; /* * Get a buffer for the name to be translated, and copy the * name into the buffer. */ if ((cnp->cn_flags & HASBUF) == 0) cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); if (ndp->ni_segflg == UIO_SYSSPACE) error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, (size_t *)&ndp->ni_pathlen); else error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, (size_t *)&ndp->ni_pathlen); /* * Don't allow empty pathnames. */ if (!error && *cnp->cn_pnbuf == '\0') error = ENOENT; #ifdef CAPABILITY_MODE /* * In capability mode, lookups must be "strictly relative" (i.e. * not an absolute path, and not containing '..' components) to * a real file descriptor, not the pseudo-descriptor AT_FDCWD. */ if (error == 0 && IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) { ndp->ni_strictrelative = 1; if (ndp->ni_dirfd == AT_FDCWD) { #ifdef KTRACE if (KTRPOINT(td, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif error = ECAPMODE; } } #endif if (error) { namei_cleanup_cnp(cnp); ndp->ni_vp = NULL; return (error); } ndp->ni_loopcnt = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_NAMEI)) { KASSERT(cnp->cn_thread == curthread, ("namei not using curthread")); ktrnamei(cnp->cn_pnbuf); } #endif /* * Get starting point for the translation. */ FILEDESC_SLOCK(fdp); ndp->ni_rootdir = fdp->fd_rdir; ndp->ni_topdir = fdp->fd_jdir; /* * If we are auditing the kernel pathname, save the user pathname. */ if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_UPATH1(td, ndp->ni_dirfd, cnp->cn_pnbuf); if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_UPATH2(td, ndp->ni_dirfd, cnp->cn_pnbuf); dp = NULL; if (cnp->cn_pnbuf[0] != '/') { if (ndp->ni_startdir != NULL) { dp = ndp->ni_startdir; error = 0; } else if (ndp->ni_dirfd != AT_FDCWD) { cap_rights_t rights; rights = ndp->ni_rightsneeded; cap_rights_set(&rights, CAP_LOOKUP); if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_ATFD1(ndp->ni_dirfd); if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_ATFD2(ndp->ni_dirfd); error = fgetvp_rights(td, ndp->ni_dirfd, &rights, &ndp->ni_filecaps, &dp); #ifdef CAPABILITIES /* * If file descriptor doesn't have all rights, * all lookups relative to it must also be * strictly relative. */ CAP_ALL(&rights); if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || ndp->ni_filecaps.fc_nioctls != -1) { ndp->ni_strictrelative = 1; } #endif } if (error != 0 || dp != NULL) { FILEDESC_SUNLOCK(fdp); if (error == 0 && dp->v_type != VDIR) { vrele(dp); error = ENOTDIR; } } if (error) { namei_cleanup_cnp(cnp); return (error); } } if (dp == NULL) { dp = fdp->fd_cdir; VREF(dp); FILEDESC_SUNLOCK(fdp); if (ndp->ni_startdir != NULL) vrele(ndp->ni_startdir); } SDT_PROBE(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf, cnp->cn_flags, 0, 0); for (;;) { /* * Check if root directory should replace current directory. * Done at start of translation and after symbolic link. */ cnp->cn_nameptr = cnp->cn_pnbuf; if (*(cnp->cn_nameptr) == '/') { vrele(dp); if (ndp->ni_strictrelative != 0) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif namei_cleanup_cnp(cnp); return (ENOTCAPABLE); } while (*(cnp->cn_nameptr) == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } dp = ndp->ni_rootdir; VREF(dp); } ndp->ni_startdir = dp; error = lookup(ndp); if (error) { namei_cleanup_cnp(cnp); SDT_PROBE(vfs, namei, lookup, return, error, NULL, 0, 0, 0); return (error); } /* * If not a symbolic link, we're done. */ if ((cnp->cn_flags & ISSYMLINK) == 0) { if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) { namei_cleanup_cnp(cnp); } else cnp->cn_flags |= HASBUF; SDT_PROBE(vfs, namei, lookup, return, 0, ndp->ni_vp, 0, 0, 0); return (0); } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; break; } #ifdef MAC if ((cnp->cn_flags & NOMACCHECK) == 0) { error = mac_vnode_check_readlink(td->td_ucred, ndp->ni_vp); if (error) break; } #endif if (ndp->ni_pathlen > 1) cp = uma_zalloc(namei_zone, M_WAITOK); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); break; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); error = ENOENT; break; } if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); error = ENAMETOOLONG; break; } if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; vput(ndp->ni_vp); dp = ndp->ni_dvp; } namei_cleanup_cnp(cnp); vput(ndp->ni_vp); ndp->ni_vp = NULL; vrele(ndp->ni_dvp); SDT_PROBE(vfs, namei, lookup, return, error, NULL, 0, 0, 0); return (error); } static int compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags) { if (mp == NULL || ((lkflags & LK_SHARED) && (!(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED) || ((cnflags & ISDOTDOT) && (mp->mnt_kern_flag & MNTK_LOOKUP_EXCL_DOTDOT))))) { lkflags &= ~LK_SHARED; lkflags |= LK_EXCLUSIVE; } + lkflags |= LK_NODDLKTREAT; return (lkflags); } static __inline int needs_exclusive_leaf(struct mount *mp, int flags) { /* * Intermediate nodes can use shared locks, we only need to * force an exclusive lock for leaf nodes. */ if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF)) return (0); /* Always use exclusive locks if LOCKSHARED isn't set. */ if (!(flags & LOCKSHARED)) return (1); /* * For lookups during open(), if the mount point supports * extended shared operations, then use a shared lock for the * leaf node, otherwise use an exclusive lock. */ if ((flags & ISOPEN) != 0) return (!MNT_EXTENDED_SHARED(mp)); /* * Lookup requests outside of open() that specify LOCKSHARED * only need a shared lock on the leaf vnode. */ return (0); } /* * Search a pathname. * This is a very central and rather complicated routine. * * The pathname is pointed to by ni_ptr and is of length ni_pathlen. * The starting directory is taken from ni_startdir. The pathname is * descended until done, or a symbolic link is encountered. The variable * ni_more is clear if the path is completed; it is set to one if a * symbolic link needing interpretation is encountered. * * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on * whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it, the parent directory is returned * locked. If flag has WANTPARENT or'ed into it, the parent directory is * returned unlocked. Otherwise the parent directory is not returned. If * the target of the pathname exists and LOCKLEAF is or'ed into the flag * the target is returned locked, otherwise it is returned unlocked. * When creating or renaming and LOCKPARENT is specified, the target may not * be ".". When deleting and LOCKPARENT is specified, the target may be ".". * * Overall outline of lookup: * * dirloop: * identify next component of name at ndp->ni_ptr * handle degenerate case where name is null string * if .. and crossing mount points and on mounted filesys, find parent * call VOP_LOOKUP routine for next component name * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set * component vnode returned in ni_vp (if it exists), locked. * if result vnode is mounted on and crossing mount points, * find mounted on vnode * if more components of name, do next level at dirloop * return the answer in ni_vp, locked if LOCKLEAF set * if LOCKPARENT set, return locked parent in ni_dvp * if WANTPARENT set, return unlocked parent in ni_dvp */ int lookup(struct nameidata *ndp) { char *cp; /* pointer into pathname argument */ struct vnode *dp = 0; /* the directory we are searching */ struct vnode *tdp; /* saved dp */ struct mount *mp; /* mount table entry */ struct prison *pr; int docache; /* == 0 do not cache last component */ int wantparent; /* 1 => wantparent or lockparent flag */ int rdonly; /* lookup read-only flag bit */ int error = 0; int dpunlocked = 0; /* dp has already been unlocked */ struct componentname *cnp = &ndp->ni_cnd; int lkflags_save; int ni_dvp_unlocked; /* * Setup: break out flag bits into variables. */ ni_dvp_unlocked = 0; wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); KASSERT(cnp->cn_nameiop == LOOKUP || wantparent, ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT.")); docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; if (cnp->cn_nameiop == DELETE || (wantparent && cnp->cn_nameiop != CREATE && cnp->cn_nameiop != LOOKUP)) docache = 0; rdonly = cnp->cn_flags & RDONLY; cnp->cn_flags &= ~ISSYMLINK; ndp->ni_dvp = NULL; /* * We use shared locks until we hit the parent of the last cn then * we adjust based on the requesting flags. */ if (lookup_shared) cnp->cn_lkflags = LK_SHARED; else cnp->cn_lkflags = LK_EXCLUSIVE; dp = ndp->ni_startdir; ndp->ni_startdir = NULLVP; vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, cnp->cn_flags)); dirloop: /* * Search a new directory. * * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. */ cnp->cn_consume = 0; for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) continue; cnp->cn_namelen = cp - cnp->cn_nameptr; if (cnp->cn_namelen > NAME_MAX) { error = ENAMETOOLONG; goto bad; } #ifdef NAMEI_DIAGNOSTIC { char c = *cp; *cp = '\0'; printf("{%s}: ", cnp->cn_nameptr); *cp = c; } #endif ndp->ni_pathlen -= cnp->cn_namelen; ndp->ni_next = cp; /* * Replace multiple slashes by a single slash and trailing slashes * by a null. This must be done before VOP_LOOKUP() because some * fs's don't know about trailing slashes. Remember if there were * trailing slashes to handle symlinks, existing non-directories * and non-existing files that won't be directories specially later. */ while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { cp++; ndp->ni_pathlen--; if (*cp == '\0') { *ndp->ni_next = '\0'; cnp->cn_flags |= TRAILINGSLASH; } } ndp->ni_next = cp; cnp->cn_flags |= MAKEENTRY; if (*cp == '\0' && docache == 0) cnp->cn_flags &= ~MAKEENTRY; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') cnp->cn_flags |= ISDOTDOT; else cnp->cn_flags &= ~ISDOTDOT; if (*ndp->ni_next == 0) cnp->cn_flags |= ISLASTCN; else cnp->cn_flags &= ~ISLASTCN; if ((cnp->cn_flags & ISLASTCN) != 0 && cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EINVAL; goto bad; } /* * Check for degenerate name (e.g. / or "") * which is a way of talking about a directory, * e.g. like "/." or ".". */ if (cnp->cn_nameptr[0] == '\0') { if (dp->v_type != VDIR) { error = ENOTDIR; goto bad; } if (cnp->cn_nameiop != LOOKUP) { error = EISDIR; goto bad; } if (wantparent) { ndp->ni_dvp = dp; VREF(dp); } ndp->ni_vp = dp; if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_VNODE1(dp); else if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_VNODE2(dp); if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF))) VOP_UNLOCK(dp, 0); /* XXX This should probably move to the top of function. */ if (cnp->cn_flags & SAVESTART) panic("lookup: SAVESTART"); goto success; } /* * Handle "..": five special cases. * 0. If doing a capability lookup, return ENOTCAPABLE (this is a * fairly conservative design choice, but it's the only one that we * are satisfied guarantees the property we're looking for). * 1. Return an error if this is the last component of * the name and the operation is DELETE or RENAME. * 2. If at root directory (e.g. after chroot) * or at absolute root directory * then ignore it so can't get out. * 3. If this vnode is the root of a mounted * filesystem, then replace it with the * vnode which was mounted on so we take the * .. in the other filesystem. * 4. If the vnode is the top directory of * the jail or chroot, don't let them out. */ if (cnp->cn_flags & ISDOTDOT) { if (ndp->ni_strictrelative != 0) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif error = ENOTCAPABLE; goto bad; } if ((cnp->cn_flags & ISLASTCN) != 0 && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EINVAL; goto bad; } for (;;) { for (pr = cnp->cn_cred->cr_prison; pr != NULL; pr = pr->pr_parent) if (dp == pr->pr_root) break; if (dp == ndp->ni_rootdir || dp == ndp->ni_topdir || dp == rootvnode || pr != NULL || ((dp->v_vflag & VV_ROOT) != 0 && (cnp->cn_flags & NOCROSSMOUNT) != 0)) { ndp->ni_dvp = dp; ndp->ni_vp = dp; VREF(dp); goto nextname; } if ((dp->v_vflag & VV_ROOT) == 0) break; if (dp->v_iflag & VI_DOOMED) { /* forced unmount */ error = ENOENT; goto bad; } tdp = dp; dp = dp->v_mount->mnt_vnodecovered; VREF(dp); vput(tdp); vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, ISDOTDOT)); } } /* * We now have a segment name to search for, and a directory to search. */ unionlookup: #ifdef MAC if ((cnp->cn_flags & NOMACCHECK) == 0) { error = mac_vnode_check_lookup(cnp->cn_thread->td_ucred, dp, cnp); if (error) goto bad; } #endif ndp->ni_dvp = dp; ndp->ni_vp = NULL; ASSERT_VOP_LOCKED(dp, "lookup"); /* * If we have a shared lock we may need to upgrade the lock for the * last operation. */ if (dp != vp_crossmp && VOP_ISLOCKED(dp) == LK_SHARED && (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT)) vn_lock(dp, LK_UPGRADE|LK_RETRY); if ((dp->v_iflag & VI_DOOMED) != 0) { error = ENOENT; goto bad; } /* * If we're looking up the last component and we need an exclusive * lock, adjust our lkflags. */ if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags)) cnp->cn_lkflags = LK_EXCLUSIVE; #ifdef NAMEI_DIAGNOSTIC vprint("lookup in", dp); #endif lkflags_save = cnp->cn_lkflags; cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags, cnp->cn_flags); if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) { cnp->cn_lkflags = lkflags_save; KASSERT(ndp->ni_vp == NULL, ("leaf should be empty")); #ifdef NAMEI_DIAGNOSTIC printf("not found\n"); #endif if ((error == ENOENT) && (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) && (dp->v_mount->mnt_flag & MNT_UNION)) { tdp = dp; dp = dp->v_mount->mnt_vnodecovered; VREF(dp); vput(tdp); vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, cnp->cn_flags)); goto unionlookup; } if (error != EJUSTRETURN) goto bad; /* * At this point, we know we're at the end of the * pathname. If creating / renaming, we can consider * allowing the file or directory to be created / renamed, * provided we're not on a read-only filesystem. */ if (rdonly) { error = EROFS; goto bad; } /* trailing slash only allowed for directories */ if ((cnp->cn_flags & TRAILINGSLASH) && !(cnp->cn_flags & WILLBEDIR)) { error = ENOENT; goto bad; } if ((cnp->cn_flags & LOCKPARENT) == 0) VOP_UNLOCK(dp, 0); /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the * (possibly locked) directory vnode in ndp->ni_dvp. */ if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp; VREF(ndp->ni_startdir); } goto success; } else cnp->cn_lkflags = lkflags_save; #ifdef NAMEI_DIAGNOSTIC printf("found\n"); #endif /* * Take into account any additional components consumed by * the underlying filesystem. */ if (cnp->cn_consume > 0) { cnp->cn_nameptr += cnp->cn_consume; ndp->ni_next += cnp->cn_consume; ndp->ni_pathlen -= cnp->cn_consume; cnp->cn_consume = 0; } dp = ndp->ni_vp; /* * Check to see if the vnode has been mounted on; * if so find the root of the mounted filesystem. */ while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && (cnp->cn_flags & NOCROSSMOUNT) == 0) { if (vfs_busy(mp, 0)) continue; vput(dp); if (dp != ndp->ni_dvp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); vref(vp_crossmp); ndp->ni_dvp = vp_crossmp; error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags, cnp->cn_flags), &tdp); vfs_unbusy(mp); if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT)) panic("vp_crossmp exclusively locked or reclaimed"); if (error) { dpunlocked = 1; goto bad2; } ndp->ni_vp = dp = tdp; } /* * Check for symbolic link */ if ((dp->v_type == VLNK) && ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) || *ndp->ni_next == '/')) { cnp->cn_flags |= ISSYMLINK; if (dp->v_iflag & VI_DOOMED) { /* * We can't know whether the directory was mounted with * NOSYMFOLLOW, so we can't follow safely. */ error = ENOENT; goto bad2; } if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) { error = EACCES; goto bad2; } /* * Symlink code always expects an unlocked dvp. */ if (ndp->ni_dvp != ndp->ni_vp) { VOP_UNLOCK(ndp->ni_dvp, 0); ni_dvp_unlocked = 1; } goto success; } nextname: /* * Not a symbolic link that we will follow. Continue with the * next component if there is any; otherwise, we're done. */ KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/', ("lookup: invalid path state.")); if (*ndp->ni_next == '/') { cnp->cn_nameptr = ndp->ni_next; while (*cnp->cn_nameptr == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } if (ndp->ni_dvp != dp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); goto dirloop; } /* * If we're processing a path with a trailing slash, * check that the end result is a directory. */ if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) { error = ENOTDIR; goto bad2; } /* * Disallow directory write attempts on read-only filesystems. */ if (rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EROFS; goto bad2; } if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp; VREF(ndp->ni_startdir); } if (!wantparent) { ni_dvp_unlocked = 2; if (ndp->ni_dvp != dp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); } else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) { VOP_UNLOCK(ndp->ni_dvp, 0); ni_dvp_unlocked = 1; } if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_VNODE1(dp); else if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_VNODE2(dp); if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp, 0); success: /* * Because of lookup_shared we may have the vnode shared locked, but * the caller may want it to be exclusively locked. */ if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) && VOP_ISLOCKED(dp) != LK_EXCLUSIVE) { vn_lock(dp, LK_UPGRADE | LK_RETRY); if (dp->v_iflag & VI_DOOMED) { error = ENOENT; goto bad2; } } return (0); bad2: if (ni_dvp_unlocked != 2) { if (dp != ndp->ni_dvp && !ni_dvp_unlocked) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); } bad: if (!dpunlocked) vput(dp); ndp->ni_vp = NULL; return (error); } /* * relookup - lookup a path name component * Used by lookup to re-acquire things. */ int relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { struct vnode *dp = 0; /* the directory we are searching */ int wantparent; /* 1 => wantparent or lockparent flag */ int rdonly; /* lookup read-only flag bit */ int error = 0; KASSERT(cnp->cn_flags & ISLASTCN, ("relookup: Not given last component.")); /* * Setup: break out flag bits into variables. */ wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); KASSERT(wantparent, ("relookup: parent not wanted.")); rdonly = cnp->cn_flags & RDONLY; cnp->cn_flags &= ~ISSYMLINK; dp = dvp; cnp->cn_lkflags = LK_EXCLUSIVE; vn_lock(dp, LK_EXCLUSIVE | LK_RETRY); /* * Search a new directory. * * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. */ #ifdef NAMEI_DIAGNOSTIC printf("{%s}: ", cnp->cn_nameptr); #endif /* * Check for "" which represents the root directory after slash * removal. */ if (cnp->cn_nameptr[0] == '\0') { /* * Support only LOOKUP for "/" because lookup() * can't succeed for CREATE, DELETE and RENAME. */ KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP")); KASSERT(dp->v_type == VDIR, ("dp is not a directory")); if (!(cnp->cn_flags & LOCKLEAF)) VOP_UNLOCK(dp, 0); *vpp = dp; /* XXX This should probably move to the top of function. */ if (cnp->cn_flags & SAVESTART) panic("lookup: SAVESTART"); return (0); } if (cnp->cn_flags & ISDOTDOT) panic ("relookup: lookup on dot-dot"); /* * We now have a segment name to search for, and a directory to search. */ #ifdef NAMEI_DIAGNOSTIC vprint("search in:", dp); #endif if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) { KASSERT(*vpp == NULL, ("leaf should be empty")); if (error != EJUSTRETURN) goto bad; /* * If creating and at end of pathname, then can consider * allowing file to be created. */ if (rdonly) { error = EROFS; goto bad; } /* ASSERT(dvp == ndp->ni_startdir) */ if (cnp->cn_flags & SAVESTART) VREF(dvp); if ((cnp->cn_flags & LOCKPARENT) == 0) VOP_UNLOCK(dp, 0); /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the * (possibly locked) directory vnode in ndp->ni_dvp. */ return (0); } dp = *vpp; /* * Disallow directory write attempts on read-only filesystems. */ if (rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { if (dvp == dp) vrele(dvp); else vput(dvp); error = EROFS; goto bad; } /* * Set the parent lock/ref state to the requested state. */ if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) { if (wantparent) VOP_UNLOCK(dvp, 0); else vput(dvp); } else if (!wantparent) vrele(dvp); /* * Check for symbolic link */ KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW), ("relookup: symlink found.\n")); /* ASSERT(dvp == ndp->ni_startdir) */ if (cnp->cn_flags & SAVESTART) VREF(dvp); if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp, 0); return (0); bad: vput(dp); *vpp = NULL; return (error); } void NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags, enum uio_seg segflg, const char *namep, int dirfd, struct vnode *startdir, cap_rights_t *rightsp, struct thread *td) { ndp->ni_cnd.cn_nameiop = op; ndp->ni_cnd.cn_flags = flags; ndp->ni_segflg = segflg; ndp->ni_dirp = namep; ndp->ni_dirfd = dirfd; ndp->ni_startdir = startdir; ndp->ni_strictrelative = 0; if (rightsp != NULL) ndp->ni_rightsneeded = *rightsp; else cap_rights_init(&ndp->ni_rightsneeded); filecaps_init(&ndp->ni_filecaps); ndp->ni_cnd.cn_thread = td; } /* * Free data allocated by namei(); see namei(9) for details. */ void NDFREE(struct nameidata *ndp, const u_int flags) { int unlock_dvp; int unlock_vp; unlock_dvp = 0; unlock_vp = 0; if (!(flags & NDF_NO_FREE_PNBUF) && (ndp->ni_cnd.cn_flags & HASBUF)) { uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); ndp->ni_cnd.cn_flags &= ~HASBUF; } if (!(flags & NDF_NO_VP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) unlock_vp = 1; if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) { if (unlock_vp) { vput(ndp->ni_vp); unlock_vp = 0; } else vrele(ndp->ni_vp); ndp->ni_vp = NULL; } if (unlock_vp) VOP_UNLOCK(ndp->ni_vp, 0); if (!(flags & NDF_NO_DVP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKPARENT) && ndp->ni_dvp != ndp->ni_vp) unlock_dvp = 1; if (!(flags & NDF_NO_DVP_RELE) && (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { if (unlock_dvp) { vput(ndp->ni_dvp); unlock_dvp = 0; } else vrele(ndp->ni_dvp); ndp->ni_dvp = NULL; } if (unlock_dvp) VOP_UNLOCK(ndp->ni_dvp, 0); if (!(flags & NDF_NO_STARTDIR_RELE) && (ndp->ni_cnd.cn_flags & SAVESTART)) { vrele(ndp->ni_startdir); ndp->ni_startdir = NULL; } } /* * Determine if there is a suitable alternate filename under the specified * prefix for the specified path. If the create flag is set, then the * alternate prefix will be used so long as the parent directory exists. * This is used by the various compatiblity ABIs so that Linux binaries prefer * files under /compat/linux for example. The chosen path (whether under * the prefix or under /) is returned in a kernel malloc'd buffer pointed * to by pathbuf. The caller is responsible for free'ing the buffer from * the M_TEMP bucket if one is returned. */ int kern_alternate_path(struct thread *td, const char *prefix, const char *path, enum uio_seg pathseg, char **pathbuf, int create, int dirfd) { struct nameidata nd, ndroot; char *ptr, *buf, *cp; size_t len, sz; int error; buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK); *pathbuf = buf; /* Copy the prefix into the new pathname as a starting point. */ len = strlcpy(buf, prefix, MAXPATHLEN); if (len >= MAXPATHLEN) { *pathbuf = NULL; free(buf, M_TEMP); return (EINVAL); } sz = MAXPATHLEN - len; ptr = buf + len; /* Append the filename to the prefix. */ if (pathseg == UIO_SYSSPACE) error = copystr(path, ptr, sz, &len); else error = copyinstr(path, ptr, sz, &len); if (error) { *pathbuf = NULL; free(buf, M_TEMP); return (error); } /* Only use a prefix with absolute pathnames. */ if (*ptr != '/') { error = EINVAL; goto keeporig; } if (dirfd != AT_FDCWD) { /* * We want the original because the "prefix" is * included in the already opened dirfd. */ bcopy(ptr, buf, len); return (0); } /* * We know that there is a / somewhere in this pathname. * Search backwards for it, to find the file's parent dir * to see if it exists in the alternate tree. If it does, * and we want to create a file (cflag is set). We don't * need to worry about the root comparison in this case. */ if (create) { for (cp = &ptr[len] - 1; *cp != '/'; cp--); *cp = '\0'; NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td); error = namei(&nd); *cp = '/'; if (error != 0) goto keeporig; } else { NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td); error = namei(&nd); if (error != 0) goto keeporig; /* * We now compare the vnode of the prefix to the one * vnode asked. If they resolve to be the same, then we * ignore the match so that the real root gets used. * This avoids the problem of traversing "../.." to find the * root directory and never finding it, because "/" resolves * to the emulation root directory. This is expensive :-( */ NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix, td); /* We shouldn't ever get an error from this namei(). */ error = namei(&ndroot); if (error == 0) { if (nd.ni_vp == ndroot.ni_vp) error = ENOENT; NDFREE(&ndroot, NDF_ONLY_PNBUF); vrele(ndroot.ni_vp); } } NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); keeporig: /* If there was an error, use the original path name. */ if (error) bcopy(ptr, buf, len); return (error); } Index: stable/10/sys/sys/lockmgr.h =================================================================== --- stable/10/sys/sys/lockmgr.h (revision 274605) +++ stable/10/sys/sys/lockmgr.h (revision 274606) @@ -1,198 +1,199 @@ /*- * Copyright (c) 2008 Attilio Rao * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice(s), this list of conditions and the following disclaimer as * the first lines of this file unmodified other than the possible * addition of one or more copyright notices. * 2. Redistributions in binary form must reproduce the above copyright * notice(s), this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_LOCKMGR_H_ #define _SYS_LOCKMGR_H_ #include #include #include #include #define LK_SHARE 0x01 #define LK_SHARED_WAITERS 0x02 #define LK_EXCLUSIVE_WAITERS 0x04 #define LK_EXCLUSIVE_SPINNERS 0x08 #define LK_ALL_WAITERS \ (LK_SHARED_WAITERS | LK_EXCLUSIVE_WAITERS) #define LK_FLAGMASK \ (LK_SHARE | LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS) #define LK_HOLDER(x) ((x) & ~LK_FLAGMASK) #define LK_SHARERS_SHIFT 4 #define LK_SHARERS(x) (LK_HOLDER(x) >> LK_SHARERS_SHIFT) #define LK_SHARERS_LOCK(x) ((x) << LK_SHARERS_SHIFT | LK_SHARE) #define LK_ONE_SHARER (1 << LK_SHARERS_SHIFT) #define LK_UNLOCKED LK_SHARERS_LOCK(0) #define LK_KERNPROC ((uintptr_t)(-1) & ~LK_FLAGMASK) #ifdef _KERNEL #if !defined(LOCK_FILE) || !defined(LOCK_LINE) #error "LOCK_FILE and LOCK_LINE not defined, include before" #endif struct thread; #define lk_recurse lock_object.lo_data /* * Function prototipes. Routines that start with an underscore are not part * of the public interface and might be wrappered with a macro. */ int __lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk, const char *wmesg, int prio, int timo, const char *file, int line); #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) void _lockmgr_assert(const struct lock *lk, int what, const char *file, int line); #endif void _lockmgr_disown(struct lock *lk, const char *file, int line); void lockallowrecurse(struct lock *lk); void lockallowshare(struct lock *lk); void lockdestroy(struct lock *lk); void lockdisablerecurse(struct lock *lk); void lockdisableshare(struct lock *lk); void lockinit(struct lock *lk, int prio, const char *wmesg, int timo, int flags); #ifdef DDB int lockmgr_chain(struct thread *td, struct thread **ownerp); #endif void lockmgr_printinfo(const struct lock *lk); int lockstatus(const struct lock *lk); /* * As far as the ilk can be a static NULL pointer these functions need a * strict prototype in order to safely use the lock_object member. */ static __inline int _lockmgr_args(struct lock *lk, u_int flags, struct mtx *ilk, const char *wmesg, int prio, int timo, const char *file, int line) { return (__lockmgr_args(lk, flags, (ilk != NULL) ? &ilk->lock_object : NULL, wmesg, prio, timo, file, line)); } static __inline int _lockmgr_args_rw(struct lock *lk, u_int flags, struct rwlock *ilk, const char *wmesg, int prio, int timo, const char *file, int line) { return (__lockmgr_args(lk, flags, (ilk != NULL) ? &ilk->lock_object : NULL, wmesg, prio, timo, file, line)); } /* * Define aliases in order to complete lockmgr KPI. */ #define lockmgr(lk, flags, ilk) \ _lockmgr_args((lk), (flags), (ilk), LK_WMESG_DEFAULT, \ LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, LOCK_FILE, LOCK_LINE) #define lockmgr_args(lk, flags, ilk, wmesg, prio, timo) \ _lockmgr_args((lk), (flags), (ilk), (wmesg), (prio), (timo), \ LOCK_FILE, LOCK_LINE) #define lockmgr_args_rw(lk, flags, ilk, wmesg, prio, timo) \ _lockmgr_args_rw((lk), (flags), (ilk), (wmesg), (prio), (timo), \ LOCK_FILE, LOCK_LINE) #define lockmgr_disown(lk) \ _lockmgr_disown((lk), LOCK_FILE, LOCK_LINE) #define lockmgr_recursed(lk) \ ((lk)->lk_recurse != 0) #define lockmgr_rw(lk, flags, ilk) \ _lockmgr_args_rw((lk), (flags), (ilk), LK_WMESG_DEFAULT, \ LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, LOCK_FILE, LOCK_LINE) #define lockmgr_waiters(lk) \ ((lk)->lk_lock & LK_ALL_WAITERS) #ifdef INVARIANTS #define lockmgr_assert(lk, what) \ _lockmgr_assert((lk), (what), LOCK_FILE, LOCK_LINE) #else #define lockmgr_assert(lk, what) #endif /* * Flags for lockinit(). */ #define LK_INIT_MASK 0x0000FF #define LK_CANRECURSE 0x000001 #define LK_NODUP 0x000002 #define LK_NOPROFILE 0x000004 #define LK_NOSHARE 0x000008 #define LK_NOWITNESS 0x000010 #define LK_QUIET 0x000020 #define LK_ADAPTIVE 0x000040 #define LK_IS_VNODE 0x000080 /* Tell WITNESS about a VNODE lock */ /* * Additional attributes to be used in lockmgr(). */ #define LK_EATTR_MASK 0x00FF00 #define LK_INTERLOCK 0x000100 #define LK_NOWAIT 0x000200 #define LK_RETRY 0x000400 #define LK_SLEEPFAIL 0x000800 #define LK_TIMELOCK 0x001000 +#define LK_NODDLKTREAT 0x002000 /* * Operations for lockmgr(). */ #define LK_TYPE_MASK 0xFF0000 #define LK_DOWNGRADE 0x010000 #define LK_DRAIN 0x020000 #define LK_EXCLOTHER 0x040000 #define LK_EXCLUSIVE 0x080000 #define LK_RELEASE 0x100000 #define LK_SHARED 0x200000 #define LK_UPGRADE 0x400000 #define LK_TRYUPGRADE 0x800000 #define LK_TOTAL_MASK (LK_INIT_MASK | LK_EATTR_MASK | LK_TYPE_MASK) /* * Default values for lockmgr_args(). */ #define LK_WMESG_DEFAULT (NULL) #define LK_PRIO_DEFAULT (0) #define LK_TIMO_DEFAULT (0) /* * Assertion flags. */ #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT) #define KA_LOCKED LA_LOCKED #define KA_SLOCKED LA_SLOCKED #define KA_XLOCKED LA_XLOCKED #define KA_UNLOCKED LA_UNLOCKED #define KA_RECURSED LA_RECURSED #define KA_NOTRECURSED LA_NOTRECURSED #endif #endif /* _KERNEL */ #endif /* !_SYS_LOCKMGR_H_ */ Index: stable/10 =================================================================== --- stable/10 (revision 274605) +++ stable/10 (revision 274606) Property changes on: stable/10 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r273966,273986