Index: sys/fs/unionfs/union_vfsops.c =================================================================== --- sys/fs/unionfs/union_vfsops.c +++ sys/fs/unionfs/union_vfsops.c @@ -310,6 +310,30 @@ return (ENOENT); } + /* + * Specify that the covered vnode lock should remain held while + * lookup() performs the cross-mount walk. This prevents a lock-order + * reversal between the covered vnode lock (which is also locked by + * unionfs_lock()) and the mountpoint's busy count. Without this, + * unmount will lock the covered vnode lock (directly through the + * covered vnode) and wait for the busy count to drain, while a + * concurrent lookup will increment the busy count and then lock + * the covered vnode lock (indirectly through unionfs_lock()). + * + * Note that we can't yet use this facility for the 'below' case + * in which the upper vnode is the covered vnode, because that would + * introduce a different LOR in which the cross-mount lookup would + * effectively hold the upper vnode lock before acquiring the lower + * vnode lock, while an unrelated lock operation would still acquire + * the lower vnode lock before the upper vnode lock, which is the + * order unionfs currently requires. + */ + if (!below) { + vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY); + mp->mnt_vnodecovered->v_vflag |= VV_CROSSLOCK; + VOP_UNLOCK(mp->mnt_vnodecovered); + } + MNT_ILOCK(mp); if ((lowermp->mnt_flag & MNT_LOCAL) != 0 && (uppermp->mnt_flag & MNT_LOCAL) != 0) @@ -362,6 +386,9 @@ if (error) return (error); + vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY); + mp->mnt_vnodecovered->v_vflag &= ~VV_CROSSLOCK; + VOP_UNLOCK(mp->mnt_vnodecovered); vfs_unregister_upper(ump->um_lowervp->v_mount, &ump->um_lower_link); vfs_unregister_upper(ump->um_uppervp->v_mount, &ump->um_upper_link); free(ump, M_UNIONFSMNT); Index: sys/kern/vfs_lookup.c =================================================================== --- sys/kern/vfs_lookup.c +++ sys/kern/vfs_lookup.c @@ -98,21 +98,15 @@ { struct vnode *vp; struct lock *lk __diagused; - const char *file __witness_used; - int flags, line __witness_used; + int flags; vp = ap->a_vp; lk = vp->v_vnlock; flags = ap->a_flags; - file = ap->a_file; - line = ap->a_line; - if ((flags & LK_SHARED) == 0) - panic("invalid lock request for crossmp"); + KASSERT((flags & (LK_SHARED | LK_NOWAIT)) == (LK_SHARED | LK_NOWAIT), + ("%s: invalid lock request 0x%x for crossmp", __func__, flags)); - WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line, - flags & LK_INTERLOCK ? &VI_MTX(vp)->lock_object : NULL); - WITNESS_LOCK(&lk->lock_object, 0, file, line); if ((flags & LK_INTERLOCK) != 0) VI_UNLOCK(vp); LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, ap->a_file, ap->a_line); @@ -128,7 +122,6 @@ vp = ap->a_vp; lk = vp->v_vnlock; - WITNESS_UNLOCK(&lk->lock_object, 0, LOCK_FILE, LOCK_LINE); LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, LOCK_FILE, LOCK_LINE); return (0); @@ -910,6 +903,8 @@ struct componentname *cnp = &ndp->ni_cnd; int lkflags_save; int ni_dvp_unlocked; + int crosslkflags; + bool crosslock; /* * Setup: break out flag bits into variables. @@ -1243,18 +1238,32 @@ */ while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && (cnp->cn_flags & NOCROSSMOUNT) == 0) { + crosslock = ((dp->v_vflag & VV_CROSSLOCK) != 0); + crosslkflags = compute_cn_lkflags(mp, cnp->cn_lkflags, + cnp->cn_flags); + if (__predict_false(crosslock) && + (crosslkflags & LK_EXCLUSIVE) != 0 && + VOP_ISLOCKED(dp) != LK_EXCLUSIVE) { + vn_lock(dp, LK_UPGRADE | LK_RETRY); + if (VN_IS_DOOMED(dp)) { + error = ENOENT; + goto bad2; + } + } if (vfs_busy(mp, 0)) continue; - vput(dp); + if (__predict_true(!crosslock)) + vput(dp); if (dp != ndp->ni_dvp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); vrefact(vp_crossmp); ndp->ni_dvp = vp_crossmp; - error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags, - cnp->cn_flags), &tdp); + error = VFS_ROOT(mp, crosslkflags, &tdp); vfs_unbusy(mp); + if (__predict_false(crosslock)) + vput(dp); if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT)) panic("vp_crossmp exclusively locked or reclaimed"); if (error) { Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -770,17 +770,22 @@ * +->F->D->E * * The lookup() process for namei("/var") illustrates the process: - * VOP_LOOKUP() obtains B while A is held - * vfs_busy() obtains a shared lock on F while A and B are held - * vput() releases lock on B - * vput() releases lock on A - * VFS_ROOT() obtains lock on D while shared lock on F is held - * vfs_unbusy() releases shared lock on F - * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. - * Attempt to lock A (instead of vp_crossmp) while D is held would - * violate the global order, causing deadlocks. + * 1. VOP_LOOKUP() obtains B while A is held + * 2. vfs_busy() obtains a shared lock on F while A and B are held + * 3. vput() releases lock on B + * 4. vput() releases lock on A + * 5. VFS_ROOT() obtains lock on D while shared lock on F is held + * 6. vfs_unbusy() releases shared lock on F + * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. + * Attempt to lock A (instead of vp_crossmp) while D is held would + * violate the global order, causing deadlocks. * - * dounmount() locks B while F is drained. + * dounmount() locks B while F is drained. Note that for stacked + * filesystems, D and B in the example above may be the same lock, + * which introdues potential lock order reversal deadlock between + * dounmount() and step 5 above. These filesystems may avoid the LOR + * by setting VV_CROSSLOCK on the covered vnode so that lock B will + * remain held until after step 5. */ int vfs_busy(struct mount *mp, int flags) Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -276,6 +276,7 @@ #define VV_FORCEINSMQ 0x1000 /* force the insmntque to succeed */ #define VV_READLINK 0x2000 /* fdescfs linux vnode */ #define VV_UNREF 0x4000 /* vunref, do not drop lock in inactive() */ +#define VV_CROSSLOCK 0x8000 /* vnode lock is shared w/ root mounted here */ #define VMP_LAZYLIST 0x0001 /* Vnode is on mnt's lazy list */