diff --git a/sys/fs/unionfs/union_vfsops.c b/sys/fs/unionfs/union_vfsops.c --- a/sys/fs/unionfs/union_vfsops.c +++ b/sys/fs/unionfs/union_vfsops.c @@ -310,6 +310,30 @@ return (ENOENT); } + /* + * Specify that the covered vnode lock should remain held while + * lookup() performs the cross-mount walk. This prevents a lock-order + * reversal between the covered vnode lock (which is also locked by + * unionfs_lock()) and the mountpoint's busy count. Without this, + * unmount will lock the covered vnode lock (directly through the + * covered vnode) and wait for the busy count to drain, while a + * concurrent lookup will increment the busy count and then lock + * the covered vnode lock (indirectly through unionfs_lock()). + * + * Note that we can't yet use this facility for the 'below' case + * in which the upper vnode is the covered vnode, because that would + * introduce a different LOR in which the cross-mount lookup would + * effectively hold the upper vnode lock before acquiring the lower + * vnode lock, while an unrelated lock operation would still acquire + * the lower vnode lock before the upper vnode lock, which is the + * order unionfs currently requires. + */ + if (!below) { + vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY); + mp->mnt_vnodecovered->v_vflag |= VV_CROSSLOCK; + VOP_UNLOCK(mp->mnt_vnodecovered); + } + MNT_ILOCK(mp); if ((lowermp->mnt_flag & MNT_LOCAL) != 0 && (uppermp->mnt_flag & MNT_LOCAL) != 0) @@ -362,6 +386,9 @@ if (error) return (error); + vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY); + mp->mnt_vnodecovered->v_vflag &= ~VV_CROSSLOCK; + VOP_UNLOCK(mp->mnt_vnodecovered); vfs_unregister_upper(ump->um_lowervp->v_mount, &ump->um_lower_link); vfs_unregister_upper(ump->um_uppervp->v_mount, &ump->um_upper_link); free(ump, M_UNIONFSMNT); diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -105,21 +105,15 @@ { struct vnode *vp; struct lock *lk __diagused; - const char *file __witness_used; - int flags, line __witness_used; + int flags; vp = ap->a_vp; lk = vp->v_vnlock; flags = ap->a_flags; - file = ap->a_file; - line = ap->a_line; - if ((flags & LK_SHARED) == 0) - panic("invalid lock request for crossmp"); + KASSERT((flags & (LK_SHARED | LK_NOWAIT)) == (LK_SHARED | LK_NOWAIT), + ("%s: invalid lock request 0x%x for crossmp", __func__, flags)); - WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line, - flags & LK_INTERLOCK ? &VI_MTX(vp)->lock_object : NULL); - WITNESS_LOCK(&lk->lock_object, 0, file, line); if ((flags & LK_INTERLOCK) != 0) VI_UNLOCK(vp); LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, ap->a_file, ap->a_line); @@ -135,7 +129,6 @@ vp = ap->a_vp; lk = vp->v_vnlock; - WITNESS_UNLOCK(&lk->lock_object, 0, LOCK_FILE, LOCK_LINE); LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, LOCK_FILE, LOCK_LINE); return (0); @@ -901,6 +894,8 @@ struct componentname *cnp = &ndp->ni_cnd; int lkflags_save; int ni_dvp_unlocked; + int crosslkflags; + bool crosslock; /* * Setup: break out flag bits into variables. @@ -1226,33 +1221,6 @@ #endif dp = ndp->ni_vp; - /* - * Check to see if the vnode has been mounted on; - * if so find the root of the mounted filesystem. - */ - while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && - (cnp->cn_flags & NOCROSSMOUNT) == 0) { - if (vfs_busy(mp, 0)) - continue; - vput(dp); - if (dp != ndp->ni_dvp) - vput(ndp->ni_dvp); - else - vrele(ndp->ni_dvp); - vrefact(vp_crossmp); - ndp->ni_dvp = vp_crossmp; - error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags, - cnp->cn_flags), &tdp); - vfs_unbusy(mp); - if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT)) - panic("vp_crossmp exclusively locked or reclaimed"); - if (error) { - dpunlocked = 1; - goto bad2; - } - ndp->ni_vp = dp = tdp; - } - /* * Check for symbolic link */ @@ -1280,7 +1248,54 @@ ni_dvp_unlocked = 1; } goto success; - } + } else if ((vn_irflag_read(dp) & VIRF_MOUNTPOINT) != 0) { + if ((cnp->cn_flags & NOCROSSMOUNT) != 0) + goto nextname; + } else + goto nextname; + + /* + * Check to see if the vnode has been mounted on; + * if so find the root of the mounted filesystem. + */ + do { + mp = dp->v_mountedhere; + KASSERT(mp != NULL, + ("%s: NULL mountpoint for VIRF_MOUNTPOINT vnode", __func__)); + crosslock = (dp->v_vflag & VV_CROSSLOCK) != 0; + crosslkflags = compute_cn_lkflags(mp, cnp->cn_lkflags, + cnp->cn_flags); + if (__predict_false(crosslock) && + (crosslkflags & LK_EXCLUSIVE) != 0 && + VOP_ISLOCKED(dp) != LK_EXCLUSIVE) { + vn_lock(dp, LK_UPGRADE | LK_RETRY); + if (VN_IS_DOOMED(dp)) { + error = ENOENT; + goto bad2; + } + } + if (vfs_busy(mp, 0) != 0) + continue; + if (__predict_true(!crosslock)) + vput(dp); + if (dp != ndp->ni_dvp) + vput(ndp->ni_dvp); + else + vrele(ndp->ni_dvp); + vrefact(vp_crossmp); + ndp->ni_dvp = vp_crossmp; + error = VFS_ROOT(mp, crosslkflags, &tdp); + vfs_unbusy(mp); + if (__predict_false(crosslock)) + vput(dp); + if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT)) + panic("vp_crossmp exclusively locked or reclaimed"); + if (error != 0) { + dpunlocked = 1; + goto bad2; + } + ndp->ni_vp = dp = tdp; + } while ((vn_irflag_read(dp) & VIRF_MOUNTPOINT) != 0); nextname: /* diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -770,17 +770,22 @@ * +->F->D->E * * The lookup() process for namei("/var") illustrates the process: - * VOP_LOOKUP() obtains B while A is held - * vfs_busy() obtains a shared lock on F while A and B are held - * vput() releases lock on B - * vput() releases lock on A - * VFS_ROOT() obtains lock on D while shared lock on F is held - * vfs_unbusy() releases shared lock on F - * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. - * Attempt to lock A (instead of vp_crossmp) while D is held would - * violate the global order, causing deadlocks. + * 1. VOP_LOOKUP() obtains B while A is held + * 2. vfs_busy() obtains a shared lock on F while A and B are held + * 3. vput() releases lock on B + * 4. vput() releases lock on A + * 5. VFS_ROOT() obtains lock on D while shared lock on F is held + * 6. vfs_unbusy() releases shared lock on F + * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. + * Attempt to lock A (instead of vp_crossmp) while D is held would + * violate the global order, causing deadlocks. * - * dounmount() locks B while F is drained. + * dounmount() locks B while F is drained. Note that for stacked + * filesystems, D and B in the example above may be the same lock, + * which introdues potential lock order reversal deadlock between + * dounmount() and step 5 above. These filesystems may avoid the LOR + * by setting VV_CROSSLOCK on the covered vnode so that lock B will + * remain held until after step 5. */ int vfs_busy(struct mount *mp, int flags) @@ -814,8 +819,7 @@ * flag in addition to MNTK_UNMOUNT, indicating that mount point is * about to be really destroyed. vfs_busy needs to release its * reference on the mount point in this case and return with ENOENT, - * telling the caller that mount mount it tried to busy is no longer - * valid. + * telling the caller the mount it tried to busy is no longer valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -276,6 +276,7 @@ #define VV_FORCEINSMQ 0x1000 /* force the insmntque to succeed */ #define VV_READLINK 0x2000 /* fdescfs linux vnode */ #define VV_UNREF 0x4000 /* vunref, do not drop lock in inactive() */ +#define VV_CROSSLOCK 0x8000 /* vnode lock is shared w/ root mounted here */ #define VMP_LAZYLIST 0x0001 /* Vnode is on mnt's lazy list */