diff --git a/sys/fs/unionfs/union_vfsops.c b/sys/fs/unionfs/union_vfsops.c --- a/sys/fs/unionfs/union_vfsops.c +++ b/sys/fs/unionfs/union_vfsops.c @@ -310,6 +310,30 @@ return (ENOENT); } + /* + * Specify that the covered vnode lock should remain held while + * lookup() performs the cross-mount walk. This prevents a lock-order + * reversal between the covered vnode lock (which is also locked by + * unionfs_lock()) and the mountpoint's busy count. Without this, + * unmount will lock the covered vnode lock (directly through the + * covered vnode) and wait for the busy count to drain, while a + * concurrent lookup will increment the busy count and then lock + * the covered vnode lock (indirectly through unionfs_lock()). + * + * Note that we can't yet use this facility for the 'below' case + * in which the upper vnode is the covered vnode, because that would + * introduce a different LOR in which the cross-mount lookup would + * effectively hold the upper vnode lock before acquiring the lower + * vnode lock, while an unrelated lock operation would still acquire + * the lower vnode lock before the upper vnode lock, which is the + * order unionfs currently requires. + */ + if (!below) { + vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY); + mp->mnt_vnodecovered->v_vflag |= VV_CROSSLOCK; + VOP_UNLOCK(mp->mnt_vnodecovered); + } + MNT_ILOCK(mp); if ((lowermp->mnt_flag & MNT_LOCAL) != 0 && (uppermp->mnt_flag & MNT_LOCAL) != 0) @@ -362,6 +386,9 @@ if (error) return (error); + vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY); + mp->mnt_vnodecovered->v_vflag &= ~VV_CROSSLOCK; + VOP_UNLOCK(mp->mnt_vnodecovered); vfs_unregister_upper(ump->um_lowervp->v_mount, &ump->um_lower_link); vfs_unregister_upper(ump->um_uppervp->v_mount, &ump->um_upper_link); free(ump, M_UNIONFSMNT); diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -901,6 +901,8 @@ struct componentname *cnp = &ndp->ni_cnd; int lkflags_save; int ni_dvp_unlocked; + int crosslkflags; + bool crosslock; /* * Setup: break out flag bits into variables. @@ -1232,18 +1234,32 @@ */ while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && (cnp->cn_flags & NOCROSSMOUNT) == 0) { + crosslock = (dp->v_vflag & VV_CROSSLOCK) != 0; + crosslkflags = compute_cn_lkflags(mp, cnp->cn_lkflags, + cnp->cn_flags); + if (__predict_false(crosslock) && + (crosslkflags & LK_EXCLUSIVE) != 0 && + VOP_ISLOCKED(dp) != LK_EXCLUSIVE) { + vn_lock(dp, LK_UPGRADE | LK_RETRY); + if (VN_IS_DOOMED(dp)) { + error = ENOENT; + goto bad2; + } + } if (vfs_busy(mp, 0)) continue; - vput(dp); + if (__predict_true(!crosslock)) + vput(dp); if (dp != ndp->ni_dvp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); vrefact(vp_crossmp); ndp->ni_dvp = vp_crossmp; - error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags, - cnp->cn_flags), &tdp); + error = VFS_ROOT(mp, crosslkflags, &tdp); vfs_unbusy(mp); + if (__predict_false(crosslock)) + vput(dp); if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT)) panic("vp_crossmp exclusively locked or reclaimed"); if (error) { diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -770,17 +770,22 @@ * +->F->D->E * * The lookup() process for namei("/var") illustrates the process: - * VOP_LOOKUP() obtains B while A is held - * vfs_busy() obtains a shared lock on F while A and B are held - * vput() releases lock on B - * vput() releases lock on A - * VFS_ROOT() obtains lock on D while shared lock on F is held - * vfs_unbusy() releases shared lock on F - * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. - * Attempt to lock A (instead of vp_crossmp) while D is held would - * violate the global order, causing deadlocks. + * 1. VOP_LOOKUP() obtains B while A is held + * 2. vfs_busy() obtains a shared lock on F while A and B are held + * 3. vput() releases lock on B + * 4. vput() releases lock on A + * 5. VFS_ROOT() obtains lock on D while shared lock on F is held + * 6. vfs_unbusy() releases shared lock on F + * 7. vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. + * Attempt to lock A (instead of vp_crossmp) while D is held would + * violate the global order, causing deadlocks. * - * dounmount() locks B while F is drained. + * dounmount() locks B while F is drained. Note that for stacked + * filesystems, D and B in the example above may be the same lock, + * which introdues potential lock order reversal deadlock between + * dounmount() and step 5 above. These filesystems may avoid the LOR + * by setting VV_CROSSLOCK on the covered vnode so that lock B will + * remain held until after step 5. */ int vfs_busy(struct mount *mp, int flags) diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -276,6 +276,7 @@ #define VV_FORCEINSMQ 0x1000 /* force the insmntque to succeed */ #define VV_READLINK 0x2000 /* fdescfs linux vnode */ #define VV_UNREF 0x4000 /* vunref, do not drop lock in inactive() */ +#define VV_CROSSLOCK 0x8000 /* vnode lock is shared w/ root mounted here */ #define VMP_LAZYLIST 0x0001 /* Vnode is on mnt's lazy list */