diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -102,6 +102,10 @@ /* Allocation zone for namei. */ uma_zone_t namei_zone; +/* Forward declaration (for use in vn_cross_mounts()). */ +static int enforce_lkflags(struct mount *mp, int lkflags); + + /* Placeholder vnode for mp traversal. */ static struct vnode *vp_crossmp; @@ -157,6 +161,233 @@ * gets allocated early. See nameiinit for the direct call below. */ +/* + * Returns busied the mount point mounted on the passed vnode, if any. + * + * The vnode's lock must be held and may be released on output, as indicated by + * '*unlocked'. The caller must also have an active reference on the vnode + * (vref() or vget()) which is preserved across the call. On success, the + * busied mount point is passed through 'mp'. + * + * If the vnode is not mounted-on, EJUSTRETURN is returned and '*mp' is set to + * NULL. Concurrent unmounts/remounts of the covering mount are handled + * transparently by restarting the process (doing so is currently not really + * necessary for correctness but is closer to the historical behavior where the + * unmounts/remounts were prevented to happen in this case, and will be required + * (but not enough) if we ever want to implement such things as atomic mount + * substitutions). ENOENT is returned if the vnode was doomed while trying to + * determine its covering mount, and '*mp' is set to NULL. Else, '*mp' is set + * to the busied mount point and 0 is returned. + */ +int +vn_busy_mountedhere(struct vnode *vp, bool *unlocked, struct mount **mp) +{ + int error; + + ASSERT_VOP_LOCKED(vp, __func__); + ASSERT_VI_UNLOCKED(vp, __func__); + + *unlocked = false; + *mp = NULL; + + if (VN_IS_DOOMED(vp)) + return (ENOENT); + + if (__predict_true((vn_irflag_read(vp) & VIRF_MOUNTPOINT) == 0)) + return (EJUSTRETURN); + + *mp = vp->v_mountedhere; + MPASS(*mp != NULL); + + /* + * Opportunistically try to busy the mount point. On success, this can + * avoid a superfluous unlock/relock cycle on 'vp' in some cases (in + * particular, the vfs_lookup() case), and always avoids a pair of + * vfs_ref()/vfs_rel() calls. + */ + error = vfs_busy(*mp, MBF_NOWAIT); + if (__predict_true(error == 0)) + return (error); + + /* Make sure '*mp' survives the unlock of 'vp'. */ + vfs_ref(*mp); + VOP_UNLOCK(vp); + *unlocked = true; + + for (;;) { + error = vfs_busy(*mp, 0); + vfs_rel(*mp); + + if (__predict_true(error == 0)) + return (0); + + *mp = NULL; + VI_LOCK(vp); + + if (VN_IS_DOOMED(vp)) { + error = ENOENT; + goto unlock_exit; + } + + if (__predict_true + ((vn_irflag_read(vp) & VIRF_MOUNTPOINT) == 0)) { + error = EJUSTRETURN; + goto unlock_exit; + } + /* + * We can't retrieve the same (conceptual) mount point as before + * since the vfs_busy() above returned with an error only after + * 'v_mountedhere' was cleared on the covered vnode (but we well + * could retrieve the same pointer in case the structure is + * recycled). + */ + *mp = vp->v_mountedhere; + MPASS(*mp != NULL); + + /* + * This establishes the order "covered vnode's interlock" -> + * "mounted-here mount point interlock". Note that this order + * between a vnode and a mount point is the reverse of that of + * "vnode's owning mount point" -> "vnode's interlock", without + * causing trouble since the mount point is different in both + * cases. This causes a spurious LOR with the initial 'devfs' + * being mounted at '/' and then remounted at '/dev' (see + * vfs_mountroot()). + */ + vfs_ref(*mp); + VI_UNLOCK(vp); + } + + __assert_unreachable(); + +unlock_exit: + VI_UNLOCK(vp); + return (error); +} + +/* + * Cross a single mounted-on vnode, returning the mount's root vnode. + * + * The vnode's lock must be held, and may be unlocked on output as indicated by + * '*unlocked' or on success. The caller must also have an active reference on + * the vnode (vref() or vget()) which is preserved across the call. On success, + * the mount's root vnode is returned locked according to 'root_lkflags' and + * with an active reference. + * + * Behaves essentially for errors and outputs as vn_busy_mountedhere(), which it + * calls first, with '*vpp' taking the role of '*mp'. In case of success for + * this step, VFS_ROOT() is called and its result returned. In case of any + * error, '*vpp' is set to NULL. On overall success, '*unlocked' is guaranteed + * to be set to true. + */ +int +vn_cross_single_mount(struct vnode* vp, int root_lkflags, + bool *unlocked, struct vnode **vpp) +{ + struct mount *mp; + int error; + + *vpp = NULL; + + error = vn_busy_mountedhere(vp, unlocked, &mp); + if (__predict_false(error == 0)) { + if (__predict_true(!*unlocked)) { + VOP_UNLOCK(vp); + *unlocked = true; + } + error = VFS_ROOT(mp, root_lkflags, vpp); + vfs_unbusy(mp); + } + + return (error); +} + +static void +vn_lock_enforced_flags(struct vnode *vp, int lkflags) +{ + int error __unused; + + error = vn_lock(vp, enforce_lkflags(vp->v_mount, lkflags | LK_RETRY)); + KASSERT(error == 0, + ("%s: vn_lock(LK_RETRY) returned %d", __func__, error)); +} + +/* + * Repeatedly cross mounts starting from a given vnode. + * + * Traverses all successive mounts on the same path, locking the successive + * vnodes as specified by enforce_lkflags() and unlocking them after obtaining + * their covering mount. Ensures the final vnode is locked and actively + * referenced. The initial vnode is returned unlocked and its active reference + * is released except if it is also the final vnode (no mount points to cross). + * + * Mounts are crossed until reaching vnode that is not covered by a mount, which + * is returned locked. If some traversed vnode happens to be doomed, ENOENT is + * returned. Can return errors reported by VFS_ROOT(). On success, puts the + * final vnode into '*vpp' and returns 0. + * + * This function ensures that the crossed mountpoint cannot be busied and the + * initial vnode locked at the same time. The goal is to avoid establishing + * a lock order between them in order to avoid deadlocks, at lookup with mounted + * stacked filesystems (nullfs, unionfs) where locking the mountpoint's root + * vnode leads to locking the covered vnode as well and vice-versa, but also at + * unmount where parallel vfs_busy() calls block while acquiring the covered + * vnode's lock, which establishes the acquisition order mount point -> covered + * vnode. This function (through the VFS_ROOT() call) only establishes the + * acquisition order mount point -> root vnode, which implies mount point -> + * covered vnode for stacked filesystems, thus the same order as that of + * dounmount(). In other words, the legal order is that a mount point reference + * must always be acquired before the vnode's lock, be it the root vnode under + * the mount point or the covered vnode over it. + */ +int +vn_cross_mounts(struct vnode* vp, int const lkflags, struct vnode ** const vpp) +{ + int error; + bool unlocked; + + for (;;) { + error = vn_cross_single_mount(vp, lkflags, &unlocked, vpp); + + /* Optimize for the non-mount-point case. */ + if (__predict_true(error == EJUSTRETURN)) { + /* No more mounts to cross. */ + *vpp = vp; + error = 0; + + if (__predict_false(unlocked)) { + vn_lock_enforced_flags(vp, lkflags); + if (VN_IS_DOOMED(vp)) { + vput(vp); + + *vpp = NULL; + error = ENOENT; + } + } + + return (error); + } + + if (__predict_false(error != 0)) { + if (__predict_true(unlocked)) + vrele(vp); + else + vput(vp); + + return (error); + } + + /* Crossed one mount. Try to cross another one. */ + MPASS(unlocked); + ASSERT_VOP_UNLOCKED(vp, __func__); + vrele(vp); + vp = *vpp; + ASSERT_VOP_LOCKED(vp, __func__); + } + + __assert_unreachable(); +} + struct nameicap_tracker { struct vnode *dp; TAILQ_ENTRY(nameicap_tracker) nm_link; diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -826,6 +826,12 @@ int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio); +int vn_busy_mountedhere(struct vnode *vp, bool *unlocked, + struct mount **mp); +int vn_cross_single_mount(struct vnode* vp, int root_lkflags, + bool *unlocked, struct vnode **vpp); +int vn_cross_mounts(struct vnode* vp, int lkflags, struct vnode **vpp); + void vn_seqc_write_begin_locked(struct vnode *vp); void vn_seqc_write_begin(struct vnode *vp); void vn_seqc_write_end_locked(struct vnode *vp);