diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -113,6 +113,7 @@ static bool vhold_recycle_free(struct vnode *); static void vdropl_recycle(struct vnode *vp); static void vdrop_recycle(struct vnode *vp); +static void vdrop_doomed_process(void); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static void vfs_knl_assert_lock(void *arg, int what); @@ -226,6 +227,10 @@ SYSCTL_ULONG(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, 0, "Number of times inactive processing was deferred"); +static u_long deferred_drop; +SYSCTL_ULONG(_vfs, OID_AUTO, deferred_drop, CTLFLAG_RD, + &deferred_drop, 0, "Number of times drop was deferred"); + /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; @@ -318,6 +323,9 @@ static void vdbatch_dequeue(struct vnode *vp); +static struct mtx vnode_deferred_drop_mtx; +static struct vnodelst vnode_deferred_drop_list; + /* * The syncer will require at least SYNCER_MAXDELAY iterations to shutdown; * we probably don't want to pause for the whole second each time. @@ -774,6 +782,8 @@ } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); + TAILQ_INIT(&vnode_deferred_drop_list); + mtx_init(&vnode_deferred_drop_mtx, "vnode_defdrop", NULL, MTX_DEF); TAILQ_INIT(&vnode_list); mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); /* @@ -1784,6 +1794,8 @@ for (;;) { kproc_suspend_check(vnlruproc); + vdrop_doomed_process(); + if (force == 0 && vnlru_proc_light()) continue; @@ -2217,6 +2229,7 @@ ("dirty blk trie not empty")); VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp, ("Leaked inactivation")); + VNPASS((vp->v_iflag & VI_DEFDROP) == 0, vp); VI_UNLOCK(vp); cache_assert_no_entries(vp); @@ -3450,6 +3463,7 @@ struct mount *mp; VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); + VNPASS((vp->v_iflag & VI_DEFDROP) == 0, vp); if ((vp->v_mflag & VMP_LAZYLIST) != 0) return; @@ -3644,26 +3658,26 @@ } break; } - if (error == 0) { - if (func == VUNREF) { - VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, - ("recursive vunref")); - vp->v_vflag |= VV_UNREF; - } - for (;;) { - error = vinactive(vp); - if (want_unlock) - VOP_UNLOCK(vp); - if (error != ERELOOKUP || !want_unlock) - break; - VOP_LOCK(vp, LK_EXCLUSIVE); - } - if (func == VUNREF) - vp->v_vflag &= ~VV_UNREF; - vdropl(vp); - } else { + if (error != 0) { vdefer_inactive(vp); + return; + } + if (func == VUNREF) { + VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp, + ("recursive vunref")); + vp->v_vflag |= VV_UNREF; } + for (;;) { + error = vinactive(vp); + if (want_unlock) + VOP_UNLOCK(vp); + if (error != ERELOOKUP || !want_unlock) + break; + VOP_LOCK(vp, LK_EXCLUSIVE); + } + if (func == VUNREF) + vp->v_vflag &= ~VV_UNREF; + vdropl(vp); return; out: if (func == VPUT) @@ -3771,6 +3785,14 @@ * * However, while this is more performant, it hinders debugging by eliminating * the previously mentioned invariant. + * + * Note: there is a corner case where VHOLD_NO_SMR can transiently appear and + * disappear. This is fine as if vhold_smr fails, the caller is expected to try + * to reach the vnode in other ways, if at all possible. See vdropl_final. + * + * FIXME: dooming should set a flag preventing new ref acquires from vhold_smr, + * possibly even VHOLD_NO_SMR itself. Getting there would require sorting out + * all the open-coded ->v_holdcnt access and plain atomic vs ref API confusion. */ bool vhold_smr(struct vnode *vp) @@ -3960,6 +3982,92 @@ MPASS(vp->v_dbatchcpu == NOCPU); } +/* + * Deferred vdrop for doomed vnodes. + * + * This is *not* a general-purpose async vdrop mechanism. + * + * The only purpose is to facilitate the invariant that VOP_UNLOCK is safe to + * call *after* dropping the reference. + * + * It works around races of the following sort: + * + * Suppose the vnode is doomed, v_usecount is 2, v_holdcnt is 1 (on account of + * v_usecount > 0) and both usecount holders are issuing vput while having a + * read lock on the vnode. + * + * thread1 thread2 + * refcount_release usecount + * // v_usecount is now 1 + * // will proceed to unlock + * refcount_release usecount + * // v_usecount is now 0 + * // will proceed to vdrop + * VOP_UNLOCK + * refcount_release holdcnt + * // v_holdcnt is now 0 + * // since the vnode is doomed + * // it will proceed to free it + * freevnode + * // goes ahead with unlock, but + * // the vnode is already freed + * VOP_UNLOCK + * + * In order to combat the problem we check if there are any lock holders + * still active and wait for them to exit. It might not be safe to do it + * in some of the places vdrop is called from, hence we defer this work. + */ +static void +vdrop_doomed_defer(struct vnode *vp) +{ + + ASSERT_VI_LOCKED(vp, __func__); + VNPASS(VN_IS_DOOMED(vp), vp); + VNPASS(vp->v_mount == NULL, vp); + VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); + VNPASS((vp->v_iflag & VI_DEFDROP) == 0, vp); + + vp->v_iflag |= VI_DEFDROP; + mtx_lock(&vnode_deferred_drop_mtx); + TAILQ_INSERT_TAIL(&vnode_deferred_drop_list, vp, v_lazylist); + deferred_drop++; + mtx_unlock(&vnode_deferred_drop_mtx); + VI_UNLOCK(vp); +} + +/* + * Called by vnlru. + * + * If dooming ever gets reworked this should probably land in the syncer. + */ +static void __noinline +vdrop_doomed_process(void) +{ + struct vnodelst drop_list; + struct vnode *vp, *vptmp; + + if (TAILQ_EMPTY(&vnode_deferred_drop_list)) + return; + + TAILQ_INIT(&drop_list); + mtx_lock(&vnode_deferred_drop_mtx); + TAILQ_CONCAT(&drop_list, &vnode_deferred_drop_list, v_lazylist); + mtx_unlock(&vnode_deferred_drop_mtx); + + TAILQ_FOREACH_SAFE(vp, &drop_list, v_lazylist, vptmp) { + VNPASS(VN_IS_DOOMED(vp), vp); + VNPASS((vp->v_iflag & VI_DEFDROP) != 0, vp); + + VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY); + VOP_UNLOCK(vp); + + VI_LOCK(vp); + vp->v_iflag &= ~VI_DEFDROP; + vdropl(vp); + /* unlocks the interlock */ + } +} + /* * Drop the hold count of the vnode. * @@ -3975,6 +4083,8 @@ ASSERT_VI_LOCKED(vp, __func__); VNPASS(VN_IS_DOOMED(vp), vp); + VNPASS((vp->v_iflag & VI_DEFDROP) == 0, vp); + /* * Set the VHOLD_NO_SMR flag. * @@ -3990,6 +4100,40 @@ */ return; } + + /* + * v_holdcnt is now VHOLD_NO_SMR and we hold the interlock. + * + * While nobody can legally gain a reference, we need to wait for any + * pending unlock operations to finish. See vdrop_doomed_defer for + * details. + * + * If someone holds the lock it may be we can't safely wait for them. + * Instead, we backpedal the above transition and defer the work. + * + * Note we depend on a full fence from atomic_cmpset_int to both + * publish VHOLD_NO_SMR and only observe the state of the lock + * afterwards. + */ + if (__predict_false(VOP_ISLOCKED(vp))) { + if (!atomic_cmpset_int(&vp->v_holdcnt, VHOLD_NO_SMR, 1)) { + VNASSERT(0, vp, ("failed to transition holdcnt from VHOLD_NO_SMR to 1")); + panic("failed to transition holdcnt from VHOLD_NO_SMR to 1 on vnode %p", vp); + } + + /* + * Don't bump freevnodes as we have just undone the 1->0 + * transition and the counter has not been touched so far. + */ + vdrop_doomed_defer(vp); + return; + } + + /* + * Paired with release fence in VOP_UNLOCK. + */ + atomic_thread_fence_acq(); + /* * Don't bump freevnodes as this one is going away. */ @@ -4651,8 +4795,10 @@ strlcat(buf, "|VI_DEFINACT", sizeof(buf)); if (vp->v_iflag & VI_FOPENING) strlcat(buf, "|VI_FOPENING", sizeof(buf)); + if (vp->v_iflag & VI_DEFDROP) + strlcat(buf, "|VI_DEFDROP", sizeof(buf)); flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT | - VI_OWEINACT | VI_DEFINACT | VI_FOPENING); + VI_OWEINACT | VI_DEFINACT | VI_FOPENING | VI_DEFDROP); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -268,6 +268,7 @@ #define VI_DEFINACT 0x0010 /* deferred inactive */ #define VI_FOPENING 0x0020 /* In open, with opening process having the first right to advlock file */ +#define VI_DEFDROP 0x0040 /* deferred vdrop */ #define VV_ROOT 0x0001 /* root of its filesystem */ #define VV_ISTTY 0x0002 /* vnode represents a tty */