Page MenuHomeFreeBSD

D52628.id162565.diff
No OneTemporary

D52628.id162565.diff

diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -113,6 +113,7 @@
static bool vhold_recycle_free(struct vnode *);
static void vdropl_recycle(struct vnode *vp);
static void vdrop_recycle(struct vnode *vp);
+static void vdrop_doomed_process(void);
static void vfs_knllock(void *arg);
static void vfs_knlunlock(void *arg);
static void vfs_knl_assert_lock(void *arg, int what);
@@ -226,6 +227,10 @@
SYSCTL_ULONG(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD,
&deferred_inact, 0, "Number of times inactive processing was deferred");
+static u_long deferred_drop;
+SYSCTL_ULONG(_vfs, OID_AUTO, deferred_drop, CTLFLAG_RD,
+ &deferred_drop, 0, "Number of times drop was deferred");
+
/* To keep more than one thread at a time from running vfs_getnewfsid */
static struct mtx mntid_mtx;
@@ -318,6 +323,9 @@
static void vdbatch_dequeue(struct vnode *vp);
+struct mtx vnode_deferred_drop_mtx;
+struct vnodelst vnode_deferred_drop_list;
+
/*
* The syncer will require at least SYNCER_MAXDELAY iterations to shutdown;
* we probably don't want to pause for the whole second each time.
@@ -774,6 +782,8 @@
}
wantfreevnodes = desiredvnodes / 4;
mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
+ TAILQ_INIT(&vnode_deferred_drop_list);
+ mtx_init(&vnode_deferred_drop_mtx, "vnode_defdrop", NULL, MTX_DEF);
TAILQ_INIT(&vnode_list);
mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF);
/*
@@ -1784,6 +1794,8 @@
for (;;) {
kproc_suspend_check(vnlruproc);
+ vdrop_doomed_process();
+
if (force == 0 && vnlru_proc_light())
continue;
@@ -2217,6 +2229,7 @@
("dirty blk trie not empty"));
VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp,
("Leaked inactivation"));
+ VNPASS((vp->v_iflag & VI_DEFDROP) == 0, vp);
VI_UNLOCK(vp);
cache_assert_no_entries(vp);
@@ -3450,6 +3463,7 @@
struct mount *mp;
VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__));
+ VNPASS((vp->v_iflag & VI_DEFDROP) == 0, vp);
if ((vp->v_mflag & VMP_LAZYLIST) != 0)
return;
@@ -3644,26 +3658,26 @@
}
break;
}
- if (error == 0) {
- if (func == VUNREF) {
- VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp,
- ("recursive vunref"));
- vp->v_vflag |= VV_UNREF;
- }
- for (;;) {
- error = vinactive(vp);
- if (want_unlock)
- VOP_UNLOCK(vp);
- if (error != ERELOOKUP || !want_unlock)
- break;
- VOP_LOCK(vp, LK_EXCLUSIVE);
- }
- if (func == VUNREF)
- vp->v_vflag &= ~VV_UNREF;
- vdropl(vp);
- } else {
+ if (error != 0) {
vdefer_inactive(vp);
+ return;
+ }
+ if (func == VUNREF) {
+ VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp,
+ ("recursive vunref"));
+ vp->v_vflag |= VV_UNREF;
+ }
+ for (;;) {
+ error = vinactive(vp);
+ if (want_unlock)
+ VOP_UNLOCK(vp);
+ if (error != ERELOOKUP || !want_unlock)
+ break;
+ VOP_LOCK(vp, LK_EXCLUSIVE);
}
+ if (func == VUNREF)
+ vp->v_vflag &= ~VV_UNREF;
+ vdropl(vp);
return;
out:
if (func == VPUT)
@@ -3771,6 +3785,10 @@
*
* However, while this is more performant, it hinders debugging by eliminating
* the previously mentioned invariant.
+ *
+ * Note: there is a corner case where VHOLD_NO_SMR can transiently appear and
+ * disappear. This is fine as if vhold_smr fails, the caller is expected to try
+ * to reach the vnode in other ways, if at all possible. See vdropl_final.
*/
bool
vhold_smr(struct vnode *vp)
@@ -3960,6 +3978,92 @@
MPASS(vp->v_dbatchcpu == NOCPU);
}
+/*
+ * Deferred vdrop for doomed vnodes.
+ *
+ * This is *not* a general-purpose async vdrop mechanism.
+ *
+ * The only purpose is to facilitate the invariant that VOP_UNLOCK is safe to
+ * call *after* dropping the reference.
+ *
+ * It works around the following race:
+ *
+ * Suppose the vnode is doomed, v_usecount is 2, v_holdcnt is 1 (on account of
+ * v_usecount > 0) and both usecount holders are issuing vput while having a
+ * read lock on the vnode.
+ *
+ * thread1 thread2
+ * refcount_release usecount
+ * // v_usecount is now 1
+ * // will proceed to unlock
+ * refcount_release usecount
+ * // v_usecount is now 0
+ * // will proceed to vdrop
+ * VOP_UNLOCK
+ * refcount_release holdcnt
+ * // v_holdcnt is now 0
+ * // since the vnode is doomed
+ * // it will proceed to free it
+ * freevnode
+ * // goes ahead with unlock, but
+ * // the vnode is already freed
+ * VOP_UNLOCK
+ *
+ * In order to combat the problem we check if there are any lock holders
+ * still active and wait for them to exit. It might not be safe to do it
+ * in some of the places vdrop is called from, hence we defer this work.
+ */
+static void
+vdrop_doomed_defer(struct vnode *vp)
+{
+
+ ASSERT_VI_LOCKED(vp, __func__);
+ VNPASS(VN_IS_DOOMED(vp), vp);
+ VNPASS(vp->v_mount == NULL, vp);
+ VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
+ VNPASS((vp->v_iflag & VI_DEFDROP) == 0, vp);
+
+ vp->v_iflag |= VI_DEFDROP;
+ mtx_lock(&vnode_deferred_drop_mtx);
+ TAILQ_INSERT_TAIL(&vnode_deferred_drop_list, vp, v_lazylist);
+ deferred_drop++;
+ mtx_unlock(&vnode_deferred_drop_mtx);
+ VI_UNLOCK(vp);
+}
+
+/*
+ * Called by vnlru.
+ *
+ * If dooming ever gets reworked this should probably land in the syncer.
+ */
+static void __noinline
+vdrop_doomed_process(void)
+{
+ struct vnodelst drop_list;
+ struct vnode *vp, *vptmp;
+
+ if (TAILQ_EMPTY(&vnode_deferred_drop_list))
+ return;
+
+ TAILQ_INIT(&drop_list);
+ mtx_lock(&vnode_deferred_drop_mtx);
+ TAILQ_CONCAT(&drop_list, &vnode_deferred_drop_list, v_lazylist);
+ mtx_unlock(&vnode_deferred_drop_mtx);
+
+ TAILQ_FOREACH_SAFE(vp, &drop_list, v_lazylist, vptmp) {
+ VNPASS(VN_IS_DOOMED(vp), vp);
+ VNPASS((vp->v_iflag & VI_DEFDROP) != 0, vp);
+
+ VOP_LOCK(vp, LK_EXCLUSIVE | LK_RETRY);
+ VOP_UNLOCK(vp);
+
+ VI_LOCK(vp);
+ vp->v_iflag &= ~VI_DEFDROP;
+ vdropl(vp);
+ /* unlocks the interlock */
+ }
+}
+
/*
* Drop the hold count of the vnode.
*
@@ -3975,6 +4079,7 @@
ASSERT_VI_LOCKED(vp, __func__);
VNPASS(VN_IS_DOOMED(vp), vp);
+ VNPASS((vp->v_iflag & VI_DEFDROP) == 0, vp);
/*
* Set the VHOLD_NO_SMR flag.
*
@@ -3990,6 +4095,27 @@
*/
return;
}
+
+ /*
+ * v_holdcnt is now VHOLD_NO_SMR and we hold the interlock.
+ *
+ * While nobody can legally gain a reference, we need to wait for any pending
+ * unlock operations to finish. See vdrop_doomed_defer for details.
+ *
+ * If someone holds the lock it may be we can't safely wait for them.
+ * Instead, we backpedal the above transition and defer the work.
+ *
+ * Note we depend on a full fence from atomic_cmpset_int.
+ */
+ if (__predict_false(VOP_ISLOCKED(vp))) {
+ if (!atomic_cmpset_int(&vp->v_holdcnt, VHOLD_NO_SMR, 1)) {
+ VNASSERT(0, vp, ("failed to transition holdcnt from VHOLD_NO_SMR to 1"));
+ panic("failed to transition holdcnt from VHOLD_NO_SMR to 1 on vnode %p", vp);
+ }
+ vdrop_doomed_defer(vp);
+ return;
+ }
+
/*
* Don't bump freevnodes as this one is going away.
*/
@@ -4651,8 +4777,10 @@
strlcat(buf, "|VI_DEFINACT", sizeof(buf));
if (vp->v_iflag & VI_FOPENING)
strlcat(buf, "|VI_FOPENING", sizeof(buf));
+ if (vp->v_iflag & VI_DEFDROP)
+ strlcat(buf, "|VI_DEFDROP", sizeof(buf));
flags = vp->v_iflag & ~(VI_MOUNT | VI_DOINGINACT |
- VI_OWEINACT | VI_DEFINACT | VI_FOPENING);
+ VI_OWEINACT | VI_DEFINACT | VI_FOPENING | VI_DEFDROP);
if (flags != 0) {
snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
strlcat(buf, buf2, sizeof(buf));
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -268,6 +268,7 @@
#define VI_DEFINACT 0x0010 /* deferred inactive */
#define VI_FOPENING 0x0020 /* In open, with opening process having the
first right to advlock file */
+#define VI_DEFDROP 0x0040 /* deferred vdrop */
#define VV_ROOT 0x0001 /* root of its filesystem */
#define VV_ISTTY 0x0002 /* vnode represents a tty */

File Metadata

Mime Type
text/plain
Expires
Sat, Mar 28, 7:00 AM (10 h, 55 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
30462601
Default Alt Text
D52628.id162565.diff (7 KB)

Event Timeline