Index: sys/fs/nfsserver/nfs_nfsdport.c =================================================================== --- sys/fs/nfsserver/nfs_nfsdport.c +++ sys/fs/nfsserver/nfs_nfsdport.c @@ -3318,6 +3318,7 @@ nfsv4root_mnt.mnt_flag = (MNT_RDONLY | MNT_EXPORTED); TAILQ_INIT(&nfsv4root_mnt.mnt_nvnodelist); TAILQ_INIT(&nfsv4root_mnt.mnt_activevnodelist); + TAILQ_INIT(&nfsv4root_mnt.mnt_dirtyvnodelist); nfsv4root_mnt.mnt_export = NULL; TAILQ_INIT(&nfsv4root_opt); TAILQ_INIT(&nfsv4root_newopt); @@ -3325,6 +3326,7 @@ nfsv4root_mnt.mnt_optnew = &nfsv4root_newopt; nfsv4root_mnt.mnt_nvnodelistsize = 0; nfsv4root_mnt.mnt_activevnodelistsize = 0; + nfsv4root_mnt.mnt_dirtyvnodelistsize = 0; } /* Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -1230,6 +1230,8 @@ VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp, ("neg writecount increment %d", ap->a_inc)); vp->v_writecount += ap->a_inc; + if (vp->v_writecount > 0 && vp->v_mount != NULL) + vdirty(vp); error = 0; } VI_UNLOCK(vp); Index: sys/kern/vfs_mount.c =================================================================== --- sys/kern/vfs_mount.c +++ sys/kern/vfs_mount.c @@ -506,6 +506,8 @@ mp->mnt_activevnodelistsize = 0; TAILQ_INIT(&mp->mnt_tmpfreevnodelist); mp->mnt_tmpfreevnodelistsize = 0; + TAILQ_INIT(&mp->mnt_dirtyvnodelist); + mp->mnt_dirtyvnodelistsize = 0; if (mp->mnt_ref != 0 || mp->mnt_lockref != 0 || mp->mnt_writeopcount != 0) panic("%s: non-zero counters on new mp %p\n", __func__, mp); @@ -575,6 +577,8 @@ panic("vfs_mount_destroy: nonzero activevnodelistsize"); if (mp->mnt_tmpfreevnodelistsize != 0) panic("vfs_mount_destroy: nonzero tmpfreevnodelistsize"); + if (mp->mnt_dirtyvnodelistsize != 0) + panic("vfs_mount_destroy: nonzero dirtyvnodelistsize"); if (mp->mnt_lockref != 0) panic("vfs_mount_destroy: nonzero lock refcount"); MNT_IUNLOCK(mp); Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -1747,6 +1747,15 @@ mp->mnt_activevnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } + if (vp->v_mflag & VMP_DIRTYLIST) { + mtx_lock(&mp->mnt_listmtx); + if (vp->v_mflag & VMP_DIRTYLIST) { + vp->v_mflag &= ~VMP_DIRTYLIST; + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, vp, v_dirtylist); + mp->mnt_dirtyvnodelistsize--; + } + mtx_unlock(&mp->mnt_listmtx); + } vp->v_mount = NULL; VI_UNLOCK(vp); VNASSERT(mp->mnt_nvnodelistsize > 0, vp, @@ -2974,6 +2983,25 @@ return (vp->v_usecount); } +void +vdirty(struct vnode *vp) +{ + struct mount *mp; + + VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); + + if ((vp->v_mflag & VMP_DIRTYLIST) != 0) + return; + mp = vp->v_mount; + mtx_lock(&mp->mnt_listmtx); + if ((vp->v_mflag & VMP_DIRTYLIST) == 0) { + vp->v_mflag |= VMP_DIRTYLIST; + TAILQ_INSERT_TAIL(&mp->mnt_dirtyvnodelist, vp, v_dirtylist); + mp->mnt_dirtyvnodelistsize++; + } + mtx_unlock(&mp->mnt_listmtx); +} + static void vdefer_inactive(struct vnode *vp) { @@ -2987,6 +3015,7 @@ vdropl(vp); return; } + vdirty(vp); vp->v_iflag |= VI_DEFINACT; VI_UNLOCK(vp); counter_u64_add(deferred_inact, 1); @@ -3263,6 +3292,11 @@ ("vdrop: freeing when we shouldn't")); mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); + if (vp->v_mflag & VMP_DIRTYLIST) { + vp->v_mflag &= ~VMP_DIRTYLIST; + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, vp, v_dirtylist); + mp->mnt_dirtyvnodelistsize--; + } if (vp->v_iflag & VI_ACTIVE) { vp->v_iflag &= ~VI_ACTIVE; TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); @@ -3840,7 +3874,9 @@ } if (vp->v_mflag & VMP_TMPMNTFREELIST) strlcat(buf, "|VMP_TMPMNTFREELIST", sizeof(buf)); - flags = vp->v_mflag & ~(VMP_TMPMNTFREELIST); + if (vp->v_mflag & VMP_DIRTYLIST) + strlcat(buf, "|VMP_DIRTYLIST", sizeof(buf)); + flags = vp->v_mflag & ~(VMP_TMPMNTFREELIST | VMP_DIRTYLIST); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); @@ -4060,6 +4096,8 @@ db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_activevnodelistsize = %d\n", mp->mnt_activevnodelistsize); + db_printf(" mnt_dirtyvnodelistsize = %d\n", + mp->mnt_dirtyvnodelistsize); db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); @@ -4410,6 +4448,13 @@ vdefer_inactive_cond(vp); } +static int +vfs_periodic_inactive_filter(struct vnode *vp, void *arg) +{ + + return (vp->v_iflag & VI_DEFINACT); +} + static void __noinline vfs_periodic_inactive(struct mount *mp, int flags) { @@ -4420,7 +4465,7 @@ if (flags != MNT_WAIT) lkflags |= LK_NOWAIT; - MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { + MNT_VNODE_FOREACH_DIRTY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { if ((vp->v_iflag & VI_DEFINACT) == 0) { VI_UNLOCK(vp); continue; @@ -4435,12 +4480,27 @@ { struct vm_object *obj; + /* + * This test may be performed without any locks held. + * We realy on vm_object's type stability. + */ if (vp->v_vflag & VV_NOSYNC) return (false); obj = vp->v_object; return (obj != NULL && vm_object_mightbedirty(obj)); } +static int +vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) +{ + + if (vp->v_vflag & VV_NOSYNC) + return (false); + if (vp->v_iflag & VI_DEFINACT) + return (true); + return (vfs_want_msync(vp)); +} + static void __noinline vfs_periodic_msync_inactive(struct mount *mp, int flags) { @@ -4460,7 +4520,7 @@ objflags = OBJPC_SYNC; } - MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { + MNT_VNODE_FOREACH_DIRTY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { seen_defer = false; if (vp->v_iflag & VI_DEFINACT) { vp->v_iflag &= ~VI_DEFINACT; @@ -6174,3 +6234,212 @@ mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_active(mvp, mp); } + +/* + * These are helper functions for filesystems to traverse their + * dirty vnodes. See MNT_VNODE_FOREACH_DIRTY() in sys/mount.h + */ +static void +mnt_vnode_markerfree_dirty(struct vnode **mvp, struct mount *mp) +{ + + KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); + + MNT_ILOCK(mp); + MNT_REL(mp); + MNT_IUNLOCK(mp); + free(*mvp, M_VNODE_MARKER); + *mvp = NULL; +} + +/* + * Relock the mp mount vnode list lock with the vp vnode interlock in the + * conventional lock order during mnt_vnode_next_dirty iteration. + * + * On entry, the mount vnode list lock is held and the vnode interlock is not. + * The list lock is dropped and reacquired. On success, both locks are held. + * On failure, the mount vnode list lock is held but the vnode interlock is + * not, and the procedure may have yielded. + */ +static bool +mnt_vnode_next_dirty_relock(struct vnode *mvp, struct mount *mp, + struct vnode *vp) +{ + const struct vnode *tmp; + bool held, ret; + + VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && + TAILQ_NEXT(mvp, v_dirtylist) != NULL, mvp, + ("%s: bad marker", __func__)); + VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, + ("%s: inappropriate vnode", __func__)); + ASSERT_VI_UNLOCKED(vp, __func__); + mtx_assert(&mp->mnt_listmtx, MA_OWNED); + + ret = false; + + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, mvp, v_dirtylist); + TAILQ_INSERT_BEFORE(vp, mvp, v_dirtylist); + + /* + * Use a hold to prevent vp from disappearing while the mount vnode + * list lock is dropped and reacquired. Normally a hold would be + * acquired with vhold(), but that might try to acquire the vnode + * interlock, which would be a LOR with the mount vnode list lock. + */ + held = refcount_acquire_if_not_zero(&vp->v_holdcnt); + mtx_unlock(&mp->mnt_listmtx); + if (!held) + goto abort; + VI_LOCK(vp); + if (!refcount_release_if_not_last(&vp->v_holdcnt)) { + vdropl(vp); + goto abort; + } + mtx_lock(&mp->mnt_listmtx); + + /* + * Determine whether the vnode is still the next one after the marker, + * excepting any other markers. If the vnode has not been doomed by + * vgone() then the hold should have ensured that it remained on the + * dirty list. If it has been doomed but is still on the dirty list, + * don't abort, but rather skip over it (avoid spinning on doomed + * vnodes). + */ + tmp = mvp; + do { + tmp = TAILQ_NEXT(tmp, v_dirtylist); + } while (tmp != NULL && tmp->v_type == VMARKER); + if (tmp != vp) { + mtx_unlock(&mp->mnt_listmtx); + VI_UNLOCK(vp); + goto abort; + } + + ret = true; + goto out; +abort: + maybe_yield(); + mtx_lock(&mp->mnt_listmtx); +out: + if (ret) + ASSERT_VI_LOCKED(vp, __func__); + else + ASSERT_VI_UNLOCKED(vp, __func__); + mtx_assert(&mp->mnt_listmtx, MA_OWNED); + return (ret); +} + +static struct vnode * +mnt_vnode_next_dirty(struct vnode **mvp, struct mount *mp, mnt_dirty_cb_t *cb, + void *cbarg) +{ + struct vnode *vp, *nvp; + + mtx_assert(&mp->mnt_listmtx, MA_OWNED); + KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); +restart: + vp = TAILQ_NEXT(*mvp, v_dirtylist); + while (vp != NULL) { + if (vp->v_type == VMARKER) { + vp = TAILQ_NEXT(vp, v_dirtylist); + continue; + } + /* + * See if we want to process the vnode. Note we may encounter a + * long string of vnodes we don't care about and hog the list + * as a result. Check for it and requeue the marker. + */ + if (VN_IS_DOOMED(vp) || !cb(vp, cbarg)) { + if (!should_yield()) { + vp = TAILQ_NEXT(vp, v_dirtylist); + continue; + } + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, *mvp, + v_dirtylist); + TAILQ_INSERT_AFTER(&mp->mnt_dirtyvnodelist, vp, *mvp, + v_dirtylist); + mtx_unlock(&mp->mnt_listmtx); + kern_yield(PRI_USER); + mtx_lock(&mp->mnt_listmtx); + goto restart; + } + /* + * Try-lock because this is the wrong lock order. If that does + * not succeed, drop the mount vnode list lock and try to + * reacquire it and the vnode interlock in the right order. + */ + if (!VI_TRYLOCK(vp) && + !mnt_vnode_next_dirty_relock(*mvp, mp, vp)) + goto restart; + KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); + KASSERT(vp->v_mount == mp || vp->v_mount == NULL, + ("alien vnode on the dirty list %p %p", vp, mp)); + if (vp->v_mount == mp && !VN_IS_DOOMED(vp)) + break; + nvp = TAILQ_NEXT(vp, v_dirtylist); + VI_UNLOCK(vp); + vp = nvp; + } + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, *mvp, v_dirtylist); + + /* Check if we are done */ + if (vp == NULL) { + mtx_unlock(&mp->mnt_listmtx); + mnt_vnode_markerfree_dirty(mvp, mp); + return (NULL); + } + TAILQ_INSERT_AFTER(&mp->mnt_dirtyvnodelist, vp, *mvp, v_dirtylist); + mtx_unlock(&mp->mnt_listmtx); + ASSERT_VI_LOCKED(vp, "dirty iter"); + KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); + return (vp); +} + +struct vnode * +__mnt_vnode_next_dirty(struct vnode **mvp, struct mount *mp, mnt_dirty_cb_t *cb, + void *cbarg) +{ + + if (should_yield()) + kern_yield(PRI_USER); + mtx_lock(&mp->mnt_listmtx); + return (mnt_vnode_next_dirty(mvp, mp, cb, cbarg)); +} + +struct vnode * +__mnt_vnode_first_dirty(struct vnode **mvp, struct mount *mp, mnt_dirty_cb_t *cb, + void *cbarg) +{ + struct vnode *vp; + + *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); + MNT_ILOCK(mp); + MNT_REF(mp); + MNT_IUNLOCK(mp); + (*mvp)->v_type = VMARKER; + (*mvp)->v_mount = mp; + + mtx_lock(&mp->mnt_listmtx); + vp = TAILQ_FIRST(&mp->mnt_dirtyvnodelist); + if (vp == NULL) { + mtx_unlock(&mp->mnt_listmtx); + mnt_vnode_markerfree_dirty(mvp, mp); + return (NULL); + } + TAILQ_INSERT_BEFORE(vp, *mvp, v_dirtylist); + return (mnt_vnode_next_dirty(mvp, mp, cb, cbarg)); +} + +void +__mnt_vnode_markerfree_dirty(struct vnode **mvp, struct mount *mp) +{ + + if (*mvp == NULL) + return; + + mtx_lock(&mp->mnt_listmtx); + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, *mvp, v_dirtylist); + mtx_unlock(&mp->mnt_listmtx); + mnt_vnode_markerfree_dirty(mvp, mp); +} Index: sys/sys/mount.h =================================================================== --- sys/sys/mount.h +++ sys/sys/mount.h @@ -223,6 +223,8 @@ int mnt_activevnodelistsize;/* (l) # of active vnodes */ struct vnodelst mnt_tmpfreevnodelist; /* (l) list of free vnodes */ int mnt_tmpfreevnodelistsize;/* (l) # of free vnodes */ + struct vnodelst mnt_dirtyvnodelist; /* (l) list of dirty vnodes */ + int mnt_dirtyvnodelistsize; /* (l) # of dirty vnodes */ struct lock mnt_explock; /* vfs_export walkers lock */ TAILQ_ENTRY(mount) mnt_upper_link; /* (m) we in the all uppers */ TAILQ_HEAD(, mount) mnt_uppers; /* (m) upper mounts over us*/ @@ -267,6 +269,24 @@ #define MNT_VNODE_FOREACH_ACTIVE_ABORT(mp, mvp) \ __mnt_vnode_markerfree_active(&(mvp), (mp)) +/* + * Definitions for MNT_VNODE_FOREACH_DIRTY. + */ +typedef int mnt_dirty_cb_t(struct vnode *, void *); +struct vnode *__mnt_vnode_next_dirty(struct vnode **mvp, struct mount *mp, + mnt_dirty_cb_t *cb, void *cbarg); +struct vnode *__mnt_vnode_first_dirty(struct vnode **mvp, struct mount *mp, + mnt_dirty_cb_t *cb, void *cbarg); +void __mnt_vnode_markerfree_dirty(struct vnode **mvp, struct mount *mp); + +#define MNT_VNODE_FOREACH_DIRTY(vp, mp, mvp, cb, cbarg) \ + for (vp = __mnt_vnode_first_dirty(&(mvp), (mp), (cb), (cbarg)); \ + (vp) != NULL; \ + vp = __mnt_vnode_next_dirty(&(mvp), (mp), (cb), (cbarg))) + +#define MNT_VNODE_FOREACH_DIRTY_ABORT(mp, mvp) \ + __mnt_vnode_markerfree_dirty(&(mvp), (mp)) + #define MNT_ILOCK(mp) mtx_lock(&(mp)->mnt_mtx) #define MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx) #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -148,6 +148,7 @@ * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_actfreelist; /* l vnode active/free lists */ + TAILQ_ENTRY(vnode) v_dirtylist; /* l vnode dirty list */ struct bufobj v_bufobj; /* * Buffer cache object */ /* @@ -261,6 +262,7 @@ #define VV_READLINK 0x2000 /* fdescfs linux vnode */ #define VMP_TMPMNTFREELIST 0x0001 /* Vnode is on mnt's tmp free list */ +#define VMP_DIRTYLIST 0x0002 /* Vnode is on mnt's dirty free list */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value @@ -654,6 +656,7 @@ struct ucred *cred, int *privused); void vattr_null(struct vattr *vap); int vcount(struct vnode *vp); +void vdirty(struct vnode *); void vdrop(struct vnode *); void vdropl(struct vnode *); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);