Index: sys/fs/nfsserver/nfs_nfsdport.c =================================================================== --- sys/fs/nfsserver/nfs_nfsdport.c +++ sys/fs/nfsserver/nfs_nfsdport.c @@ -3318,6 +3318,7 @@ nfsv4root_mnt.mnt_flag = (MNT_RDONLY | MNT_EXPORTED); TAILQ_INIT(&nfsv4root_mnt.mnt_nvnodelist); TAILQ_INIT(&nfsv4root_mnt.mnt_activevnodelist); + TAILQ_INIT(&nfsv4root_mnt.mnt_dirtyvnodelist); nfsv4root_mnt.mnt_export = NULL; TAILQ_INIT(&nfsv4root_opt); TAILQ_INIT(&nfsv4root_newopt); @@ -3325,6 +3326,7 @@ nfsv4root_mnt.mnt_optnew = &nfsv4root_newopt; nfsv4root_mnt.mnt_nvnodelistsize = 0; nfsv4root_mnt.mnt_activevnodelistsize = 0; + nfsv4root_mnt.mnt_dirtyvnodelistsize = 0; } /* Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -1230,6 +1230,8 @@ VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp, ("neg writecount increment %d", ap->a_inc)); vp->v_writecount += ap->a_inc; + if (vp->v_writecount > 0 && vp->v_mount != NULL) + vdirty(vp); error = 0; } VI_UNLOCK(vp); Index: sys/kern/vfs_mount.c =================================================================== --- sys/kern/vfs_mount.c +++ sys/kern/vfs_mount.c @@ -506,6 +506,8 @@ mp->mnt_activevnodelistsize = 0; TAILQ_INIT(&mp->mnt_tmpfreevnodelist); mp->mnt_tmpfreevnodelistsize = 0; + TAILQ_INIT(&mp->mnt_dirtyvnodelist); + mp->mnt_dirtyvnodelistsize = 0; if (mp->mnt_ref != 0 || mp->mnt_lockref != 0 || mp->mnt_writeopcount != 0) panic("%s: non-zero counters on new mp %p\n", __func__, mp); @@ -575,6 +577,8 @@ panic("vfs_mount_destroy: nonzero activevnodelistsize"); if (mp->mnt_tmpfreevnodelistsize != 0) panic("vfs_mount_destroy: nonzero tmpfreevnodelistsize"); + if (mp->mnt_dirtyvnodelistsize != 0) + panic("vfs_mount_destroy: nonzero dirtyvnodelistsize"); if (mp->mnt_lockref != 0) panic("vfs_mount_destroy: nonzero lock refcount"); MNT_IUNLOCK(mp); Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -1784,6 +1784,15 @@ mp->mnt_activevnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } + if (vp->v_mflag & VMP_DIRTYLIST) { + mtx_lock(&mp->mnt_listmtx); + if (vp->v_mflag & VMP_DIRTYLIST) { + vp->v_mflag &= ~VMP_DIRTYLIST; + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, vp, v_dirtylist); + mp->mnt_dirtyvnodelistsize--; + } + mtx_unlock(&mp->mnt_listmtx); + } vp->v_mount = NULL; VI_UNLOCK(vp); VNASSERT(mp->mnt_nvnodelistsize > 0, vp, @@ -3011,6 +3020,25 @@ return (vp->v_usecount); } +void +vdirty(struct vnode *vp) +{ + struct mount *mp; + + VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); + + if ((vp->v_mflag & VMP_DIRTYLIST) != 0) + return; + mp = vp->v_mount; + mtx_lock(&mp->mnt_listmtx); + if ((vp->v_mflag & VMP_DIRTYLIST) == 0) { + vp->v_mflag |= VMP_DIRTYLIST; + TAILQ_INSERT_TAIL(&mp->mnt_dirtyvnodelist, vp, v_dirtylist); + mp->mnt_dirtyvnodelistsize++; + } + mtx_unlock(&mp->mnt_listmtx); +} + static void vdefer_inactive(struct vnode *vp) { @@ -3025,6 +3053,7 @@ vdropl(vp); return; } + vdirty(vp); vp->v_iflag |= VI_DEFINACT; VI_UNLOCK(vp); counter_u64_add(deferred_inact, 1); @@ -3301,6 +3330,11 @@ ("vdrop: freeing when we shouldn't")); mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); + if (vp->v_mflag & VMP_DIRTYLIST) { + vp->v_mflag &= ~VMP_DIRTYLIST; + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, vp, v_dirtylist); + mp->mnt_dirtyvnodelistsize--; + } if (vp->v_iflag & VI_ACTIVE) { vp->v_iflag &= ~VI_ACTIVE; TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); @@ -3878,7 +3912,9 @@ } if (vp->v_mflag & VMP_TMPMNTFREELIST) strlcat(buf, "|VMP_TMPMNTFREELIST", sizeof(buf)); - flags = vp->v_mflag & ~(VMP_TMPMNTFREELIST); + if (vp->v_mflag & VMP_DIRTYLIST) + strlcat(buf, "|VMP_DIRTYLIST", sizeof(buf)); + flags = vp->v_mflag & ~(VMP_TMPMNTFREELIST | VMP_DIRTYLIST); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); @@ -4098,6 +4134,8 @@ db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_activevnodelistsize = %d\n", mp->mnt_activevnodelistsize); + db_printf(" mnt_dirtyvnodelistsize = %d\n", + mp->mnt_dirtyvnodelistsize); db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); @@ -4449,6 +4487,13 @@ vdefer_inactive_cond(vp); } +static int +vfs_periodic_inactive_filter(struct vnode *vp, void *arg) +{ + + return (vp->v_iflag & VI_DEFINACT); +} + static void __noinline vfs_periodic_inactive(struct mount *mp, int flags) { @@ -4459,7 +4504,7 @@ if (flags != MNT_WAIT) lkflags |= LK_NOWAIT; - MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { + MNT_VNODE_FOREACH_DIRTY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { if ((vp->v_iflag & VI_DEFINACT) == 0) { VI_UNLOCK(vp); continue; @@ -4474,12 +4519,27 @@ { struct vm_object *obj; + /* + * This test may be performed without any locks held. + * We realy on vm_object's type stability. + */ if (vp->v_vflag & VV_NOSYNC) return (false); obj = vp->v_object; return (obj != NULL && vm_object_mightbedirty(obj)); } +static int +vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) +{ + + if (vp->v_vflag & VV_NOSYNC) + return (false); + if (vp->v_iflag & VI_DEFINACT) + return (true); + return (vfs_want_msync(vp)); +} + static void __noinline vfs_periodic_msync_inactive(struct mount *mp, int flags) { @@ -4499,7 +4559,7 @@ objflags = OBJPC_SYNC; } - MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { + MNT_VNODE_FOREACH_DIRTY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { seen_defer = false; if (vp->v_iflag & VI_DEFINACT) { vp->v_iflag &= ~VI_DEFINACT; @@ -6026,10 +6086,10 @@ /* * These are helper functions for filesystems to traverse their - * active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h + * dirty vnodes. See MNT_VNODE_FOREACH_DIRTY() in sys/mount.h */ static void -mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) +mnt_vnode_markerfree_dirty(struct vnode **mvp, struct mount *mp) { KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); @@ -6037,13 +6097,13 @@ MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); - vn_free_marker(*mvp); + free(*mvp, M_VNODE_MARKER); *mvp = NULL; } /* * Relock the mp mount vnode list lock with the vp vnode interlock in the - * conventional lock order during mnt_vnode_next_active iteration. + * conventional lock order during mnt_vnode_next_dirty iteration. * * On entry, the mount vnode list lock is held and the vnode interlock is not. * The list lock is dropped and reacquired. On success, both locks are held. @@ -6051,14 +6111,14 @@ * not, and the procedure may have yielded. */ static bool -mnt_vnode_next_active_relock(struct vnode *mvp, struct mount *mp, +mnt_vnode_next_dirty_relock(struct vnode *mvp, struct mount *mp, struct vnode *vp) { const struct vnode *tmp; bool held, ret; VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && - TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp, + TAILQ_NEXT(mvp, v_dirtylist) != NULL, mvp, ("%s: bad marker", __func__)); VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, ("%s: inappropriate vnode", __func__)); @@ -6067,8 +6127,8 @@ ret = false; - TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist); - TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist); + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, mvp, v_dirtylist); + TAILQ_INSERT_BEFORE(vp, mvp, v_dirtylist); /* * Use a hold to prevent vp from disappearing while the mount vnode @@ -6091,13 +6151,13 @@ * Determine whether the vnode is still the next one after the marker, * excepting any other markers. If the vnode has not been doomed by * vgone() then the hold should have ensured that it remained on the - * active list. If it has been doomed but is still on the active list, + * dirty list. If it has been doomed but is still on the dirty list, * don't abort, but rather skip over it (avoid spinning on doomed * vnodes). */ tmp = mvp; do { - tmp = TAILQ_NEXT(tmp, v_actfreelist); + tmp = TAILQ_NEXT(tmp, v_dirtylist); } while (tmp != NULL && tmp->v_type == VMARKER); if (tmp != vp) { mtx_unlock(&mp->mnt_listmtx); @@ -6120,91 +6180,115 @@ } static struct vnode * -mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) +mnt_vnode_next_dirty(struct vnode **mvp, struct mount *mp, mnt_dirty_cb_t *cb, + void *cbarg) { struct vnode *vp, *nvp; mtx_assert(&mp->mnt_listmtx, MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); restart: - vp = TAILQ_NEXT(*mvp, v_actfreelist); + vp = TAILQ_NEXT(*mvp, v_dirtylist); while (vp != NULL) { if (vp->v_type == VMARKER) { - vp = TAILQ_NEXT(vp, v_actfreelist); + vp = TAILQ_NEXT(vp, v_dirtylist); continue; } + /* + * See if we want to process the vnode. Note we may encounter a + * long string of vnodes we don't care about and hog the list + * as a result. Check for it and requeue the marker. + */ + if (VN_IS_DOOMED(vp) || !cb(vp, cbarg)) { + if (!should_yield()) { + vp = TAILQ_NEXT(vp, v_dirtylist); + continue; + } + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, *mvp, + v_dirtylist); + TAILQ_INSERT_AFTER(&mp->mnt_dirtyvnodelist, vp, *mvp, + v_dirtylist); + mtx_unlock(&mp->mnt_listmtx); + kern_yield(PRI_USER); + mtx_lock(&mp->mnt_listmtx); + goto restart; + } /* * Try-lock because this is the wrong lock order. If that does * not succeed, drop the mount vnode list lock and try to * reacquire it and the vnode interlock in the right order. */ if (!VI_TRYLOCK(vp) && - !mnt_vnode_next_active_relock(*mvp, mp, vp)) + !mnt_vnode_next_dirty_relock(*mvp, mp, vp)) goto restart; KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); KASSERT(vp->v_mount == mp || vp->v_mount == NULL, - ("alien vnode on the active list %p %p", vp, mp)); + ("alien vnode on the dirty list %p %p", vp, mp)); if (vp->v_mount == mp && !VN_IS_DOOMED(vp)) break; - nvp = TAILQ_NEXT(vp, v_actfreelist); + nvp = TAILQ_NEXT(vp, v_dirtylist); VI_UNLOCK(vp); vp = nvp; } - TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, *mvp, v_dirtylist); /* Check if we are done */ if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); - mnt_vnode_markerfree_active(mvp, mp); + mnt_vnode_markerfree_dirty(mvp, mp); return (NULL); } - TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist); + TAILQ_INSERT_AFTER(&mp->mnt_dirtyvnodelist, vp, *mvp, v_dirtylist); mtx_unlock(&mp->mnt_listmtx); - ASSERT_VI_LOCKED(vp, "active iter"); + ASSERT_VI_LOCKED(vp, "dirty iter"); KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); return (vp); } struct vnode * -__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp) +__mnt_vnode_next_dirty(struct vnode **mvp, struct mount *mp, mnt_dirty_cb_t *cb, + void *cbarg) { if (should_yield()) kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); - return (mnt_vnode_next_active(mvp, mp)); + return (mnt_vnode_next_dirty(mvp, mp, cb, cbarg)); } struct vnode * -__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp) +__mnt_vnode_first_dirty(struct vnode **mvp, struct mount *mp, mnt_dirty_cb_t *cb, + void *cbarg) { struct vnode *vp; - *mvp = vn_alloc_marker(mp); + *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); + (*mvp)->v_type = VMARKER; + (*mvp)->v_mount = mp; mtx_lock(&mp->mnt_listmtx); - vp = TAILQ_FIRST(&mp->mnt_activevnodelist); + vp = TAILQ_FIRST(&mp->mnt_dirtyvnodelist); if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); - mnt_vnode_markerfree_active(mvp, mp); + mnt_vnode_markerfree_dirty(mvp, mp); return (NULL); } - TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist); - return (mnt_vnode_next_active(mvp, mp)); + TAILQ_INSERT_BEFORE(vp, *mvp, v_dirtylist); + return (mnt_vnode_next_dirty(mvp, mp, cb, cbarg)); } void -__mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp) +__mnt_vnode_markerfree_dirty(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_lock(&mp->mnt_listmtx); - TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist); + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, *mvp, v_dirtylist); mtx_unlock(&mp->mnt_listmtx); - mnt_vnode_markerfree_active(mvp, mp); + mnt_vnode_markerfree_dirty(mvp, mp); } Index: sys/sys/mount.h =================================================================== --- sys/sys/mount.h +++ sys/sys/mount.h @@ -223,6 +223,8 @@ int mnt_activevnodelistsize;/* (l) # of active vnodes */ struct vnodelst mnt_tmpfreevnodelist; /* (l) list of free vnodes */ int mnt_tmpfreevnodelistsize;/* (l) # of free vnodes */ + struct vnodelst mnt_dirtyvnodelist; /* (l) list of dirty vnodes */ + int mnt_dirtyvnodelistsize; /* (l) # of dirty vnodes */ struct lock mnt_explock; /* vfs_export walkers lock */ TAILQ_ENTRY(mount) mnt_upper_link; /* (m) we in the all uppers */ TAILQ_HEAD(, mount) mnt_uppers; /* (m) upper mounts over us*/ @@ -267,6 +269,24 @@ #define MNT_VNODE_FOREACH_ACTIVE_ABORT(mp, mvp) \ __mnt_vnode_markerfree_active(&(mvp), (mp)) +/* + * Definitions for MNT_VNODE_FOREACH_DIRTY. + */ +typedef int mnt_dirty_cb_t(struct vnode *, void *); +struct vnode *__mnt_vnode_next_dirty(struct vnode **mvp, struct mount *mp, + mnt_dirty_cb_t *cb, void *cbarg); +struct vnode *__mnt_vnode_first_dirty(struct vnode **mvp, struct mount *mp, + mnt_dirty_cb_t *cb, void *cbarg); +void __mnt_vnode_markerfree_dirty(struct vnode **mvp, struct mount *mp); + +#define MNT_VNODE_FOREACH_DIRTY(vp, mp, mvp, cb, cbarg) \ + for (vp = __mnt_vnode_first_dirty(&(mvp), (mp), (cb), (cbarg)); \ + (vp) != NULL; \ + vp = __mnt_vnode_next_dirty(&(mvp), (mp), (cb), (cbarg))) + +#define MNT_VNODE_FOREACH_DIRTY_ABORT(mp, mvp) \ + __mnt_vnode_markerfree_dirty(&(mvp), (mp)) + #define MNT_ILOCK(mp) mtx_lock(&(mp)->mnt_mtx) #define MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx) #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -148,6 +148,7 @@ * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_actfreelist; /* l vnode active/free lists */ + TAILQ_ENTRY(vnode) v_dirtylist; /* l vnode dirty list */ struct bufobj v_bufobj; /* * Buffer cache object */ /* @@ -260,6 +261,7 @@ #define VV_READLINK 0x2000 /* fdescfs linux vnode */ #define VMP_TMPMNTFREELIST 0x0001 /* Vnode is on mnt's tmp free list */ +#define VMP_DIRTYLIST 0x0002 /* Vnode is on mnt's dirty free list */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value @@ -653,6 +655,7 @@ struct ucred *cred, int *privused); void vattr_null(struct vattr *vap); int vcount(struct vnode *vp); +void vdirty(struct vnode *); void vdrop(struct vnode *); void vdropl(struct vnode *); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);