Index: sys/compat/linux/linux_stats.c =================================================================== --- sys/compat/linux/linux_stats.c +++ sys/compat/linux/linux_stats.c @@ -666,7 +666,7 @@ if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { save = curthread_pflags_set(TDP_SYNCIO); - vfs_msync(mp, MNT_NOWAIT); + vfs_periodic(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT); curthread_pflags_restore(save); vn_finished_write(mp); Index: sys/fs/nfsserver/nfs_nfsdport.c =================================================================== --- sys/fs/nfsserver/nfs_nfsdport.c +++ sys/fs/nfsserver/nfs_nfsdport.c @@ -3319,6 +3319,7 @@ nfsv4root_mnt.mnt_flag = (MNT_RDONLY | MNT_EXPORTED); TAILQ_INIT(&nfsv4root_mnt.mnt_nvnodelist); TAILQ_INIT(&nfsv4root_mnt.mnt_activevnodelist); + TAILQ_INIT(&nfsv4root_mnt.mnt_dirtyvnodelist); nfsv4root_mnt.mnt_export = NULL; TAILQ_INIT(&nfsv4root_opt); TAILQ_INIT(&nfsv4root_newopt); @@ -3326,6 +3327,7 @@ nfsv4root_mnt.mnt_optnew = &nfsv4root_newopt; nfsv4root_mnt.mnt_nvnodelistsize = 0; nfsv4root_mnt.mnt_activevnodelistsize = 0; + nfsv4root_mnt.mnt_dirtyvnodelistsize = 0; } /* Index: sys/geom/journal/g_journal.c =================================================================== --- sys/geom/journal/g_journal.c +++ sys/geom/journal/g_journal.c @@ -2874,7 +2874,7 @@ save = curthread_pflags_set(TDP_SYNCIO); GJ_TIMER_START(1, &bt); - vfs_msync(mp, MNT_NOWAIT); + vfs_periodic(mp, MNT_NOWAIT); GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint); GJ_TIMER_START(1, &bt); Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -1236,6 +1236,8 @@ VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp, ("neg writecount increment %d", ap->a_inc)); vp->v_writecount += ap->a_inc; + if (vp->v_writecount > 0 && vp->v_mount != NULL) + vdirty(vp); error = 0; } VI_UNLOCK(vp); Index: sys/kern/vfs_mount.c =================================================================== --- sys/kern/vfs_mount.c +++ sys/kern/vfs_mount.c @@ -506,6 +506,8 @@ mp->mnt_activevnodelistsize = 0; TAILQ_INIT(&mp->mnt_tmpfreevnodelist); mp->mnt_tmpfreevnodelistsize = 0; + TAILQ_INIT(&mp->mnt_dirtyvnodelist); + mp->mnt_dirtyvnodelistsize = 0; if (mp->mnt_ref != 0 || mp->mnt_lockref != 0 || mp->mnt_writeopcount != 0) panic("%s: non-zero counters on new mp %p\n", __func__, mp); @@ -575,6 +577,8 @@ panic("vfs_mount_destroy: nonzero activevnodelistsize"); if (mp->mnt_tmpfreevnodelistsize != 0) panic("vfs_mount_destroy: nonzero tmpfreevnodelistsize"); + if (mp->mnt_dirtyvnodelistsize != 0) + panic("vfs_mount_destroy: nonzero dirtyvnodelistsize"); if (mp->mnt_lockref != 0) panic("vfs_mount_destroy: nonzero lock refcount"); MNT_IUNLOCK(mp); @@ -1692,7 +1696,7 @@ if (coveredvp != NULL) vdrop(coveredvp); - vfs_msync(mp, MNT_WAIT); + vfs_periodic(mp, MNT_WAIT); MNT_ILOCK(mp); async_flag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -213,10 +213,9 @@ SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW | CTLFLAG_STATS, &reassignbufcalls, 0, "Number of calls to reassignbuf"); -static counter_u64_t free_owe_inact; -SYSCTL_COUNTER_U64(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, - "Number of times free vnodes kept on active list due to VFS " - "owing inactivation"); +static counter_u64_t deferred_inact; +SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, + "Number of times inactive processing was deferred"); /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; @@ -581,7 +580,7 @@ vnodes_created = counter_u64_alloc(M_WAITOK); recycles_count = counter_u64_alloc(M_WAITOK); - free_owe_inact = counter_u64_alloc(M_WAITOK); + deferred_inact = counter_u64_alloc(M_WAITOK); /* * Initialize the filesystem syncer. @@ -1748,6 +1747,15 @@ mp->mnt_activevnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } + if (vp->v_mflag & VMP_DIRTYLIST) { + mtx_lock(&mp->mnt_listmtx); + if (vp->v_mflag & VMP_DIRTYLIST) { + vp->v_mflag &= ~VMP_DIRTYLIST; + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, vp, v_dirtylist); + mp->mnt_dirtyvnodelistsize--; + } + mtx_unlock(&mp->mnt_listmtx); + } vp->v_mount = NULL; VI_UNLOCK(vp); VNASSERT(mp->mnt_nvnodelistsize > 0, vp, @@ -2975,6 +2983,42 @@ return (vp->v_usecount); } +void +vdirty(struct vnode *vp) +{ + struct mount *mp; + + VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); + + if ((vp->v_mflag & VMP_DIRTYLIST) != 0) + return; + mp = vp->v_mount; + mtx_lock(&mp->mnt_listmtx); + if ((vp->v_mflag & VMP_DIRTYLIST) == 0) { + vp->v_mflag |= VMP_DIRTYLIST; + TAILQ_INSERT_TAIL(&mp->mnt_dirtyvnodelist, vp, v_dirtylist); + mp->mnt_dirtyvnodelistsize++; + } + mtx_unlock(&mp->mnt_listmtx); +} + +static void +vdefer_inactive(struct vnode *vp) +{ + + ASSERT_VI_LOCKED(vp, __func__); + VNASSERT(vp->v_iflag & VI_OWEINACT, vp, + ("%s: vnode without VI_OWEINACT", __func__)); + if (vp->v_iflag & VI_DEFINACT) { + vdropl(vp); + return; + } + vdirty(vp); + vp->v_iflag |= VI_DEFINACT; + VI_UNLOCK(vp); + counter_u64_add(deferred_inact, 1); +} + enum vputx_op { VPUTX_VRELE, VPUTX_VPUT, VPUTX_VUNREF }; /* @@ -3064,8 +3108,12 @@ vinactive(vp, curthread); if (func != VPUTX_VUNREF) VOP_UNLOCK(vp, 0); + vdropl(vp); + } else if (vp->v_iflag & VI_OWEINACT) { + vdefer_inactive(vp); + } else { + vdropl(vp); } - vdropl(vp); } /* @@ -3220,28 +3268,32 @@ ("vdrop: vnode already reclaimed.")); VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free")); + VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, + ("vnode with VI_OWEINACT set")); + VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, + ("vnode with VI_DEFINACT set")); VNASSERT(vp->v_holdcnt == 0, vp, ("vdrop: freeing when we shouldn't")); - if ((vp->v_iflag & VI_OWEINACT) == 0) { - mp = vp->v_mount; - mtx_lock(&mp->mnt_listmtx); - if (vp->v_iflag & VI_ACTIVE) { - vp->v_iflag &= ~VI_ACTIVE; - TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); - mp->mnt_activevnodelistsize--; - } - TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist); - mp->mnt_tmpfreevnodelistsize++; - vp->v_iflag |= VI_FREE; - vp->v_mflag |= VMP_TMPMNTFREELIST; - VI_UNLOCK(vp); - if (mp->mnt_tmpfreevnodelistsize >= mnt_free_list_batch) - vnlru_return_batch_locked(mp); - mtx_unlock(&mp->mnt_listmtx); - } else { - VI_UNLOCK(vp); - counter_u64_add(free_owe_inact, 1); + mp = vp->v_mount; + mtx_lock(&mp->mnt_listmtx); + if (vp->v_mflag & VMP_DIRTYLIST) { + vp->v_mflag &= ~VMP_DIRTYLIST; + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, vp, v_dirtylist); + mp->mnt_dirtyvnodelistsize--; + } + if (vp->v_iflag & VI_ACTIVE) { + vp->v_iflag &= ~VI_ACTIVE; + TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist); + mp->mnt_activevnodelistsize--; } + TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist); + mp->mnt_tmpfreevnodelistsize++; + vp->v_iflag |= VI_FREE; + vp->v_mflag |= VMP_TMPMNTFREELIST; + VI_UNLOCK(vp); + if (mp->mnt_tmpfreevnodelistsize >= mnt_free_list_batch) + vnlru_return_batch_locked(mp); + mtx_unlock(&mp->mnt_listmtx); } void @@ -3592,7 +3644,16 @@ */ active = vp->v_usecount > 0; oweinact = (vp->v_iflag & VI_OWEINACT) != 0; - VI_UNLOCK(vp); + /* + * If we need to do inactive VI_OWEINACT will be set. + */ + if (vp->v_iflag & VI_DEFINACT) { + vp->v_iflag &= ~VI_DEFINACT; + vdropl(vp); + } else { + VI_UNLOCK(vp); + } + VNASSERT(vp->v_holdcnt > 0, vp, ("no refs?")); vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); /* @@ -3787,15 +3848,19 @@ strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); + if (vp->v_iflag & VI_DEFINACT) + strlcat(buf, "|VI_DEFINACT", sizeof(buf)); flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_FREE | VI_ACTIVE | - VI_DOINGINACT | VI_OWEINACT); + VI_DOINGINACT | VI_OWEINACT | VI_DEFINACT); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_mflag & VMP_TMPMNTFREELIST) strlcat(buf, "|VMP_TMPMNTFREELIST", sizeof(buf)); - flags = vp->v_mflag & ~(VMP_TMPMNTFREELIST); + if (vp->v_mflag & VMP_DIRTYLIST) + strlcat(buf, "|VMP_DIRTYLIST", sizeof(buf)); + flags = vp->v_mflag & ~(VMP_TMPMNTFREELIST | VMP_DIRTYLIST); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); @@ -4015,6 +4080,8 @@ db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_activevnodelistsize = %d\n", mp->mnt_activevnodelistsize); + db_printf(" mnt_dirtyvnodelistsize = %d\n", + mp->mnt_dirtyvnodelistsize); db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); @@ -4345,48 +4412,143 @@ unmount_or_warn(rootdevmp); } -/* - * perform msync on all vnodes under a mount point - * the mount point must be locked. - */ -void -vfs_msync(struct mount *mp, int flags) +static void +vfs_deferred_inactive(struct vnode *vp, int lkflags) +{ + + ASSERT_VI_LOCKED(vp, __func__); + if ((vp->v_iflag & VI_OWEINACT) == 0) { + vdropl(vp); + return; + } + if (vn_lock(vp, lkflags) == 0) { + VI_LOCK(vp); + if ((vp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == VI_OWEINACT) + vinactive(vp, curthread); + VOP_UNLOCK(vp, 0); + vdropl(vp); + return; + } + VI_LOCK(vp); + if (VN_IS_DOOMED(vp) || + (vp->v_iflag & VI_DEFINACT) || + (vp->v_iflag & VI_OWEINACT) == 0) { + vdropl(vp); + return; + } + /* + * Try again later. + */ + vp->v_iflag |= VI_DEFINACT; + VI_UNLOCK(vp); +} + +static bool +vfs_periodic_inactive_filter(struct vnode *vp, void *arg) +{ + + return (vp->v_iflag & VI_DEFINACT); +} + +static void __noinline +vfs_periodic_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; + int lkflags; + + lkflags = LK_EXCLUSIVE | LK_INTERLOCK; + if (flags != MNT_WAIT) + lkflags |= LK_NOWAIT; + + MNT_VNODE_FOREACH_DIRTY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { + if ((vp->v_iflag & VI_DEFINACT) == 0) { + VI_UNLOCK(vp); + continue; + } + vp->v_iflag &= ~VI_DEFINACT; + vfs_deferred_inactive(vp, lkflags); + } +} + +static inline bool +vfs_want_msync(struct vnode *vp) +{ struct vm_object *obj; - CTR2(KTR_VFS, "%s: mp %p", __func__, mp); + /* + * This test may be performed without any locks held. + * We realy on vm_object's type stability. + */ + if (vp->v_vflag & VV_NOSYNC) + return (false); + obj = vp->v_object; + return (obj != NULL && vm_object_mightbedirty(obj)); +} - if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) - return; +static bool +vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) +{ - MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { - obj = vp->v_object; - if (obj != NULL && vm_object_mightbedirty(obj) && - (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) { - if (!vget(vp, - LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK, - curthread)) { - if (vp->v_vflag & VV_NOSYNC) { /* unlinked */ - vput(vp); - continue; - } + if (vp->v_vflag & VV_NOSYNC) + return (false); + if (vp->v_iflag & VI_DEFINACT) + return (true); + return (vfs_want_msync(vp)); +} - obj = vp->v_object; - if (obj != NULL) { - VM_OBJECT_WLOCK(obj); - vm_object_page_clean(obj, 0, 0, - flags == MNT_WAIT ? - OBJPC_SYNC : OBJPC_NOSYNC); - VM_OBJECT_WUNLOCK(obj); - } - vput(vp); +static void __noinline +vfs_periodic_msync_inactive(struct mount *mp, int flags) +{ + struct vnode *vp, *mvp; + struct vm_object *obj; + struct thread *td; + int lkflags; + bool seen_defer; + + lkflags = LK_EXCLUSIVE | LK_INTERLOCK; + if (flags != MNT_WAIT) + lkflags |= LK_NOWAIT; + + td = curthread; + MNT_VNODE_FOREACH_DIRTY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { + seen_defer = false; + if (vp->v_iflag & VI_DEFINACT) { + vp->v_iflag &= ~VI_DEFINACT; + seen_defer = true; + } + if (!vfs_want_msync(vp)) { + if (seen_defer) + vfs_deferred_inactive(vp, lkflags); + else + VI_UNLOCK(vp); + continue; + } + if (vget(vp, lkflags, td) == 0) { + obj = vp->v_object; + if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { + VM_OBJECT_WLOCK(obj); + vm_object_page_clean(obj, 0, 0, + flags == MNT_WAIT ? + OBJPC_SYNC : OBJPC_NOSYNC); + VM_OBJECT_WUNLOCK(obj); } - } else - VI_UNLOCK(vp); + vput(vp); + } + if (seen_defer) + vdrop(vp); } } +void +vfs_periodic(struct mount *mp, int flags) +{ + + if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) + vfs_periodic_inactive(mp, flags); + else + vfs_periodic_msync_inactive(mp, flags); +} + static void destroy_vpollinfo_free(struct vpollinfo *vi) { @@ -4597,7 +4759,7 @@ * batch. Return them instead of letting them stay there indefinitely. */ vnlru_return_batch(mp); - vfs_msync(mp, MNT_NOWAIT); + vfs_periodic(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY); curthread_pflags_restore(save); vn_finished_write(mp); @@ -6064,3 +6226,212 @@ mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_active(mvp, mp); } + +/* + * These are helper functions for filesystems to traverse their + * dirty vnodes. See MNT_VNODE_FOREACH_DIRTY() in sys/mount.h + */ +static void +mnt_vnode_markerfree_dirty(struct vnode **mvp, struct mount *mp) +{ + + KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); + + MNT_ILOCK(mp); + MNT_REL(mp); + MNT_IUNLOCK(mp); + free(*mvp, M_VNODE_MARKER); + *mvp = NULL; +} + +/* + * Relock the mp mount vnode list lock with the vp vnode interlock in the + * conventional lock order during mnt_vnode_next_dirty iteration. + * + * On entry, the mount vnode list lock is held and the vnode interlock is not. + * The list lock is dropped and reacquired. On success, both locks are held. + * On failure, the mount vnode list lock is held but the vnode interlock is + * not, and the procedure may have yielded. + */ +static bool +mnt_vnode_next_dirty_relock(struct vnode *mvp, struct mount *mp, + struct vnode *vp) +{ + const struct vnode *tmp; + bool held, ret; + + VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && + TAILQ_NEXT(mvp, v_dirtylist) != NULL, mvp, + ("%s: bad marker", __func__)); + VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, + ("%s: inappropriate vnode", __func__)); + ASSERT_VI_UNLOCKED(vp, __func__); + mtx_assert(&mp->mnt_listmtx, MA_OWNED); + + ret = false; + + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, mvp, v_dirtylist); + TAILQ_INSERT_BEFORE(vp, mvp, v_dirtylist); + + /* + * Use a hold to prevent vp from disappearing while the mount vnode + * list lock is dropped and reacquired. Normally a hold would be + * acquired with vhold(), but that might try to acquire the vnode + * interlock, which would be a LOR with the mount vnode list lock. + */ + held = refcount_acquire_if_not_zero(&vp->v_holdcnt); + mtx_unlock(&mp->mnt_listmtx); + if (!held) + goto abort; + VI_LOCK(vp); + if (!refcount_release_if_not_last(&vp->v_holdcnt)) { + vdropl(vp); + goto abort; + } + mtx_lock(&mp->mnt_listmtx); + + /* + * Determine whether the vnode is still the next one after the marker, + * excepting any other markers. If the vnode has not been doomed by + * vgone() then the hold should have ensured that it remained on the + * dirty list. If it has been doomed but is still on the dirty list, + * don't abort, but rather skip over it (avoid spinning on doomed + * vnodes). + */ + tmp = mvp; + do { + tmp = TAILQ_NEXT(tmp, v_dirtylist); + } while (tmp != NULL && tmp->v_type == VMARKER); + if (tmp != vp) { + mtx_unlock(&mp->mnt_listmtx); + VI_UNLOCK(vp); + goto abort; + } + + ret = true; + goto out; +abort: + maybe_yield(); + mtx_lock(&mp->mnt_listmtx); +out: + if (ret) + ASSERT_VI_LOCKED(vp, __func__); + else + ASSERT_VI_UNLOCKED(vp, __func__); + mtx_assert(&mp->mnt_listmtx, MA_OWNED); + return (ret); +} + +static struct vnode * +mnt_vnode_next_dirty(struct vnode **mvp, struct mount *mp, + bool (*cb)(struct vnode *, void *), void *cbarg) +{ + struct vnode *vp, *nvp; + + mtx_assert(&mp->mnt_listmtx, MA_OWNED); + KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); +restart: + vp = TAILQ_NEXT(*mvp, v_dirtylist); + while (vp != NULL) { + if (vp->v_type == VMARKER) { + vp = TAILQ_NEXT(vp, v_dirtylist); + continue; + } + /* + * See if we want to process the vnode. Note we may encounter a + * long string of vnodes we don't care about and hog the list + * as a result. Check for it and requeue the marker. + */ + if (VN_IS_DOOMED(vp) || !cb(vp, cbarg)) { + if (!should_yield()) { + vp = TAILQ_NEXT(vp, v_dirtylist); + continue; + } + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, *mvp, + v_dirtylist); + TAILQ_INSERT_AFTER(&mp->mnt_dirtyvnodelist, vp, *mvp, + v_dirtylist); + mtx_unlock(&mp->mnt_listmtx); + kern_yield(PRI_USER); + mtx_lock(&mp->mnt_listmtx); + goto restart; + } + /* + * Try-lock because this is the wrong lock order. If that does + * not succeed, drop the mount vnode list lock and try to + * reacquire it and the vnode interlock in the right order. + */ + if (!VI_TRYLOCK(vp) && + !mnt_vnode_next_dirty_relock(*mvp, mp, vp)) + goto restart; + KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); + KASSERT(vp->v_mount == mp || vp->v_mount == NULL, + ("alien vnode on the dirty list %p %p", vp, mp)); + if (vp->v_mount == mp && !VN_IS_DOOMED(vp)) + break; + nvp = TAILQ_NEXT(vp, v_dirtylist); + VI_UNLOCK(vp); + vp = nvp; + } + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, *mvp, v_dirtylist); + + /* Check if we are done */ + if (vp == NULL) { + mtx_unlock(&mp->mnt_listmtx); + mnt_vnode_markerfree_dirty(mvp, mp); + return (NULL); + } + TAILQ_INSERT_AFTER(&mp->mnt_dirtyvnodelist, vp, *mvp, v_dirtylist); + mtx_unlock(&mp->mnt_listmtx); + ASSERT_VI_LOCKED(vp, "dirty iter"); + KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp)); + return (vp); +} + +struct vnode * +__mnt_vnode_next_dirty(struct vnode **mvp, struct mount *mp, + bool (*cb)(struct vnode *, void *), void *cbarg) +{ + + if (should_yield()) + kern_yield(PRI_USER); + mtx_lock(&mp->mnt_listmtx); + return (mnt_vnode_next_dirty(mvp, mp, cb, cbarg)); +} + +struct vnode * +__mnt_vnode_first_dirty(struct vnode **mvp, struct mount *mp, + bool (*cb)(struct vnode *, void *), void *cbarg) +{ + struct vnode *vp; + + *mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); + MNT_ILOCK(mp); + MNT_REF(mp); + MNT_IUNLOCK(mp); + (*mvp)->v_type = VMARKER; + (*mvp)->v_mount = mp; + + mtx_lock(&mp->mnt_listmtx); + vp = TAILQ_FIRST(&mp->mnt_dirtyvnodelist); + if (vp == NULL) { + mtx_unlock(&mp->mnt_listmtx); + mnt_vnode_markerfree_dirty(mvp, mp); + return (NULL); + } + TAILQ_INSERT_BEFORE(vp, *mvp, v_dirtylist); + return (mnt_vnode_next_dirty(mvp, mp, cb, cbarg)); +} + +void +__mnt_vnode_markerfree_dirty(struct vnode **mvp, struct mount *mp) +{ + + if (*mvp == NULL) + return; + + mtx_lock(&mp->mnt_listmtx); + TAILQ_REMOVE(&mp->mnt_dirtyvnodelist, *mvp, v_dirtylist); + mtx_unlock(&mp->mnt_listmtx); + mnt_vnode_markerfree_dirty(mvp, mp); +} Index: sys/kern/vfs_syscalls.c =================================================================== --- sys/kern/vfs_syscalls.c +++ sys/kern/vfs_syscalls.c @@ -129,7 +129,7 @@ if ((mp->mnt_flag & MNT_RDONLY) == 0 && vn_start_write(NULL, &mp, V_NOWAIT) == 0) { save = curthread_pflags_set(TDP_SYNCIO); - vfs_msync(mp, MNT_NOWAIT); + vfs_periodic(mp, MNT_NOWAIT); VFS_SYNC(mp, MNT_NOWAIT); curthread_pflags_restore(save); vn_finished_write(mp); Index: sys/sys/mount.h =================================================================== --- sys/sys/mount.h +++ sys/sys/mount.h @@ -223,6 +223,8 @@ int mnt_activevnodelistsize;/* (l) # of active vnodes */ struct vnodelst mnt_tmpfreevnodelist; /* (l) list of free vnodes */ int mnt_tmpfreevnodelistsize;/* (l) # of free vnodes */ + struct vnodelst mnt_dirtyvnodelist; /* (l) list of dirty vnodes */ + int mnt_dirtyvnodelistsize; /* (l) # of dirty vnodes */ struct lock mnt_explock; /* vfs_export walkers lock */ TAILQ_ENTRY(mount) mnt_upper_link; /* (m) we in the all uppers */ TAILQ_HEAD(, mount) mnt_uppers; /* (m) upper mounts over us*/ @@ -267,6 +269,23 @@ #define MNT_VNODE_FOREACH_ACTIVE_ABORT(mp, mvp) \ __mnt_vnode_markerfree_active(&(mvp), (mp)) +/* + * Definitions for MNT_VNODE_FOREACH_DIRTY. + */ +struct vnode *__mnt_vnode_next_dirty(struct vnode **mvp, struct mount *mp, + bool (*cb)(struct vnode *, void *), void *cbarg); +struct vnode *__mnt_vnode_first_dirty(struct vnode **mvp, struct mount *mp, + bool (*cb)(struct vnode *, void *), void *cbarg); +void __mnt_vnode_markerfree_dirty(struct vnode **mvp, struct mount *); + +#define MNT_VNODE_FOREACH_DIRTY(vp, mp, mvp, cb, cbarg) \ + for (vp = __mnt_vnode_first_dirty(&(mvp), (mp), (cb), (cbarg)); \ + (vp) != NULL; \ + vp = __mnt_vnode_next_dirty(&(mvp), (mp), (cb), (cbarg))) + +#define MNT_VNODE_FOREACH_DIRTY_ABORT(mp, mvp) \ + __mnt_vnode_markerfree_dirty(&(mvp), (mp)) + #define MNT_ILOCK(mp) mtx_lock(&(mp)->mnt_mtx) #define MNT_ITRYLOCK(mp) mtx_trylock(&(mp)->mnt_mtx) #define MNT_IUNLOCK(mp) mtx_unlock(&(mp)->mnt_mtx) @@ -396,7 +415,7 @@ #define MNTK_UNMOUNTF 0x00000001 /* forced unmount in progress */ #define MNTK_ASYNC 0x00000002 /* filtered async flag */ #define MNTK_SOFTDEP 0x00000004 /* async disabled by softdep */ -#define MNTK_NOMSYNC 0x00000008 /* don't do vfs_msync */ +#define MNTK_NOMSYNC 0x00000008 /* don't do msync */ #define MNTK_DRAINING 0x00000010 /* lock draining is happening */ #define MNTK_REFEXPIRE 0x00000020 /* refcount expiring is happening */ #define MNTK_EXTENDED_SHARED 0x00000040 /* Allow shared locking for more ops */ @@ -903,7 +922,7 @@ const char *value); int vfs_setpublicfs /* set publicly exported fs */ (struct mount *, struct netexport *, struct export_args *); -void vfs_msync(struct mount *, int); +void vfs_periodic(struct mount *, int); int vfs_busy(struct mount *, int); int vfs_export /* process mount export info */ (struct mount *, struct export_args *); Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -148,6 +148,7 @@ * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_actfreelist; /* l vnode active/free lists */ + TAILQ_ENTRY(vnode) v_dirtylist; /* l vnode dirty list */ struct bufobj v_bufobj; /* * Buffer cache object */ /* @@ -243,6 +244,7 @@ #define VI_ACTIVE 0x0200 /* This vnode is on the active list */ #define VI_DOINGINACT 0x0800 /* VOP_INACTIVE is in progress */ #define VI_OWEINACT 0x1000 /* Need to call inactive */ +#define VI_DEFINACT 0x2000 /* deferred inactive */ #define VV_ROOT 0x0001 /* root of its filesystem */ #define VV_ISTTY 0x0002 /* vnode represents a tty */ @@ -260,6 +262,7 @@ #define VV_READLINK 0x2000 /* fdescfs linux vnode */ #define VMP_TMPMNTFREELIST 0x0001 /* Vnode is on mnt's tmp free list */ +#define VMP_DIRTYLIST 0x0002 /* Vnode is on mnt's dirty free list */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value @@ -653,6 +656,7 @@ struct ucred *cred, int *privused); void vattr_null(struct vattr *vap); int vcount(struct vnode *vp); +void vdirty(struct vnode *); void vdrop(struct vnode *); void vdropl(struct vnode *); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);