Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c +++ head/sys/kern/vfs_subr.c @@ -191,10 +191,11 @@ * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim * whenever vnlru_proc() becomes active. */ -static u_long wantfreevnodes; -static u_long __exclusive_cache_line freevnodes; +static long wantfreevnodes; +static long __exclusive_cache_line freevnodes; SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "Number of \"free\" vnodes"); +static long freevnodes_old; static counter_u64_t recycles_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, @@ -299,6 +300,7 @@ #define VDBATCH_SIZE 8 struct vdbatch { u_int index; + long freevnodes; struct mtx lock; struct vnode *tab[VDBATCH_SIZE]; }; @@ -323,6 +325,8 @@ static u_long vstir; /* nonzero to stir non-free vnodes */ static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ +static u_long vnlru_read_freevnodes(void); + /* * Note that no attempt is made to sanitize these parameters. */ @@ -1205,15 +1209,17 @@ /* * Attempt to reduce the free list by the requested amount. */ -static void +static int vnlru_free_locked(int count, struct vfsops *mnt_op) { struct vnode *vp, *mvp; struct mount *mp; + int ocount; mtx_assert(&vnode_list_mtx, MA_OWNED); if (count > max_vnlru_free) count = max_vnlru_free; + ocount = count; mvp = vnode_list_free_marker; restart: vp = mvp; @@ -1254,6 +1260,7 @@ mtx_lock(&vnode_list_mtx); goto restart; } + return (ocount - count); } void @@ -1283,6 +1290,38 @@ static struct proc *vnlruproc; static int vnlruproc_sig; +/* + * The main freevnodes counter is only updated when threads requeue their vnode + * batches. CPUs are conditionally walked to compute a more accurate total. + * + * Limit how much of a slop are we willing to tolerate. Note: the actual value + * at any given moment can still exceed slop, but it should not be by significant + * margin in practice. + */ +#define VNLRU_FREEVNODES_SLOP 128 + +static u_long +vnlru_read_freevnodes(void) +{ + struct vdbatch *vd; + long slop; + int cpu; + + mtx_assert(&vnode_list_mtx, MA_OWNED); + if (freevnodes > freevnodes_old) + slop = freevnodes - freevnodes_old; + else + slop = freevnodes_old - freevnodes; + if (slop < VNLRU_FREEVNODES_SLOP) + return (freevnodes >= 0 ? freevnodes : 0); + freevnodes_old = freevnodes; + CPU_FOREACH(cpu) { + vd = DPCPU_ID_PTR((cpu), vd); + freevnodes_old += vd->freevnodes; + } + return (freevnodes_old >= 0 ? freevnodes_old : 0); +} + static bool vnlru_under(u_long rnumvnodes, u_long limit) { @@ -1293,6 +1332,23 @@ space = desiredvnodes - rnumvnodes; if (space < limit) { + rfreevnodes = vnlru_read_freevnodes(); + if (rfreevnodes > wantfreevnodes) + space += rfreevnodes - wantfreevnodes; + } + return (space < limit); +} + +static bool +vnlru_under_unlocked(u_long rnumvnodes, u_long limit) +{ + long rfreevnodes, space; + + if (__predict_false(rnumvnodes > desiredvnodes)) + return (true); + + space = desiredvnodes - rnumvnodes; + if (space < limit) { rfreevnodes = atomic_load_long(&freevnodes); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; @@ -1317,16 +1373,23 @@ u_long rnumvnodes, rfreevnodes, target; unsigned long onumvnodes; int done, force, trigger, usevnodes; - bool reclaim_nc_src; + bool reclaim_nc_src, want_reread; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, SHUTDOWN_PRI_FIRST); force = 0; + want_reread = false; for (;;) { kproc_suspend_check(vnlruproc); mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); + + if (want_reread) { + force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; + want_reread = false; + } + /* * If numvnodes is too large (due to desiredvnodes being * adjusted using its sysctl, or emergency growth), first @@ -1354,7 +1417,7 @@ PVFS|PDROP, "vlruwt", hz); continue; } - rfreevnodes = atomic_load_long(&freevnodes); + rfreevnodes = vnlru_read_freevnodes(); onumvnodes = rnumvnodes; /* @@ -1397,16 +1460,14 @@ force = 3; continue; } + want_reread = true; force = 0; vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); - } else + } else { + want_reread = true; kern_yield(PRI_USER); - /* - * After becoming active to expand above low water, keep - * active until above high water. - */ - force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; + } } } @@ -1510,7 +1571,7 @@ vn_alloc_cyclecount = 0; goto alloc; } - rfreevnodes = atomic_load_long(&freevnodes); + rfreevnodes = vnlru_read_freevnodes(); if (vn_alloc_cyclecount++ >= rfreevnodes) { vn_alloc_cyclecount = 0; vstir = 1; @@ -1525,10 +1586,8 @@ * should be chosen so that we never wait or even reclaim from * the free list to below its target minimum. */ - if (rfreevnodes > 0) { - vnlru_free_locked(1, NULL); + if (vnlru_free_locked(1, NULL) > 0) goto alloc; - } if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { /* * Wait for space for a new vnode. @@ -1536,7 +1595,7 @@ vnlru_kick(); msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && - atomic_load_long(&freevnodes) > 1) + vnlru_read_freevnodes() > 1) vnlru_free_locked(1, NULL); } alloc: @@ -1555,7 +1614,7 @@ if (__predict_false(vn_alloc_cyclecount != 0)) return (vn_alloc_hard(mp)); rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; - if (__predict_false(vnlru_under(rnumvnodes, vlowat))) { + if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { atomic_subtract_long(&numvnodes, 1); return (vn_alloc_hard(mp)); } @@ -3177,13 +3236,17 @@ static void vhold_activate(struct vnode *vp) { + struct vdbatch *vd; ASSERT_VI_LOCKED(vp, __func__); VNASSERT(vp->v_holdcnt == 0, vp, ("%s: wrong hold count", __func__)); VNASSERT(vp->v_op != NULL, vp, ("%s: vnode already reclaimed.", __func__)); - atomic_subtract_long(&freevnodes, 1); + critical_enter(); + vd = DPCPU_PTR(vd); + vd->freevnodes--; + critical_exit(); refcount_acquire(&vp->v_holdcnt); } @@ -3233,9 +3296,12 @@ int i; mtx_assert(&vd->lock, MA_OWNED); + MPASS(curthread->td_pinned > 0); MPASS(vd->index == VDBATCH_SIZE); mtx_lock(&vnode_list_mtx); + critical_enter(); + freevnodes += vd->freevnodes; for (i = 0; i < VDBATCH_SIZE; i++) { vp = vd->tab[i]; TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); @@ -3244,6 +3310,8 @@ vp->v_dbatchcpu = NOCPU; } mtx_unlock(&vnode_list_mtx); + critical_exit(); + vd->freevnodes = 0; bzero(vd->tab, sizeof(vd->tab)); vd->index = 0; } @@ -3257,20 +3325,24 @@ VNASSERT(!VN_IS_DOOMED(vp), vp, ("%s: deferring requeue of a doomed vnode", __func__)); + critical_enter(); + vd = DPCPU_PTR(vd); + vd->freevnodes++; if (vp->v_dbatchcpu != NOCPU) { VI_UNLOCK(vp); + critical_exit(); return; } - /* - * A hack: pin us to the current CPU so that we know what to put in - * ->v_dbatchcpu. - */ sched_pin(); - vd = DPCPU_PTR(vd); + critical_exit(); mtx_lock(&vd->lock); MPASS(vd->index < VDBATCH_SIZE); MPASS(vd->tab[vd->index] == NULL); + /* + * A hack: we depend on being pinned so that we know what to put in + * ->v_dbatchcpu. + */ vp->v_dbatchcpu = curcpu; vd->tab[vd->index] = vp; vd->index++; @@ -3355,7 +3427,6 @@ mp->mnt_lazyvnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } - atomic_add_long(&freevnodes, 1); vdbatch_enqueue(vp); }