diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h --- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h +++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h @@ -285,6 +285,12 @@ #define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) extern int zfs_super_owner; +extern uint64_t zfs_znode_count; +extern uint64_t zfs_znode_inuse_count; +extern wmsum_t zfs_znode_pruning_requested; +extern wmsum_t zfs_znode_pruning_skipped; +extern wmsum_t zfs_znode_pruning_withwaiter; +extern wmsum_t zfs_znode_pruning_withwaiter_throttled; extern void zfs_init(void); extern void zfs_fini(void); diff --git a/sys/contrib/openzfs/include/sys/arc.h b/sys/contrib/openzfs/include/sys/arc.h --- a/sys/contrib/openzfs/include/sys/arc.h +++ b/sys/contrib/openzfs/include/sys/arc.h @@ -321,6 +321,7 @@ void arc_set_limits(uint64_t); void arc_init(void); void arc_fini(void); +boolean_t arc_is_waiting_evict(void); /* * Level 2 ARC diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c --- a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c @@ -51,6 +51,7 @@ #include #include #include +#include extern struct vfsops zfs_vfsops; @@ -146,18 +147,45 @@ static eventhandler_tag arc_event_lowmem = NULL; +/* + * The vm_lowmem event counters. + */ +wmsum_t zfs_arc_vm_lowmem_events; +wmsum_t zfs_arc_vm_lowmem_kmem; +wmsum_t zfs_arc_vm_lowmem_pages; +wmsum_t zfs_arc_vm_lowmem_nofree; +wmsum_t zfs_arc_vm_lowmem_pagedaemon; + static void -arc_lowmem(void *arg __unused, int howto __unused) +arc_lowmem(void *arg __unused, int howto) { int64_t free_memory, to_free; + wmsum_add(&zfs_arc_vm_lowmem_events, 1); + switch (howto) { + case VM_LOW_KMEM: + wmsum_add(&zfs_arc_vm_lowmem_kmem, 1); + break; + + case VM_LOW_PAGES: + wmsum_add(&zfs_arc_vm_lowmem_pages, 1); + break; + + default: + break; + } + if (curproc == pageproc) + wmsum_add(&zfs_arc_vm_lowmem_pagedaemon, 1); + arc_no_grow = B_TRUE; arc_warm = B_TRUE; arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); free_memory = arc_available_memory(); int64_t can_free = arc_c - arc_c_min; - if (can_free <= 0) + if (can_free <= 0) { + wmsum_add(&zfs_arc_vm_lowmem_nofree, 1); return; + } to_free = (can_free >> arc_shrink_shift) - MIN(free_memory, 0); DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); arc_reduce_target_size(to_free); @@ -174,6 +202,11 @@ void arc_lowmem_init(void) { + wmsum_init(&zfs_arc_vm_lowmem_events, 0); + wmsum_init(&zfs_arc_vm_lowmem_kmem, 0); + wmsum_init(&zfs_arc_vm_lowmem_pages, 0); + wmsum_init(&zfs_arc_vm_lowmem_nofree, 0); + wmsum_init(&zfs_arc_vm_lowmem_pagedaemon, 0); arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, EVENTHANDLER_PRI_FIRST); } @@ -183,6 +216,11 @@ { if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); + wmsum_fini(&zfs_arc_vm_lowmem_events); + wmsum_fini(&zfs_arc_vm_lowmem_kmem); + wmsum_fini(&zfs_arc_vm_lowmem_pages); + wmsum_fini(&zfs_arc_vm_lowmem_nofree); + wmsum_fini(&zfs_arc_vm_lowmem_pagedaemon); } void diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c --- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c @@ -121,6 +121,7 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, znode, CTLFLAG_RW, 0, "ZFS znode"); SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS livelist condense"); @@ -468,6 +469,35 @@ "size of l2c_only state"); /* END CSTYLED */ +/* arc_os.c */ + +extern counter_u64_t zfs_arc_vm_lowmem_events; +extern counter_u64_t zfs_arc_vm_lowmem_kmem; +extern counter_u64_t zfs_arc_vm_lowmem_pages; +extern counter_u64_t zfs_arc_vm_lowmem_nofree; +extern counter_u64_t zfs_arc_vm_lowmem_pagedaemon; + +SYSCTL_NODE(_vfs_zfs_arc, OID_AUTO, vm_lowmem, CTLFLAG_RW, 0, + "vm_lowmem kernel event received by ARC"); + +/* BEGIN CSTYLED */ +SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, events, + CTLFLAG_RD, &zfs_arc_vm_lowmem_events, + "total vm_lowmem events"); +SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, kmem, + CTLFLAG_RD, &zfs_arc_vm_lowmem_kmem, + "low kernel memory events"); +SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, pages, + CTLFLAG_RD, &zfs_arc_vm_lowmem_pages, + "low page events"); +SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, nofree, + CTLFLAG_RD, &zfs_arc_vm_lowmem_nofree, + "ARC memory not freed"); +SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, pagedaemon, + CTLFLAG_RD, &zfs_arc_vm_lowmem_pagedaemon, + "calls by pagedaemon"); +/* END CSTYLED */ + /* dbuf.c */ /* dmu.c */ @@ -885,3 +915,43 @@ CTLFLAG_RDTUN, &zio_exclude_metadata, 0, "Exclude metadata buffers from dumps as well"); /* END CSTYLED */ + +/* zfs_vfsops.c */ + +static int +param_get_znode_prunable_count(SYSCTL_HANDLER_ARGS) +{ + int64_t val; + uint64_t count, inuse; + + count = atomic_load_acq_64(&zfs_znode_count); + inuse = atomic_load_acq_64(&zfs_znode_inuse_count); + + val = count - inuse; + return (sysctl_handle_64(oidp, &val, 0, req)); +} + +/* BEGIN CSTYLED */ +SYSCTL_UQUAD(_vfs_zfs_znode, OID_AUTO, count, + CTLFLAG_RD, &zfs_znode_count, 0, + "number of zfs vnodes"); +SYSCTL_UQUAD(_vfs_zfs_znode, OID_AUTO, inuse, + CTLFLAG_RD, &zfs_znode_inuse_count, 0, + "number of zfs vnodes in use"); +SYSCTL_PROC(_vfs_zfs_znode, OID_AUTO, prunable, + CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + NULL, 0, param_get_znode_prunable_count, "Q", + "number of ARC-prunable zfs vnodes"); +SYSCTL_COUNTER_U64(_vfs_zfs_znode, OID_AUTO, pruning_requested, + CTLFLAG_RD, &zfs_znode_pruning_requested, + "number of ARC pruning requests"); +SYSCTL_COUNTER_U64(_vfs_zfs_znode, OID_AUTO, pruning_skipped, + CTLFLAG_RD, &zfs_znode_pruning_skipped, + "number of ARC pruning skips"); +SYSCTL_COUNTER_U64(_vfs_zfs_znode, OID_AUTO, pruning_withwaiter, + CTLFLAG_RD, &zfs_znode_pruning_withwaiter, + "number of ARC pruning executed due to waiters"); +SYSCTL_COUNTER_U64(_vfs_zfs_znode, OID_AUTO, pruning_withwaiter_throttled, + CTLFLAG_RD, &zfs_znode_pruning_withwaiter_throttled, + "number of ARC pruning with waiters, throttled"); +/* END CSTYLED */ diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c @@ -185,6 +185,11 @@ return (error); } + /* + * Do not account the vnodes for the ZFS sfs; such the vnodes are not + * subject to the ARC pruning. + */ + /* * Exclusively lock the vnode vnode while it's being constructed. */ diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -167,6 +168,36 @@ */ static uint32_t zfs_active_fs_count = 0; +/* + * The counts of the znodes and those in use. (vp->v_usecount > 0) + * They are used to estimate the number of the ARC-prunable [vz]nodes and + * dnodes. + */ +uint64_t zfs_znode_count; +uint64_t zfs_znode_inuse_count; + +/* + * The stats of the ARC pruning. + * + * - zfs_znode_pruning_requested + * The requests of the ARC pruning. + * + * - zfs_znode_pruning_skipped + * The skipped ARC pruning attempts because the prunable znodes do not meet + * the requested size. + * + * - zfs_znode_pruning_withwaiter + * The ARC pruning attempts executed because there is at least one thread + * waiting for the ARC eviction. + * + * - zfs_znode_pruning_withwaiter_throttled + * The ARC pruning attempts not boosted due to the rate limit. + */ +wmsum_t zfs_znode_pruning_requested; +wmsum_t zfs_znode_pruning_skipped; +wmsum_t zfs_znode_pruning_withwaiter; +wmsum_t zfs_znode_pruning_withwaiter_throttled; + int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint) @@ -1208,6 +1239,9 @@ #if defined(_KERNEL) && !defined(KMEM_DEBUG) vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; #endif + + vfsp->mnt_fsvninusep = &zfs_znode_inuse_count; + /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a @@ -2077,17 +2111,102 @@ static arc_prune_t *zfs_prune; static void -zfs_prune_task(uint64_t nr_to_scan, void *arg __unused) +zfs_prune_task(uint64_t dn_to_scan, void *arg __unused) { - if (nr_to_scan > INT_MAX) - nr_to_scan = INT_MAX; + boolean_t update_ts_last_withwaiter; + int64_t zn_prunable, dn_total, zn_delta; + uint64_t zn_total, zn_inuse, zn_to_scan; + struct timespec ts_now, ts_delta; + static struct timespec ts_last_withwaiter; + static const struct timespec ts_pause_withwaiter = + {.tv_sec = 1, .tv_nsec = 0}; + + wmsum_add(&zfs_znode_pruning_requested, 1); + + zn_total = atomic_load_acq_64(&zfs_znode_count); + zn_inuse = atomic_load_acq_64(&zfs_znode_inuse_count); + + /* + * Work around the in-use counter error that may happen under a heavy load. + * + * Fix the in-use counter value only when the counters are stable, ie their + * values do not change across multiple reads. Otherwise, defer the fix to + * the next chance. + */ + if (__predict_false(zn_total < zn_inuse)) + zn_delta = zn_inuse - zn_total; + else if (__predict_false(((int64_t)zn_inuse) < 0)) + zn_delta = (int64_t)zn_inuse; + else + zn_delta = 0; + + if (__predict_false(0 != zn_delta)) { + if (zn_total == atomic_load_64(&zfs_znode_count)) { + if (atomic_cmpset_64(&zfs_znode_inuse_count, zn_inuse, + zn_inuse - zn_delta)) { + if (__predict_false( + zn_total != atomic_load_64(&zfs_znode_count))) { + atomic_add_64(&zfs_znode_inuse_count, zn_delta); + } + } + } + } + + zn_prunable = zn_total - zn_inuse - zn_delta; + + /* + * Scale the number of the prunable dnodes into the znodes by the total + * number of the znodes and dnodes. A znode may span across multiple + * dnodes, but the precise span estimation is both complicated and opaque + * to the znode and vnode sides. + * + * Assume that the numbers of the znodes and dnodes fit within the 32 bit + * integer type. + */ + zn_to_scan = dn_to_scan * zn_total; + dn_total = wmsum_value(&arc_sums.arcstat_dnode_size) / sizeof(dnode_t); + zn_to_scan /= dn_total; + + update_ts_last_withwaiter = B_FALSE; + + if (arc_is_waiting_evict()) { + /* + * Someone wants the ARC eviction. Prune everything unless there are + * no prunable vnodes at all. + * + * Limit the rate up to 1 [Hz] because this eviction makes the vnode + * allocation so expensive. + */ + wmsum_add(&zfs_znode_pruning_withwaiter, 1); + getnanotime(&ts_now); + timespecsub(&ts_now, &ts_last_withwaiter, &ts_delta); + if (timespeccmp(&ts_delta, &ts_pause_withwaiter, >=)) { + if (zn_prunable < zn_to_scan) + zn_to_scan = zn_prunable; + update_ts_last_withwaiter = B_TRUE; + } else + wmsum_add(&zfs_znode_pruning_withwaiter_throttled, 1); + } + if ((zn_prunable < zn_to_scan) || (0 == zn_to_scan)) { + wmsum_add(&zfs_znode_pruning_skipped, 1); + return; + } + + if (zn_to_scan > INT_MAX) + zn_to_scan = INT_MAX; + + if (zn_to_scan > 0) { #if __FreeBSD_version >= 1300139 - sx_xlock(&zfs_vnlru_lock); - vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker); - sx_xunlock(&zfs_vnlru_lock); + sx_xlock(&zfs_vnlru_lock); + vnlru_free_vfsops(zn_to_scan, &zfs_vfsops, zfs_vnlru_marker); + sx_xunlock(&zfs_vnlru_lock); #else - vnlru_free(nr_to_scan, &zfs_vfsops); + vnlru_free(zn_to_scan, &zfs_vfsops); #endif + } + + if (update_ts_last_withwaiter) + getnanotime(&ts_last_withwaiter); } void @@ -2113,6 +2232,11 @@ */ zfs_vnodes_adjust(); + wmsum_init(&zfs_znode_pruning_requested, 0); + wmsum_init(&zfs_znode_pruning_skipped, 0); + wmsum_init(&zfs_znode_pruning_withwaiter, 0); + wmsum_init(&zfs_znode_pruning_withwaiter_throttled, 0); + dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); @@ -2133,6 +2257,11 @@ sx_destroy(&zfs_vnlru_lock); #endif + wmsum_fini(&zfs_znode_pruning_requested); + wmsum_fini(&zfs_znode_pruning_skipped); + wmsum_fini(&zfs_znode_pruning_withwaiter); + wmsum_fini(&zfs_znode_pruning_withwaiter_throttled); + taskq_destroy(zfsvfs_taskq); zfsctl_fini(); zfs_znode_fini(); diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c @@ -82,6 +82,7 @@ #include #include #include +#include #include #include #include @@ -5178,6 +5179,7 @@ ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); vp->v_data = NULL; + atomic_subtract_rel_64(&zfs_znode_count, 1); return (0); } diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c --- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c +++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #endif /* _KERNEL */ @@ -547,6 +548,12 @@ if (vp->v_type != VFIFO) VN_LOCK_ASHARE(vp); + atomic_add_rel_64(&zfs_znode_count, 1); + /* + * Defer the increment of zfs_znode_inuse_count until vp gets inserted into + * mp. + */ + return (zp); } @@ -827,6 +834,7 @@ vp->v_vflag &= ~VV_FORCEINSMQ; (void) err; KASSERT(err == 0, ("insmntque() failed: error %d", err)); + atomic_add_rel_64(&zfs_znode_inuse_count, 1); } kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); @@ -1056,6 +1064,7 @@ if (err == 0) { vp->v_hash = obj_num; VOP_UNLOCK1(vp); + atomic_add_rel_64(&zfs_znode_inuse_count, 1); } else { zp->z_vnode = NULL; zfs_znode_dmu_fini(zp); diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c --- a/sys/contrib/openzfs/module/zfs/arc.c +++ b/sys/contrib/openzfs/module/zfs/arc.c @@ -7765,6 +7765,18 @@ ASSERT0(arc_loaned_bytes); } +boolean_t +arc_is_waiting_evict(void) +{ + boolean_t is_empty; + + mutex_enter(&arc_evict_lock); + is_empty = list_is_empty(&arc_evict_waiters); + mutex_exit(&arc_evict_lock); + + return (!is_empty); +} + /* * Level 2 ARC * diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1311,11 +1311,74 @@ } static int max_free_per_call = 10000; +static bool recycle_vnode_bufs_pages = true; +static bool recycle_vnode_nc_src = true; SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_free_per_call, 0, "limit on vnode free requests per call to the vnlru_free routine (legacy)"); SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, max_free_per_call, CTLFLAG_RW, &max_free_per_call, 0, "limit on vnode free requests per call to the vnlru_free routine"); +SYSCTL_BOOL(_vfs_vnode_vnlru, OID_AUTO, recycle_bufs_pages, CTLFLAG_RW, + &recycle_vnode_bufs_pages, 0, + "enable recycling vnodes with clean buffers and clean/dirty VM pages"); +SYSCTL_BOOL(_vfs_vnode_vnlru, OID_AUTO, recycle_nc_src, CTLFLAG_RW, + &recycle_vnode_nc_src, 0, + "enable recycling vnodes acting as namecache source"); + +/* + * Count the hold sources on a regular file vnode. + */ +static void +vnlru_count_hold_sources_reg(struct vnode * restrict vp, + int * restrict vn_holdcnt, + int * restrict cleanbuf_holdcnt, + int * restrict dirtybuf_holdcnt, + int * restrict vmpage_holdcnt, + int * restrict unknown_holdcnt) +{ + struct vm_object *object; + struct bufobj *bo; + + VNPASS(VREG == vp->v_type, vp); + + *vn_holdcnt = atomic_load_int(&vp->v_holdcnt); + + bo = &vp->v_bufobj; + *cleanbuf_holdcnt = atomic_load_int(&bo->bo_clean.bv_cnt); + *dirtybuf_holdcnt = atomic_load_int(&bo->bo_dirty.bv_cnt); + + object = atomic_load_ptr(&vp->v_object); + if (object != NULL && + object->type == OBJT_VNODE && + object->resident_page_count > 0) + *vmpage_holdcnt = 1; + else + *vmpage_holdcnt = 0; + + *unknown_holdcnt = *vn_holdcnt - + (*cleanbuf_holdcnt + *dirtybuf_holdcnt + *vmpage_holdcnt); +} + +/* + * Count the hold sources on a directory vnode. + */ +static void +vnlru_count_hold_sources_dir(struct vnode * restrict vp, + int * restrict vn_holdcnt, + int * restrict nc_src_holdcnt, + int * restrict unknown_holdcnt) +{ + VNPASS(VDIR == vp->v_type, vp); + + *vn_holdcnt = atomic_load_int(&vp->v_holdcnt); + + if (LIST_EMPTY(&vp->v_cache_src)) + *nc_src_holdcnt = 0; + else + *nc_src_holdcnt = 1; + + *unknown_holdcnt = *vn_holdcnt - *nc_src_holdcnt; +} /* * Attempt to recycle requested amount of free vnodes. @@ -1325,8 +1388,9 @@ { struct vnode *vp; struct mount *mp; - int ocount; - bool retried; + int ocount, vn_holdcnt, cleanbuf_holdcnt, dirtybuf_holdcnt, vmpage_holdcnt, + nc_src_holdcnt, unknown_holdcnt; + bool retried, *phase2_go_toggle, phase2_go; mtx_assert(&vnode_list_mtx, MA_OWNED); if (count > max_free_per_call) @@ -1366,8 +1430,6 @@ } if (__predict_false(vp->v_type == VMARKER)) continue; - if (vp->v_holdcnt > 0) - continue; /* * Don't recycle if our vnode is from different type * of mount point. Note that mp is type-safe, the @@ -1378,9 +1440,71 @@ mp->mnt_op != mnt_op) { continue; } - if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { + if (vp->v_type == VBAD || __predict_false(vp->v_type == VNON)) { continue; } + vn_holdcnt = atomic_load_int(&vp->v_holdcnt); + if (vn_holdcnt > 0) { + phase2_go_toggle = NULL; + phase2_go = false; + + switch (vp->v_type) { + case VREG: + phase2_go_toggle = &recycle_vnode_bufs_pages; + + /* + * Count the holds by the bufs and VM pages in the object, + * and compare them to the actual hold count. + */ + vnlru_count_hold_sources_reg(vp, + &vn_holdcnt, + &cleanbuf_holdcnt, + &dirtybuf_holdcnt, + &vmpage_holdcnt, + &unknown_holdcnt); + + if ((cleanbuf_holdcnt == vn_holdcnt) && + (0 == dirtybuf_holdcnt) && (0 == vmpage_holdcnt)) { + phase2_go = true; + } else if ( + ((cleanbuf_holdcnt + vmpage_holdcnt) == vn_holdcnt) && + (0 == dirtybuf_holdcnt)) { + phase2_go = true; + } + break; + + case VDIR: + phase2_go_toggle = &recycle_vnode_nc_src; + + /* + * Count the holds by the namecache entries from this + * vnode, and compare them to the actual hold count. + */ + + vnlru_count_hold_sources_dir(vp, + &vn_holdcnt, + &nc_src_holdcnt, + &unknown_holdcnt); + + if (nc_src_holdcnt == vn_holdcnt) { + phase2_go = true; + } + + break; + + default: + /* + * NOP; the rest of the vnode types should not happen so + * often. + */ + break; + } + + if ((NULL == phase2_go_toggle) || + !(*phase2_go_toggle) || + !phase2_go) + continue; + } if (!vhold_recycle_free(vp)) continue; TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); @@ -3753,7 +3877,9 @@ static bool vhold_recycle_free(struct vnode *vp) { - int count; + int count, vn_holdcnt, cleanbuf_holdcnt, dirtybuf_holdcnt, vmpage_holdcnt, + nc_src_holdcnt, unknown_holdcnt; + bool *phase2_go_toggle, phase2_go; mtx_assert(&vnode_list_mtx, MA_OWNED); @@ -3766,10 +3892,61 @@ } VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); if (count > 0) { - return (false); + /* + * Check for the vnode holds again. Refer to the phase 2 test in + * vnlru_free_impl() for the detail. + */ + phase2_go_toggle = NULL; + phase2_go = false; + + switch (vp->v_type) { + case VREG: + phase2_go_toggle = &recycle_vnode_bufs_pages; + + vnlru_count_hold_sources_reg(vp, + &vn_holdcnt, + &cleanbuf_holdcnt, + &dirtybuf_holdcnt, + &vmpage_holdcnt, + &unknown_holdcnt); + + if ((cleanbuf_holdcnt == vn_holdcnt) && + (0 == vmpage_holdcnt) && (0 == dirtybuf_holdcnt)) { + phase2_go = true; + } else if ( + ((cleanbuf_holdcnt + vmpage_holdcnt) == vn_holdcnt) && + (0 == dirtybuf_holdcnt)) { + phase2_go = true; + } + + break; + + case VDIR: + phase2_go_toggle = &recycle_vnode_nc_src; + + vnlru_count_hold_sources_dir(vp, + &vn_holdcnt, + &nc_src_holdcnt, + &unknown_holdcnt); + + if (nc_src_holdcnt == vn_holdcnt) { + phase2_go = true; + } + + break; + + default: + return (false); + } + + if ((NULL == phase2_go_toggle) || + !(*phase2_go_toggle) || + !phase2_go) + return (false); } if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) { - vfs_freevnodes_dec(); + if (0 == count) + vfs_freevnodes_dec(); return (true); } }