Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F109640114
D44173.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
21 KB
Referenced Files
None
Subscribers
None
D44173.diff
View Options
diff --git a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h
--- a/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h
+++ b/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h
@@ -285,6 +285,12 @@
#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t))
extern int zfs_super_owner;
+extern uint64_t zfs_znode_count;
+extern uint64_t zfs_znode_inuse_count;
+extern wmsum_t zfs_znode_pruning_requested;
+extern wmsum_t zfs_znode_pruning_skipped;
+extern wmsum_t zfs_znode_pruning_withwaiter;
+extern wmsum_t zfs_znode_pruning_withwaiter_throttled;
extern void zfs_init(void);
extern void zfs_fini(void);
diff --git a/sys/contrib/openzfs/include/sys/arc.h b/sys/contrib/openzfs/include/sys/arc.h
--- a/sys/contrib/openzfs/include/sys/arc.h
+++ b/sys/contrib/openzfs/include/sys/arc.h
@@ -321,6 +321,7 @@
void arc_set_limits(uint64_t);
void arc_init(void);
void arc_fini(void);
+boolean_t arc_is_waiting_evict(void);
/*
* Level 2 ARC
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
@@ -51,6 +51,7 @@
#include <machine/vmparam.h>
#include <sys/vm.h>
#include <sys/vmmeter.h>
+#include <vm/vm_pageout.h>
extern struct vfsops zfs_vfsops;
@@ -146,18 +147,45 @@
static eventhandler_tag arc_event_lowmem = NULL;
+/*
+ * The vm_lowmem event counters.
+ */
+wmsum_t zfs_arc_vm_lowmem_events;
+wmsum_t zfs_arc_vm_lowmem_kmem;
+wmsum_t zfs_arc_vm_lowmem_pages;
+wmsum_t zfs_arc_vm_lowmem_nofree;
+wmsum_t zfs_arc_vm_lowmem_pagedaemon;
+
static void
-arc_lowmem(void *arg __unused, int howto __unused)
+arc_lowmem(void *arg __unused, int howto)
{
int64_t free_memory, to_free;
+ wmsum_add(&zfs_arc_vm_lowmem_events, 1);
+ switch (howto) {
+ case VM_LOW_KMEM:
+ wmsum_add(&zfs_arc_vm_lowmem_kmem, 1);
+ break;
+
+ case VM_LOW_PAGES:
+ wmsum_add(&zfs_arc_vm_lowmem_pages, 1);
+ break;
+
+ default:
+ break;
+ }
+ if (curproc == pageproc)
+ wmsum_add(&zfs_arc_vm_lowmem_pagedaemon, 1);
+
arc_no_grow = B_TRUE;
arc_warm = B_TRUE;
arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
free_memory = arc_available_memory();
int64_t can_free = arc_c - arc_c_min;
- if (can_free <= 0)
+ if (can_free <= 0) {
+ wmsum_add(&zfs_arc_vm_lowmem_nofree, 1);
return;
+ }
to_free = (can_free >> arc_shrink_shift) - MIN(free_memory, 0);
DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
arc_reduce_target_size(to_free);
@@ -174,6 +202,11 @@
void
arc_lowmem_init(void)
{
+ wmsum_init(&zfs_arc_vm_lowmem_events, 0);
+ wmsum_init(&zfs_arc_vm_lowmem_kmem, 0);
+ wmsum_init(&zfs_arc_vm_lowmem_pages, 0);
+ wmsum_init(&zfs_arc_vm_lowmem_nofree, 0);
+ wmsum_init(&zfs_arc_vm_lowmem_pagedaemon, 0);
arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
EVENTHANDLER_PRI_FIRST);
}
@@ -183,6 +216,11 @@
{
if (arc_event_lowmem != NULL)
EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
+ wmsum_fini(&zfs_arc_vm_lowmem_events);
+ wmsum_fini(&zfs_arc_vm_lowmem_kmem);
+ wmsum_fini(&zfs_arc_vm_lowmem_pages);
+ wmsum_fini(&zfs_arc_vm_lowmem_nofree);
+ wmsum_fini(&zfs_arc_vm_lowmem_pagedaemon);
}
void
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -121,6 +121,7 @@
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL");
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, znode, CTLFLAG_RW, 0, "ZFS znode");
SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0,
"ZFS livelist condense");
@@ -468,6 +469,35 @@
"size of l2c_only state");
/* END CSTYLED */
+/* arc_os.c */
+
+extern counter_u64_t zfs_arc_vm_lowmem_events;
+extern counter_u64_t zfs_arc_vm_lowmem_kmem;
+extern counter_u64_t zfs_arc_vm_lowmem_pages;
+extern counter_u64_t zfs_arc_vm_lowmem_nofree;
+extern counter_u64_t zfs_arc_vm_lowmem_pagedaemon;
+
+SYSCTL_NODE(_vfs_zfs_arc, OID_AUTO, vm_lowmem, CTLFLAG_RW, 0,
+ "vm_lowmem kernel event received by ARC");
+
+/* BEGIN CSTYLED */
+SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, events,
+ CTLFLAG_RD, &zfs_arc_vm_lowmem_events,
+ "total vm_lowmem events");
+SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, kmem,
+ CTLFLAG_RD, &zfs_arc_vm_lowmem_kmem,
+ "low kernel memory events");
+SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, pages,
+ CTLFLAG_RD, &zfs_arc_vm_lowmem_pages,
+ "low page events");
+SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, nofree,
+ CTLFLAG_RD, &zfs_arc_vm_lowmem_nofree,
+ "ARC memory not freed");
+SYSCTL_COUNTER_U64(_vfs_zfs_arc_vm_lowmem, OID_AUTO, pagedaemon,
+ CTLFLAG_RD, &zfs_arc_vm_lowmem_pagedaemon,
+ "calls by pagedaemon");
+/* END CSTYLED */
+
/* dbuf.c */
/* dmu.c */
@@ -885,3 +915,43 @@
CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
"Exclude metadata buffers from dumps as well");
/* END CSTYLED */
+
+/* zfs_vfsops.c */
+
+static int
+param_get_znode_prunable_count(SYSCTL_HANDLER_ARGS)
+{
+ int64_t val;
+ uint64_t count, inuse;
+
+ count = atomic_load_acq_64(&zfs_znode_count);
+ inuse = atomic_load_acq_64(&zfs_znode_inuse_count);
+
+ val = count - inuse;
+ return (sysctl_handle_64(oidp, &val, 0, req));
+}
+
+/* BEGIN CSTYLED */
+SYSCTL_UQUAD(_vfs_zfs_znode, OID_AUTO, count,
+ CTLFLAG_RD, &zfs_znode_count, 0,
+ "number of zfs vnodes");
+SYSCTL_UQUAD(_vfs_zfs_znode, OID_AUTO, inuse,
+ CTLFLAG_RD, &zfs_znode_inuse_count, 0,
+ "number of zfs vnodes in use");
+SYSCTL_PROC(_vfs_zfs_znode, OID_AUTO, prunable,
+ CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ NULL, 0, param_get_znode_prunable_count, "Q",
+ "number of ARC-prunable zfs vnodes");
+SYSCTL_COUNTER_U64(_vfs_zfs_znode, OID_AUTO, pruning_requested,
+ CTLFLAG_RD, &zfs_znode_pruning_requested,
+ "number of ARC pruning requests");
+SYSCTL_COUNTER_U64(_vfs_zfs_znode, OID_AUTO, pruning_skipped,
+ CTLFLAG_RD, &zfs_znode_pruning_skipped,
+ "number of ARC pruning skips");
+SYSCTL_COUNTER_U64(_vfs_zfs_znode, OID_AUTO, pruning_withwaiter,
+ CTLFLAG_RD, &zfs_znode_pruning_withwaiter,
+ "number of ARC pruning executed due to waiters");
+SYSCTL_COUNTER_U64(_vfs_zfs_znode, OID_AUTO, pruning_withwaiter_throttled,
+ CTLFLAG_RD, &zfs_znode_pruning_withwaiter_throttled,
+ "number of ARC pruning with waiters, throttled");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
@@ -185,6 +185,11 @@
return (error);
}
+ /*
+ * Do not account the vnodes for the ZFS sfs; such the vnodes are not
+ * subject to the ARC pruning.
+ */
+
/*
* Exclusively lock the vnode vnode while it's being constructed.
*/
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
@@ -40,6 +40,7 @@
#include <sys/vfs.h>
#include <sys/mntent.h>
#include <sys/mount.h>
+#include <sys/arc_impl.h>
#include <sys/cmn_err.h>
#include <sys/zfs_znode.h>
#include <sys/zfs_vnops.h>
@@ -167,6 +168,36 @@
*/
static uint32_t zfs_active_fs_count = 0;
+/*
+ * The counts of the znodes and those in use. (vp->v_usecount > 0)
+ * They are used to estimate the number of the ARC-prunable [vz]nodes and
+ * dnodes.
+ */
+uint64_t zfs_znode_count;
+uint64_t zfs_znode_inuse_count;
+
+/*
+ * The stats of the ARC pruning.
+ *
+ * - zfs_znode_pruning_requested
+ * The requests of the ARC pruning.
+ *
+ * - zfs_znode_pruning_skipped
+ * The skipped ARC pruning attempts because the prunable znodes do not meet
+ * the requested size.
+ *
+ * - zfs_znode_pruning_withwaiter
+ * The ARC pruning attempts executed because there is at least one thread
+ * waiting for the ARC eviction.
+ *
+ * - zfs_znode_pruning_withwaiter_throttled
+ * The ARC pruning attempts not boosted due to the rate limit.
+ */
+wmsum_t zfs_znode_pruning_requested;
+wmsum_t zfs_znode_pruning_skipped;
+wmsum_t zfs_znode_pruning_withwaiter;
+wmsum_t zfs_znode_pruning_withwaiter_throttled;
+
int
zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
char *setpoint)
@@ -1208,6 +1239,9 @@
#if defined(_KERNEL) && !defined(KMEM_DEBUG)
vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
#endif
+
+ vfsp->mnt_fsvninusep = &zfs_znode_inuse_count;
+
/*
* The fsid is 64 bits, composed of an 8-bit fs type, which
* separates our fsid from any other filesystem types, and a
@@ -2077,17 +2111,102 @@
static arc_prune_t *zfs_prune;
static void
-zfs_prune_task(uint64_t nr_to_scan, void *arg __unused)
+zfs_prune_task(uint64_t dn_to_scan, void *arg __unused)
{
- if (nr_to_scan > INT_MAX)
- nr_to_scan = INT_MAX;
+ boolean_t update_ts_last_withwaiter;
+ int64_t zn_prunable, dn_total, zn_delta;
+ uint64_t zn_total, zn_inuse, zn_to_scan;
+ struct timespec ts_now, ts_delta;
+ static struct timespec ts_last_withwaiter;
+ static const struct timespec ts_pause_withwaiter =
+ {.tv_sec = 1, .tv_nsec = 0};
+
+ wmsum_add(&zfs_znode_pruning_requested, 1);
+
+ zn_total = atomic_load_acq_64(&zfs_znode_count);
+ zn_inuse = atomic_load_acq_64(&zfs_znode_inuse_count);
+
+ /*
+ * Work around the in-use counter error that may happen under a heavy load.
+ *
+ * Fix the in-use counter value only when the counters are stable, ie their
+ * values do not change across multiple reads. Otherwise, defer the fix to
+ * the next chance.
+ */
+ if (__predict_false(zn_total < zn_inuse))
+ zn_delta = zn_inuse - zn_total;
+ else if (__predict_false(((int64_t)zn_inuse) < 0))
+ zn_delta = (int64_t)zn_inuse;
+ else
+ zn_delta = 0;
+
+ if (__predict_false(0 != zn_delta)) {
+ if (zn_total == atomic_load_64(&zfs_znode_count)) {
+ if (atomic_cmpset_64(&zfs_znode_inuse_count, zn_inuse,
+ zn_inuse - zn_delta)) {
+ if (__predict_false(
+ zn_total != atomic_load_64(&zfs_znode_count))) {
+ atomic_add_64(&zfs_znode_inuse_count, zn_delta);
+ }
+ }
+ }
+ }
+
+ zn_prunable = zn_total - zn_inuse - zn_delta;
+
+ /*
+ * Scale the number of the prunable dnodes into the znodes by the total
+ * number of the znodes and dnodes. A znode may span across multiple
+ * dnodes, but the precise span estimation is both complicated and opaque
+ * to the znode and vnode sides.
+ *
+ * Assume that the numbers of the znodes and dnodes fit within the 32 bit
+ * integer type.
+ */
+ zn_to_scan = dn_to_scan * zn_total;
+ dn_total = wmsum_value(&arc_sums.arcstat_dnode_size) / sizeof(dnode_t);
+ zn_to_scan /= dn_total;
+
+ update_ts_last_withwaiter = B_FALSE;
+
+ if (arc_is_waiting_evict()) {
+ /*
+ * Someone wants the ARC eviction. Prune everything unless there are
+ * no prunable vnodes at all.
+ *
+ * Limit the rate up to 1 [Hz] because this eviction makes the vnode
+ * allocation so expensive.
+ */
+ wmsum_add(&zfs_znode_pruning_withwaiter, 1);
+ getnanotime(&ts_now);
+ timespecsub(&ts_now, &ts_last_withwaiter, &ts_delta);
+ if (timespeccmp(&ts_delta, &ts_pause_withwaiter, >=)) {
+ if (zn_prunable < zn_to_scan)
+ zn_to_scan = zn_prunable;
+ update_ts_last_withwaiter = B_TRUE;
+ } else
+ wmsum_add(&zfs_znode_pruning_withwaiter_throttled, 1);
+ }
+ if ((zn_prunable < zn_to_scan) || (0 == zn_to_scan)) {
+ wmsum_add(&zfs_znode_pruning_skipped, 1);
+ return;
+ }
+
+ if (zn_to_scan > INT_MAX)
+ zn_to_scan = INT_MAX;
+
+ if (zn_to_scan > 0) {
#if __FreeBSD_version >= 1300139
- sx_xlock(&zfs_vnlru_lock);
- vnlru_free_vfsops(nr_to_scan, &zfs_vfsops, zfs_vnlru_marker);
- sx_xunlock(&zfs_vnlru_lock);
+ sx_xlock(&zfs_vnlru_lock);
+ vnlru_free_vfsops(zn_to_scan, &zfs_vfsops, zfs_vnlru_marker);
+ sx_xunlock(&zfs_vnlru_lock);
#else
- vnlru_free(nr_to_scan, &zfs_vfsops);
+ vnlru_free(zn_to_scan, &zfs_vfsops);
#endif
+ }
+
+ if (update_ts_last_withwaiter)
+ getnanotime(&ts_last_withwaiter);
}
void
@@ -2113,6 +2232,11 @@
*/
zfs_vnodes_adjust();
+ wmsum_init(&zfs_znode_pruning_requested, 0);
+ wmsum_init(&zfs_znode_pruning_skipped, 0);
+ wmsum_init(&zfs_znode_pruning_withwaiter, 0);
+ wmsum_init(&zfs_znode_pruning_withwaiter_throttled, 0);
+
dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
@@ -2133,6 +2257,11 @@
sx_destroy(&zfs_vnlru_lock);
#endif
+ wmsum_fini(&zfs_znode_pruning_requested);
+ wmsum_fini(&zfs_znode_pruning_skipped);
+ wmsum_fini(&zfs_znode_pruning_withwaiter);
+ wmsum_fini(&zfs_znode_pruning_withwaiter_throttled);
+
taskq_destroy(zfsvfs_taskq);
zfsctl_fini();
zfs_znode_fini();
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -82,6 +82,7 @@
#include <sys/vmmeter.h>
#include <vm/vm_param.h>
#include <sys/zil.h>
+#include <sys/zfs_vfsops.h>
#include <sys/zfs_vnops.h>
#include <sys/module.h>
#include <sys/sysent.h>
@@ -5178,6 +5179,7 @@
ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
vp->v_data = NULL;
+ atomic_subtract_rel_64(&zfs_znode_count, 1);
return (0);
}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
@@ -50,6 +50,7 @@
#include <sys/zfs_ioctl.h>
#include <sys/zfs_rlock.h>
#include <sys/zfs_fuid.h>
+#include <sys/zfs_vfsops.h>
#include <sys/dnode.h>
#include <sys/fs/zfs.h>
#endif /* _KERNEL */
@@ -547,6 +548,12 @@
if (vp->v_type != VFIFO)
VN_LOCK_ASHARE(vp);
+ atomic_add_rel_64(&zfs_znode_count, 1);
+ /*
+ * Defer the increment of zfs_znode_inuse_count until vp gets inserted into
+ * mp.
+ */
+
return (zp);
}
@@ -827,6 +834,7 @@
vp->v_vflag &= ~VV_FORCEINSMQ;
(void) err;
KASSERT(err == 0, ("insmntque() failed: error %d", err));
+ atomic_add_rel_64(&zfs_znode_inuse_count, 1);
}
kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
@@ -1056,6 +1064,7 @@
if (err == 0) {
vp->v_hash = obj_num;
VOP_UNLOCK1(vp);
+ atomic_add_rel_64(&zfs_znode_inuse_count, 1);
} else {
zp->z_vnode = NULL;
zfs_znode_dmu_fini(zp);
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -7765,6 +7765,18 @@
ASSERT0(arc_loaned_bytes);
}
+boolean_t
+arc_is_waiting_evict(void)
+{
+ boolean_t is_empty;
+
+ mutex_enter(&arc_evict_lock);
+ is_empty = list_is_empty(&arc_evict_waiters);
+ mutex_exit(&arc_evict_lock);
+
+ return (!is_empty);
+}
+
/*
* Level 2 ARC
*
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1311,11 +1311,74 @@
}
static int max_free_per_call = 10000;
+static bool recycle_vnode_bufs_pages = true;
+static bool recycle_vnode_nc_src = true;
SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_free_per_call, 0,
"limit on vnode free requests per call to the vnlru_free routine (legacy)");
SYSCTL_INT(_vfs_vnode_vnlru, OID_AUTO, max_free_per_call, CTLFLAG_RW,
&max_free_per_call, 0,
"limit on vnode free requests per call to the vnlru_free routine");
+SYSCTL_BOOL(_vfs_vnode_vnlru, OID_AUTO, recycle_bufs_pages, CTLFLAG_RW,
+ &recycle_vnode_bufs_pages, 0,
+ "enable recycling vnodes with clean buffers and clean/dirty VM pages");
+SYSCTL_BOOL(_vfs_vnode_vnlru, OID_AUTO, recycle_nc_src, CTLFLAG_RW,
+ &recycle_vnode_nc_src, 0,
+ "enable recycling vnodes acting as namecache source");
+
+/*
+ * Count the hold sources on a regular file vnode.
+ */
+static void
+vnlru_count_hold_sources_reg(struct vnode * restrict vp,
+ int * restrict vn_holdcnt,
+ int * restrict cleanbuf_holdcnt,
+ int * restrict dirtybuf_holdcnt,
+ int * restrict vmpage_holdcnt,
+ int * restrict unknown_holdcnt)
+{
+ struct vm_object *object;
+ struct bufobj *bo;
+
+ VNPASS(VREG == vp->v_type, vp);
+
+ *vn_holdcnt = atomic_load_int(&vp->v_holdcnt);
+
+ bo = &vp->v_bufobj;
+ *cleanbuf_holdcnt = atomic_load_int(&bo->bo_clean.bv_cnt);
+ *dirtybuf_holdcnt = atomic_load_int(&bo->bo_dirty.bv_cnt);
+
+ object = atomic_load_ptr(&vp->v_object);
+ if (object != NULL &&
+ object->type == OBJT_VNODE &&
+ object->resident_page_count > 0)
+ *vmpage_holdcnt = 1;
+ else
+ *vmpage_holdcnt = 0;
+
+ *unknown_holdcnt = *vn_holdcnt -
+ (*cleanbuf_holdcnt + *dirtybuf_holdcnt + *vmpage_holdcnt);
+}
+
+/*
+ * Count the hold sources on a directory vnode.
+ */
+static void
+vnlru_count_hold_sources_dir(struct vnode * restrict vp,
+ int * restrict vn_holdcnt,
+ int * restrict nc_src_holdcnt,
+ int * restrict unknown_holdcnt)
+{
+ VNPASS(VDIR == vp->v_type, vp);
+
+ *vn_holdcnt = atomic_load_int(&vp->v_holdcnt);
+
+ if (LIST_EMPTY(&vp->v_cache_src))
+ *nc_src_holdcnt = 0;
+ else
+ *nc_src_holdcnt = 1;
+
+ *unknown_holdcnt = *vn_holdcnt - *nc_src_holdcnt;
+}
/*
* Attempt to recycle requested amount of free vnodes.
@@ -1325,8 +1388,9 @@
{
struct vnode *vp;
struct mount *mp;
- int ocount;
- bool retried;
+ int ocount, vn_holdcnt, cleanbuf_holdcnt, dirtybuf_holdcnt, vmpage_holdcnt,
+ nc_src_holdcnt, unknown_holdcnt;
+ bool retried, *phase2_go_toggle, phase2_go;
mtx_assert(&vnode_list_mtx, MA_OWNED);
if (count > max_free_per_call)
@@ -1366,8 +1430,6 @@
}
if (__predict_false(vp->v_type == VMARKER))
continue;
- if (vp->v_holdcnt > 0)
- continue;
/*
* Don't recycle if our vnode is from different type
* of mount point. Note that mp is type-safe, the
@@ -1378,9 +1440,71 @@
mp->mnt_op != mnt_op) {
continue;
}
- if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
+ if (vp->v_type == VBAD || __predict_false(vp->v_type == VNON)) {
continue;
}
+ vn_holdcnt = atomic_load_int(&vp->v_holdcnt);
+ if (vn_holdcnt > 0) {
+ phase2_go_toggle = NULL;
+ phase2_go = false;
+
+ switch (vp->v_type) {
+ case VREG:
+ phase2_go_toggle = &recycle_vnode_bufs_pages;
+
+ /*
+ * Count the holds by the bufs and VM pages in the object,
+ * and compare them to the actual hold count.
+ */
+ vnlru_count_hold_sources_reg(vp,
+ &vn_holdcnt,
+ &cleanbuf_holdcnt,
+ &dirtybuf_holdcnt,
+ &vmpage_holdcnt,
+ &unknown_holdcnt);
+
+ if ((cleanbuf_holdcnt == vn_holdcnt) &&
+ (0 == dirtybuf_holdcnt) && (0 == vmpage_holdcnt)) {
+ phase2_go = true;
+ } else if (
+ ((cleanbuf_holdcnt + vmpage_holdcnt) == vn_holdcnt) &&
+ (0 == dirtybuf_holdcnt)) {
+ phase2_go = true;
+ }
+ break;
+
+ case VDIR:
+ phase2_go_toggle = &recycle_vnode_nc_src;
+
+ /*
+ * Count the holds by the namecache entries from this
+ * vnode, and compare them to the actual hold count.
+ */
+
+ vnlru_count_hold_sources_dir(vp,
+ &vn_holdcnt,
+ &nc_src_holdcnt,
+ &unknown_holdcnt);
+
+ if (nc_src_holdcnt == vn_holdcnt) {
+ phase2_go = true;
+ }
+
+ break;
+
+ default:
+ /*
+ * NOP; the rest of the vnode types should not happen so
+ * often.
+ */
+ break;
+ }
+
+ if ((NULL == phase2_go_toggle) ||
+ !(*phase2_go_toggle) ||
+ !phase2_go)
+ continue;
+ }
if (!vhold_recycle_free(vp))
continue;
TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
@@ -3753,7 +3877,9 @@
static bool
vhold_recycle_free(struct vnode *vp)
{
- int count;
+ int count, vn_holdcnt, cleanbuf_holdcnt, dirtybuf_holdcnt, vmpage_holdcnt,
+ nc_src_holdcnt, unknown_holdcnt;
+ bool *phase2_go_toggle, phase2_go;
mtx_assert(&vnode_list_mtx, MA_OWNED);
@@ -3766,10 +3892,61 @@
}
VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
if (count > 0) {
- return (false);
+ /*
+ * Check for the vnode holds again. Refer to the phase 2 test in
+ * vnlru_free_impl() for the detail.
+ */
+ phase2_go_toggle = NULL;
+ phase2_go = false;
+
+ switch (vp->v_type) {
+ case VREG:
+ phase2_go_toggle = &recycle_vnode_bufs_pages;
+
+ vnlru_count_hold_sources_reg(vp,
+ &vn_holdcnt,
+ &cleanbuf_holdcnt,
+ &dirtybuf_holdcnt,
+ &vmpage_holdcnt,
+ &unknown_holdcnt);
+
+ if ((cleanbuf_holdcnt == vn_holdcnt) &&
+ (0 == vmpage_holdcnt) && (0 == dirtybuf_holdcnt)) {
+ phase2_go = true;
+ } else if (
+ ((cleanbuf_holdcnt + vmpage_holdcnt) == vn_holdcnt) &&
+ (0 == dirtybuf_holdcnt)) {
+ phase2_go = true;
+ }
+
+ break;
+
+ case VDIR:
+ phase2_go_toggle = &recycle_vnode_nc_src;
+
+ vnlru_count_hold_sources_dir(vp,
+ &vn_holdcnt,
+ &nc_src_holdcnt,
+ &unknown_holdcnt);
+
+ if (nc_src_holdcnt == vn_holdcnt) {
+ phase2_go = true;
+ }
+
+ break;
+
+ default:
+ return (false);
+ }
+
+ if ((NULL == phase2_go_toggle) ||
+ !(*phase2_go_toggle) ||
+ !phase2_go)
+ return (false);
}
if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
- vfs_freevnodes_dec();
+ if (0 == count)
+ vfs_freevnodes_dec();
return (true);
}
}
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sat, Feb 8, 8:48 PM (21 h, 24 s)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16531916
Default Alt Text
D44173.diff (21 KB)
Attached To
Mode
D44173: kern/openzfs: Regulate the ZFS ARC pruning process precisely.
Attached
Detach File
Event Timeline
Log In to Comment