Page MenuHomeFreeBSD

D44178.id54425.diff
No OneTemporary

D44178.id54425.diff

diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h
--- a/sys/fs/nullfs/null.h
+++ b/sys/fs/nullfs/null.h
@@ -63,12 +63,43 @@
#define VTONULL(vp) ((struct null_node *)(vp)->v_data)
#define NULLTOV(xp) ((xp)->null_vnode)
+struct cv;
+struct mtx;
+struct thread;
+#if __FreeBSD_version >= 1300139
+struct vnode;
+struct sx;
+#endif
+
+/*
+ * The recycle request types.
+ */
+enum {
+ NULL_RECYCLE_REQ_NONE,
+ NULL_RECYCLE_REQ_LOW_PAGES,
+ NULL_RECYCLE_REQ_LOW_KMEM,
+};
+
+extern uint64_t null_node_num;
+extern uint64_t null_node_inuse_num;
+extern int null_recycle_lowpages;
+extern int null_recycle_lowkmem;
+#if __FreeBSD_version >= 1300139
+extern struct vnode *null_recycle_marker;
+extern struct sx null_recycle_sx;
+#endif
+extern struct mtx null_recycle_lock;
+extern struct cv null_recycle_cv;
+extern int null_recycle_request;
+extern struct thread *null_recycle_td;
+
int nullfs_init(struct vfsconf *vfsp);
int nullfs_uninit(struct vfsconf *vfsp);
int null_nodeget(struct mount *mp, struct vnode *target, struct vnode **vpp);
struct vnode *null_hashget(struct mount *mp, struct vnode *lowervp);
void null_hashrem(struct null_node *xp);
int null_bypass(struct vop_generic_args *ap);
+void null_recycle_thread(void);
#ifdef DIAGNOSTIC
struct vnode *null_checkvp(struct vnode *vp, char *fil, int lno);
diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c
--- a/sys/fs/nullfs/null_subr.c
+++ b/sys/fs/nullfs/null_subr.c
@@ -36,14 +36,26 @@
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/condvar.h>
+#include <sys/counter.h>
+#include <sys/eventhandler.h>
#include <sys/kernel.h>
+#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/rwlock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
+#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
#include <sys/vnode.h>
+#include <machine/atomic.h>
+
+#include <vm/vm.h>
+#include <vm/vm_pageout.h>
+
#include <fs/nullfs/null.h>
/*
@@ -59,14 +71,77 @@
static LIST_HEAD(null_node_hashhead, null_node) *null_node_hashtbl;
static struct rwlock null_hash_lock;
static u_long null_hash_mask;
+uint64_t null_node_num;
+uint64_t null_node_inuse_num;
+int null_recycle_lowpages = 20;
+int null_recycle_lowkmem = 80;
+counter_u64_t null_recycle_calls;
+
+#if __FreeBSD_version >= 1300139
+struct vnode *null_recycle_marker;
+struct sx null_recycle_sx;
+#endif
+struct mtx null_recycle_lock;
+struct cv null_recycle_cv;
+int null_recycle_request = NULL_RECYCLE_REQ_NONE;
+static eventhandler_tag null_event_lowmem = NULL;
+struct thread *null_recycle_td;
static MALLOC_DEFINE(M_NULLFSHASH, "nullfs_hash", "NULLFS hash table");
MALLOC_DEFINE(M_NULLFSNODE, "nullfs_node", "NULLFS vnode private part");
static struct vnode * null_hashins(struct mount *, struct null_node *);
+static void null_lowmem(void *, int);
+
+static struct kthread_desc null_recycle_ktd = {
+ .arg0 = "nullfs recycle",
+ .func = null_recycle_thread,
+ .global_threadpp = &null_recycle_td,
+};
+SYSINIT(nullfs_recycle, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kthread_start,
+ &null_recycle_ktd);
+
+static int
+null_recycle_sysctl_handle_percent(SYSCTL_HANDLER_ARGS)
+{
+ int val, err;
+
+ val = atomic_load_int((int *)arg1);
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if ((val < 0) || (val > 100))
+ return (EINVAL);
+
+ atomic_store_int((int *)arg1, val);
+
+ return (0);
+}
+
+SYSCTL_DECL(_vfs);
+
+SYSCTL_NODE(_vfs, OID_AUTO, nullfs, CTLFLAG_RW, 0, "nullfs");
+SYSCTL_UQUAD(_vfs_nullfs, OID_AUTO, nodes, CTLFLAG_RD,
+ &null_node_num, 0, "number of nodes");
+SYSCTL_UQUAD(_vfs_nullfs, OID_AUTO, inuse, CTLFLAG_RD,
+ &null_node_inuse_num, 0, "number of nodes in use");
+
+SYSCTL_NODE(_vfs_nullfs, OID_AUTO, recycle, CTLFLAG_RW, 0, "nullfs recycle");
+SYSCTL_PROC(_vfs_nullfs_recycle, OID_AUTO, lowpages,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ &null_recycle_lowpages, 0, null_recycle_sysctl_handle_percent, "I",
+ "node ratio to recycle upon low pages, in percent");
+SYSCTL_PROC(_vfs_nullfs_recycle, OID_AUTO, lowkmem,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ &null_recycle_lowkmem, 0, null_recycle_sysctl_handle_percent, "I",
+ "node ratio to recycle upon low kernel memory, in percent");
+SYSCTL_COUNTER_U64(_vfs_nullfs_recycle, OID_AUTO, calls,
+ CTLFLAG_RD, &null_recycle_calls,
+ "nullfs recycle calls");
/*
- * Initialise cache headers
+ * Initialise cache headers and nullfs recycle
*/
int
nullfs_init(vfsp)
@@ -76,6 +151,15 @@
null_node_hashtbl = hashinit(desiredvnodes, M_NULLFSHASH,
&null_hash_mask);
rw_init(&null_hash_lock, "nullhs");
+ null_recycle_calls = counter_u64_alloc(M_WAITOK);
+ null_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, null_lowmem, NULL,
+ EVENTHANDLER_PRI_FIRST);
+#if __FreeBSD_version >= 1300139
+ null_recycle_marker = vnlru_alloc_marker();
+ sx_init(&null_recycle_sx, "nullfs recycle sx");
+#endif
+ mtx_init(&null_recycle_lock, "nullfs recycle lock", NULL, MTX_DEF);
+ cv_init(&null_recycle_cv, "nullfs recycle cv");
return (0);
}
@@ -86,6 +170,16 @@
rw_destroy(&null_hash_lock);
hashdestroy(null_node_hashtbl, M_NULLFSHASH, null_hash_mask);
+#if __FreeBSD_version >= 1300139
+ if (null_recycle_marker != NULL)
+ vnlru_free_marker(null_recycle_marker);
+ sx_destroy(&null_recycle_sx);
+#endif
+ cv_destroy(&null_recycle_cv);
+ mtx_destroy(&null_recycle_lock);
+ if (null_event_lowmem != NULL)
+ EVENTHANDLER_DEREGISTER(vm_lowmem, null_event_lowmem);
+ counter_u64_free(null_recycle_calls);
return (0);
}
@@ -162,6 +256,7 @@
}
LIST_INSERT_HEAD(hd, xp, null_hash);
rw_wunlock(&null_hash_lock);
+ atomic_add_rel_64(&null_node_num, 1);
return (NULLVP);
}
@@ -253,6 +348,7 @@
error = insmntque1(vp, mp, null_insmntque_dtr, xp);
if (error != 0)
return (error);
+ atomic_add_rel_64(&null_node_inuse_num, 1);
if (lowervp == MOUNTTONULLMOUNT(mp)->nullm_lowerrootvp)
vp->v_vflag |= VV_ROOT;
@@ -301,6 +397,7 @@
rw_wlock(&null_hash_lock);
LIST_REMOVE(xp, null_hash);
rw_wunlock(&null_hash_lock);
+ atomic_subtract_rel_64(&null_node_num, 1);
}
#ifdef DIAGNOSTIC
@@ -341,3 +438,38 @@
return (a->null_lowervp);
}
#endif
+
+/*
+ * Nullfs(5) adds a use count to the lower vnode, which prevents it from
+ * recycling. This design blocks the vnode recycle triggered by a filesystem,
+ * typically zfs(4). In such the case, commit the pruning on nullfs(5) in the
+ * hope of releasing the lower vnodes.
+ *
+ * Distinguish the degree of the low memory. The low page kernel events are
+ * not abnormal when the working set size of the kernel and user processes
+ * exceed the physical memory. The low kernel memory events, on the other
+ * hand, may lead to the system stall.
+ */
+static void
+null_lowmem(void *arg __unused, int howto)
+{
+ int req;
+
+ switch (howto) {
+ case VM_LOW_KMEM:
+ req = NULL_RECYCLE_REQ_LOW_KMEM;
+ break;
+
+ case VM_LOW_PAGES:
+ default: /* XXX */
+ req = NULL_RECYCLE_REQ_LOW_PAGES;
+ break;
+ }
+
+ mtx_lock(&null_recycle_lock);
+ if (null_recycle_request < req) {
+ null_recycle_request = req;
+ cv_broadcast(&null_recycle_cv);
+ }
+ mtx_unlock(&null_recycle_lock);
+}
diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c
--- a/sys/fs/nullfs/null_vfsops.c
+++ b/sys/fs/nullfs/null_vfsops.c
@@ -43,13 +43,19 @@
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/condvar.h>
+#include <sys/eventhandler.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
+#include <sys/kthread.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
+#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/proc.h>
+#include <sys/sx.h>
#include <sys/vnode.h>
#include <sys/jail.h>
@@ -67,6 +73,8 @@
static vfs_vget_t nullfs_vget;
static vfs_extattrctl_t nullfs_extattrctl;
+static struct vfsops null_vfsops;
+
/*
* Mount null layer
*/
@@ -208,6 +216,7 @@
mp->mnt_kern_flag |= MNTK_LOOKUP_EXCL_DOTDOT | MNTK_NOMSYNC;
mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag &
(MNTK_USES_BCACHE | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS);
+ mp->mnt_fsvninusep = &null_node_inuse_num;
MNT_IUNLOCK(mp);
vfs_getnewfsid(mp);
if ((xmp->nullm_flags & NULLM_CACHE) != 0) {
@@ -436,6 +445,112 @@
vdrop(vp);
}
+void
+null_recycle_thread(void)
+{
+ int hz_rem, recycle_percent;
+ int64_t vn_scan, node_inuse_delta;
+ uint64_t node_num, node_inuse_num;
+ struct timeval tv_now, tv_delta, tv_rem;
+ static struct timeval tv_last;
+ static const struct timeval tv_pause =
+ {.tv_sec = 1, .tv_usec = 0};
+ extern counter_u64_t null_recycle_calls;
+
+ EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, null_recycle_td,
+ SHUTDOWN_PRI_FIRST);
+
+ for (;;) {
+ kthread_suspend_check();
+
+ node_num = atomic_load_acq_64(&null_node_num);
+ node_inuse_num = atomic_load_acq_64(&null_node_inuse_num);
+
+ /*
+ * Work around the in-use counter error that may happen under a heavy
+ * load.
+ *
+ * Fix the in-use counter value only when the counters are stable, ie
+ * their values do not change across multiple reads. Otherwise, defer
+ * the fix to the next chance.
+ */
+ if (__predict_false(node_num < node_inuse_num))
+ node_inuse_delta = node_inuse_num - node_num;
+ else if (__predict_false(((int64_t)node_inuse_num) < 0))
+ node_inuse_delta = (int64_t)node_inuse_num;
+ else
+ node_inuse_delta = 0;
+
+ if (__predict_false(0 != node_inuse_delta)) {
+ if (node_num == atomic_load_64(&null_node_num)) {
+ if (atomic_cmpset_64(&null_node_inuse_num,
+ node_inuse_num,
+ node_inuse_num - node_inuse_delta)) {
+ if (__predict_false(node_num != atomic_load_64(&null_node_num))) {
+ atomic_add_64(&null_node_inuse_num, node_inuse_delta);
+ }
+ }
+ }
+ }
+
+ getmicrotime(&tv_now);
+ tv_delta = tv_now;
+ timevalsub(&tv_delta, &tv_last);
+ if (timevalcmp(&tv_pause, &tv_delta, >=)) {
+ tv_rem = tv_pause;
+ timevalsub(&tv_rem, &tv_delta);
+ } else
+ timevalclear(&tv_rem);
+
+ mtx_lock(&null_recycle_lock);
+
+ if ((NULL_RECYCLE_REQ_NONE == null_recycle_request) || timevalisset(&tv_rem)) {
+ if (NULL_RECYCLE_REQ_NONE == null_recycle_request)
+ hz_rem = hz;
+ else
+ hz_rem = tvtohz(&tv_rem);
+ cv_timedwait(&null_recycle_cv, &null_recycle_lock, hz_rem);
+ mtx_unlock(&null_recycle_lock);
+ continue;
+ }
+
+ mtx_unlock(&null_recycle_lock);
+
+ counter_u64_add(null_recycle_calls, 1);
+
+ vn_scan = node_num - node_inuse_num - node_inuse_delta;
+
+ switch (null_recycle_request) {
+ case NULL_RECYCLE_REQ_LOW_KMEM:
+ recycle_percent = null_recycle_lowkmem;
+ break;
+
+ case NULL_RECYCLE_REQ_LOW_PAGES:
+ default: /* XXX */
+ recycle_percent = null_recycle_lowpages;
+ break;
+ }
+
+ vn_scan *= recycle_percent;
+ vn_scan /= 100;
+
+ if (vn_scan > 0) {
+#if __FreeBSD_version >= 1300139
+ sx_xlock(&null_recycle_sx);
+ vnlru_free_vfsops(vn_scan, &null_vfsops, null_recycle_marker);
+ sx_xunlock(&null_recycle_sx);
+#else
+ vnlru_free(vn_scan, &null_vfsops);
+#endif
+ }
+
+ mtx_lock(&null_recycle_lock);
+ null_recycle_request = NULL_RECYCLE_REQ_NONE;
+ mtx_unlock(&null_recycle_lock);
+ getmicrotime(&tv_last);
+ }
+}
+
static struct vfsops null_vfsops = {
.vfs_extattrctl = nullfs_extattrctl,
.vfs_fhtovp = nullfs_fhtovp,

File Metadata

Mime Type
text/plain
Expires
Mon, Feb 9, 1:36 AM (11 h, 8 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28522046
Default Alt Text
D44178.id54425.diff (11 KB)

Event Timeline