diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h --- a/sys/fs/nullfs/null.h +++ b/sys/fs/nullfs/null.h @@ -63,12 +63,43 @@ #define VTONULL(vp) ((struct null_node *)(vp)->v_data) #define NULLTOV(xp) ((xp)->null_vnode) +struct cv; +struct mtx; +struct thread; +#if __FreeBSD_version >= 1300139 +struct vnode; +struct sx; +#endif + +/* + * The recycle request types. + */ +enum { + NULL_RECYCLE_REQ_NONE, + NULL_RECYCLE_REQ_LOW_PAGES, + NULL_RECYCLE_REQ_LOW_KMEM, +}; + +extern uint64_t null_node_num; +extern uint64_t null_node_inuse_num; +extern int null_recycle_lowpages; +extern int null_recycle_lowkmem; +#if __FreeBSD_version >= 1300139 +extern struct vnode *null_recycle_marker; +extern struct sx null_recycle_sx; +#endif +extern struct mtx null_recycle_lock; +extern struct cv null_recycle_cv; +extern int null_recycle_request; +extern struct thread *null_recycle_td; + int nullfs_init(struct vfsconf *vfsp); int nullfs_uninit(struct vfsconf *vfsp); int null_nodeget(struct mount *mp, struct vnode *target, struct vnode **vpp); struct vnode *null_hashget(struct mount *mp, struct vnode *lowervp); void null_hashrem(struct null_node *xp); int null_bypass(struct vop_generic_args *ap); +void null_recycle_thread(void); #ifdef DIAGNOSTIC struct vnode *null_checkvp(struct vnode *vp, char *fil, int lno); diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c --- a/sys/fs/nullfs/null_subr.c +++ b/sys/fs/nullfs/null_subr.c @@ -34,14 +34,26 @@ #include #include +#include +#include +#include #include +#include #include #include #include #include +#include #include +#include +#include #include +#include + +#include +#include + #include /* @@ -57,14 +69,77 @@ static LIST_HEAD(null_node_hashhead, null_node) *null_node_hashtbl; static struct rwlock null_hash_lock; static u_long null_hash_mask; +uint64_t null_node_num; +uint64_t null_node_inuse_num; +int null_recycle_lowpages = 20; +int null_recycle_lowkmem = 80; +counter_u64_t null_recycle_calls; + +#if __FreeBSD_version >= 1300139 +struct vnode *null_recycle_marker; +struct sx null_recycle_sx; +#endif +struct mtx null_recycle_lock; +struct cv null_recycle_cv; +int null_recycle_request = NULL_RECYCLE_REQ_NONE; +static eventhandler_tag null_event_lowmem = NULL; +struct thread *null_recycle_td; static MALLOC_DEFINE(M_NULLFSHASH, "nullfs_hash", "NULLFS hash table"); MALLOC_DEFINE(M_NULLFSNODE, "nullfs_node", "NULLFS vnode private part"); static void null_hashins(struct mount *, struct null_node *); +static void null_lowmem(void *, int); + +static struct kthread_desc null_recycle_ktd = { + .arg0 = "nullfs recycle", + .func = null_recycle_thread, + .global_threadpp = &null_recycle_td, +}; +SYSINIT(nullfs_recycle, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kthread_start, + &null_recycle_ktd); + +static int +null_recycle_sysctl_handle_percent(SYSCTL_HANDLER_ARGS) +{ + int val, err; + + val = atomic_load_int((int *)arg1); + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if ((val < 0) || (val > 100)) + return (EINVAL); + + atomic_store_int((int *)arg1, val); + + return (0); +} + +SYSCTL_DECL(_vfs); + +SYSCTL_NODE(_vfs, OID_AUTO, nullfs, CTLFLAG_RW, 0, "nullfs"); +SYSCTL_UQUAD(_vfs_nullfs, OID_AUTO, nodes, CTLFLAG_RD, + &null_node_num, 0, "number of nodes"); +SYSCTL_UQUAD(_vfs_nullfs, OID_AUTO, inuse, CTLFLAG_RD, + &null_node_inuse_num, 0, "number of nodes in use"); + +SYSCTL_NODE(_vfs_nullfs, OID_AUTO, recycle, CTLFLAG_RW, 0, "nullfs recycle"); +SYSCTL_PROC(_vfs_nullfs_recycle, OID_AUTO, lowpages, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + &null_recycle_lowpages, 0, null_recycle_sysctl_handle_percent, "I", + "node ratio to recycle upon low pages, in percent"); +SYSCTL_PROC(_vfs_nullfs_recycle, OID_AUTO, lowkmem, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + &null_recycle_lowkmem, 0, null_recycle_sysctl_handle_percent, "I", + "node ratio to recycle upon low kernel memory, in percent"); +SYSCTL_COUNTER_U64(_vfs_nullfs_recycle, OID_AUTO, calls, + CTLFLAG_RD, &null_recycle_calls, + "nullfs recycle calls"); /* - * Initialise cache headers + * Initialise cache headers and nullfs recycle */ int nullfs_init(struct vfsconf *vfsp) @@ -73,6 +148,15 @@ null_node_hashtbl = hashinit(desiredvnodes, M_NULLFSHASH, &null_hash_mask); rw_init(&null_hash_lock, "nullhs"); + null_recycle_calls = counter_u64_alloc(M_WAITOK); + null_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, null_lowmem, NULL, + EVENTHANDLER_PRI_FIRST); +#if __FreeBSD_version >= 1300139 + null_recycle_marker = vnlru_alloc_marker(); + sx_init(&null_recycle_sx, "nullfs recycle sx"); +#endif + mtx_init(&null_recycle_lock, "nullfs recycle lock", NULL, MTX_DEF); + cv_init(&null_recycle_cv, "nullfs recycle cv"); return (0); } @@ -82,6 +166,16 @@ rw_destroy(&null_hash_lock); hashdestroy(null_node_hashtbl, M_NULLFSHASH, null_hash_mask); +#if __FreeBSD_version >= 1300139 + if (null_recycle_marker != NULL) + vnlru_free_marker(null_recycle_marker); + sx_destroy(&null_recycle_sx); +#endif + cv_destroy(&null_recycle_cv); + mtx_destroy(&null_recycle_lock); + if (null_event_lowmem != NULL) + EVENTHANDLER_DEREGISTER(vm_lowmem, null_event_lowmem); + counter_u64_free(null_recycle_calls); return (0); } @@ -160,6 +254,7 @@ } #endif LIST_INSERT_HEAD(hd, xp, null_hash); + atomic_add_rel_64(&null_node_num, 1); } static void @@ -256,6 +351,7 @@ null_destroy_proto(vp, xp); return (error); } + atomic_add_rel_64(&null_node_inuse_num, 1); null_hashins(mp, xp); vn_set_state(vp, VSTATE_CONSTRUCTED); @@ -275,6 +371,7 @@ rw_wlock(&null_hash_lock); LIST_REMOVE(xp, null_hash); rw_wunlock(&null_hash_lock); + atomic_subtract_rel_64(&null_node_num, 1); } #ifdef DIAGNOSTIC @@ -312,3 +409,38 @@ return (a->null_lowervp); } #endif + +/* + * Nullfs(5) adds a use count to the lower vnode, which prevents it from + * recycling. This design blocks the vnode recycle triggered by a filesystem, + * typically zfs(4). In such the case, commit the pruning on nullfs(5) in the + * hope of releasing the lower vnodes. + * + * Distinguish the degree of the low memory. The low page kernel events are + * not abnormal when the working set size of the kernel and user processes + * exceed the physical memory. The low kernel memory events, on the other + * hand, may lead to the system stall. + */ +static void +null_lowmem(void *arg __unused, int howto) +{ + int req; + + switch (howto) { + case VM_LOW_KMEM: + req = NULL_RECYCLE_REQ_LOW_KMEM; + break; + + case VM_LOW_PAGES: + default: /* XXX */ + req = NULL_RECYCLE_REQ_LOW_PAGES; + break; + } + + mtx_lock(&null_recycle_lock); + if (null_recycle_request < req) { + null_recycle_request = req; + cv_broadcast(&null_recycle_cv); + } + mtx_unlock(&null_recycle_lock); +} diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c --- a/sys/fs/nullfs/null_vfsops.c +++ b/sys/fs/nullfs/null_vfsops.c @@ -39,13 +39,19 @@ #include #include +#include +#include +#include #include #include +#include #include #include #include +#include #include #include +#include #include #include @@ -63,6 +69,8 @@ static vfs_vget_t nullfs_vget; static vfs_extattrctl_t nullfs_extattrctl; +static struct vfsops null_vfsops; + /* * Mount null layer */ @@ -222,6 +230,7 @@ mp->mnt_kern_flag |= MNTK_NOMSYNC | MNTK_UNLOCKED_INSMNTQUE; mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag & (MNTK_USES_BCACHE | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS); + mp->mnt_fsvninusep = &null_node_inuse_num; MNT_IUNLOCK(mp); vfs_getnewfsid(mp); vfs_mountedfrom(mp, target); @@ -467,6 +476,112 @@ vdrop(vp); } +void +null_recycle_thread(void) +{ + int hz_rem, recycle_percent; + int64_t vn_scan, node_inuse_delta; + uint64_t node_num, node_inuse_num; + struct timeval tv_now, tv_delta, tv_rem; + static struct timeval tv_last; + static const struct timeval tv_pause = + {.tv_sec = 1, .tv_usec = 0}; + extern counter_u64_t null_recycle_calls; + + EVENTHANDLER_REGISTER(shutdown_pre_sync, kthread_shutdown, null_recycle_td, + SHUTDOWN_PRI_FIRST); + + for (;;) { + kthread_suspend_check(); + + node_num = atomic_load_acq_64(&null_node_num); + node_inuse_num = atomic_load_acq_64(&null_node_inuse_num); + + /* + * Work around the in-use counter error that may happen under a heavy + * load. + * + * Fix the in-use counter value only when the counters are stable, ie + * their values do not change across multiple reads. Otherwise, defer + * the fix to the next chance. + */ + if (__predict_false(node_num < node_inuse_num)) + node_inuse_delta = node_inuse_num - node_num; + else if (__predict_false(((int64_t)node_inuse_num) < 0)) + node_inuse_delta = (int64_t)node_inuse_num; + else + node_inuse_delta = 0; + + if (__predict_false(0 != node_inuse_delta)) { + if (node_num == atomic_load_64(&null_node_num)) { + if (atomic_cmpset_64(&null_node_inuse_num, + node_inuse_num, + node_inuse_num - node_inuse_delta)) { + if (__predict_false(node_num != atomic_load_64(&null_node_num))) { + atomic_add_64(&null_node_inuse_num, node_inuse_delta); + } + } + } + } + + getmicrotime(&tv_now); + tv_delta = tv_now; + timevalsub(&tv_delta, &tv_last); + if (timevalcmp(&tv_pause, &tv_delta, >=)) { + tv_rem = tv_pause; + timevalsub(&tv_rem, &tv_delta); + } else + timevalclear(&tv_rem); + + mtx_lock(&null_recycle_lock); + + if ((NULL_RECYCLE_REQ_NONE == null_recycle_request) || timevalisset(&tv_rem)) { + if (NULL_RECYCLE_REQ_NONE == null_recycle_request) + hz_rem = hz; + else + hz_rem = tvtohz(&tv_rem); + cv_timedwait(&null_recycle_cv, &null_recycle_lock, hz_rem); + mtx_unlock(&null_recycle_lock); + continue; + } + + mtx_unlock(&null_recycle_lock); + + counter_u64_add(null_recycle_calls, 1); + + vn_scan = node_num - node_inuse_num - node_inuse_delta; + + switch (null_recycle_request) { + case NULL_RECYCLE_REQ_LOW_KMEM: + recycle_percent = null_recycle_lowkmem; + break; + + case NULL_RECYCLE_REQ_LOW_PAGES: + default: /* XXX */ + recycle_percent = null_recycle_lowpages; + break; + } + + vn_scan *= recycle_percent; + vn_scan /= 100; + + if (vn_scan > 0) { +#if __FreeBSD_version >= 1300139 + sx_xlock(&null_recycle_sx); + vnlru_free_vfsops(vn_scan, &null_vfsops, null_recycle_marker); + sx_xunlock(&null_recycle_sx); +#else + vnlru_free(vn_scan, &null_vfsops); +#endif + } + + mtx_lock(&null_recycle_lock); + null_recycle_request = NULL_RECYCLE_REQ_NONE; + mtx_unlock(&null_recycle_lock); + getmicrotime(&tv_last); + } +} + static struct vfsops null_vfsops = { .vfs_extattrctl = nullfs_extattrctl, .vfs_fhtovp = nullfs_fhtovp,