Changeset View
Changeset View
Standalone View
Standalone View
sys/kern/vfs_subr.c
Show First 20 Lines • Show All 159 Lines • ▼ Show 20 Lines | int vttoif_tab[10] = { | ||||
S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT | S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT | ||||
}; | }; | ||||
/* | /* | ||||
* List of allocates vnodes in the system. | * List of allocates vnodes in the system. | ||||
*/ | */ | ||||
static TAILQ_HEAD(freelst, vnode) vnode_list; | static TAILQ_HEAD(freelst, vnode) vnode_list; | ||||
static struct vnode *vnode_list_free_marker; | static struct vnode *vnode_list_free_marker; | ||||
static struct vnode *vnode_list_reclaim_marker; | |||||
/* | /* | ||||
* "Free" vnode target. Free vnodes are rarely completely free, but are | * "Free" vnode target. Free vnodes are rarely completely free, but are | ||||
* just ones that are cheap to recycle. Usually they are for files which | * just ones that are cheap to recycle. Usually they are for files which | ||||
* have been stat'd but not read; these usually have inode and namecache | * have been stat'd but not read; these usually have inode and namecache | ||||
* data attached to them. This target is the preferred minimum size of a | * data attached to them. This target is the preferred minimum size of a | ||||
* sub-cache consisting mostly of such files. The system balances the size | * sub-cache consisting mostly of such files. The system balances the size | ||||
* of this sub-cache with its complement to try to prevent either from | * of this sub-cache with its complement to try to prevent either from | ||||
▲ Show 20 Lines • Show All 429 Lines • ▼ Show 20 Lines | if (desiredvnodes > MAXVNODES_MAX) { | ||||
desiredvnodes = MAXVNODES_MAX; | desiredvnodes = MAXVNODES_MAX; | ||||
} | } | ||||
wantfreevnodes = desiredvnodes / 4; | wantfreevnodes = desiredvnodes / 4; | ||||
mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); | mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); | ||||
TAILQ_INIT(&vnode_list); | TAILQ_INIT(&vnode_list); | ||||
mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); | mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); | ||||
vnode_list_free_marker = vn_alloc_marker(NULL); | vnode_list_free_marker = vn_alloc_marker(NULL); | ||||
TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); | TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); | ||||
vnode_list_reclaim_marker = vn_alloc_marker(NULL); | |||||
TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); | |||||
vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, | vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, | ||||
vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); | vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); | ||||
vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), | vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), | ||||
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); | NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); | ||||
/* | /* | ||||
* Preallocate enough nodes to support one-per buf so that | * Preallocate enough nodes to support one-per buf so that | ||||
* we can not fail an insert. reassignbuf() callers can not | * we can not fail an insert. reassignbuf() callers can not | ||||
* tolerate the insertion failure. | * tolerate the insertion failure. | ||||
▲ Show 20 Lines • Show All 388 Lines • ▼ Show 20 Lines | vattr_null(struct vattr *vap) | ||||
vap->va_birthtime.tv_sec = VNOVAL; | vap->va_birthtime.tv_sec = VNOVAL; | ||||
vap->va_birthtime.tv_nsec = VNOVAL; | vap->va_birthtime.tv_nsec = VNOVAL; | ||||
vap->va_flags = VNOVAL; | vap->va_flags = VNOVAL; | ||||
vap->va_gen = VNOVAL; | vap->va_gen = VNOVAL; | ||||
vap->va_vaflags = 0; | vap->va_vaflags = 0; | ||||
} | } | ||||
/* | /* | ||||
* Try to reduce the total number of vnodes. | |||||
* | |||||
* This routine (and its user) are buggy in at least the following ways: | |||||
* - all parameters were picked years ago when RAM sizes were significantly | |||||
* smaller | |||||
* - it can pick vnodes based on pages used by the vm object, but filesystems | |||||
* like ZFS don't use it making the pick broken | |||||
* - since ZFS has its own aging policy it gets partially combated by this one | |||||
* - a dedicated method should be provided for filesystems to let them decide | |||||
* whether the vnode should be recycled | |||||
* | |||||
* This routine is called when we have too many vnodes. It attempts | * This routine is called when we have too many vnodes. It attempts | ||||
* to free <count> vnodes and will potentially free vnodes that still | * to free <count> vnodes and will potentially free vnodes that still | ||||
* have VM backing store (VM backing store is typically the cause | * have VM backing store (VM backing store is typically the cause | ||||
* of a vnode blowout so we want to do this). Therefore, this operation | * of a vnode blowout so we want to do this). Therefore, this operation | ||||
* is not considered cheap. | * is not considered cheap. | ||||
* | * | ||||
* A number of conditions may prevent a vnode from being reclaimed. | * A number of conditions may prevent a vnode from being reclaimed. | ||||
* the buffer cache may have references on the vnode, a directory | * the buffer cache may have references on the vnode, a directory | ||||
* vnode may still have references due to the namei cache representing | * vnode may still have references due to the namei cache representing | ||||
* underlying files, or the vnode may be in active use. It is not | * underlying files, or the vnode may be in active use. It is not | ||||
* desirable to reuse such vnodes. These conditions may cause the | * desirable to reuse such vnodes. These conditions may cause the | ||||
* number of vnodes to reach some minimum value regardless of what | * number of vnodes to reach some minimum value regardless of what | ||||
* you set kern.maxvnodes to. Do not set kern.maxvnodes too low. | * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. | ||||
* | * | ||||
* @param mp Try to reclaim vnodes from this mountpoint | |||||
* @param reclaim_nc_src Only reclaim directories with outgoing namecache | * @param reclaim_nc_src Only reclaim directories with outgoing namecache | ||||
* entries if this argument is strue | * entries if this argument is strue | ||||
* @param trigger Only reclaim vnodes with fewer than this many resident | * @param trigger Only reclaim vnodes with fewer than this many resident | ||||
* pages. | * pages. | ||||
* @return The number of vnodes that were reclaimed. | * @return The number of vnodes that were reclaimed. | ||||
*/ | */ | ||||
static int | static int | ||||
vlrureclaim(struct mount *mp, bool reclaim_nc_src, int trigger) | vlrureclaim(bool reclaim_nc_src, int trigger) | ||||
{ | { | ||||
struct vnode *vp; | struct vnode *vp, *mvp; | ||||
int count, done, target; | struct mount *mp; | ||||
u_long done, target; | |||||
bool retried; | |||||
retried = false; | |||||
done = 0; | done = 0; | ||||
vn_start_write(NULL, &mp, V_WAIT); | target = numvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); | ||||
MNT_ILOCK(mp); | |||||
count = mp->mnt_nvnodelistsize; | |||||
target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1); | |||||
target = target / 10 + 1; | target = target / 10 + 1; | ||||
while (count != 0 && done < target) { | |||||
vp = TAILQ_FIRST(&mp->mnt_nvnodelist); | mtx_lock(&vnode_list_mtx); | ||||
while (vp != NULL && vp->v_type == VMARKER) | mvp = vnode_list_reclaim_marker; | ||||
vp = TAILQ_NEXT(vp, v_nmntvnodes); | restart: | ||||
if (vp == NULL) | vp = mvp; | ||||
while (done < target) { | |||||
vp = TAILQ_NEXT(vp, v_vnodelist); | |||||
if (__predict_false(vp == NULL)) | |||||
break; | break; | ||||
if (__predict_false(vp->v_type == VMARKER || vp->v_type == VBAD || | |||||
vp->v_type == VNON)) | |||||
continue; | |||||
/* | /* | ||||
* XXX LRU is completely broken for non-free vnodes. First | |||||
* by calling here in mountpoint order, then by moving | |||||
* unselected vnodes to the end here, and most grossly by | |||||
* removing the vlruvp() function that was supposed to | |||||
* maintain the order. (This function was born broken | |||||
* since syncer problems prevented it doing anything.) The | |||||
* order is closer to LRC (C = Created). | |||||
* | |||||
* LRU reclaiming of vnodes seems to have last worked in | |||||
* FreeBSD-3 where LRU wasn't mentioned under any spelling. | |||||
* Then there was no hold count, and inactive vnodes were | |||||
* simply put on the free list in LRU order. The separate | |||||
* lists also break LRU. We prefer to reclaim from the | |||||
* free list for technical reasons. This tends to thrash | |||||
* the free list to keep very unrecently used held vnodes. | |||||
* The problem is mitigated by keeping the free list large. | |||||
*/ | |||||
TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); | |||||
TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); | |||||
--count; | |||||
if (!VI_TRYLOCK(vp)) | |||||
goto next_iter; | |||||
/* | |||||
* If it's been deconstructed already, it's still | * If it's been deconstructed already, it's still | ||||
* referenced, or it exceeds the trigger, skip it. | * referenced, or it exceeds the trigger, skip it. | ||||
* Also skip free vnodes. We are trying to make space | * Also skip free vnodes. We are trying to make space | ||||
* to expand the free list, not reduce it. | * to expand the free list, not reduce it. | ||||
*/ | */ | ||||
if (vp->v_usecount || | if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || | ||||
(!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) | |||||
goto next_iter; | |||||
if (!VI_TRYLOCK(vp)) | |||||
goto next_iter; | |||||
if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || | |||||
(!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || | (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || | ||||
vp->v_holdcnt == 0 || | vp->v_type == VBAD || vp->v_type == VNON || | ||||
VN_IS_DOOMED(vp) || (vp->v_object != NULL && | (vp->v_object != NULL && | ||||
vp->v_object->resident_page_count > trigger)) { | vp->v_object->resident_page_count > trigger)) { | ||||
VI_UNLOCK(vp); | VI_UNLOCK(vp); | ||||
goto next_iter; | goto next_iter; | ||||
} | } | ||||
MNT_IUNLOCK(mp); | |||||
vholdl(vp); | vholdl(vp); | ||||
if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) { | VI_UNLOCK(vp); | ||||
TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); | |||||
TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); | |||||
mtx_unlock(&vnode_list_mtx); | |||||
if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { | |||||
vdrop(vp); | vdrop(vp); | ||||
goto next_iter_mntunlocked; | goto next_iter_unlocked; | ||||
} | } | ||||
if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { | |||||
vdrop(vp); | |||||
vn_finished_write(mp); | |||||
goto next_iter_unlocked; | |||||
} | |||||
VI_LOCK(vp); | VI_LOCK(vp); | ||||
/* | if (vp->v_usecount > 0 || | ||||
* v_usecount may have been bumped after VOP_LOCK() dropped | |||||
* the vnode interlock and before it was locked again. | |||||
* | |||||
* It is not necessary to recheck VIRF_DOOMED because it can | |||||
* only be set by another thread that holds both the vnode | |||||
* lock and vnode interlock. If another thread has the | |||||
* vnode lock before we get to VOP_LOCK() and obtains the | |||||
* vnode interlock after VOP_LOCK() drops the vnode | |||||
* interlock, the other thread will be unable to drop the | |||||
* vnode lock before our VOP_LOCK() call fails. | |||||
*/ | |||||
if (vp->v_usecount || | |||||
(!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || | (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || | ||||
(vp->v_object != NULL && | (vp->v_object != NULL && | ||||
vp->v_object->resident_page_count > trigger)) { | vp->v_object->resident_page_count > trigger)) { | ||||
VOP_UNLOCK(vp); | VOP_UNLOCK(vp); | ||||
vdropl(vp); | vdropl(vp); | ||||
goto next_iter_mntunlocked; | vn_finished_write(mp); | ||||
goto next_iter_unlocked; | |||||
} | } | ||||
KASSERT(!VN_IS_DOOMED(vp), | |||||
("VIRF_DOOMED unexpectedly detected in vlrureclaim()")); | |||||
counter_u64_add(recycles_count, 1); | counter_u64_add(recycles_count, 1); | ||||
vgonel(vp); | vgonel(vp); | ||||
VOP_UNLOCK(vp); | VOP_UNLOCK(vp); | ||||
vdropl(vp); | vdropl(vp); | ||||
vn_finished_write(mp); | |||||
done++; | done++; | ||||
next_iter_mntunlocked: | next_iter_unlocked: | ||||
if (!should_yield()) | if (should_yield()) | ||||
goto relock_mnt; | kern_yield(PRI_USER); | ||||
goto yield; | mtx_lock(&vnode_list_mtx); | ||||
goto restart; | |||||
next_iter: | next_iter: | ||||
MPASS(vp->v_type != VMARKER); | |||||
if (!should_yield()) | if (!should_yield()) | ||||
continue; | continue; | ||||
MNT_IUNLOCK(mp); | TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); | ||||
yield: | TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); | ||||
mtx_unlock(&vnode_list_mtx); | |||||
kern_yield(PRI_USER); | kern_yield(PRI_USER); | ||||
relock_mnt: | mtx_lock(&vnode_list_mtx); | ||||
MNT_ILOCK(mp); | goto restart; | ||||
} | } | ||||
MNT_IUNLOCK(mp); | if (done == 0 && !retried) { | ||||
vn_finished_write(mp); | TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); | ||||
return done; | TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); | ||||
retried = true; | |||||
goto restart; | |||||
} | } | ||||
mtx_unlock(&vnode_list_mtx); | |||||
return (done); | |||||
} | |||||
static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ | static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ | ||||
SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, | SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, | ||||
0, | 0, | ||||
"limit on vnode free requests per call to the vnlru_free routine"); | "limit on vnode free requests per call to the vnlru_free routine"); | ||||
/* | /* | ||||
* Attempt to reduce the free list by the requested amount. | * Attempt to reduce the free list by the requested amount. | ||||
▲ Show 20 Lines • Show All 86 Lines • ▼ Show 20 Lines | |||||
*/ | */ | ||||
static struct proc *vnlruproc; | static struct proc *vnlruproc; | ||||
static int vnlruproc_sig; | static int vnlruproc_sig; | ||||
static void | static void | ||||
vnlru_proc(void) | vnlru_proc(void) | ||||
{ | { | ||||
u_long rnumvnodes, rfreevnodes; | u_long rnumvnodes, rfreevnodes; | ||||
struct mount *mp, *nmp; | |||||
unsigned long onumvnodes; | unsigned long onumvnodes; | ||||
int done, force, trigger, usevnodes, vsp; | int done, force, trigger, usevnodes, vsp; | ||||
bool reclaim_nc_src; | bool reclaim_nc_src; | ||||
EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, | EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, | ||||
SHUTDOWN_PRI_FIRST); | SHUTDOWN_PRI_FIRST); | ||||
force = 0; | force = 0; | ||||
Show All 23 Lines | for (;;) { | ||||
if (vsp >= vlowat && force == 0) { | if (vsp >= vlowat && force == 0) { | ||||
vnlruproc_sig = 0; | vnlruproc_sig = 0; | ||||
wakeup(&vnlruproc_sig); | wakeup(&vnlruproc_sig); | ||||
msleep(vnlruproc, &vnode_list_mtx, | msleep(vnlruproc, &vnode_list_mtx, | ||||
PVFS|PDROP, "vlruwt", hz); | PVFS|PDROP, "vlruwt", hz); | ||||
continue; | continue; | ||||
} | } | ||||
mtx_unlock(&vnode_list_mtx); | mtx_unlock(&vnode_list_mtx); | ||||
done = 0; | |||||
rnumvnodes = atomic_load_long(&numvnodes); | rnumvnodes = atomic_load_long(&numvnodes); | ||||
rfreevnodes = atomic_load_long(&freevnodes); | rfreevnodes = atomic_load_long(&freevnodes); | ||||
onumvnodes = rnumvnodes; | onumvnodes = rnumvnodes; | ||||
/* | /* | ||||
* Calculate parameters for recycling. These are the same | * Calculate parameters for recycling. These are the same | ||||
* throughout the loop to give some semblance of fairness. | * throughout the loop to give some semblance of fairness. | ||||
* The trigger point is to avoid recycling vnodes with lots | * The trigger point is to avoid recycling vnodes with lots | ||||
Show All 13 Lines | for (;;) { | ||||
* it is effectively infinite in some congested and | * it is effectively infinite in some congested and | ||||
* misconfigured cases, and this is necessary. Normally | * misconfigured cases, and this is necessary. Normally | ||||
* it is about 8 to 100 (pages), which is quite large. | * it is about 8 to 100 (pages), which is quite large. | ||||
*/ | */ | ||||
trigger = vm_cnt.v_page_count * 2 / usevnodes; | trigger = vm_cnt.v_page_count * 2 / usevnodes; | ||||
if (force < 2) | if (force < 2) | ||||
trigger = vsmalltrigger; | trigger = vsmalltrigger; | ||||
reclaim_nc_src = force >= 3; | reclaim_nc_src = force >= 3; | ||||
mtx_lock(&mountlist_mtx); | done = vlrureclaim(reclaim_nc_src, trigger); | ||||
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { | |||||
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { | |||||
nmp = TAILQ_NEXT(mp, mnt_list); | |||||
continue; | |||||
} | |||||
done += vlrureclaim(mp, reclaim_nc_src, trigger); | |||||
mtx_lock(&mountlist_mtx); | |||||
nmp = TAILQ_NEXT(mp, mnt_list); | |||||
vfs_unbusy(mp); | |||||
} | |||||
mtx_unlock(&mountlist_mtx); | |||||
if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) | if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) | ||||
uma_reclaim(UMA_RECLAIM_DRAIN); | uma_reclaim(UMA_RECLAIM_DRAIN); | ||||
if (done == 0) { | if (done == 0) { | ||||
if (force == 0 || force == 1) { | if (force == 0 || force == 1) { | ||||
force = 2; | force = 2; | ||||
continue; | continue; | ||||
} | } | ||||
if (force == 2) { | if (force == 2) { | ||||
▲ Show 20 Lines • Show All 4,931 Lines • Show Last 20 Lines |