Changeset View
Standalone View
sys/kern/vfs_subr.c
Show First 20 Lines • Show All 1,431 Lines • ▼ Show 20 Lines | if (vsp < vlowat && vnlruproc_sig == 0) { | ||||
wakeup(vnlruproc); | wakeup(vnlruproc); | ||||
} | } | ||||
} | } | ||||
/* | /* | ||||
* Wait if necessary for space for a new vnode. | * Wait if necessary for space for a new vnode. | ||||
*/ | */ | ||||
static int | static int | ||||
getnewvnode_wait(int suspended) | vn_alloc_wait(int suspended) | ||||
{ | { | ||||
mtx_assert(&vnode_list_mtx, MA_OWNED); | mtx_assert(&vnode_list_mtx, MA_OWNED); | ||||
if (numvnodes >= desiredvnodes) { | if (numvnodes >= desiredvnodes) { | ||||
if (suspended) { | if (suspended) { | ||||
/* | /* | ||||
* The file system is being suspended. We cannot | * The file system is being suspended. We cannot | ||||
* risk a deadlock here, so allow allocation of | * risk a deadlock here, so allow allocation of | ||||
Show All 9 Lines | msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, | ||||
"vlruwk", hz); | "vlruwk", hz); | ||||
} | } | ||||
/* Post-adjust like the pre-adjust in getnewvnode(). */ | /* Post-adjust like the pre-adjust in getnewvnode(). */ | ||||
if (numvnodes + 1 > desiredvnodes && freevnodes > 1) | if (numvnodes + 1 > desiredvnodes && freevnodes > 1) | ||||
vnlru_free_locked(1, NULL); | vnlru_free_locked(1, NULL); | ||||
return (numvnodes >= desiredvnodes ? ENFILE : 0); | return (numvnodes >= desiredvnodes ? ENFILE : 0); | ||||
} | } | ||||
/* | static struct vnode * | ||||
* This hack is fragile, and probably not needed any more now that the | vn_alloc(struct mount *mp) | ||||
* watermark handling works. | |||||
*/ | |||||
void | |||||
getnewvnode_reserve(u_int count) | |||||
{ | { | ||||
u_long rnumvnodes, rfreevnodes; | |||||
struct thread *td; | |||||
/* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */ | |||||
/* XXX no longer so quick, but this part is not racy. */ | |||||
mtx_lock(&vnode_list_mtx); | |||||
rnumvnodes = atomic_load_long(&numvnodes); | |||||
rfreevnodes = atomic_load_long(&freevnodes); | |||||
if (rnumvnodes + count > desiredvnodes && rfreevnodes > wantfreevnodes) | |||||
vnlru_free_locked(ulmin(rnumvnodes + count - desiredvnodes, | |||||
rfreevnodes - wantfreevnodes), NULL); | |||||
mtx_unlock(&vnode_list_mtx); | |||||
td = curthread; | |||||
/* First try to be quick and racy. */ | |||||
if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { | |||||
td->td_vp_reserv += count; | |||||
vcheckspace(); /* XXX no longer so quick, but more racy */ | |||||
return; | |||||
} else | |||||
atomic_subtract_long(&numvnodes, count); | |||||
mtx_lock(&vnode_list_mtx); | |||||
while (count > 0) { | |||||
if (getnewvnode_wait(0) == 0) { | |||||
count--; | |||||
td->td_vp_reserv++; | |||||
atomic_add_long(&numvnodes, 1); | |||||
} | |||||
} | |||||
vcheckspace(); | |||||
mtx_unlock(&vnode_list_mtx); | |||||
} | |||||
/* | |||||
* This hack is fragile, especially if desiredvnodes or wantvnodes are | |||||
* misconfgured or changed significantly. Reducing desiredvnodes below | |||||
* the reserved amount should cause bizarre behaviour like reducing it | |||||
* below the number of active vnodes -- the system will try to reduce | |||||
* numvnodes to match, but should fail, so the subtraction below should | |||||
* not overflow. | |||||
*/ | |||||
void | |||||
getnewvnode_drop_reserve(void) | |||||
{ | |||||
struct thread *td; | |||||
td = curthread; | |||||
atomic_subtract_long(&numvnodes, td->td_vp_reserv); | |||||
td->td_vp_reserv = 0; | |||||
} | |||||
/* | |||||
* Return the next vnode from the free list. | |||||
*/ | |||||
int | |||||
getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, | |||||
struct vnode **vpp) | |||||
{ | |||||
struct vnode *vp; | struct vnode *vp; | ||||
struct thread *td; | |||||
struct lock_object *lo; | |||||
static int cyclecount; | static int cyclecount; | ||||
int error __unused; | int error __unused; | ||||
CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); | vp = uma_zalloc(vnode_zone, M_NOWAIT); | ||||
kib: I disagree with this, completely. The vnode limit is not to enforce some arbitrary limits on… | |||||
mjgAuthorUnsubmitted Done Inline ActionsThis is part of why this is NOWAIT. Should there be any RAM shortage this is expected to fail at which point we go back to the old way. mjg: This is part of why this is NOWAIT. Should there be any RAM shortage this is expected to fail… | |||||
kibUnsubmitted Not Done Inline ActionsWould the vnodes be the only memory consumer, then yes, it mostly worked. But they are not, and there is a very significant allocations done after each vnode allocation, as I mentioned above. Both factors make overriding the limit very undesirable, and actually fatal on KVA-starved arches (regardless the amount of memory they have). Also, the existing override for the vnode count limit is only done for the situation when the request comes from suspender. It is not completely true, the code to check for suspension was added before the suspension owner field was added to struct mount. This is because blocking suspender there causes deadlock, and it could be improved by checking that curthread == mp->mnt_susp_owner. kib: Would the vnodes be the only memory consumer, then yes, it mostly worked. But they are not… | |||||
if (vp != NULL) { | |||||
KASSERT(vops->registered, | atomic_add_long(&numvnodes, 1); | ||||
("%s: not registered vector op %p\n", __func__, vops)); | counter_u64_add(vnodes_created, 1); | ||||
return (vp); | |||||
vp = NULL; | |||||
td = curthread; | |||||
if (td->td_vp_reserv > 0) { | |||||
td->td_vp_reserv -= 1; | |||||
goto alloc; | |||||
} | } | ||||
mtx_lock(&vnode_list_mtx); | mtx_lock(&vnode_list_mtx); | ||||
if (numvnodes < desiredvnodes) | if (numvnodes < desiredvnodes) | ||||
cyclecount = 0; | cyclecount = 0; | ||||
else if (cyclecount++ >= freevnodes) { | else if (cyclecount++ >= freevnodes) { | ||||
cyclecount = 0; | cyclecount = 0; | ||||
vstir = 1; | vstir = 1; | ||||
} | } | ||||
/* | /* | ||||
* Grow the vnode cache if it will not be above its target max | * Grow the vnode cache if it will not be above its target max | ||||
* after growing. Otherwise, if the free list is nonempty, try | * after growing. Otherwise, if the free list is nonempty, try | ||||
* to reclaim 1 item from it before growing the cache (possibly | * to reclaim 1 item from it before growing the cache (possibly | ||||
* above its target max if the reclamation failed or is delayed). | * above its target max if the reclamation failed or is delayed). | ||||
* Otherwise, wait for some space. In all cases, schedule | * Otherwise, wait for some space. In all cases, schedule | ||||
* vnlru_proc() if we are getting short of space. The watermarks | * vnlru_proc() if we are getting short of space. The watermarks | ||||
* should be chosen so that we never wait or even reclaim from | * should be chosen so that we never wait or even reclaim from | ||||
* the free list to below its target minimum. | * the free list to below its target minimum. | ||||
*/ | */ | ||||
if (numvnodes + 1 <= desiredvnodes) | if (numvnodes + 1 <= desiredvnodes) | ||||
; | ; | ||||
else if (freevnodes > 0) | else if (freevnodes > 0) | ||||
vnlru_free_locked(1, NULL); | vnlru_free_locked(1, NULL); | ||||
else { | else { | ||||
error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & | error = vn_alloc_wait(mp != NULL && (mp->mnt_kern_flag & | ||||
MNTK_SUSPEND)); | MNTK_SUSPEND)); | ||||
#if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ | #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ | ||||
if (error != 0) { | if (error != 0) { | ||||
mtx_unlock(&vnode_list_mtx); | mtx_unlock(&vnode_list_mtx); | ||||
return (error); | return (error); | ||||
} | } | ||||
#endif | #endif | ||||
} | } | ||||
vcheckspace(); | vcheckspace(); | ||||
atomic_add_long(&numvnodes, 1); | atomic_add_long(&numvnodes, 1); | ||||
mtx_unlock(&vnode_list_mtx); | mtx_unlock(&vnode_list_mtx); | ||||
alloc: | |||||
counter_u64_add(vnodes_created, 1); | counter_u64_add(vnodes_created, 1); | ||||
vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK); | return (uma_zalloc(vnode_zone, M_WAITOK)); | ||||
} | |||||
static void | |||||
vn_free(struct vnode *vp) | |||||
{ | |||||
atomic_subtract_long(&numvnodes, 1); | |||||
uma_zfree(vnode_zone, vp); | |||||
} | |||||
void | |||||
getnewvnode_reserve(void) | |||||
{ | |||||
struct thread *td; | |||||
td = curthread; | |||||
MPASS(td->td_vp_reserved == NULL); | |||||
td->td_vp_reserved = vn_alloc(NULL); | |||||
} | |||||
void | |||||
getnewvnode_drop_reserve(void) | |||||
{ | |||||
struct thread *td; | |||||
td = curthread; | |||||
if (td->td_vp_reserved != NULL) { | |||||
vn_free(td->td_vp_reserved); | |||||
td->td_vp_reserved = NULL; | |||||
} | |||||
} | |||||
/* | /* | ||||
* Return the next vnode from the free list. | |||||
*/ | |||||
int | |||||
getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, | |||||
struct vnode **vpp) | |||||
{ | |||||
struct vnode *vp; | |||||
struct thread *td; | |||||
struct lock_object *lo; | |||||
CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); | |||||
KASSERT(vops->registered, | |||||
("%s: not registered vector op %p\n", __func__, vops)); | |||||
td = curthread; | |||||
if (td->td_vp_reserved != NULL) { | |||||
vp = td->td_vp_reserved; | |||||
td->td_vp_reserved = NULL; | |||||
} else { | |||||
vp = vn_alloc(mp); | |||||
} | |||||
/* | |||||
* Locks are given the generic name "vnode" when created. | * Locks are given the generic name "vnode" when created. | ||||
* Follow the historic practice of using the filesystem | * Follow the historic practice of using the filesystem | ||||
* name when they allocated, e.g., "zfs", "ufs", "nfs, etc. | * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. | ||||
* | * | ||||
* Locks live in a witness group keyed on their name. Thus, | * Locks live in a witness group keyed on their name. Thus, | ||||
* when a lock is renamed, it must also move from the witness | * when a lock is renamed, it must also move from the witness | ||||
* group of its old name to the witness group of its new name. | * group of its old name to the witness group of its new name. | ||||
* | * | ||||
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines | freevnode(struct vnode *vp) | ||||
* | * | ||||
* The vnode will be returned to the zone where it will | * The vnode will be returned to the zone where it will | ||||
* normally remain until it is needed for another vnode. We | * normally remain until it is needed for another vnode. We | ||||
* need to cleanup (or verify that the cleanup has already | * need to cleanup (or verify that the cleanup has already | ||||
* been done) any residual data left from its current use | * been done) any residual data left from its current use | ||||
* so as not to contaminate the freshly allocated vnode. | * so as not to contaminate the freshly allocated vnode. | ||||
*/ | */ | ||||
CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); | CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); | ||||
atomic_subtract_long(&numvnodes, 1); | |||||
bo = &vp->v_bufobj; | bo = &vp->v_bufobj; | ||||
VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); | VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); | ||||
VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); | VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); | ||||
VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); | VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); | ||||
VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); | VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); | ||||
VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); | VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); | ||||
VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); | VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); | ||||
VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, | VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, | ||||
Show All 22 Lines | #endif | ||||
vp->v_unpcb = NULL; | vp->v_unpcb = NULL; | ||||
vp->v_rdev = NULL; | vp->v_rdev = NULL; | ||||
vp->v_fifoinfo = NULL; | vp->v_fifoinfo = NULL; | ||||
vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; | vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; | ||||
vp->v_irflag = 0; | vp->v_irflag = 0; | ||||
vp->v_iflag = 0; | vp->v_iflag = 0; | ||||
vp->v_vflag = 0; | vp->v_vflag = 0; | ||||
bo->bo_flag = 0; | bo->bo_flag = 0; | ||||
uma_zfree(vnode_zone, vp); | vn_free(vp); | ||||
} | } | ||||
/* | /* | ||||
* Delete from old mount point vnode list, if on one. | * Delete from old mount point vnode list, if on one. | ||||
*/ | */ | ||||
static void | static void | ||||
delmntque(struct vnode *vp) | delmntque(struct vnode *vp) | ||||
{ | { | ||||
▲ Show 20 Lines • Show All 4,580 Lines • Show Last 20 Lines |
I disagree with this, completely. The vnode limit is not to enforce some arbitrary limits on vnodes, it is there to allow kernel to avoid KVA exhaustion.
The problem comes mostly from the secondary allocations for vnodes. For instance, UFS vnode instantiation means that we also allocate vm_object, inode, dinode, (often) ext attr area, and probably more. From what I saw, for ZFS it is even more severe.
Too high values for desiredvnodes killed 32bit machines, see e.g. 295971, your change makes the limit not effective at all.