Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -296,6 +296,15 @@ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); +#define VDBATCH_SIZE 8 +struct vdbatch { + u_int index; + u_int count; + struct mtx lock; + struct vnode *tab[VDBATCH_SIZE]; +}; +DPCPU_DEFINE_STATIC(struct vdbatch, vd); + /* * When shutting down the syncer, run it at four times normal speed. */ @@ -313,6 +322,8 @@ static int vstir; /* nonzero to stir non-free vnodes */ static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ +static void vdbatch_dequeue(struct vnode *vp); + static int sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) { @@ -514,6 +525,8 @@ */ rangelock_init(&vp->v_rl); + vp->v_dbatchcpu = NOCPU; + mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_marker, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); @@ -530,6 +543,7 @@ struct bufobj *bo; vp = mem; + vdbatch_dequeue(vp); mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); @@ -564,8 +578,9 @@ static void vntblinit(void *dummy __unused) { + struct vdbatch *vd; + int cpu, physvnodes, virtvnodes; u_int i; - int physvnodes, virtvnodes; /* * Desiredvnodes is a function of the physical memory size and the @@ -625,6 +640,12 @@ for (i = 1; i <= sizeof(struct vnode); i <<= 1) vnsz2log++; vnsz2log--; + + CPU_FOREACH(cpu) { + vd = DPCPU_ID_PTR((cpu), vd); + bzero(vd, sizeof(*vd)); + mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); + } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); @@ -3175,6 +3196,106 @@ #endif } +static void +vdbatch_process(struct vdbatch *vd) +{ + struct vnode *vp; + int i; + + mtx_assert(&vd->lock, MA_OWNED); + + mtx_lock(&vnode_list_mtx); + for (i = 0; i < VDBATCH_SIZE; i++) { + vp = vd->tab[i]; + if (vp == NULL) + continue; + TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); + TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); + MPASS(vp->v_dbatchcpu != NOCPU); + vp->v_dbatchcpu = NOCPU; + vd->tab[i] = NULL; + } + vd->index = 0; + vd->count = 0; + mtx_unlock(&vnode_list_mtx); +} + +static void +vdbatch_enqueue(struct vnode *vp) +{ + struct vdbatch *vd; + int i; + + ASSERT_VI_LOCKED(vp, __func__); + VNASSERT(!VN_IS_DOOMED(vp), vp, + ("%s: deferring requeue of a doomed vnode", __func__)); + + if (vp->v_dbatchcpu != NOCPU) { + VI_UNLOCK(vp); + return; + } + /* + * A hack: pin us to the current CPU so that we know what to put in + * ->v_dbatchcpu. + */ + sched_pin(); + vd = DPCPU_PTR(vd); + mtx_lock(&vd->lock); + KASSERT(vd->index < VDBATCH_SIZE && vd->count < VDBATCH_SIZE, + ("%s: invalid state of vdbatch (index %u, count %u)\n", + __func__, vd->index, vd->count)); + for (i = vd->index; i < VDBATCH_SIZE; i++) { + if (vd->tab[i] != NULL) + continue; + MPASS(curthread->td_pinned); + vp->v_dbatchcpu = curcpu; + vd->tab[i] = vp; + vd->index = i + 1; + vd->count++; + break; + } + KASSERT(vp->v_dbatchcpu != NOCPU, + ("%s: failed to fit the vnode (index %u, count %u)\n", + __func__, vd->index, vd->count)); + VI_UNLOCK(vp); + sched_unpin(); + if (vd->count == VDBATCH_SIZE) + vdbatch_process(vd); + mtx_unlock(&vd->lock); +} + +/* + * This routine must only be called for vnodes which are about to be + * deallocated. Supporting dequeue for arbitrary vndoes would require + * validating that the locked batch matches. + */ +static void +vdbatch_dequeue(struct vnode *vp) +{ + struct vdbatch *vd; + int i; + short cpu; + + VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, + ("%s: called for a used vnode\n", __func__)); + cpu = atomic_load_short(&vp->v_dbatchcpu); + if (cpu == NOCPU) + return; + vd = DPCPU_ID_PTR(cpu, vd); + mtx_lock(&vd->lock); + for (i = 0; i < VDBATCH_SIZE; i++) { + if (vd->tab[i] != vp) + continue; + vp->v_dbatchcpu = NOCPU; + vd->tab[i] = NULL; + if (i < vd->index) + vd->index = i; + vd->count--; + break; + } + mtx_unlock(&vd->lock); +} + /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd @@ -3212,12 +3333,8 @@ mp->mnt_dirtyvnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } - mtx_lock(&vnode_list_mtx); - TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); - TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); - mtx_unlock(&vnode_list_mtx); atomic_add_long(&freevnodes, 1); - VI_UNLOCK(vp); + vdbatch_enqueue(vp); } void Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -171,7 +171,8 @@ u_int v_usecount; /* I ref count of users */ u_int v_iflag; /* i vnode flags (see below) */ u_int v_vflag; /* v vnode flags */ - u_int v_mflag; /* l mnt-specific vnode flags */ + u_short v_mflag; /* l mnt-specific vnode flags */ + short v_dbatchcpu; /* i LRU requeue deferral batch */ int v_writecount; /* I ref count of writers or (negative) text users */ u_int v_hash;