Index: head/sys/kern/vfs_subr.c =================================================================== --- head/sys/kern/vfs_subr.c +++ head/sys/kern/vfs_subr.c @@ -76,6 +76,7 @@ #include #include #include +#include #include #include #include @@ -238,6 +239,8 @@ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; +__read_frequently smr_t vfs_smr; + /* * The workitem queue. * @@ -661,7 +664,8 @@ vnode_list_reclaim_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, - vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); + vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR); + vfs_smr = uma_zone_get_smr(vnode_zone); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* @@ -1603,7 +1607,7 @@ if (vnlru_under(rnumvnodes, vlowat)) vnlru_kick(); mtx_unlock(&vnode_list_mtx); - return (uma_zalloc(vnode_zone, M_WAITOK)); + return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static struct vnode * @@ -1619,7 +1623,7 @@ return (vn_alloc_hard(mp)); } - return (uma_zalloc(vnode_zone, M_WAITOK)); + return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static void @@ -1627,7 +1631,7 @@ { atomic_subtract_long(&numvnodes, 1); - uma_zfree(vnode_zone, vp); + uma_zfree_smr(vnode_zone, vp); } /* @@ -1758,7 +1762,7 @@ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); bo = &vp->v_bufobj; VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); - VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); + VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); @@ -2848,8 +2852,30 @@ * * holdcnt can be manipulated using atomics without holding any locks, * except when transitioning 1<->0, in which case the interlock is held. + * + * Consumers which don't guarantee liveness of the vnode can use SMR to + * try to get a reference. Note this operation can fail since the vnode + * may be awaiting getting freed by the time they get to it. */ enum vgetstate +vget_prep_smr(struct vnode *vp) +{ + enum vgetstate vs; + + VFS_SMR_ASSERT_ENTERED(); + + if (refcount_acquire_if_not_zero(&vp->v_usecount)) { + vs = VGET_USECOUNT; + } else { + if (vhold_smr(vp)) + vs = VGET_HOLDCNT; + else + vs = VGET_NONE; + } + return (vs); +} + +enum vgetstate vget_prep(struct vnode *vp) { enum vgetstate vs; @@ -2919,6 +2945,7 @@ ASSERT_VI_LOCKED(vp, __func__); else ASSERT_VI_UNLOCKED(vp, __func__); + VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); @@ -3380,7 +3407,8 @@ CTR2(KTR_VFS, "%s: vp %p", __func__, vp); old = atomic_fetchadd_int(&vp->v_holdcnt, 1); - VNASSERT(old >= 0, vp, ("%s: wrong hold count %d", __func__, old)); + VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, + ("%s: wrong hold count %d", __func__, old)); if (old != 0) return; critical_enter(); @@ -3405,12 +3433,40 @@ CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); - VNASSERT(old > 0, vp, ("%s: wrong hold count %d", __func__, old)); + VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, + ("%s: wrong hold count %d", __func__, old)); #else atomic_add_int(&vp->v_holdcnt, 1); #endif } +/* + * Grab a hold count as long as the vnode is not getting freed. + * + * Only use this routine if vfs smr is the only protection you have against + * freeing the vnode. + */ +bool +vhold_smr(struct vnode *vp) +{ + int count; + + VFS_SMR_ASSERT_ENTERED(); + + count = atomic_load_int(&vp->v_holdcnt); + for (;;) { + if (count & VHOLD_NO_SMR) { + VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, + ("non-zero hold count with flags %d\n", count)); + return (false); + } + + VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); + if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) + return (true); + } +} + static void __noinline vdbatch_process(struct vdbatch *vd) { @@ -3581,11 +3637,25 @@ VI_UNLOCK(vp); return; } - if (VN_IS_DOOMED(vp)) { - freevnode(vp); + if (!VN_IS_DOOMED(vp)) { + vdrop_deactivate(vp); return; } - vdrop_deactivate(vp); + /* + * We may be racing against vhold_smr. + * + * If they win we can just pretend we never got this far, they will + * vdrop later. + */ + if (!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR)) { + /* + * We lost the aforementioned race. Note that any subsequent + * access is invalid as they might have managed to vdropl on + * their own. + */ + return; + } + freevnode(vp); } /* @@ -4041,20 +4111,25 @@ {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; +_Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, + "new hold count flag not added to vn_printf"); + void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; + u_int holdcnt; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("type %s\n", typename[vp->v_type]); + holdcnt = atomic_load_int(&vp->v_holdcnt); printf(" usecount %d, writecount %d, refcount %d", - vp->v_usecount, vp->v_writecount, vp->v_holdcnt); + vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS); switch (vp->v_type) { case VDIR: printf(" mountedhere %p\n", vp->v_mountedhere); @@ -4072,6 +4147,12 @@ printf("\n"); break; } + buf[0] = '\0'; + buf[1] = '\0'; + if (holdcnt & VHOLD_NO_SMR) + strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); + printf(" hold count flags (%s)\n", buf + 1); + buf[0] = '\0'; buf[1] = '\0'; if (vp->v_irflag & VIRF_DOOMED) Index: head/sys/sys/vnode.h =================================================================== --- head/sys/sys/vnode.h +++ head/sys/sys/vnode.h @@ -58,7 +58,7 @@ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, VMARKER }; -enum vgetstate { VGET_HOLDCNT, VGET_USECOUNT }; +enum vgetstate { VGET_NONE, VGET_HOLDCNT, VGET_USECOUNT }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). @@ -236,6 +236,9 @@ * VIRF_DOOMED is doubly protected by the interlock and vnode lock. Both * are required for writing but the status may be checked with either. */ +#define VHOLD_NO_SMR (1<<29) /* Disable vhold_smr */ +#define VHOLD_ALL_FLAGS (VHOLD_NO_SMR) + #define VIRF_DOOMED 0x0001 /* This vnode is being recycled */ #define VI_TEXT_REF 0x0001 /* Text ref grabbed use ref */ @@ -657,12 +660,14 @@ void vdropl(struct vnode *); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); int vget(struct vnode *vp, int flags, struct thread *td); +enum vgetstate vget_prep_smr(struct vnode *vp); enum vgetstate vget_prep(struct vnode *vp); int vget_finish(struct vnode *vp, int flags, enum vgetstate vs); void vgone(struct vnode *vp); void vhold(struct vnode *); void vholdl(struct vnode *); void vholdnz(struct vnode *); +bool vhold_smr(struct vnode *); void vinactive(struct vnode *vp); int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo); int vtruncbuf(struct vnode *vp, off_t length, int blksize); @@ -973,6 +978,16 @@ #define VFS_VOP_VECTOR_REGISTER(vnodeops) \ SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \ vfs_vector_op_register, &vnodeops) + +#define VFS_SMR_DECLARE \ + extern smr_t vfs_smr + +#define VFS_SMR() vfs_smr +#define vfs_smr_enter() smr_enter(VFS_SMR()) +#define vfs_smr_exit() smr_exit(VFS_SMR()) +#define VFS_SMR_ASSERT_ENTERED() SMR_ASSERT_ENTERED(VFS_SMR()) +#define VFS_SMR_ASSERT_NOT_ENTERED() SMR_ASSERT_NOT_ENTERED(VFS_SMR()) +#define VFS_SMR_ZONE_SET(zone) uma_zone_set_smr((zone), VFS_SMR()) #endif /* _KERNEL */