Index: sys/kern/vfs_cache.c =================================================================== --- sys/kern/vfs_cache.c +++ sys/kern/vfs_cache.c @@ -56,11 +56,13 @@ #include #include #include +#include #include #include #include #include #include +#include #ifdef KTRACE #include #endif @@ -104,7 +106,7 @@ */ struct namecache { - LIST_ENTRY(namecache) nc_hash; /* hash chain */ + CK_LIST_ENTRY(namecache) nc_hash;/* hash chain */ LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ struct vnode *nc_dvp; /* vnode of parent of name */ @@ -143,6 +145,25 @@ #define NCF_DVDROP 0x10 #define NCF_NEGATIVE 0x20 #define NCF_HOTNEGATIVE 0x40 +#define NCF_INVALID 0x80 + +static bool +cache_ncp_invalid(struct namecache *ncp) +{ + + atomic_thread_fence_acq(); + return ((ncp->nc_flag & NCF_INVALID) != 0); +} + +static void +cache_ncp_invalidate(struct namecache *ncp) +{ + + atomic_thread_fence_rel(); + KASSERT((ncp->nc_flag & NCF_INVALID) == 0, + ("%s: entry %p already invalid", __func__, ncp)); + ncp->nc_flag |= NCF_INVALID; +} /* * Name caching works as follows: @@ -192,12 +213,14 @@ * the first node, locking everything in order and revalidating the state. */ +VFS_SMR_DECLARE; + /* * Structures associated with name caching. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) -static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ +static __read_mostly CK_LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ static u_long __read_mostly nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "Size of namecache hash table"); @@ -275,15 +298,15 @@ if (__predict_false(ts)) { if (len <= CACHE_PATH_CUTOFF) - ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK); + ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); else - ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK); + ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); ncp = &ncp_ts->nc_nc; } else { if (len <= CACHE_PATH_CUTOFF) - ncp = uma_zalloc(cache_zone_small, M_WAITOK); + ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); else - ncp = uma_zalloc(cache_zone_large, M_WAITOK); + ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); } return (ncp); } @@ -300,14 +323,14 @@ if (__predict_false(ncp->nc_flag & NCF_TS)) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) - uma_zfree(cache_zone_small_ts, ncp_ts); + uma_zfree_smr(cache_zone_small_ts, ncp_ts); else - uma_zfree(cache_zone_large_ts, ncp_ts); + uma_zfree_smr(cache_zone_large_ts, ncp_ts); } else { if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) - uma_zfree(cache_zone_small, ncp); + uma_zfree_smr(cache_zone_small, ncp); else - uma_zfree(cache_zone_large, ncp); + uma_zfree_smr(cache_zone_large, ncp); } } @@ -336,6 +359,9 @@ "VFS namecache enabled"); #endif +static bool __read_mostly cache_try_smr = true; +SYSCTL_BOOL(_debug, OID_AUTO, cache_try_smr, CTLFLAG_RW, &cache_try_smr, 0, ""); + /* Export size information to userland */ SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct namecache), "sizeof(struct namecache)"); @@ -605,7 +631,7 @@ } /* Scan hash tables counting entries */ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) - LIST_FOREACH(ncp, ncpp, nc_hash) + CK_LIST_FOREACH(ncp, ncpp, nc_hash) cntbuf[i]++; cache_unlock_all_buckets(); for (error = 0, i = 0; i < n_nchash; i++) @@ -638,7 +664,7 @@ /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; - LIST_FOREACH(ncp, ncpp, nc_hash) { + CK_LIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) @@ -859,7 +885,10 @@ CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); - LIST_REMOVE(ncp, nc_hash); + + cache_ncp_invalidate(ncp); + + CK_LIST_REMOVE(ncp, nc_hash); if (!(ncp->nc_flag & NCF_NEGATIVE)) { SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, ncp->nc_name, ncp->nc_vp); @@ -1012,7 +1041,7 @@ cache_sort_vnodes(&dvlp, &vlp); cache_lock_vnodes(dvlp, vlp); rw_wlock(blp); - LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { + CK_LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { if (rncp == ncp && rncp->nc_dvp == dvp && rncp->nc_nlen == cnp->cn_namelen && !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) @@ -1224,12 +1253,12 @@ hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); retry: - if (LIST_EMPTY(NCHHASH(hash))) + if (CK_LIST_EMPTY(NCHHASH(hash))) goto out_no_entry; rw_wlock(blp); - LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { + CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { counter_u64_add(numchecks, 1); if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) @@ -1304,6 +1333,7 @@ uint32_t hash; enum vgetstate vs; int error, ltype; + bool try_smr, doing_smr; #ifdef DEBUG_CACHE if (__predict_false(!doingcache)) { @@ -1320,7 +1350,11 @@ if ((cnp->cn_flags & MAKEENTRY) == 0) return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); + try_smr = atomic_load_char(&cache_try_smr); + if (cnp->cn_nameiop == CREATE) + try_smr = false; retry: + doing_smr = false; blp = NULL; dvlp = NULL; error = 0; @@ -1360,10 +1394,17 @@ } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); - blp = HASH2BUCKETLOCK(hash); - rw_rlock(blp); +retry_hashed: + if (try_smr) { + vfs_smr_enter(); + doing_smr = true; + try_smr = false; + } else { + blp = HASH2BUCKETLOCK(hash); + rw_rlock(blp); + } - LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { + CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { counter_u64_add(numchecks, 1); if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) @@ -1372,15 +1413,28 @@ /* We failed to find an entry */ if (__predict_false(ncp == NULL)) { - rw_runlock(blp); + if (doing_smr) + vfs_smr_exit(); + else + rw_runlock(blp); SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); counter_u64_add(nummiss, 1); return (0); } - if (ncp->nc_flag & NCF_NEGATIVE) + /* + * Negative entries may need locks to be taken and are temporarily not + * handled for simplicity. + */ + if (ncp->nc_flag & NCF_NEGATIVE) { + if (doing_smr) { + vfs_smr_exit(); + doing_smr = false; + goto retry_hashed; + } goto negative_success; + } /* We found a "positive" match, return the vnode */ counter_u64_add(numposhits, 1); @@ -1401,8 +1455,22 @@ ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp); } - vs = vget_prep(*vpp); - cache_lookup_unlock(blp, dvlp); + if (doing_smr) { + if (cache_ncp_invalid(ncp)) { + vfs_smr_exit(); + *vpp = NULL; + goto retry; + } + vs = vget_prep_smr(*vpp); + vfs_smr_exit(); + if (vs == VGET_NONE) { + *vpp = NULL; + goto retry; + } + } else { + vs = vget_prep(*vpp); + cache_lookup_unlock(blp, dvlp); + } error = vget_finish(*vpp, cnp->cn_lkflags, vs); if (cnp->cn_flags & ISDOTDOT) { vn_lock(dvp, ltype | LK_RETRY); @@ -1424,6 +1492,7 @@ return (-1); negative_success: + MPASS(!doing_smr); /* We found a negative match, and want to create it, so purge */ if (cnp->cn_nameiop == CREATE) { counter_u64_add(numnegzaps, 1); @@ -1441,6 +1510,7 @@ return (ENOENT); zap_and_exit: + MPASS(!doing_smr); if (blp != NULL) error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); else @@ -1774,7 +1844,7 @@ * the same path name. */ ncpp = NCHHASH(hash); - LIST_FOREACH(n2, ncpp, nc_hash) { + CK_LIST_FOREACH(n2, ncpp, nc_hash) { if (n2->nc_dvp == dvp && n2->nc_nlen == cnp->cn_namelen && !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { @@ -1838,12 +1908,6 @@ LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); } - /* - * Insert the new namecache entry into the appropriate chain - * within the cache entries table. - */ - LIST_INSERT_HEAD(ncpp, ncp, nc_hash); - /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the @@ -1860,6 +1924,14 @@ SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, ncp->nc_name); } + + atomic_thread_fence_rel(); + /* + * Insert the new namecache entry into the appropriate chain + * within the cache entries table. + */ + CK_LIST_INSERT_HEAD(ncpp, ncp, nc_hash); + cache_enter_unlock(&cel); if (numneg * ncnegfactor > lnumcache) cache_negative_zap_one(); @@ -1907,6 +1979,11 @@ NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), UMA_ZONE_ZINIT); + VFS_SMR_ZONE_SET(cache_zone_small); + VFS_SMR_ZONE_SET(cache_zone_small_ts); + VFS_SMR_ZONE_SET(cache_zone_large); + VFS_SMR_ZONE_SET(cache_zone_large_ts); + ncsize = desiredvnodes * ncsizefactor; nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; @@ -1992,11 +2069,11 @@ nchashtbl = new_nchashtbl; nchash = new_nchash; for (i = 0; i <= old_nchash; i++) { - while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { + while ((ncp = CK_LIST_FIRST(&old_nchashtbl[i])) != NULL) { hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); - LIST_REMOVE(ncp, nc_hash); - LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); + CK_LIST_REMOVE(ncp, nc_hash); + CK_LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); } } ncsize = newncsize; @@ -2110,7 +2187,7 @@ for (j = i; j < n_nchash; j += numbucketlocks) { retry: bucket = &nchashtbl[j]; - LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { + CK_LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { cache_assert_bucket_locked(ncp, RA_WLOCKED); if (ncp->nc_dvp->v_mount != mp) continue; Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -76,6 +76,7 @@ #include #include #include +#include #include #include #include @@ -238,6 +239,8 @@ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; +__read_frequently smr_t vfs_smr; + /* * The workitem queue. * @@ -661,7 +664,8 @@ vnode_list_reclaim_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, - vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); + vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR); + vfs_smr = uma_zone_get_smr(vnode_zone); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* @@ -1608,7 +1612,7 @@ if (vnlru_under(rnumvnodes, vlowat)) vnlru_kick(); mtx_unlock(&vnode_list_mtx); - return (uma_zalloc(vnode_zone, M_WAITOK)); + return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static struct vnode * @@ -1624,7 +1628,7 @@ return (vn_alloc_hard(mp)); } - return (uma_zalloc(vnode_zone, M_WAITOK)); + return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static void @@ -1632,7 +1636,7 @@ { atomic_subtract_long(&numvnodes, 1); - uma_zfree(vnode_zone, vp); + uma_zfree_smr(vnode_zone, vp); } /* @@ -1763,7 +1767,7 @@ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); bo = &vp->v_bufobj; VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); - VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count")); + VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); @@ -2851,7 +2855,29 @@ * * holdcnt can be manipulated using atomics without holding any locks, * except when transitioning 1<->0, in which case the interlock is held. + * + * Consumers which don't guarantee liveness of the vnode can use SMR to + * try to get a reference. Note this operation can fail since the vnode + * may be awaiting getting freed by the time they get to it. */ +enum vgetstate +vget_prep_smr(struct vnode *vp) +{ + enum vgetstate vs; + + VFS_SMR_ASSERT_ENTERED(); + + if (refcount_acquire_if_not_zero(&vp->v_usecount)) { + vs = VGET_USECOUNT; + } else { + if (vhold_smr(vp)) + vs = VGET_HOLDCNT; + else + vs = VGET_NONE; + } + return (vs); +} + enum vgetstate vget_prep(struct vnode *vp) { @@ -2922,6 +2948,7 @@ ASSERT_VI_LOCKED(vp, __func__); else ASSERT_VI_UNLOCKED(vp, __func__); + VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); @@ -3383,7 +3410,8 @@ CTR2(KTR_VFS, "%s: vp %p", __func__, vp); old = atomic_fetchadd_int(&vp->v_holdcnt, 1); - VNASSERT(old >= 0, vp, ("%s: wrong hold count %d", __func__, old)); + VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, + ("%s: wrong hold count %d", __func__, old)); if (old != 0) return; critical_enter(); @@ -3408,12 +3436,40 @@ CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); - VNASSERT(old > 0, vp, ("%s: wrong hold count %d", __func__, old)); + VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, + ("%s: wrong hold count %d", __func__, old)); #else atomic_add_int(&vp->v_holdcnt, 1); #endif } +/* + * Grab a hold count as long as the vnode is not getting freed. + * + * Only use this routine if vfs smr is the only protection you have against + * freeing the vnode. + */ +bool +vhold_smr(struct vnode *vp) +{ + int count; + + VFS_SMR_ASSERT_ENTERED(); + + count = atomic_load_int(&vp->v_holdcnt); + for (;;) { + if (count & VHOLD_NO_SMR) { + VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, + ("non-zero hold count with flags %d\n", count)); + return (false); + } + + VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); + if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) + return (true); + } +} + static void __noinline vdbatch_process(struct vdbatch *vd) { @@ -3584,11 +3640,22 @@ VI_UNLOCK(vp); return; } - if (VN_IS_DOOMED(vp)) { - freevnode(vp); + if (!VN_IS_DOOMED(vp)) { + vdrop_deactivate(vp); return; } - vdrop_deactivate(vp); + /* + * We may be racing against vhold_smr. + * + * If they win we can just pretend we never got this far, they will + * vdrop later. + */ + if (!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR)) { + VNPASS(vp->v_holdcnt > 0, vp); + VNPASS((vp->v_holdcnt & VHOLD_NO_SMR) == 0, vp); + return; + } + freevnode(vp); } /* @@ -4044,20 +4111,25 @@ {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; +_Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, + "new hold count flag not added to vn_printf"); + void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; + u_int holdcnt; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("type %s\n", typename[vp->v_type]); + holdcnt = atomic_load_int(&vp->v_holdcnt); printf(" usecount %d, writecount %d, refcount %d", - vp->v_usecount, vp->v_writecount, vp->v_holdcnt); + vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS); switch (vp->v_type) { case VDIR: printf(" mountedhere %p\n", vp->v_mountedhere); @@ -4075,6 +4147,12 @@ printf("\n"); break; } + buf[0] = '\0'; + buf[1] = '\0'; + if (holdcnt & VHOLD_NO_SMR) + strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); + printf(" hold count flags (%s)\n", buf + 1); + buf[0] = '\0'; buf[1] = '\0'; if (vp->v_irflag & VIRF_DOOMED) Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -58,7 +58,7 @@ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, VMARKER }; -enum vgetstate { VGET_HOLDCNT, VGET_USECOUNT }; +enum vgetstate { VGET_NONE, VGET_HOLDCNT, VGET_USECOUNT }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). @@ -236,6 +236,9 @@ * VIRF_DOOMED is doubly protected by the interlock and vnode lock. Both * are required for writing but the status may be checked with either. */ +#define VHOLD_NO_SMR (1<<29) /* Disable vhold_smr */ +#define VHOLD_ALL_FLAGS (VHOLD_NO_SMR) + #define VIRF_DOOMED 0x0001 /* This vnode is being recycled */ #define VI_TEXT_REF 0x0001 /* Text ref grabbed use ref */ @@ -657,12 +660,14 @@ void vdropl(struct vnode *); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); int vget(struct vnode *vp, int flags, struct thread *td); +enum vgetstate vget_prep_smr(struct vnode *vp); enum vgetstate vget_prep(struct vnode *vp); int vget_finish(struct vnode *vp, int flags, enum vgetstate vs); void vgone(struct vnode *vp); void vhold(struct vnode *); void vholdl(struct vnode *); void vholdnz(struct vnode *); +bool vhold_smr(struct vnode *); void vinactive(struct vnode *vp); int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo); int vtruncbuf(struct vnode *vp, off_t length, int blksize); @@ -974,6 +979,16 @@ SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \ vfs_vector_op_register, &vnodeops) +#define VFS_SMR_DECLARE \ + extern smr_t vfs_smr + +#define VFS_SMR() vfs_smr +#define vfs_smr_enter() smr_enter(VFS_SMR()) +#define vfs_smr_exit() smr_exit(VFS_SMR()) +#define VFS_SMR_ASSERT_ENTERED() SMR_ASSERT_ENTERED(VFS_SMR()) +#define VFS_SMR_ASSERT_NOT_ENTERED() SMR_ASSERT_NOT_ENTERED(VFS_SMR()) +#define VFS_SMR_ZONE_SET(zone) uma_zone_set_smr((zone), VFS_SMR()) + #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */