Index: sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c =================================================================== --- sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c +++ sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c @@ -156,6 +156,8 @@ } VOP_UNLOCK(vp); + vfs_seqc_write_begin(vp); + /* * Allocate and initialize the filesystem. * We don't want regular user that triggered snapshot mount to be able @@ -206,6 +208,7 @@ VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); + vfs_seqc_write_end(vp); vput(vp); vfs_unbusy(mp); vfs_freeopts(mp->mnt_optnew); @@ -242,6 +245,7 @@ if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp)) panic("mount: lost mount"); VOP_UNLOCK(vp); + vfs_seqc_write_end(vp); vfs_op_exit(mp); vfs_unbusy(mp); *vpp = mvp; Index: sys/fs/tmpfs/tmpfs_subr.c =================================================================== --- sys/fs/tmpfs/tmpfs_subr.c +++ sys/fs/tmpfs/tmpfs_subr.c @@ -75,6 +75,7 @@ static uma_zone_t tmpfs_dirent_pool; static uma_zone_t tmpfs_node_pool; +extern smr_t vfs_smr; static int tmpfs_node_ctor(void *mem, int size, void *arg, int flags) @@ -131,6 +132,7 @@ tmpfs_node_pool = uma_zcreate("TMPFS node", sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor, tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0); + uma_zone_set_smr(tmpfs_node_pool, vfs_smr); } void @@ -288,7 +290,7 @@ if ((mp->mnt_kern_flag & MNT_RDONLY) != 0) return (EROFS); - nnode = uma_zalloc_arg(tmpfs_node_pool, tmp, M_WAITOK); + nnode = uma_zalloc_smr(tmpfs_node_pool, M_WAITOK); /* Generic initialization. */ nnode->tn_type = type; @@ -435,7 +437,7 @@ panic("tmpfs_free_node: type %p %d", node, (int)node->tn_type); } - uma_zfree(tmpfs_node_pool, node); + uma_zfree_smr(tmpfs_node_pool, node); TMPFS_LOCK(tmp); tmpfs_free_tmp(tmp); return (true); @@ -1622,10 +1624,11 @@ int tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, struct thread *p) { - int error; + int error, newmode; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chmod"); + ASSERT_VOP_IN_SEQC(vp); node = VP_TO_TMPFS_NODE(vp); @@ -1659,9 +1662,9 @@ return (error); } - - node->tn_mode &= ~ALLPERMS; - node->tn_mode |= mode & ALLPERMS; + newmode = node->tn_mode & ~ALLPERMS; + newmode |= mode & ALLPERMS; + atomic_store_int(&node->tn_mode, newmode); node->tn_status |= TMPFS_NODE_CHANGED; @@ -1687,6 +1690,7 @@ gid_t ogid; ASSERT_VOP_ELOCKED(vp, "chown"); + ASSERT_VOP_IN_SEQC(vp); node = VP_TO_TMPFS_NODE(vp); @@ -1733,7 +1737,7 @@ if ((node->tn_mode & (S_ISUID | S_ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) - node->tn_mode &= ~(S_ISUID | S_ISGID); + atomic_store_int(&node->tn_mode, node->tn_mode & ~(S_ISUID | S_ISGID)); } ASSERT_VOP_ELOCKED(vp, "chown2"); Index: sys/fs/tmpfs/tmpfs_vfsops.c =================================================================== --- sys/fs/tmpfs/tmpfs_vfsops.c +++ sys/fs/tmpfs/tmpfs_vfsops.c @@ -462,6 +462,8 @@ mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | MNTK_TEXT_REFS | MNTK_NOMSYNC; + if (!nonc) + mp->mnt_kern_flag |= MNTK_FPLOOKUP; MNT_IUNLOCK(mp); mp->mnt_data = tmp; Index: sys/fs/tmpfs/tmpfs_vnops.h =================================================================== --- sys/fs/tmpfs/tmpfs_vnops.h +++ sys/fs/tmpfs/tmpfs_vnops.h @@ -49,6 +49,7 @@ extern struct vop_vector tmpfs_vnodeop_nonc_entries; vop_access_t tmpfs_access; +vop_fplookup_vexec_t tmpfs_fplookup_vexec; vop_getattr_t tmpfs_getattr; vop_setattr_t tmpfs_setattr; vop_pathconf_t tmpfs_pathconf; Index: sys/fs/tmpfs/tmpfs_vnops.c =================================================================== --- sys/fs/tmpfs/tmpfs_vnops.c +++ sys/fs/tmpfs/tmpfs_vnops.c @@ -317,6 +317,24 @@ return (0); } +/* + * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see + * the comment above cache_fplookup for details. + */ +int +tmpfs_fplookup_vexec(struct vop_fplookup_vexec_args *v) +{ + struct vnode *vp; + struct tmpfs_node *node; + struct ucred *cred; + + cred = v->a_cred; + vp = v->a_vp; + node = VP_TO_TMPFS_NODE(vp); + + return (vaccess_vexec_smr(node->tn_mode, node->tn_uid, node->tn_gid, cred)); +} + int tmpfs_access(struct vop_access_args *v) { @@ -428,6 +446,8 @@ MPASS(VOP_ISLOCKED(vp)); + vfs_seqc_write_begin(vp); + error = 0; /* Abort if any unsettable attribute is given. */ @@ -466,6 +486,8 @@ * from tmpfs_update. */ tmpfs_update(vp); + vfs_seqc_write_end(vp); + MPASS(VOP_ISLOCKED(vp)); return error; @@ -806,12 +828,15 @@ struct tmpfs_node *tnode; struct tmpfs_node *tdnode; int error; + bool want_seqc_end; MPASS(VOP_ISLOCKED(tdvp)); MPASS(IMPLIES(tvp != NULL, VOP_ISLOCKED(tvp))); MPASS(fcnp->cn_flags & HASBUF); MPASS(tcnp->cn_flags & HASBUF); + want_seqc_end = false; + /* * Disallow cross-device renames. * XXX Why isn't this done by the caller? @@ -852,6 +877,13 @@ } } + if (tvp != NULL) + vfs_seqc_write_begin(tvp); + vfs_seqc_write_begin(tdvp); + vfs_seqc_write_begin(fvp); + vfs_seqc_write_begin(fdvp); + want_seqc_end = true; + tmp = VFS_TO_TMPFS(tdvp->v_mount); tdnode = VP_TO_TMPFS_DIR(tdvp); tnode = (tvp == NULL) ? NULL : VP_TO_TMPFS_NODE(tvp); @@ -1065,6 +1097,14 @@ VOP_UNLOCK(fdvp); out: + if (want_seqc_end) { + if (tvp != NULL) + vfs_seqc_write_end(tvp); + vfs_seqc_write_end(tdvp); + vfs_seqc_write_end(fvp); + vfs_seqc_write_end(fdvp); + } + /* * Release target nodes. * XXX: I don't understand when tdvp can be the same as tvp, but @@ -1609,6 +1649,7 @@ .vop_mknod = tmpfs_mknod, .vop_open = tmpfs_open, .vop_close = tmpfs_close, + .vop_fplookup_vexec = tmpfs_fplookup_vexec, .vop_access = tmpfs_access, .vop_getattr = tmpfs_getattr, .vop_setattr = tmpfs_setattr, Index: sys/kern/kern_descrip.c =================================================================== --- sys/kern/kern_descrip.c +++ sys/kern/kern_descrip.c @@ -102,8 +102,8 @@ static __read_mostly uma_zone_t file_zone; static __read_mostly uma_zone_t filedesc0_zone; -static __read_mostly uma_zone_t pwd_zone; -static __read_mostly smr_t pwd_smr; +__read_mostly uma_zone_t pwd_zone; +extern smr_t vfs_smr; static int closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td, int holdleaders); @@ -3297,14 +3297,24 @@ fdp = td->td_proc->p_fd; - smr_enter(pwd_smr); + smr_enter(vfs_smr); for (;;) { - pwd = smr_entered_load(&fdp->fd_pwd, pwd_smr); + pwd = smr_entered_load(&fdp->fd_pwd, vfs_smr); MPASS(pwd != NULL); if (refcount_acquire_if_not_zero(&pwd->pwd_refcount)) break; } - smr_exit(pwd_smr); + smr_exit(vfs_smr); + return (pwd); +} + +struct pwd * +pwd_get_smr(void) +{ + struct pwd *pwd; + + pwd = smr_entered_load(&curproc->p_fd->fd_pwd, vfs_smr); + MPASS(pwd != NULL); return (pwd); } @@ -4293,7 +4303,7 @@ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR); - pwd_smr = uma_zone_get_smr(pwd_zone); + vfs_smr = uma_zone_get_smr(pwd_zone); mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF); } SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL); Index: sys/kern/vfs_cache.c =================================================================== --- sys/kern/vfs_cache.c +++ sys/kern/vfs_cache.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -67,6 +68,11 @@ #include #endif +#include + +#include +#include + #ifdef DDB #include #endif @@ -100,6 +106,8 @@ SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", "char *"); +SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); + /* * This structure describes the elements in the cache of recent * names looked up by namei. @@ -2817,3 +2825,612 @@ } #endif + +static void +cached_namei_handle_root(struct nameidata *ndp, struct vnode **dpp) +{ + struct componentname *cnp; + + cnp = &ndp->ni_cnd; + while (*(cnp->cn_nameptr) == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } +} + +static void +cached_namei_handle_root_initial(struct nameidata *ndp, struct vnode **dpp) +{ + cached_namei_handle_root(ndp, dpp); + *dpp = ndp->ni_rootdir; +} + +static bool __read_frequently cache_fast_lookup = true; +SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, + &cache_fast_lookup, 0, ""); + +#define CACHE_FPL_UNHANDLED -2020 + +struct cache_fpl { + int line; + bool handled; + bool in_smr; + struct nameidata *ndp; + struct componentname *cnp; + struct vnode *vp; + seqc_t vp_seqc; + struct vnode *next_vp; + seqc_t next_vp_seqc; +}; + +static void +cache_fpl_smr_assert_not_entered(struct cache_fpl *fpl) +{ + + SMR_ASSERT_NOT_ENTERED(vfs_smr); + MPASS(fpl->in_smr == false); +} + +static void +cache_fpl_smr_enter(struct cache_fpl *fpl) +{ + + MPASS(fpl->in_smr == false); + smr_enter(vfs_smr); + fpl->in_smr = true; +} + +static void +cache_fpl_smr_exit(struct cache_fpl *fpl) +{ + + MPASS(fpl->in_smr == true); + fpl->in_smr = false; + smr_exit(vfs_smr); +} + +static void +cache_fpl_unhandled_impl(struct cache_fpl *fpl, int line) +{ + + fpl->handled = false; + fpl->line = line; +} + +#define cache_fpl_unhandled(x) cache_fpl_unhandled_impl((x), __LINE__) + +static void +cache_fpl_handled_impl(struct cache_fpl *fpl, int line) +{ + + fpl->handled = true; + fpl->line = line; +} + +#define cache_fpl_handled(x) cache_fpl_handled_impl((x), __LINE__) + +static bool +cache_can_fplookup(struct cache_fpl *fpl) +{ + struct nameidata *ndp; + struct componentname *cnp; + struct thread *td; + + ndp = fpl->ndp; + cnp = &ndp->ni_cnd; + td = cnp->cn_thread; + + if (!cache_fast_lookup) + return (false); + + /* + * XXXMJG missing MAC checks + */ + + if (cnp->cn_flags & ~(LOCKLEAF | FOLLOW | LOCKSHARED | ISOPEN | AUDITVNODE1)) { + cache_fpl_unhandled(fpl); + return (false); + } + if ((cnp->cn_flags & LOCKLEAF) == 0) { + cache_fpl_unhandled(fpl); + return (false); + } + if (IN_CAPABILITY_MODE(td)) { + cache_fpl_unhandled(fpl); + return (false); + } + if (AUDITING_TD(td)) { + cache_fpl_unhandled(fpl); + return (false); + } + if (ndp->ni_dirfd != AT_FDCWD) { + cache_fpl_unhandled(fpl); + return (false); + } + if (cnp->cn_nameiop != LOOKUP) { + cache_fpl_unhandled(fpl); + return (false); + } + + return (true); +} + +static void +cache_save_nameidata(struct nameidata *ndp, struct nameidata *saved) +{ + + *saved = *ndp; +} + +static void +cache_restore_nameidata(struct nameidata *ndp, struct nameidata *saved) +{ + + *ndp = *saved; +} + +static bool +cache_fplookup_vnode_supported(struct vnode *vp) +{ + + switch (vp->v_type) { + case VLNK: + return (false); + default: + break; + } + return (true); +} + +static int +cache_fplookup_final(struct cache_fpl *fpl) +{ + struct componentname *cnp; + enum vgetstate vs; + struct vnode *dvp, *tvp; + seqc_t dvp_seqc, tvp_seqc; + int error; + + cnp = fpl->cnp; + dvp = fpl->vp; + dvp_seqc = fpl->vp_seqc; + tvp = fpl->next_vp; + tvp_seqc = fpl->next_vp_seqc; + + VNPASS(cache_fplookup_vnode_supported(dvp), dvp); + + vs = vget_prep_smr(tvp); + cache_fpl_smr_exit(fpl); + if (vs == VGET_NONE) { + cache_fpl_unhandled(fpl); + return (CACHE_FPL_UNHANDLED); + } + + if (!seqc_consistent(&dvp->v_seqc, dvp_seqc)) { + cache_fpl_unhandled(fpl); + vdrop(tvp); + return (CACHE_FPL_UNHANDLED); + } + + MPASS((cnp->cn_flags & LOCKLEAF) != 0); + + error = vget_finish(tvp, cnp->cn_lkflags, vs); + if (error != 0) { + cache_fpl_unhandled(fpl); + return (CACHE_FPL_UNHANDLED); + } + + if (!seqc_consistent(&tvp->v_seqc, tvp_seqc)) { + cache_fpl_unhandled(fpl); + vput(tvp); + return (CACHE_FPL_UNHANDLED); + } + + cache_fpl_handled(fpl); + return (0); +} + +static int +cache_fplookup_next(struct cache_fpl *fpl) +{ + struct componentname *cnp; + struct namecache *ncp; + struct vnode *dvp, *tvp; + u_char nc_flag; + uint32_t hash; + + cnp = fpl->cnp; + dvp = fpl->vp; + + if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { + fpl->next_vp = dvp; + fpl->next_vp_seqc = seqc_read_any(&dvp->v_seqc); + return (0); + } + + hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); + + CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { + counter_u64_add(numchecks, 1); + if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && + !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) + break; + } + + fpl->next_vp = (void *)0xdeadbeef; + + /* + * If there is no entry we have to punt to the slow path to perform + * actual lookup. Should there be nothing with this name a negative + * entry will be created. + */ + if (__predict_false(ncp == NULL)) { + cache_fpl_unhandled(fpl); + return (CACHE_FPL_UNHANDLED); + } + + tvp = atomic_load_ptr(&ncp->nc_vp); + nc_flag = atomic_load_char(&ncp->nc_flag); + if (__predict_false(cache_ncp_invalid(ncp))) { + cache_fpl_unhandled(fpl); + return (CACHE_FPL_UNHANDLED); + } + + fpl->next_vp = tvp; + if (nc_flag & NCF_NEGATIVE) { + counter_u64_add(numneghits, 1); + cache_fpl_smr_exit(fpl); + cache_fpl_handled(fpl); + return (ENOENT); + } + + fpl->next_vp_seqc = seqc_read_any(&tvp->v_seqc); + if (seqc_in_modify(fpl->next_vp_seqc)) { + cache_fpl_unhandled(fpl); + return (CACHE_FPL_UNHANDLED); + } + + if (!cache_fplookup_vnode_supported(tvp)) { + cache_fpl_unhandled(fpl); + return (CACHE_FPL_UNHANDLED); + } + + counter_u64_add(numposhits, 1); + SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); + return (0); +} + +static bool +cache_fplookup_mp_supported(struct vnode *dp) +{ + struct mount *mp; + + MPASS(dp != NULL); + mp = atomic_load_ptr(&dp->v_mount); + if (mp == NULL || (mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) + return (false); + return (true); +} + +static int +cache_fplookup_climb_mount(struct cache_fpl *fpl) +{ + struct mount *mp, *prev_mp; + struct vnode *vp; + seqc_t vp_seqc; + + vp = fpl->next_vp; + vp_seqc = fpl->next_vp_seqc; + if (vp->v_type != VDIR) + return (0); + + mp = atomic_load_ptr(&vp->v_mountedhere); + if (mp == NULL) + return (0); + + prev_mp = NULL; + for (;;) { + if (!vfs_op_thread_enter(mp)) { + if (prev_mp != NULL) + vfs_op_thread_exit(prev_mp); + cache_fpl_unhandled(fpl); + return (CACHE_FPL_UNHANDLED); + } + if (prev_mp != NULL) + vfs_op_thread_exit(prev_mp); + if (!seqc_consistent(&vp->v_seqc, vp_seqc)) { + vfs_op_thread_exit(mp); + cache_fpl_unhandled(fpl); + return (CACHE_FPL_UNHANDLED); + } + if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) { + vfs_op_thread_exit(mp); + cache_fpl_unhandled(fpl); + return (CACHE_FPL_UNHANDLED); + } + vp = atomic_load_ptr(&mp->mnt_rootvnode); + if (vp == NULL || VN_IS_DOOMED(vp)) { + vfs_op_thread_exit(mp); + cache_fpl_unhandled(fpl); + return (CACHE_FPL_UNHANDLED); + } + vp_seqc = seqc_read_any(&vp->v_seqc); + if (seqc_in_modify(vp_seqc)) { + vfs_op_thread_exit(mp); + cache_fpl_unhandled(fpl); + return (CACHE_FPL_UNHANDLED); + } + prev_mp = mp; + mp = atomic_load_ptr(&vp->v_mountedhere); + if (mp == NULL) + break; + } + + vfs_op_thread_exit(prev_mp); + fpl->next_vp = vp; + fpl->next_vp_seqc = vp_seqc; + return (0); +} + +static int +cache_fplookup_impl(struct vnode *dp, struct cache_fpl *fpl) +{ + struct nameidata *ndp; + struct componentname *cnp; + char *cp; + char *prev_ni_next; /* saved ndp->ni_next */ + size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */ + int error; + + error = CACHE_FPL_UNHANDLED; + ndp = fpl->ndp; + ndp->ni_lcf = 0; + cnp = fpl->cnp; + cnp->cn_lkflags = LK_SHARED; + if ((cnp->cn_flags & LOCKSHARED) == 0) + cnp->cn_lkflags = LK_EXCLUSIVE; + + fpl->vp = dp; + fpl->vp_seqc = seqc_read_any(&fpl->vp->v_seqc); + if (seqc_in_modify(fpl->vp_seqc)) { + cache_fpl_unhandled(fpl); + goto out; + } + +next: + if (!cache_fplookup_mp_supported(fpl->vp)) + goto out; + + /* + * Search a new directory. + * + * The last component of the filename is left accessible via + * cnp->cn_nameptr for callers that need the name. Callers needing + * the name set the SAVENAME flag. When done, they assume + * responsibility for freeing the pathname buffer. + */ + for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) + continue; + cnp->cn_namelen = cp - cnp->cn_nameptr; + if (cnp->cn_namelen > NAME_MAX) { + error = ENAMETOOLONG; + goto out; + } + prev_ni_pathlen = ndp->ni_pathlen; + ndp->ni_pathlen -= cnp->cn_namelen; + KASSERT(ndp->ni_pathlen <= PATH_MAX, + ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); + prev_ni_next = ndp->ni_next; + ndp->ni_next = cp; + + /* + * Replace multiple slashes by a single slash and trailing slashes + * by a null. This must be done before VOP_LOOKUP() because some + * fs's don't know about trailing slashes. Remember if there were + * trailing slashes to handle symlinks, existing non-directories + * and non-existing files that won't be directories specially later. + */ + while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { + cp++; + ndp->ni_pathlen--; + if (*cp == '\0') { + *ndp->ni_next = '\0'; + cnp->cn_flags |= TRAILINGSLASH; + } + } + ndp->ni_next = cp; + + cnp->cn_flags |= MAKEENTRY; + + if (cnp->cn_namelen == 2 && + cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') + cnp->cn_flags |= ISDOTDOT; + else + cnp->cn_flags &= ~ISDOTDOT; + if (*ndp->ni_next == 0) + cnp->cn_flags |= ISLASTCN; + else + cnp->cn_flags &= ~ISLASTCN; + + if (cnp->cn_flags & ISDOTDOT) { + cache_fpl_unhandled(fpl); + goto out; + } + + if (!cache_fplookup_vnode_supported(fpl->vp)) { + cache_fpl_unhandled(fpl); + goto out; + } + + error = VOP_FPLOOKUP_VEXEC(fpl->vp, cnp->cn_cred, cnp->cn_thread); + if (__predict_false(error != 0)) { + VNPASS(error != EOPNOTSUPP, fpl->vp); + if (error == EAGAIN) + cache_fpl_unhandled(fpl); + else { + panic("untested"); + cache_fpl_handled(fpl); + } + goto out; + } + + error = cache_fplookup_next(fpl); + if (error != 0) { + goto out; + } + + VNPASS(!seqc_in_modify(fpl->next_vp_seqc), fpl->next_vp); + + error = cache_fplookup_climb_mount(fpl); + if (error != 0) { + goto out; + } + + VNPASS(!seqc_in_modify(fpl->next_vp_seqc), fpl->next_vp); + + if (cnp->cn_flags & ISLASTCN) { + error = cache_fplookup_final(fpl); + goto out; + } + + if (!seqc_consistent(&fpl->vp->v_seqc, fpl->vp_seqc)) { + cache_fpl_unhandled(fpl); + goto out; + } + + fpl->vp = fpl->next_vp; + fpl->vp_seqc = fpl->next_vp_seqc; + + cnp->cn_nameptr = ndp->ni_next; + while (*cnp->cn_nameptr == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + + goto next; + +out: + if (fpl->handled) { + cache_fpl_smr_assert_not_entered(fpl); + if (error == 0) { + ndp->ni_dvp = fpl->vp; + ndp->ni_vp = fpl->next_vp; + } + return (error); + } + + if (fpl->in_smr) + cache_fpl_smr_exit(fpl); + return (error); +} + +/* + * Fast path lookup. + * + * An opt-in functionality for filesystems. It can be enabled by setting the + * MNTK_FPLOOKUP flag on mount and providing VOP_FPLOOKUP_VEXEC routine. + * + * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. + * + * Traditional vnode lookup conceptually looks like this: + * + * for (;;) { + * vn_lock(current); + * next = find(); + * vn_lock(next); + * vn_unlock(current); + * current = next; + * if (!last) + * continue; + * break; + * } + * + * Each jump to the next vnode is safe memory-wise and atomic with respect to + * any modifications thanks to holding respective locks. + * + * The same guarantee can provided with a combination of safe memory reclamation + * and sequence counters instead. If all places which affect the relationship + * or permissions also the counter, the following provides the same guarantee. + * + * smr_enter(vfs_smr); + * for (;;) { + * current_seqc = seqc_read_any(current); + * next = find(); + * next_seqc = seqc_read_any(next); + * if (!seqc_consistent(current, current_seqc) + * failure(); + * current = next; // if anything changed the above would fail + * if (!last) + * continue; + * } + * + * API contract for VOP_FPLOOKUP_VEXEC routines is as follows: + * - they are called while within vfs_smr protection + * - passed vnode is not locked in any manner + * - EAGAIN can be returned to denote checking could not be performed, it is + * always valid to return it + * - if the sequence counter has not changed the result must be valid + * - if the sequence counter has changed both false positives and false negatives + * are permitted (since the result will be rejected later) + * - either safely checking permissions as they are modified or guaranteeing + * their stability is left to the routine + * - for simple cases of unix permission checks vaccess_vexec_smr can be used + */ +int +cache_fplookup(struct nameidata *ndp, bool *handled) +{ + struct nameidata saved_ndp; + struct cache_fpl fpl; + struct pwd *pwd; + struct vnode *dp, *startdir; + struct componentname *cnp; + int error; + + *handled = false; + bzero(&fpl, sizeof(fpl)); + fpl.ndp = ndp; + fpl.cnp = &ndp->ni_cnd; + MPASS(curthread == fpl.cnp->cn_thread); + + if (!cache_can_fplookup(&fpl)) { + SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.handled); + return (EOPNOTSUPP); + } + + cache_save_nameidata(ndp, &saved_ndp); + + cache_fpl_smr_enter(&fpl); + pwd = pwd_get_smr(); + ndp->ni_rootdir = pwd->pwd_rdir; + ndp->ni_topdir = pwd->pwd_jdir; + + cnp = fpl.cnp; + cnp->cn_nameptr = cnp->cn_pnbuf; + startdir = ndp->ni_startdir; + if (cnp->cn_pnbuf[0] == '/') { + cached_namei_handle_root_initial(ndp, &dp); + } else { + if (ndp->ni_startdir != NULL) { + dp = ndp->ni_startdir; + } else { + MPASS(ndp->ni_dirfd == AT_FDCWD); + dp = pwd->pwd_cdir; + } + } + + error = cache_fplookup_impl(dp, &fpl); + cache_fpl_smr_assert_not_entered(&fpl); + SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.handled); + + if (fpl.handled) { + if (error == CACHE_FPL_UNHANDLED) + panic("cache: bad error code from line %d\n", fpl.line); + *handled = true; + if (startdir != NULL) + vrele(startdir); + } else { + cache_restore_nameidata(ndp, &saved_ndp); + } + + return (error); +} Index: sys/kern/vfs_lookup.c =================================================================== --- sys/kern/vfs_lookup.c +++ sys/kern/vfs_lookup.c @@ -315,6 +315,7 @@ struct filecaps dirfd_caps; struct uio auio; int error, linklen, startdir_used; + bool handled; cnp = &ndp->ni_cnd; td = cnp->cn_thread; @@ -329,10 +330,15 @@ ndp->ni_startdir->v_type == VBAD); TAILQ_INIT(&ndp->ni_cap_tracker); ndp->ni_lcf = 0; + ndp->ni_loopcnt = 0; + startdir_used = 0; + dp = NULL; /* We will set this ourselves if we need it. */ cnp->cn_flags &= ~TRAILINGSLASH; + ndp->ni_vp = NULL; + /* * Get a buffer for the name to be translated, and copy the * name into the buffer. @@ -346,12 +352,31 @@ error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, &ndp->ni_pathlen); + if (error != 0) { + namei_cleanup_cnp(cnp); + return (error); + } + + cnp->cn_nameptr = cnp->cn_pnbuf; + /* * Don't allow empty pathnames. */ - if (error == 0 && *cnp->cn_pnbuf == '\0') - error = ENOENT; + if (*cnp->cn_pnbuf == '\0') { + namei_cleanup_cnp(cnp); + return (ENOENT); + } + error = cache_fplookup(ndp, &handled); + if (handled) { + namei_cleanup_cnp(cnp); + return (error); + } + + /* + * Ignore fast path. + */ + error = 0; #ifdef CAPABILITY_MODE /* * In capability mode, lookups must be restricted to happen in @@ -380,10 +405,8 @@ #endif if (error != 0) { namei_cleanup_cnp(cnp); - ndp->ni_vp = NULL; return (error); } - ndp->ni_loopcnt = 0; #ifdef KTRACE if (KTRPOINT(td, KTR_NAMEI)) { KASSERT(cnp->cn_thread == curthread, @@ -402,9 +425,6 @@ ndp->ni_rootdir = pwd->pwd_rdir; ndp->ni_topdir = pwd->pwd_jdir; - startdir_used = 0; - dp = NULL; - cnp->cn_nameptr = cnp->cn_pnbuf; if (cnp->cn_pnbuf[0] == '/') { ndp->ni_resflags |= NIRES_ABS; error = namei_handle_root(ndp, &dp); Index: sys/kern/vfs_mount.c =================================================================== --- sys/kern/vfs_mount.c +++ sys/kern/vfs_mount.c @@ -949,6 +949,8 @@ } VOP_UNLOCK(vp); + vfs_seqc_write_begin(vp); + /* Allocate and initialize the filesystem. */ mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred); /* XXXMAC: pass to vfs_mount_alloc? */ @@ -976,9 +978,11 @@ VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; VI_UNLOCK(vp); + vfs_seqc_write_end(vp); vrele(vp); return (error); } + vfs_seqc_write_begin(newdp); VOP_UNLOCK(newdp); if (mp->mnt_opt != NULL) @@ -1015,6 +1019,8 @@ EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td); VOP_UNLOCK(newdp); mountcheckdirs(vp, newdp); + vfs_seqc_write_end(vp); + vfs_seqc_write_end(newdp); vrele(newdp); if ((mp->mnt_flag & MNT_RDONLY) == 0) vfs_allocate_syncvnode(mp); @@ -1089,7 +1095,9 @@ VOP_UNLOCK(vp); vfs_op_enter(mp); + vfs_seqc_write_begin(vp); + rootvp = NULL; MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { MNT_IUNLOCK(mp); @@ -1101,10 +1109,10 @@ MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY); if ((mp->mnt_flag & MNT_ASYNC) == 0) mp->mnt_kern_flag &= ~MNTK_ASYNC; + if (mp->mnt_rootvnode != NULL) + vfs_seqc_write_begin(mp->mnt_rootvnode); rootvp = vfs_cache_root_clear(mp); MNT_IUNLOCK(mp); - if (rootvp != NULL) - vrele(rootvp); mp->mnt_optnew = *optlist; vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt); @@ -1175,6 +1183,11 @@ vfs_deallocate_syncvnode(mp); end: vfs_op_exit(mp); + if (rootvp != NULL) { + vfs_seqc_write_end(rootvp); + vrele(rootvp); + } + vfs_seqc_write_end(vp); vfs_unbusy(mp); VI_LOCK(vp); vp->v_iflag &= ~VI_MOUNT; @@ -1664,12 +1677,17 @@ return (EBUSY); } mp->mnt_kern_flag |= MNTK_UNMOUNT; + if (mp->mnt_rootvnode != NULL) + vfs_seqc_write_begin(mp->mnt_rootvnode); rootvp = vfs_cache_root_clear(mp); + if (coveredvp != NULL) + vfs_seqc_write_begin(coveredvp); if (flags & MNT_NONBUSY) { MNT_IUNLOCK(mp); error = vfs_check_usecounts(mp); MNT_ILOCK(mp); if (error != 0) { + vfs_seqc_write_end(coveredvp); dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT); if (rootvp != NULL) vrele(rootvp); @@ -1707,16 +1725,6 @@ if (mp->mnt_flag & MNT_EXPUBLIC) vfs_setpublicfs(NULL, NULL, NULL); - /* - * From now, we can claim that the use reference on the - * coveredvp is ours, and the ref can be released only by - * successfull unmount by us, or left for later unmount - * attempt. The previously acquired hold reference is no - * longer needed to protect the vnode from reuse. - */ - if (coveredvp != NULL) - vdrop(coveredvp); - vfs_periodic(mp, MNT_WAIT); MNT_ILOCK(mp); async_flag = mp->mnt_flag & MNT_ASYNC; @@ -1751,8 +1759,11 @@ } vfs_op_exit_locked(mp); MNT_IUNLOCK(mp); - if (coveredvp) + if (coveredvp) { + vfs_seqc_write_end(coveredvp); VOP_UNLOCK(coveredvp); + vdrop(coveredvp); + } return (error); } mtx_lock(&mountlist_mtx); @@ -1761,7 +1772,9 @@ EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td); if (coveredvp != NULL) { coveredvp->v_mountedhere = NULL; + vfs_seqc_write_end(coveredvp); VOP_UNLOCK(coveredvp); + vdrop(coveredvp); } vfs_event_signal(NULL, VQ_UNMOUNT, 0); if (rootvnode != NULL && mp == rootvnode->v_mount) { Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -664,8 +664,8 @@ vnode_list_reclaim_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, - vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR); - vfs_smr = uma_zone_get_smr(vnode_zone); + vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); + uma_zone_set_smr(vnode_zone, vfs_smr); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* @@ -1766,6 +1766,7 @@ */ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); bo = &vp->v_bufobj; + VNPASS(vp->v_seqc_users == 0, vp); VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNASSERT(vp->v_holdcnt == VHOLD_NO_SMR, vp, ("Invalid hold count")); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); @@ -4124,6 +4125,7 @@ printf(" usecount %d, writecount %d, refcount %d (flags %s)", vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_NO_SMR, holdcnt & VHOLD_NO_SMR ? "VHOLD_NO_SMR" : "none"); + printf(", seqc users %d", vp->v_seqc_users); switch (vp->v_type) { case VDIR: printf(" mountedhere %p\n", vp->v_mountedhere); @@ -5184,6 +5186,43 @@ return (error == 0); } +/* + * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see + * the comment above cache_fplookup for details. + * + * We never deny as priv_check_cred calls are not yet supported. + */ +int +vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) +{ + mode_t all_x; + + SMR_ASSERT_ENTERED(vfs_smr); + + all_x = S_IXUSR | S_IXGRP | S_IXOTH; + if (__predict_true((file_mode & all_x) == all_x)) + return (0); + + /* Check the owner. */ + if (cred->cr_uid == file_uid) { + if (file_mode & S_IXUSR) + return (0); + return (EAGAIN); + } + + /* Otherwise, check the groups (first match) */ + if (groupmember(file_gid, cred)) { + if (file_mode & S_IXGRP) + return (0); + return (EAGAIN); + } + + /* Otherwise, check everyone else. */ + if (file_mode & S_IXOTH) + return (0); + return (EAGAIN); +} + /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, credentials, @@ -6533,3 +6572,29 @@ return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread)); } + +void +vfs_seqc_write_begin(struct vnode *vp) +{ + + VI_LOCK(vp); + VNPASS(vp->v_holdcnt > 0, vp); + VNPASS(vp->v_seqc_users >= 0, vp); + vp->v_seqc_users++; + if (vp->v_seqc_users == 1) + seqc_sleepable_write_begin(&vp->v_seqc); + VI_UNLOCK(vp); +} + +void +vfs_seqc_write_end(struct vnode *vp) +{ + + VI_LOCK(vp); + VNPASS(vp->v_holdcnt > 0, vp); + VNPASS(vp->v_seqc_users > 0, vp); + vp->v_seqc_users--; + if (vp->v_seqc_users == 0) + seqc_sleepable_write_end(&vp->v_seqc); + VI_UNLOCK(vp); +} Index: sys/kern/vnode_if.src =================================================================== --- sys/kern/vnode_if.src +++ sys/kern/vnode_if.src @@ -142,6 +142,15 @@ }; +%% fplookup_vexec vp - - - + +vop_fplookup_vexec { + IN struct vnode *vp; + IN struct ucred *cred; + IN struct thread *td; +}; + + %% access vp L L L vop_access { Index: sys/sys/_seqc.h =================================================================== --- /dev/null +++ sys/sys/_seqc.h @@ -0,0 +1,6 @@ +#ifndef _SYS__SEQC_H_ +#define _SYS__SEQC_H_ + +typedef uint32_t seqc_t; + +#endif /* _SYS__SEQC_H */ Index: sys/sys/filedesc.h =================================================================== --- sys/sys/filedesc.h +++ sys/sys/filedesc.h @@ -289,6 +289,7 @@ smr_serialized_store(&fdp->fd_pwd, newpwd, (FILEDESC_XLOCK_ASSERT(fdp), true)); } +struct pwd *pwd_get_smr(void); #endif /* _KERNEL */ Index: sys/sys/mount.h =================================================================== --- sys/sys/mount.h +++ sys/sys/mount.h @@ -415,6 +415,7 @@ #define MNTK_TEXT_REFS 0x00008000 /* Keep use ref for text */ #define MNTK_VMSETSIZE_BUG 0x00010000 #define MNTK_UNIONFS 0x00020000 /* A hack for F_ISUNIONSTACK */ +#define MNTK_FPLOOKUP 0x00040000 /* fast path lookup is supported */ #define MNTK_NOASYNC 0x00800000 /* disable async */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ Index: sys/sys/seqc.h =================================================================== --- sys/sys/seqc.h +++ sys/sys/seqc.h @@ -36,7 +36,7 @@ /* * seqc_t may be included in structs visible to userspace */ -typedef uint32_t seqc_t; +#include #ifdef _KERNEL @@ -110,5 +110,22 @@ return (seqc_consistent_nomb(seqcp, oldseqc)); } +static __inline void +seqc_sleepable_write_begin(seqc_t *seqcp) +{ + + MPASS(!seqc_in_modify(*seqcp)); + *seqcp += 1; + atomic_thread_fence_rel(); +} + +static __inline void +seqc_sleepable_write_end(seqc_t *seqcp) +{ + + atomic_store_rel_int(seqcp, *seqcp + 1); + MPASS(!seqc_in_modify(*seqcp)); +} + #endif /* _KERNEL */ #endif /* _SYS_SEQC_H_ */ Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -45,6 +45,7 @@ #include #include #include +#include /* * The vnode is the focus of all file activity in UNIX. There is a @@ -66,6 +67,10 @@ struct namecache; +struct cache_rename_state { + struct vnode *vp[4]; +}; + struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ @@ -105,6 +110,7 @@ */ enum vtype v_type:8; /* u vnode type */ short v_irflag; /* i frequently read flags */ + seqc_t v_seqc; /* i modification count */ struct vop_vector *v_op; /* u vnode operations vector */ void *v_data; /* u private data for fs */ @@ -175,6 +181,7 @@ short v_dbatchcpu; /* i LRU requeue deferral batch */ int v_writecount; /* I ref count of writers or (negative) text users */ + int v_seqc_users; /* i modifications pending */ u_int v_hash; }; @@ -538,6 +545,18 @@ #define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str)) #define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str)) +#define ASSERT_VOP_IN_SEQC(vp) do { \ + struct vnode *_vp = (vp); \ + \ + VNPASS(seqc_in_modify(_vp->v_seqc), _vp); \ +} while (0) + +#define ASSERT_VOP_NOT_IN_SEQC(vp) do { \ + struct vnode *_vp = (vp); \ + \ + VNPASS(!seqc_in_modify(_vp->v_seqc), _vp); \ +} while (0) + #else /* !DEBUG_VFS_LOCKS */ #define ASSERT_VI_LOCKED(vp, str) ((void)0) @@ -545,6 +564,10 @@ #define ASSERT_VOP_ELOCKED(vp, str) ((void)0) #define ASSERT_VOP_LOCKED(vp, str) ((void)0) #define ASSERT_VOP_UNLOCKED(vp, str) ((void)0) + +#define ASSERT_VOP_IN_SEQC(vp) ((void)0) +#define ASSERT_VOP_NOT_IN_SEQC(vp) ((void)0) + #endif /* DEBUG_VFS_LOCKS */ @@ -601,6 +624,7 @@ struct vattr; struct vfsops; struct vnode; +struct pwd; typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **); @@ -613,11 +637,14 @@ void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp); +int cache_fplookup(struct nameidata *ndp, bool *handled); int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp); void cache_purge(struct vnode *vp); void cache_purge_negative(struct vnode *vp); void cache_purgevfs(struct mount *mp, bool force); +void vfs_seqc_write_begin(struct vnode *vp); +void vfs_seqc_write_end(struct vnode *vp); int change_dir(struct vnode *vp, struct thread *td); void cvtstat(struct stat *st, struct ostat *ost); void freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb); @@ -643,6 +670,8 @@ int vn_commname(struct vnode *vn, char *buf, u_int buflen); int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen); +int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, + struct ucred *cred); int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred, int *privused);