Page MenuHomeFreeBSD

D23915.id69198.diff
No OneTemporary

D23915.id69198.diff

Index: sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
===================================================================
--- sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
+++ sys/cddl/compat/opensolaris/kern/opensolaris_vfs.c
@@ -154,6 +154,7 @@
vput(vp);
return (error);
}
+ vfs_seqc_write_begin(vp);
VOP_UNLOCK(vp);
/*
@@ -206,6 +207,7 @@
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
VI_UNLOCK(vp);
+ vfs_seqc_write_end(vp);
vput(vp);
vfs_unbusy(mp);
vfs_freeopts(mp->mnt_optnew);
@@ -241,6 +243,7 @@
vfs_event_signal(NULL, VQ_MOUNT, 0);
if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp))
panic("mount: lost mount");
+ vfs_seqc_write_end(vp);
VOP_UNLOCK(vp);
vfs_op_exit(mp);
vfs_unbusy(mp);
Index: sys/kern/kern_descrip.c
===================================================================
--- sys/kern/kern_descrip.c
+++ sys/kern/kern_descrip.c
@@ -102,8 +102,8 @@
static __read_mostly uma_zone_t file_zone;
static __read_mostly uma_zone_t filedesc0_zone;
-static __read_mostly uma_zone_t pwd_zone;
-static __read_mostly smr_t pwd_smr;
+__read_mostly uma_zone_t pwd_zone;
+extern smr_t vfs_smr;
static int closefp(struct filedesc *fdp, int fd, struct file *fp,
struct thread *td, int holdleaders);
@@ -3297,14 +3297,24 @@
fdp = td->td_proc->p_fd;
- smr_enter(pwd_smr);
+ smr_enter(vfs_smr);
for (;;) {
- pwd = smr_entered_load(&fdp->fd_pwd, pwd_smr);
+ pwd = smr_entered_load(&fdp->fd_pwd, vfs_smr);
MPASS(pwd != NULL);
if (refcount_acquire_if_not_zero(&pwd->pwd_refcount))
break;
}
- smr_exit(pwd_smr);
+ smr_exit(vfs_smr);
+ return (pwd);
+}
+
+struct pwd *
+pwd_get_smr(void)
+{
+ struct pwd *pwd;
+
+ pwd = smr_entered_load(&curproc->p_fd->fd_pwd, vfs_smr);
+ MPASS(pwd != NULL);
return (pwd);
}
@@ -4293,7 +4303,7 @@
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
pwd_zone = uma_zcreate("PWD", sizeof(struct pwd), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_SMR);
- pwd_smr = uma_zone_get_smr(pwd_zone);
+ vfs_smr = uma_zone_get_smr(pwd_zone);
mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
}
SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
Index: sys/kern/vfs_cache.c
===================================================================
--- sys/kern/vfs_cache.c
+++ sys/kern/vfs_cache.c
@@ -55,6 +55,7 @@
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/rwlock.h>
+#include <sys/seqc.h>
#include <sys/sdt.h>
#include <sys/smr.h>
#include <sys/smp.h>
@@ -67,6 +68,11 @@
#include <sys/ktrace.h>
#endif
+#include <sys/capsicum.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
#ifdef DDB
#include <ddb/ddb.h>
#endif
@@ -100,6 +106,8 @@
SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
"char *");
+SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
+
/*
* This structure describes the elements in the cache of recent
* names looked up by namei.
@@ -2817,3 +2825,736 @@
}
#endif
+
+extern uma_zone_t namei_zone;
+
+static bool __read_frequently cache_fast_lookup = true;
+SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW,
+ &cache_fast_lookup, 0, "");
+
+#define CACHE_FPL_UNHANDLED -2020
+
+static void
+cache_fpl_cleanup_cnp(struct componentname *cnp)
+{
+
+ uma_zfree(namei_zone, cnp->cn_pnbuf);
+#ifdef DIAGNOSTIC
+ cnp->cn_pnbuf = NULL;
+ cnp->cn_nameptr = NULL;
+#endif
+}
+
+static void
+cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp)
+{
+ struct componentname *cnp;
+
+ cnp = &ndp->ni_cnd;
+ while (*(cnp->cn_nameptr) == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+}
+
+static void
+cache_fpl_handle_root_initial(struct nameidata *ndp, struct vnode **dpp)
+{
+
+ cache_fpl_handle_root(ndp, dpp);
+ *dpp = ndp->ni_rootdir;
+}
+
+enum cache_fpl_status { CACHE_FPL_STATUS_UNSET, CACHE_FPL_STATUS_HANDLED,
+ CACHE_FPL_STATUS_UNHANDLED };
+
+struct cache_fpl {
+ int line;
+ enum cache_fpl_status handled;
+ bool in_smr;
+ struct nameidata *ndp;
+ struct componentname *cnp;
+ struct vnode *dvp;
+ seqc_t dvp_seqc;
+ struct vnode *tvp;
+ seqc_t tvp_seqc;
+};
+
+static void
+cache_fpl_smr_assert_not_entered(struct cache_fpl *fpl)
+{
+
+ SMR_ASSERT_NOT_ENTERED(vfs_smr);
+ MPASS(fpl->in_smr == false);
+}
+
+static void
+cache_fpl_smr_enter(struct cache_fpl *fpl)
+{
+
+ MPASS(fpl->in_smr == false);
+ smr_enter(vfs_smr);
+ fpl->in_smr = true;
+}
+
+static void
+cache_fpl_smr_exit(struct cache_fpl *fpl)
+{
+
+ MPASS(fpl->in_smr == true);
+ fpl->in_smr = false;
+ smr_exit(vfs_smr);
+}
+
+static int
+cache_fpl_unhandled_impl(struct cache_fpl *fpl, int line)
+{
+
+ KASSERT(fpl->handled == CACHE_FPL_STATUS_UNSET,
+ ("%s: lookup status already set at %d\n", __func__, fpl->line));
+ fpl->handled = CACHE_FPL_STATUS_UNHANDLED;
+ fpl->line = line;
+ return (CACHE_FPL_UNHANDLED);
+}
+
+#define cache_fpl_unhandled(x) cache_fpl_unhandled_impl((x), __LINE__)
+
+static int
+cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line)
+{
+
+ KASSERT(fpl->handled == CACHE_FPL_STATUS_UNSET,
+ ("%s: lookup status already set at %d\n", __func__, fpl->line));
+ fpl->handled = CACHE_FPL_STATUS_HANDLED;
+ fpl->line = line;
+ return (error);
+}
+
+#define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__)
+
+#define CACHE_FPL_SUPPORTED_CN_FLAGS \
+ (LOCKLEAF | FOLLOW | LOCKSHARED | SAVENAME | ISOPEN | AUDITVNODE1)
+
+static bool
+cache_can_fplookup(struct cache_fpl *fpl)
+{
+ struct nameidata *ndp;
+ struct componentname *cnp;
+ struct thread *td;
+
+ ndp = fpl->ndp;
+ cnp = fpl->cnp;
+ td = cnp->cn_thread;
+
+ if (!cache_fast_lookup) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ if (mac_vnode_check_lookup_enabled()) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ if (cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ if ((cnp->cn_flags & LOCKLEAF) == 0) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ if (cnp->cn_nameiop != LOOKUP) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ if (ndp->ni_dirfd != AT_FDCWD) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ if (IN_CAPABILITY_MODE(td)) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ if (AUDITING_TD(td)) {
+ cache_fpl_unhandled(fpl);
+ return (false);
+ }
+ return (true);
+}
+
+/*
+ * TODO
+ *
+ * Save and restore nameidata in case we need to fall back to the regular lookup.
+ *
+ * Everything which is modified (some direct fields and cnp) is covered, but the
+ * copy below is overzealous for simplicity. It can be modified to only save few
+ * fields.
+ */
+static void
+cache_save_nameidata(struct nameidata *ndp, struct nameidata *saved)
+{
+
+ *saved = *ndp;
+}
+
+static void
+cache_restore_nameidata(struct nameidata *ndp, struct nameidata *saved)
+{
+
+ *ndp = *saved;
+}
+
+static bool
+cache_fplookup_vnode_supported(struct vnode *vp)
+{
+
+ switch (vp->v_type) {
+ case VLNK:
+ return (false);
+ default:
+ break;
+ }
+ return (true);
+}
+
+static int
+cache_fplookup_final(struct cache_fpl *fpl)
+{
+ struct componentname *cnp;
+ enum vgetstate tvs;
+ struct vnode *dvp, *tvp;
+ seqc_t dvp_seqc, tvp_seqc;
+ int error;
+
+ cnp = fpl->cnp;
+ dvp = fpl->dvp;
+ dvp_seqc = fpl->dvp_seqc;
+ tvp = fpl->tvp;
+ tvp_seqc = fpl->tvp_seqc;
+
+ VNPASS(cache_fplookup_vnode_supported(dvp), dvp);
+
+ tvs = vget_prep_smr(tvp);
+ cache_fpl_smr_exit(fpl);
+ if (tvs == VGET_NONE) {
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ if (!seqc_consistent(&dvp->v_seqc, dvp_seqc)) {
+ vget_abort(tvp, tvs);
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ MPASS((cnp->cn_flags & LOCKLEAF) != 0);
+
+ error = vget_finish(tvp, cnp->cn_lkflags, tvs);
+ if (error != 0) {
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ if (!seqc_consistent(&tvp->v_seqc, tvp_seqc)) {
+ vput(tvp);
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ return (cache_fpl_handled(fpl, 0));
+}
+
+static int
+cache_fplookup_next(struct cache_fpl *fpl)
+{
+ struct componentname *cnp;
+ struct namecache *ncp;
+ struct vnode *dvp, *tvp;
+ u_char nc_flag;
+ uint32_t hash;
+
+ cnp = fpl->cnp;
+ dvp = fpl->dvp;
+
+ if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) {
+ fpl->tvp = dvp;
+ fpl->tvp_seqc = seqc_read_any(&dvp->v_seqc);
+ if (seqc_in_modify(fpl->tvp_seqc)) {
+ return (cache_fpl_unhandled(fpl));
+ }
+ return (0);
+ }
+
+ hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
+
+ CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+ counter_u64_add(numchecks, 1);
+ if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
+ !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
+ break;
+ }
+
+ fpl->tvp = (void *)0xdeadbeef;
+
+ /*
+ * If there is no entry we have to punt to the slow path to perform
+ * actual lookup. Should there be nothing with this name a negative
+ * entry will be created.
+ */
+ if (__predict_false(ncp == NULL)) {
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ tvp = atomic_load_ptr(&ncp->nc_vp);
+ nc_flag = atomic_load_char(&ncp->nc_flag);
+ if (__predict_false(cache_ncp_invalid(ncp))) {
+ return (cache_fpl_unhandled(fpl));
+ }
+ if (__predict_false(nc_flag & NCF_WHITE)) {
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ fpl->tvp = tvp;
+ if (nc_flag & NCF_NEGATIVE) {
+ if ((nc_flag & NCF_HOTNEGATIVE) == 0) {
+ /*
+ * TODO
+ * Promoting to hot negative requires locks which are
+ * not yet supported for simplicity.
+ */
+ return (cache_fpl_unhandled(fpl));
+ }
+ SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
+ ncp->nc_name);
+ counter_u64_add(numneghits, 1);
+ cache_fpl_smr_exit(fpl);
+ return (cache_fpl_handled(fpl, ENOENT));
+ }
+
+ fpl->tvp_seqc = seqc_read_any(&tvp->v_seqc);
+ if (seqc_in_modify(fpl->tvp_seqc)) {
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ if (!cache_fplookup_vnode_supported(tvp)) {
+ return (cache_fpl_unhandled(fpl));
+ }
+
+ counter_u64_add(numposhits, 1);
+ SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp);
+ return (0);
+}
+
+static bool
+cache_fplookup_mp_supported(struct mount *mp)
+{
+
+ if (mp == NULL)
+ return (false);
+ if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0)
+ return (false);
+ if (mp->mnt_flag & MNT_UNION)
+ return (false);
+ return (true);
+}
+
+/*
+ * Walk up the mount stack (if any).
+ *
+ * Correctness is provided in the following ways:
+ * - all vnodes are protected from freeing with SMR
+ * - struct mount objects are type stable making them always safe to access
+ * - stability of the particular mount is provided by busying it
+ * - relationship between the vnode which is mounted on and the mount is
+ * verified with the vnode sequence counter after busying
+ * - association between root vnode of the mount and the mount is protected
+ * by busy
+ *
+ * From that point on we can read the sequence counter of the root vnode
+ * and get the next mount on the stack (if any) using the same protection.
+ *
+ * By the end of successful walk we are guaranteed the reached state was
+ * indeed present at least at some point which matches the regular lookup.
+ */
+static int
+cache_fplookup_climb_mount(struct cache_fpl *fpl)
+{
+ struct mount *mp, *prev_mp;
+ struct vnode *vp;
+ seqc_t vp_seqc;
+
+ vp = fpl->tvp;
+ vp_seqc = fpl->tvp_seqc;
+ if (vp->v_type != VDIR)
+ return (0);
+
+ mp = atomic_load_ptr(&vp->v_mountedhere);
+ if (mp == NULL)
+ return (0);
+
+ prev_mp = NULL;
+ for (;;) {
+ if (!vfs_op_thread_enter(mp)) {
+ if (prev_mp != NULL)
+ vfs_op_thread_exit(prev_mp);
+ return (cache_fpl_unhandled(fpl));
+ }
+ if (prev_mp != NULL)
+ vfs_op_thread_exit(prev_mp);
+ if (!seqc_consistent(&vp->v_seqc, vp_seqc)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_unhandled(fpl));
+ }
+ if (!cache_fplookup_mp_supported(mp)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_unhandled(fpl));
+ }
+ vp = atomic_load_ptr(&mp->mnt_rootvnode);
+ if (vp == NULL || VN_IS_DOOMED(vp)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_unhandled(fpl));
+ }
+ vp_seqc = seqc_read_any(&vp->v_seqc);
+ if (seqc_in_modify(vp_seqc)) {
+ vfs_op_thread_exit(mp);
+ return (cache_fpl_unhandled(fpl));
+ }
+ prev_mp = mp;
+ mp = atomic_load_ptr(&vp->v_mountedhere);
+ if (mp == NULL)
+ break;
+ }
+
+ vfs_op_thread_exit(prev_mp);
+ fpl->tvp = vp;
+ fpl->tvp_seqc = vp_seqc;
+ return (0);
+}
+
+static int
+cache_fplookup_parse(struct cache_fpl *fpl)
+{
+ struct nameidata *ndp;
+ struct componentname *cnp;
+ char *cp;
+ char *prev_ni_next; /* saved ndp->ni_next */
+ size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */
+
+ ndp = fpl->ndp;
+ cnp = fpl->cnp;
+
+ /*
+ * Search a new directory.
+ *
+ * The last component of the filename is left accessible via
+ * cnp->cn_nameptr for callers that need the name. Callers needing
+ * the name set the SAVENAME flag. When done, they assume
+ * responsibility for freeing the pathname buffer.
+ */
+ for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
+ continue;
+ cnp->cn_namelen = cp - cnp->cn_nameptr;
+ if (cnp->cn_namelen > NAME_MAX) {
+ return (cache_fpl_handled(fpl, ENAMETOOLONG));
+ }
+ prev_ni_pathlen = ndp->ni_pathlen;
+ ndp->ni_pathlen -= cnp->cn_namelen;
+ KASSERT(ndp->ni_pathlen <= PATH_MAX,
+ ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen));
+ prev_ni_next = ndp->ni_next;
+ ndp->ni_next = cp;
+
+ /*
+ * Replace multiple slashes by a single slash and trailing slashes
+ * by a null. This must be done before VOP_LOOKUP() because some
+ * fs's don't know about trailing slashes. Remember if there were
+ * trailing slashes to handle symlinks, existing non-directories
+ * and non-existing files that won't be directories specially later.
+ */
+ while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
+ cp++;
+ ndp->ni_pathlen--;
+ if (*cp == '\0') {
+ /*
+ * TODO
+ * Regular lookup performs the following:
+ * *ndp->ni_next = '\0';
+ * cnp->cn_flags |= TRAILINGSLASH;
+ *
+ * Which is problematic since it modifies data read
+ * from userspace. Then if fast path lookup was to
+ * abort we would have to either restore it or convey
+ * the flag. Since this is a corner case just ignore
+ * it for simplicity.
+ */
+ return (cache_fpl_unhandled(fpl));
+ }
+ }
+ ndp->ni_next = cp;
+
+ cnp->cn_flags |= MAKEENTRY;
+
+ if (cnp->cn_namelen == 2 &&
+ cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
+ cnp->cn_flags |= ISDOTDOT;
+ else
+ cnp->cn_flags &= ~ISDOTDOT;
+ if (*ndp->ni_next == 0)
+ cnp->cn_flags |= ISLASTCN;
+ else
+ cnp->cn_flags &= ~ISLASTCN;
+
+ /*
+ * Check for degenerate name (e.g. / or "")
+ * which is a way of talking about a directory,
+ * e.g. like "/." or ".".
+ *
+ * TODO
+ * Another corner case handled by the regular lookup
+ */
+ if (__predict_false(cnp->cn_nameptr[0] == '\0')) {
+ return (cache_fpl_unhandled(fpl));
+ }
+ return (0);
+}
+
+static int
+cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl)
+{
+ struct nameidata *ndp;
+ struct componentname *cnp;
+ struct mount *mp;
+ int error;
+
+ error = CACHE_FPL_UNHANDLED;
+ ndp = fpl->ndp;
+ ndp->ni_lcf = 0;
+ cnp = fpl->cnp;
+ cnp->cn_lkflags = LK_SHARED;
+ if ((cnp->cn_flags & LOCKSHARED) == 0)
+ cnp->cn_lkflags = LK_EXCLUSIVE;
+
+ fpl->dvp = dvp;
+ fpl->dvp_seqc = seqc_read_any(&fpl->dvp->v_seqc);
+ if (seqc_in_modify(fpl->dvp_seqc)) {
+ cache_fpl_unhandled(fpl);
+ goto out;
+ }
+ mp = atomic_load_ptr(&fpl->dvp->v_mount);
+ if (!cache_fplookup_mp_supported(mp)) {
+ cache_fpl_unhandled(fpl);
+ goto out;
+ }
+
+ VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
+
+ for (;;) {
+ error = cache_fplookup_parse(fpl);
+ if (__predict_false(error != 0)) {
+ break;
+ }
+
+ if (cnp->cn_flags & ISDOTDOT) {
+ error = cache_fpl_unhandled(fpl);
+ break;
+ }
+
+ VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp);
+
+ error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred, cnp->cn_thread);
+ if (__predict_false(error != 0)) {
+ switch (error) {
+ case EAGAIN:
+ case EOPNOTSUPP: /* can happen when racing against vgone */
+ cache_fpl_unhandled(fpl);
+ break;
+ default:
+ /*
+ * TODO
+ */
+ panic("%s: untested handling of error code %d", __func__, error);
+ cache_fpl_handled(fpl, error);
+ break;
+ }
+ break;
+ }
+
+ error = cache_fplookup_next(fpl);
+ if (__predict_false(error != 0)) {
+ break;
+ }
+
+ VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
+
+ error = cache_fplookup_climb_mount(fpl);
+ if (__predict_false(error != 0)) {
+ break;
+ }
+
+ VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp);
+
+ if (cnp->cn_flags & ISLASTCN) {
+ error = cache_fplookup_final(fpl);
+ break;
+ }
+
+ if (!seqc_consistent(&fpl->dvp->v_seqc, fpl->dvp_seqc)) {
+ error = cache_fpl_unhandled(fpl);
+ break;
+ }
+
+ fpl->dvp = fpl->tvp;
+ fpl->dvp_seqc = fpl->tvp_seqc;
+
+ if (!cache_fplookup_vnode_supported(fpl->dvp)) {
+ error = cache_fpl_unhandled(fpl);
+ break;
+ }
+
+ cnp->cn_nameptr = ndp->ni_next;
+ while (*cnp->cn_nameptr == '/') {
+ cnp->cn_nameptr++;
+ ndp->ni_pathlen--;
+ }
+ }
+out:
+ MPASS(fpl->handled != CACHE_FPL_STATUS_UNSET);
+ if (fpl->handled == CACHE_FPL_STATUS_UNHANDLED) {
+ if (fpl->in_smr)
+ cache_fpl_smr_exit(fpl);
+ return (error);
+ }
+
+ cache_fpl_smr_assert_not_entered(fpl);
+ if (__predict_false(error != 0)) {
+ ndp->ni_dvp = NULL;
+ ndp->ni_vp = NULL;
+ cache_fpl_cleanup_cnp(cnp);
+ return (error);
+ }
+ ndp->ni_dvp = fpl->dvp;
+ ndp->ni_vp = fpl->tvp;
+ if (cnp->cn_flags & SAVENAME)
+ cnp->cn_flags |= HASBUF;
+ else
+ cache_fpl_cleanup_cnp(cnp);
+ return (error);
+}
+
+/*
+ * Fast path lookup protected with SMR and sequence counters.
+ *
+ * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one.
+ *
+ * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria
+ * outlined below.
+ *
+ * Traditional vnode lookup conceptually looks like this:
+ *
+ * vn_lock(current);
+ * for (;;) {
+ * next = find();
+ * vn_lock(next);
+ * vn_unlock(current);
+ * current = next;
+ * if (last)
+ * break;
+ * }
+ *
+ * Each jump to the next vnode is safe memory-wise and atomic with respect to
+ * any modifications thanks to holding respective locks.
+ *
+ * The same guarantee can be provided with a combination of safe memory
+ * reclamation and sequence counters instead. If all places which affect the
+ * relationship or permissions also the counter, the following provides the
+ * same guarantee.
+ *
+ * smr_enter(vfs_smr);
+ * current_seqc = seqc_read_any(current);
+ * if (seqc_in_modify(current_seqc))
+ * abort();
+ * for (;;) {
+ * next = find();
+ * next_seqc = seqc_read_any(next);
+ * if (!seqc_consistent(current, current_seqc)
+ * abort();
+ * current = next; // if anything changed the above would fail
+ * current_seqc = next_seqc;
+ * if (last)
+ * break;
+ * }
+ *
+ * API contract for VOP_FPLOOKUP_VEXEC routines is as follows:
+ * - they are called while within vfs_smr protection which they must never exit
+ * - EAGAIN can be returned to denote checking could not be performed, it is
+ * always valid to return it
+ * - if the sequence counter has not changed the result must be valid
+ * - if the sequence counter has changed both false positives and false negatives
+ * are permitted (since the result will be rejected later)
+ * - for simple cases of unix permission checks vaccess_vexec_smr can be used
+ *
+ * Caveats to watch out for:
+ * - vnodes are passed unlocked and unreferenced with nothing stopping
+ * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised
+ * to use atomic_load_ptr to fetch it.
+ * - aforementioned object can also get freed, meaning absent other means it
+ * should be protected with vfs_smr
+ * - either safely checking permissions as they are modified or guaranteeing
+ * their stability is left to the routine
+ */
+int
+cache_fplookup(struct nameidata *ndp, bool *handled)
+{
+ struct nameidata saved_ndp;
+ struct cache_fpl fpl;
+ struct pwd *pwd;
+ struct vnode *dvp, *startdir;
+ struct componentname *cnp;
+ int error;
+
+ *handled = false;
+ bzero(&fpl, sizeof(fpl));
+ fpl.handled = CACHE_FPL_STATUS_UNSET;
+ fpl.ndp = ndp;
+ fpl.cnp = &ndp->ni_cnd;
+ MPASS(curthread == fpl.cnp->cn_thread);
+
+ if (!cache_can_fplookup(&fpl)) {
+ SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.handled);
+ return (EOPNOTSUPP);
+ }
+
+ cache_save_nameidata(ndp, &saved_ndp);
+
+ cache_fpl_smr_enter(&fpl);
+ pwd = pwd_get_smr();
+ ndp->ni_rootdir = pwd->pwd_rdir;
+ ndp->ni_topdir = pwd->pwd_jdir;
+
+ cnp = fpl.cnp;
+ cnp->cn_nameptr = cnp->cn_pnbuf;
+ startdir = ndp->ni_startdir;
+ if (cnp->cn_pnbuf[0] == '/') {
+ cache_fpl_handle_root_initial(ndp, &dvp);
+ } else {
+ if (ndp->ni_startdir != NULL) {
+ dvp = ndp->ni_startdir;
+ } else {
+ MPASS(ndp->ni_dirfd == AT_FDCWD);
+ dvp = pwd->pwd_cdir;
+ }
+ }
+
+ error = cache_fplookup_impl(dvp, &fpl);
+ cache_fpl_smr_assert_not_entered(&fpl);
+ SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.handled);
+
+ MPASS(fpl.handled != CACHE_FPL_STATUS_UNSET);
+ if (fpl.handled == CACHE_FPL_STATUS_HANDLED) {
+ if (error == CACHE_FPL_UNHANDLED)
+ panic("cache: bad error code from line %d\n", fpl.line);
+ *handled = true;
+ if (startdir != NULL)
+ vrele(startdir);
+ return (error);
+ } else {
+ cache_restore_nameidata(ndp, &saved_ndp);
+ return (error);
+ }
+}
Index: sys/kern/vfs_lookup.c
===================================================================
--- sys/kern/vfs_lookup.c
+++ sys/kern/vfs_lookup.c
@@ -315,6 +315,7 @@
struct filecaps dirfd_caps;
struct uio auio;
int error, linklen, startdir_used;
+ bool handled;
cnp = &ndp->ni_cnd;
td = cnp->cn_thread;
@@ -329,10 +330,15 @@
ndp->ni_startdir->v_type == VBAD);
TAILQ_INIT(&ndp->ni_cap_tracker);
ndp->ni_lcf = 0;
+ ndp->ni_loopcnt = 0;
+ startdir_used = 0;
+ dp = NULL;
/* We will set this ourselves if we need it. */
cnp->cn_flags &= ~TRAILINGSLASH;
+ ndp->ni_vp = NULL;
+
/*
* Get a buffer for the name to be translated, and copy the
* name into the buffer.
@@ -346,12 +352,29 @@
error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
&ndp->ni_pathlen);
+ if (error != 0) {
+ namei_cleanup_cnp(cnp);
+ return (error);
+ }
+
+ cnp->cn_nameptr = cnp->cn_pnbuf;
+
/*
* Don't allow empty pathnames.
*/
- if (error == 0 && *cnp->cn_pnbuf == '\0')
- error = ENOENT;
+ if (*cnp->cn_pnbuf == '\0') {
+ namei_cleanup_cnp(cnp);
+ return (ENOENT);
+ }
+ error = cache_fplookup(ndp, &handled);
+ if (handled)
+ return (error);
+
+ /*
+ * Ignore fast path.
+ */
+ error = 0;
#ifdef CAPABILITY_MODE
/*
* In capability mode, lookups must be restricted to happen in
@@ -380,10 +403,8 @@
#endif
if (error != 0) {
namei_cleanup_cnp(cnp);
- ndp->ni_vp = NULL;
return (error);
}
- ndp->ni_loopcnt = 0;
#ifdef KTRACE
if (KTRPOINT(td, KTR_NAMEI)) {
KASSERT(cnp->cn_thread == curthread,
@@ -402,9 +423,6 @@
ndp->ni_rootdir = pwd->pwd_rdir;
ndp->ni_topdir = pwd->pwd_jdir;
- startdir_used = 0;
- dp = NULL;
- cnp->cn_nameptr = cnp->cn_pnbuf;
if (cnp->cn_pnbuf[0] == '/') {
ndp->ni_resflags |= NIRES_ABS;
error = namei_handle_root(ndp, &dp);
Index: sys/kern/vfs_mount.c
===================================================================
--- sys/kern/vfs_mount.c
+++ sys/kern/vfs_mount.c
@@ -949,6 +949,8 @@
}
VOP_UNLOCK(vp);
+ vfs_seqc_write_begin(vp);
+
/* Allocate and initialize the filesystem. */
mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
/* XXXMAC: pass to vfs_mount_alloc? */
@@ -976,9 +978,11 @@
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
VI_UNLOCK(vp);
+ vfs_seqc_write_end(vp);
vrele(vp);
return (error);
}
+ vfs_seqc_write_begin(newdp);
VOP_UNLOCK(newdp);
if (mp->mnt_opt != NULL)
@@ -1015,6 +1019,8 @@
EVENTHANDLER_DIRECT_INVOKE(vfs_mounted, mp, newdp, td);
VOP_UNLOCK(newdp);
mountcheckdirs(vp, newdp);
+ vfs_seqc_write_end(vp);
+ vfs_seqc_write_end(newdp);
vrele(newdp);
if ((mp->mnt_flag & MNT_RDONLY) == 0)
vfs_allocate_syncvnode(mp);
@@ -1089,7 +1095,9 @@
VOP_UNLOCK(vp);
vfs_op_enter(mp);
+ vfs_seqc_write_begin(vp);
+ rootvp = NULL;
MNT_ILOCK(mp);
if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
MNT_IUNLOCK(mp);
@@ -1103,8 +1111,6 @@
mp->mnt_kern_flag &= ~MNTK_ASYNC;
rootvp = vfs_cache_root_clear(mp);
MNT_IUNLOCK(mp);
- if (rootvp != NULL)
- vrele(rootvp);
mp->mnt_optnew = *optlist;
vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
@@ -1175,6 +1181,11 @@
vfs_deallocate_syncvnode(mp);
end:
vfs_op_exit(mp);
+ if (rootvp != NULL) {
+ vfs_seqc_write_end(rootvp);
+ vrele(rootvp);
+ }
+ vfs_seqc_write_end(vp);
vfs_unbusy(mp);
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
@@ -1665,14 +1676,19 @@
}
mp->mnt_kern_flag |= MNTK_UNMOUNT;
rootvp = vfs_cache_root_clear(mp);
+ if (coveredvp != NULL)
+ vfs_seqc_write_begin(coveredvp);
if (flags & MNT_NONBUSY) {
MNT_IUNLOCK(mp);
error = vfs_check_usecounts(mp);
MNT_ILOCK(mp);
if (error != 0) {
+ vfs_seqc_write_end(coveredvp);
dounmount_cleanup(mp, coveredvp, MNTK_UNMOUNT);
- if (rootvp != NULL)
+ if (rootvp != NULL) {
+ vfs_seqc_write_end(rootvp);
vrele(rootvp);
+ }
return (error);
}
}
@@ -1701,22 +1717,14 @@
("%s: invalid return value for msleep in the drain path @ %s:%d",
__func__, __FILE__, __LINE__));
- if (rootvp != NULL)
+ if (rootvp != NULL) {
+ vhold(rootvp);
vrele(rootvp);
+ }
if (mp->mnt_flag & MNT_EXPUBLIC)
vfs_setpublicfs(NULL, NULL, NULL);
- /*
- * From now, we can claim that the use reference on the
- * coveredvp is ours, and the ref can be released only by
- * successfull unmount by us, or left for later unmount
- * attempt. The previously acquired hold reference is no
- * longer needed to protect the vnode from reuse.
- */
- if (coveredvp != NULL)
- vdrop(coveredvp);
-
vfs_periodic(mp, MNT_WAIT);
MNT_ILOCK(mp);
async_flag = mp->mnt_flag & MNT_ASYNC;
@@ -1751,8 +1759,15 @@
}
vfs_op_exit_locked(mp);
MNT_IUNLOCK(mp);
- if (coveredvp)
+ if (coveredvp) {
+ vfs_seqc_write_end(coveredvp);
VOP_UNLOCK(coveredvp);
+ vdrop(coveredvp);
+ }
+ if (rootvp != NULL) {
+ vfs_seqc_write_end(rootvp);
+ vdrop(rootvp);
+ }
return (error);
}
mtx_lock(&mountlist_mtx);
@@ -1761,7 +1776,13 @@
EVENTHANDLER_DIRECT_INVOKE(vfs_unmounted, mp, td);
if (coveredvp != NULL) {
coveredvp->v_mountedhere = NULL;
+ vfs_seqc_write_end(coveredvp);
VOP_UNLOCK(coveredvp);
+ vdrop(coveredvp);
+ }
+ if (rootvp != NULL) {
+ vfs_seqc_write_end(rootvp);
+ vdrop(rootvp);
}
vfs_event_signal(NULL, VQ_UNMOUNT, 0);
if (rootvnode != NULL && mp == rootvnode->v_mount) {
Index: sys/kern/vfs_subr.c
===================================================================
--- sys/kern/vfs_subr.c
+++ sys/kern/vfs_subr.c
@@ -664,8 +664,8 @@
vnode_list_reclaim_marker = vn_alloc_marker(NULL);
TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
- vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR);
- vfs_smr = uma_zone_get_smr(vnode_zone);
+ vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
+ uma_zone_set_smr(vnode_zone, vfs_smr);
vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
/*
@@ -1766,6 +1766,7 @@
*/
CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
bo = &vp->v_bufobj;
+ VNPASS(vp->v_seqc_users == 0, vp);
VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
VNASSERT(vp->v_holdcnt == VHOLD_NO_SMR, vp, ("Invalid hold count"));
VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
@@ -2892,6 +2893,17 @@
return (vs);
}
+void
+vget_abort(struct vnode *vp, enum vgetstate vs)
+{
+
+ VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
+ if (vs == VGET_USECOUNT)
+ vrele(vp);
+ else
+ vdrop(vp);
+}
+
int
vget(struct vnode *vp, int flags, struct thread *td)
{
@@ -2954,10 +2966,7 @@
error = vn_lock(vp, flags);
if (__predict_false(error != 0)) {
- if (vs == VGET_USECOUNT)
- vrele(vp);
- else
- vdrop(vp);
+ vget_abort(vp, vs);
CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
vp);
return (error);
@@ -3982,6 +3991,7 @@
*/
if (vp->v_irflag & VIRF_DOOMED)
return;
+ vfs_seqc_write_begin_locked(vp);
vunlazy_gone(vp);
vp->v_irflag |= VIRF_DOOMED;
@@ -4084,6 +4094,7 @@
vp->v_vnlock = &vp->v_lock;
vp->v_op = &dead_vnodeops;
vp->v_type = VBAD;
+ vfs_seqc_write_end_locked(vp);
}
/*
@@ -4124,6 +4135,7 @@
printf(" usecount %d, writecount %d, refcount %d (flags %s)",
vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_NO_SMR,
holdcnt & VHOLD_NO_SMR ? "VHOLD_NO_SMR" : "none");
+ printf(", seqc users %d", vp->v_seqc_users);
switch (vp->v_type) {
case VDIR:
printf(" mountedhere %p\n", vp->v_mountedhere);
@@ -4369,6 +4381,7 @@
MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
MNT_KERN_FLAG(MNTK_MARKER);
MNT_KERN_FLAG(MNTK_USES_BCACHE);
+ MNT_KERN_FLAG(MNTK_FPLOOKUP);
MNT_KERN_FLAG(MNTK_NOASYNC);
MNT_KERN_FLAG(MNTK_UNMOUNT);
MNT_KERN_FLAG(MNTK_MWAIT);
@@ -5184,6 +5197,38 @@
return (error == 0);
}
+/*
+ * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
+ * the comment above cache_fplookup for details.
+ *
+ * We never deny as priv_check_cred calls are not yet supported, see vaccess.
+ */
+int
+vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred)
+{
+
+ SMR_ASSERT_ENTERED(vfs_smr);
+
+ /* Check the owner. */
+ if (cred->cr_uid == file_uid) {
+ if (file_mode & S_IXUSR)
+ return (0);
+ return (EAGAIN);
+ }
+
+ /* Otherwise, check the groups (first match) */
+ if (groupmember(file_gid, cred)) {
+ if (file_mode & S_IXGRP)
+ return (0);
+ return (EAGAIN);
+ }
+
+ /* Otherwise, check everyone else. */
+ if (file_mode & S_IXOTH)
+ return (0);
+ return (EAGAIN);
+}
+
/*
* Common filesystem object access control check routine. Accepts a
* vnode's type, "mode", uid and gid, requested access mode, credentials,
@@ -5464,6 +5509,14 @@
ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
#endif
+ /*
+ * It may be tempting to add vfs_seqc_write_begin/end calls here and
+ * in vop_rename_post but that's not going to work out since some
+ * filesystems relookup vnodes mid-rename. This is probably a bug.
+ *
+ * For now filesystems are expected to do the relevant calls after they
+ * decide what vnodes to operate on.
+ */
if (a->a_tdvp != a->a_fdvp)
vhold(a->a_fdvp);
if (a->a_tvp != a->a_fvp)
@@ -5553,11 +5606,26 @@
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
}
+void
+vop_deleteextattr_pre(void *ap)
+{
+ struct vop_deleteextattr_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_begin(vp);
+}
+
void
vop_deleteextattr_post(void *ap, int rc)
{
- struct vop_deleteextattr_args *a = ap;
+ struct vop_deleteextattr_args *a;
+ struct vnode *vp;
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_end(vp);
if (!rc)
VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
}
@@ -5660,22 +5728,74 @@
}
}
+void
+vop_setattr_pre(void *ap)
+{
+ struct vop_setattr_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_begin(vp);
+}
+
void
vop_setattr_post(void *ap, int rc)
{
- struct vop_setattr_args *a = ap;
+ struct vop_setattr_args *a;
+ struct vnode *vp;
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_end(vp);
if (!rc)
- VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+ VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
+}
+
+void
+vop_setacl_pre(void *ap)
+{
+ struct vop_setacl_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_begin(vp);
+}
+
+void
+vop_setacl_post(void *ap, int rc __unused)
+{
+ struct vop_setacl_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_end(vp);
+}
+
+void
+vop_setextattr_pre(void *ap)
+{
+ struct vop_setextattr_args *a;
+ struct vnode *vp;
+
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_begin(vp);
}
void
vop_setextattr_post(void *ap, int rc)
{
- struct vop_setextattr_args *a = ap;
+ struct vop_setextattr_args *a;
+ struct vnode *vp;
+ a = ap;
+ vp = a->a_vp;
+ vfs_seqc_write_end(vp);
if (!rc)
- VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
+ VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
}
void
@@ -6237,6 +6357,8 @@
*/
MPASS(mp->mnt_vfs_ops > 0);
vp = mp->mnt_rootvnode;
+ if (vp != NULL)
+ vfs_seqc_write_begin(vp);
mp->mnt_rootvnode = NULL;
return (vp);
}
@@ -6533,3 +6655,45 @@
return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread));
}
+
+void
+vfs_seqc_write_begin_locked(struct vnode *vp)
+{
+
+ ASSERT_VI_LOCKED(vp, __func__);
+ VNPASS(vp->v_holdcnt > 0, vp);
+ VNPASS(vp->v_seqc_users >= 0, vp);
+ vp->v_seqc_users++;
+ if (vp->v_seqc_users == 1)
+ seqc_sleepable_write_begin(&vp->v_seqc);
+}
+
+void
+vfs_seqc_write_begin(struct vnode *vp)
+{
+
+ VI_LOCK(vp);
+ vfs_seqc_write_begin_locked(vp);
+ VI_UNLOCK(vp);
+}
+
+void
+vfs_seqc_write_end_locked(struct vnode *vp)
+{
+
+ ASSERT_VI_LOCKED(vp, __func__);
+ VNPASS(vp->v_holdcnt > 0, vp);
+ VNPASS(vp->v_seqc_users > 0, vp);
+ vp->v_seqc_users--;
+ if (vp->v_seqc_users == 0)
+ seqc_sleepable_write_end(&vp->v_seqc);
+}
+
+void
+vfs_seqc_write_end(struct vnode *vp)
+{
+
+ VI_LOCK(vp);
+ vfs_seqc_write_end_locked(vp);
+ VI_UNLOCK(vp);
+}
Index: sys/kern/vnode_if.src
===================================================================
--- sys/kern/vnode_if.src
+++ sys/kern/vnode_if.src
@@ -142,6 +142,15 @@
};
+%% fplookup_vexec vp - - -
+
+vop_fplookup_vexec {
+ IN struct vnode *vp;
+ IN struct ucred *cred;
+ IN struct thread *td;
+};
+
+
%% access vp L L L
vop_access {
@@ -172,6 +181,7 @@
%% setattr vp E E E
+%! setattr pre vop_setattr_pre
%! setattr post vop_setattr_post
vop_setattr {
@@ -523,6 +533,8 @@
%% setacl vp E E E
+%! setacl pre vop_setacl_pre
+%! setacl post vop_setacl_post
vop_setacl {
IN struct vnode *vp;
@@ -589,6 +601,7 @@
%% deleteextattr vp E E E
+%! deleteextattr pre vop_deleteextattr_pre
%! deleteextattr post vop_deleteextattr_post
vop_deleteextattr {
@@ -601,6 +614,7 @@
%% setextattr vp E E E
+%! setextattr pre vop_setextattr_pre
%! setextattr post vop_setextattr_post
vop_setextattr {
Index: sys/security/mac/mac_framework.h
===================================================================
--- sys/security/mac/mac_framework.h
+++ sys/security/mac/mac_framework.h
@@ -422,13 +422,14 @@
int mac_vnode_check_lookup_impl(struct ucred *cred, struct vnode *dvp,
struct componentname *cnp);
extern bool mac_vnode_check_lookup_fp_flag;
+#define mac_vnode_check_lookup_enabled() __predict_false(mac_vnode_check_lookup_fp_flag)
static inline int
mac_vnode_check_lookup(struct ucred *cred, struct vnode *dvp,
struct componentname *cnp)
{
mac_vnode_assert_locked(dvp, "mac_vnode_check_lookup");
- if (__predict_false(mac_vnode_check_lookup_fp_flag))
+ if (mac_vnode_check_lookup_enabled())
return (mac_vnode_check_lookup_impl(cred, dvp, cnp));
return (0);
}
Index: sys/sys/_seqc.h
===================================================================
--- /dev/null
+++ sys/sys/_seqc.h
@@ -0,0 +1,6 @@
+#ifndef _SYS__SEQC_H_
+#define _SYS__SEQC_H_
+
+typedef uint32_t seqc_t;
+
+#endif /* _SYS__SEQC_H */
Index: sys/sys/filedesc.h
===================================================================
--- sys/sys/filedesc.h
+++ sys/sys/filedesc.h
@@ -289,6 +289,7 @@
smr_serialized_store(&fdp->fd_pwd, newpwd,
(FILEDESC_XLOCK_ASSERT(fdp), true));
}
+struct pwd *pwd_get_smr(void);
#endif /* _KERNEL */
Index: sys/sys/mount.h
===================================================================
--- sys/sys/mount.h
+++ sys/sys/mount.h
@@ -415,6 +415,7 @@
#define MNTK_TEXT_REFS 0x00008000 /* Keep use ref for text */
#define MNTK_VMSETSIZE_BUG 0x00010000
#define MNTK_UNIONFS 0x00020000 /* A hack for F_ISUNIONSTACK */
+#define MNTK_FPLOOKUP 0x00040000 /* fast path lookup is supported */
#define MNTK_NOASYNC 0x00800000 /* disable async */
#define MNTK_UNMOUNT 0x01000000 /* unmount in progress */
#define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */
Index: sys/sys/seqc.h
===================================================================
--- sys/sys/seqc.h
+++ sys/sys/seqc.h
@@ -36,7 +36,7 @@
/*
* seqc_t may be included in structs visible to userspace
*/
-typedef uint32_t seqc_t;
+#include <sys/_seqc.h>
#ifdef _KERNEL
@@ -110,5 +110,22 @@
return (seqc_consistent_nomb(seqcp, oldseqc));
}
+static __inline void
+seqc_sleepable_write_begin(seqc_t *seqcp)
+{
+
+ MPASS(!seqc_in_modify(*seqcp));
+ *seqcp += 1;
+ atomic_thread_fence_rel();
+}
+
+static __inline void
+seqc_sleepable_write_end(seqc_t *seqcp)
+{
+
+ atomic_store_rel_int(seqcp, *seqcp + 1);
+ MPASS(!seqc_in_modify(*seqcp));
+}
+
#endif /* _KERNEL */
#endif /* _SYS_SEQC_H_ */
Index: sys/sys/vnode.h
===================================================================
--- sys/sys/vnode.h
+++ sys/sys/vnode.h
@@ -45,6 +45,7 @@
#include <sys/uio.h>
#include <sys/acl.h>
#include <sys/ktr.h>
+#include <sys/_seqc.h>
/*
* The vnode is the focus of all file activity in UNIX. There is a
@@ -105,6 +106,7 @@
*/
enum vtype v_type:8; /* u vnode type */
short v_irflag; /* i frequently read flags */
+ seqc_t v_seqc; /* i modification count */
struct vop_vector *v_op; /* u vnode operations vector */
void *v_data; /* u private data for fs */
@@ -175,6 +177,7 @@
short v_dbatchcpu; /* i LRU requeue deferral batch */
int v_writecount; /* I ref count of writers or
(negative) text users */
+ int v_seqc_users; /* i modifications pending */
u_int v_hash;
};
@@ -538,6 +541,18 @@
#define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str))
#define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str))
+#define ASSERT_VOP_IN_SEQC(vp) do { \
+ struct vnode *_vp = (vp); \
+ \
+ VNPASS(seqc_in_modify(_vp->v_seqc), _vp); \
+} while (0)
+
+#define ASSERT_VOP_NOT_IN_SEQC(vp) do { \
+ struct vnode *_vp = (vp); \
+ \
+ VNPASS(!seqc_in_modify(_vp->v_seqc), _vp); \
+} while (0)
+
#else /* !DEBUG_VFS_LOCKS */
#define ASSERT_VI_LOCKED(vp, str) ((void)0)
@@ -545,6 +560,10 @@
#define ASSERT_VOP_ELOCKED(vp, str) ((void)0)
#define ASSERT_VOP_LOCKED(vp, str) ((void)0)
#define ASSERT_VOP_UNLOCKED(vp, str) ((void)0)
+
+#define ASSERT_VOP_IN_SEQC(vp) ((void)0)
+#define ASSERT_VOP_NOT_IN_SEQC(vp) ((void)0)
+
#endif /* DEBUG_VFS_LOCKS */
@@ -601,6 +620,7 @@
struct vattr;
struct vfsops;
struct vnode;
+struct pwd;
typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **);
@@ -613,11 +633,16 @@
void cache_enter_time(struct vnode *dvp, struct vnode *vp,
struct componentname *cnp, struct timespec *tsp,
struct timespec *dtsp);
+int cache_fplookup(struct nameidata *ndp, bool *handled);
int cache_lookup(struct vnode *dvp, struct vnode **vpp,
struct componentname *cnp, struct timespec *tsp, int *ticksp);
void cache_purge(struct vnode *vp);
void cache_purge_negative(struct vnode *vp);
void cache_purgevfs(struct mount *mp, bool force);
+void vfs_seqc_write_begin_locked(struct vnode *vp);
+void vfs_seqc_write_begin(struct vnode *vp);
+void vfs_seqc_write_end_locked(struct vnode *vp);
+void vfs_seqc_write_end(struct vnode *vp);
int change_dir(struct vnode *vp, struct thread *td);
void cvtstat(struct stat *st, struct ostat *ost);
void freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb);
@@ -643,6 +668,8 @@
int vn_commname(struct vnode *vn, char *buf, u_int buflen);
int vn_path_to_global_path(struct thread *td, struct vnode *vp,
char *path, u_int pathlen);
+int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid,
+ struct ucred *cred);
int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid,
gid_t file_gid, accmode_t accmode, struct ucred *cred,
int *privused);
@@ -662,6 +689,7 @@
enum vgetstate vget_prep_smr(struct vnode *vp);
enum vgetstate vget_prep(struct vnode *vp);
int vget_finish(struct vnode *vp, int flags, enum vgetstate vs);
+void vget_abort(struct vnode *vp, enum vgetstate vs);
void vgone(struct vnode *vp);
void vhold(struct vnode *);
void vholdl(struct vnode *);
@@ -804,6 +832,7 @@
/* These are called from within the actual VOPS. */
void vop_close_post(void *a, int rc);
void vop_create_post(void *a, int rc);
+void vop_deleteextattr_pre(void *a);
void vop_deleteextattr_post(void *a, int rc);
void vop_link_post(void *a, int rc);
void vop_lookup_post(void *a, int rc);
@@ -818,7 +847,11 @@
void vop_rename_post(void *a, int rc);
void vop_rename_pre(void *a);
void vop_rmdir_post(void *a, int rc);
+void vop_setattr_pre(void *a);
void vop_setattr_post(void *a, int rc);
+void vop_setacl_pre(void *a);
+void vop_setacl_post(void *a, int rc);
+void vop_setextattr_pre(void *a);
void vop_setextattr_post(void *a, int rc);
void vop_symlink_post(void *a, int rc);
int vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a);

File Metadata

Mime Type
text/plain
Expires
Sat, Dec 28, 12:16 PM (7 h, 17 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
15619663
Default Alt Text
D23915.id69198.diff (40 KB)

Event Timeline